From d2f62a85fc63a356138e9901d15db30283ecd32f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 08:54:03 -0800
Subject: [PATCH 001/321] Implement a map from <level, stripe_id, col_id> to
 compression info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 44ece671155..3fd252be73b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -25,12 +25,27 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <tuple>
+#include <unordered_map>
+
 namespace cudf::io::orc::detail {
 
 /**
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
 struct file_intermediate_data {
+  using chunk_index     = std::tuple<size_t, size_t, size_t>;
+  using chunk_comp_info = std::tuple<size_t, size_t, size_t>;
+
+  struct index_hash {
+    std::size_t operator()(chunk_index const& index) const
+    {
+      return std::hash<size_t>()(std::get<0>(index)) ^ std::hash<size_t>()(std::get<1>(index)) ^
+             std::hash<size_t>()(std::get<2>(index));
+    }
+  };
+  std::unordered_map<chunk_index, chunk_comp_info, index_hash> compinfo_map;
+
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 

From b8e1fd79860313b62f2c02ab7fbc595e812a7b29 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 10:49:28 -0800
Subject: [PATCH 002/321] Change benchmark

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/CMakeLists.txt              |  2 +-
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 33 ++++++++--------------
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 35b03fa33d0..84f583e8ed2 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -255,7 +255,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
-ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
+ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp)
 
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fdb7dbe59b8..fd27b56ef0e 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -88,9 +88,6 @@ void BM_orc_read_io_compression(
   nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
-                                         static_cast<int32_t>(data_type::FLOAT),
-                                         static_cast<int32_t>(data_type::DECIMAL),
-                                         static_cast<int32_t>(data_type::TIMESTAMP),
                                          static_cast<int32_t>(data_type::STRING),
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
@@ -116,29 +113,23 @@ void BM_orc_read_io_compression(
   orc_read_common(num_rows_written, source_sink, state);
 }
 
-using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
-                                            data_type::FLOAT,
-                                            data_type::DECIMAL,
-                                            data_type::TIMESTAMP,
-                                            data_type::STRING,
-                                            data_type::LIST,
-                                            data_type::STRUCT>;
+using d_type_list = nvbench::
+  enum_type_list<data_type::INTEGRAL_SIGNED, data_type::STRING, data_type::LIST, data_type::STRUCT>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::DEVICE_BUFFER>;
+using io_list =
+  nvbench::enum_type_list<cudf::io::io_type::FILEPATH, cudf::io::io_type::HOST_BUFFER>;
 
 using compression_list =
   nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
 
-NVBENCH_BENCH_TYPES(BM_orc_read_data,
-                    NVBENCH_TYPE_AXES(d_type_list,
-                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
-  .set_name("orc_read_decode")
-  .set_type_axes_names({"data_type", "io"})
-  .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
+// NVBENCH_BENCH_TYPES(BM_orc_read_data,
+//                     NVBENCH_TYPE_AXES(d_type_list,
+//                                       nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+//   .set_name("orc_read_decode")
+//   .set_type_axes_names({"data_type", "io"})
+//   .set_min_samples(4)
+//   .add_int64_axis("cardinality", {0, 1000})
+//   .add_int64_axis("run_length", {1, 32});
 
 NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
   .set_name("orc_read_io_compression")

From f6f479c758f9d613357e8ed0ee3577e853f08de5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 10:49:39 -0800
Subject: [PATCH 003/321] Add comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 8cae1ff5309..02bf74e9c01 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -187,6 +187,8 @@ aggregate_orc_metadata::select_stripes(
           "Invalid stripe index");
         stripe_infos.push_back(
           std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+
+        // TODO: check for overflow here.
         rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
       }
       selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});

From bd308e649506edc014883b27ec145f39caabf6c1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 10:49:49 -0800
Subject: [PATCH 004/321] Implementing query for stripe sizes

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp           |   5 +
 cpp/src/io/orc/reader_impl_chunking.hpp  |   1 +
 cpp/src/io/orc/reader_impl_preprocess.cu | 330 +++++++++++++++++++++++
 3 files changed, 336 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 6561c08f2d9..b0869125fe9 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -77,6 +77,11 @@ class reader::impl {
                     std::optional<size_type> const& num_rows_opt,
                     std::vector<std::vector<size_type>> const& stripes);
 
+  /**
+   * @brief Compute stripe sizes.
+   */
+  void query_stripe_compression_info();
+
   /**
    * @brief Create the output table metadata from file metadata.
    *
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 3fd252be73b..43883b34077 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -45,6 +45,7 @@ struct file_intermediate_data {
     }
   };
   std::unordered_map<chunk_index, chunk_comp_info, index_hash> compinfo_map;
+  bool compinfo_ready{false};
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 179afa12bd5..9af2bdb2aa8 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -646,6 +646,7 @@ void aggregate_child_meta(std::size_t level,
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
+        // TODO: Check for overflow here.
         num_child_rows[child_col_idx] += child_rows;
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
@@ -697,6 +698,332 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
+void reader::impl::query_stripe_compression_info()
+{
+  if (_file_itm_data->compinfo_ready) { return; }
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  // TODO : remove?
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
+  // Set up table for converting timestamp columns from local to UTC time
+  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column
+             ? cudf::detail::make_timezone_transition_table(
+                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
+             : std::make_unique<cudf::table>();
+  }();
+
+  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
+  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
+
+  _out_buffers.resize(_selected_columns.num_levels());
+
+  // Iterates through levels of nested columns, child column will be one level down
+  // compared to parent column.
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+    std::vector<orc_column_meta> nested_cols;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
+    for (auto& col : columns_level) {
+      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
+                                   _use_np_dtypes,
+                                   _timestamp_type.id(),
+                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
+                      });
+    auto const num_columns = columns_level.size();
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+      total_num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    const bool use_index =
+      _use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
+
+    // Logically view streams as columns
+    std::vector<orc_stream_info> stream_info;
+
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    std::size_t stripe_start_row = 0;
+    std::size_t num_dict_entries = 0;
+    std::size_t num_rowgroups    = 0;
+    int stripe_idx               = 0;
+
+    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe_source_mapping : selected_stripes) {
+      // Iterate through the source files selected stripes
+      for (auto const& stripe : stripe_source_mapping.stripe_info) {
+        auto const stripe_info   = stripe.first;
+        auto const stripe_footer = stripe.second;
+
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        use_index,
+                                                        level == 0,
+                                                        &num_dict_entries,
+                                                        stream_info,
+                                                        chunks);
+
+        auto const is_stripe_data_empty = total_data_size == 0;
+        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                     "Invalid index rowgroup stream data");
+
+        // Buffer needs to be padded.
+        // Required by `copy_uncompressed_kernel`.
+        stripe_data.emplace_back(
+          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
+        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+        // Coalesce consecutive streams into one read
+        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+          auto const offset = stream_info[stream_count].offset;
+          auto len          = stream_info[stream_count].length;
+          stream_count++;
+
+          while (stream_count < stream_info.size() &&
+                 stream_info[stream_count].offset == offset + len) {
+            len += stream_info[stream_count].length;
+            stream_count++;
+          }
+          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                .source->is_device_read_preferred(len)) {
+            read_tasks.push_back(
+              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                          .source->device_read_async(offset, len, d_dst, _stream),
+                        len));
+
+          } else {
+            auto const buffer =
+              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                offset, len);
+            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+            CUDF_CUDA_TRY(
+              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+            _stream.synchronize();
+          }
+        }
+
+        auto const num_rows_per_stripe = stripe_info->numberOfRows;
+        auto const rowgroup_id         = num_rowgroups;
+        auto stripe_num_rowgroups      = 0;
+        if (use_index) {
+          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                                 _metadata.get_row_index_stride();
+        }
+        // Update chunks to reference streams pointers
+        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+          auto& chunk = chunks[stripe_idx][col_idx];
+          // start row, number of rows in a each stripe and total number of rows
+          // may change in lower levels of nesting
+          chunk.start_row = (level == 0)
+                              ? stripe_start_row
+                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+          chunk.num_rows =
+            (level == 0) ? stripe_info->numberOfRows
+                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+          chunk.parent_validity_info =
+            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+          chunk.parent_null_count_prefix_sums =
+            (level == 0)
+              ? nullptr
+              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                              .ff.types[columns_level[col_idx].id]
+                              .kind;
+          // num_child_rows for a struct column will be same, for other nested types it will be
+          // calculated.
+          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+          chunk.dtype_id       = column_types[col_idx].id();
+          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                                  .ff.types[columns_level[col_idx].id]
+                                  .scale.value_or(0);
+
+          chunk.rowgroup_id   = rowgroup_id;
+          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                  ? sizeof(string_index_pair)
+                                : ((column_types[col_idx].id() == type_id::LIST) or
+                               (column_types[col_idx].id() == type_id::STRUCT))
+                                  ? sizeof(size_type)
+                                  : cudf::size_of(column_types[col_idx]);
+          chunk.num_rowgroups = stripe_num_rowgroups;
+          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
+          if (not is_stripe_data_empty) {
+            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            }
+          }
+        }
+        stripe_start_row += num_rows_per_stripe;
+        num_rowgroups += stripe_num_rowgroups;
+
+        stripe_idx++;
+      }
+    }
+    for (auto& task : read_tasks) {
+      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+    }
+
+    if (stripe_data.empty()) { continue; }
+
+    // Process dataset chunk pages into output columns
+    auto row_groups =
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+    if (level > 0 and row_groups.size().first) {
+      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                     num_rowgroups * num_columns);
+      auto& rw_grp_meta = col_meta.rwgrp_meta;
+
+      // Update start row and num rows per row group
+      std::transform(rw_grp_meta.begin(),
+                     rw_grp_meta.end(),
+                     row_groups_span.begin(),
+                     rw_grp_meta.begin(),
+                     [&](auto meta, auto& row_grp) {
+                       row_grp.num_rows  = meta.num_rows;
+                       row_grp.start_row = meta.start_row;
+                       return meta;
+                     });
+    }
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                total_num_stripes,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+      stripe_data.clear();
+      stripe_data.push_back(std::move(decomp_data));
+    } else {
+        // Set decompressed data size equal to the input size.
+        // TODO
+    }
+
+    for (std::size_t i = 0; i < column_types.size(); ++i) {
+      bool is_nullable = false;
+      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          is_nullable = true;
+          break;
+        }
+      }
+      auto is_list_type = (column_types[i].id() == type_id::LIST);
+      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+      // For list column, offset column will be always size + 1
+      if (is_list_type) n_rows++;
+      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+    }
+
+    decode_stream_data(num_dict_entries,
+                       rows_to_skip,
+                       _metadata.get_row_index_stride(),
+                       level,
+                       tz_table->view(),
+                       chunks,
+                       row_groups,
+                       _out_buffers[level],
+                       _stream,
+                       _mr);
+
+    if (nested_cols.size()) {
+      // Extract information to process nested child columns
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+
+      row_groups.device_to_host_sync(_stream);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
+
+      // ORC stores number of elements at each row, so we need to generate offsets from that
+      std::vector<list_buffer_data> buff_data;
+      std::for_each(
+        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+          if (out_buffer.type.id() == type_id::LIST) {
+            auto data = static_cast<size_type*>(out_buffer.data());
+            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+          }
+        });
+
+      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    }
+  }  // end loop level
+
+  _file_itm_data->compinfo_ready = true;
+}
+
 void reader::impl::prepare_data(uint64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
                                 std::vector<std::vector<size_type>> const& stripes)
@@ -722,6 +1049,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // If no rows or stripes to read, return empty columns
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
+  // query_stripe_compression_info();
+
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {
     auto const has_timestamp_column = std::any_of(
@@ -996,6 +1325,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       }
     }
 
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < total_num_stripes; ++j) {

From 3ad2a6f5cecabee5e7d365002aeaf9b0acfb1d08 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 11:29:56 -0800
Subject: [PATCH 005/321] Remove redundant code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 270 ++++++++---------------
 1 file changed, 91 insertions(+), 179 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 9af2bdb2aa8..3c392d25d30 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -64,7 +64,7 @@ struct orc_stream_info {
   uint64_t offset;      // offset in file
   std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
   std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
+  uint32_t stripe_idx;  // stripe processing index, not stripe index in source
 };
 
 /**
@@ -74,12 +74,45 @@ std::size_t gather_stream_info(std::size_t stripe_index,
                                orc::StripeInformation const* stripeinfo,
                                orc::StripeFooter const* stripefooter,
                                host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               std::size_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
+                               std::vector<orc_stream_info>& stream_info)
+{
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      dst_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col != -1) {
+      stream_info.emplace_back(
+        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+/**
+ * @brief Function that populates column descriptors stream/chunk
+ */
+std::size_t gather_stream_info_and_update_chunks(
+  std::size_t stripe_index,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  std::size_t* num_dictionary_entries,
+  std::vector<orc_stream_info>& stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
 {
   uint64_t src_offset = 0;
   uint64_t dst_offset = 0;
@@ -726,12 +759,9 @@ void reader::impl::query_stripe_compression_info()
              : std::make_unique<cudf::table>();
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
+  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
   lvl_stripe_data.resize(_selected_columns.num_levels());
 
-  _out_buffers.resize(_selected_columns.num_levels());
-
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
@@ -796,22 +826,10 @@ void reader::impl::query_stripe_compression_info()
     // Logically view streams as columns
     std::vector<orc_stream_info> stream_info;
 
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
 
-    std::size_t stripe_start_row = 0;
-    std::size_t num_dict_entries = 0;
-    std::size_t num_rowgroups    = 0;
-    int stripe_idx               = 0;
+    int stripe_idx = 0;
 
     std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
@@ -821,16 +839,8 @@ void reader::impl::query_stripe_compression_info()
         auto const stripe_footer = stripe.second;
 
         auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
+        auto const total_data_size = gather_stream_info(
+          stripe_idx, stripe_info, stripe_footer, col_meta.orc_col_map[level], stream_info);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -872,61 +882,6 @@ void reader::impl::query_stripe_compression_info()
           }
         }
 
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
-        }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
-          }
-        }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
         stripe_idx++;
       }
     }
@@ -936,91 +891,48 @@ void reader::impl::query_stripe_compression_info()
 
     if (stripe_data.empty()) { continue; }
 
-    // Process dataset chunk pages into output columns
-    auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
-    if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
-      auto& rw_grp_meta = col_meta.rwgrp_meta;
-
-      // Update start row and num rows per row group
-      std::transform(rw_grp_meta.begin(),
-                     rw_grp_meta.end(),
-                     row_groups_span.begin(),
-                     rw_grp_meta.begin(),
-                     [&](auto meta, auto& row_grp) {
-                       row_grp.num_rows  = meta.num_rows;
-                       row_grp.start_row = meta.start_row;
-                       return meta;
-                     });
-    }
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                total_num_stripes,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
-    } else {
-        // Set decompressed data size equal to the input size.
-        // TODO
-    }
-
-    for (std::size_t i = 0; i < column_types.size(); ++i) {
-      bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-          is_nullable = true;
-          break;
-        }
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+        0, stream_info.size(), _stream);
+      for (auto const& info : stream_info) {
+        compinfo.push_back(gpu::CompressedStreamInfo(
+          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+          info.length));
       }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
-    }
-
-    decode_stream_data(num_dict_entries,
-                       rows_to_skip,
-                       _metadata.get_row_index_stride(),
-                       level,
-                       tz_table->view(),
-                       chunks,
-                       row_groups,
-                       _out_buffers[level],
-                       _stream,
-                       _mr);
-
-    if (nested_cols.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
-
-      row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(
-        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
-
-      // ORC stores number of elements at each row, so we need to generate offsets from that
-      std::vector<list_buffer_data> buff_data;
-      std::for_each(
-        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
-          if (out_buffer.type.id() == type_id::LIST) {
-            auto data = static_cast<size_type*>(out_buffer.data());
-            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
-          }
-        });
+      compinfo.host_to_device_async(_stream);
+
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
+
+      // Count the exact number of compressed blocks
+      std::size_t num_compressed_blocks   = 0;
+      std::size_t num_uncompressed_blocks = 0;
+      std::size_t total_decomp_size       = 0;
+      for (std::size_t i = 0; i < compinfo.size(); ++i) {
+        num_compressed_blocks += compinfo[i].num_compressed_blocks;
+        num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+        total_decomp_size += compinfo[i].max_uncompressed_size;
+      }
+      CUDF_EXPECTS(
+        not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+        "Inconsistent info on compression blocks");
+      printf("compression correct\n");
+      fflush(stdout);
 
-      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    } else {
+      // Set decompressed data size equal to the input size.
+      // TODO
     }
+
   }  // end loop level
 
+  lvl_stripe_data.clear();
   _file_itm_data->compinfo_ready = true;
 }
 
@@ -1160,17 +1072,18 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
+        auto stream_count = stream_info.size();
+        auto const total_data_size =
+          gather_stream_info_and_update_chunks(stripe_idx,
+                                               stripe_info,
+                                               stripe_footer,
+                                               col_meta.orc_col_map[level],
+                                               _metadata.get_types(),
+                                               use_index,
+                                               level == 0,
+                                               &num_dict_entries,
+                                               stream_info,
+                                               chunks);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -1325,7 +1238,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       }
     }
 
-
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < total_num_stripes; ++j) {

From 47a66a3f38f67e7a9b39ad26de6c3663b2678888 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 11:41:41 -0800
Subject: [PATCH 006/321] Change comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 3c392d25d30..067a32a06d7 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -922,10 +922,11 @@ void reader::impl::query_stripe_compression_info()
       CUDF_EXPECTS(
         not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
         "Inconsistent info on compression blocks");
-      printf("compression correct\n");
-      fflush(stdout);
 
     } else {
+      printf("no compression \n");
+      fflush(stdout);
+
       // Set decompressed data size equal to the input size.
       // TODO
     }
@@ -961,7 +962,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // If no rows or stripes to read, return empty columns
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  // query_stripe_compression_info();
+  query_stripe_compression_info();
 
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {

From 589a8423b79c97efa3f840210ce120ac05cc7a2b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 13:30:18 -0800
Subject: [PATCH 007/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 38 ++----------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 067a32a06d7..b7c3fb3e732 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -769,33 +769,11 @@ void reader::impl::query_stripe_compression_info()
     auto& columns_level = _selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
     col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_cols;
 
-    // Get a list of column data types
-    std::vector<data_type> column_types;
+    size_type col_id{0};
     for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
-
       // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
+      col_meta.orc_col_map[level][col.id] = col_id++;
     }
 
     // Get the total number of stripes across all input files.
@@ -811,18 +789,6 @@ void reader::impl::query_stripe_compression_info()
       total_num_stripes, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
-    const bool use_index =
-      _use_index &&
-      // Do stripes have row group index
-      _metadata.is_row_grp_idx_present() &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (rows_to_skip == 0);
-
     // Logically view streams as columns
     std::vector<orc_stream_info> stream_info;
 

From 9bc9ebfe5dea3baf4b0f8846abb33460f19959fd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 14:25:15 -0800
Subject: [PATCH 008/321] Extend index

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 32 ++++++++++++++++---------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 43883b34077..1c020d2ed16 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -30,21 +30,31 @@
 
 namespace cudf::io::orc::detail {
 
+using stream_index = struct {
+  std::size_t stripe_idx;
+  std::size_t level;
+  std::size_t col_idx;
+  std::size_t stream_idx;
+};
+using stream_comp_info = struct {
+  std::size_t num_compressed_blocks;
+  std::size_t num_uncompressed_blocks;
+  std::size_t total_decomp_size;
+};
+struct stream_index_hash {
+  std::size_t operator()(stream_index const& index) const
+  {
+    auto const hasher = std::hash<size_t>{};
+    return hasher(index.stripe_idx) ^ hasher(index.level) ^ hasher(index.col_idx) ^
+           hasher(index.stream_idx);
+  }
+};
+
 /**
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
 struct file_intermediate_data {
-  using chunk_index     = std::tuple<size_t, size_t, size_t>;
-  using chunk_comp_info = std::tuple<size_t, size_t, size_t>;
-
-  struct index_hash {
-    std::size_t operator()(chunk_index const& index) const
-    {
-      return std::hash<size_t>()(std::get<0>(index)) ^ std::hash<size_t>()(std::get<1>(index)) ^
-             std::hash<size_t>()(std::get<2>(index));
-    }
-  };
-  std::unordered_map<chunk_index, chunk_comp_info, index_hash> compinfo_map;
+  std::unordered_map<stream_index, stream_comp_info, stream_index_hash> compinfo_map;
   bool compinfo_ready{false};
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;

From 0298430e4accd2c286daf9aa647f75b27b224d5a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 14:57:44 -0800
Subject: [PATCH 009/321] Compute stripe-level comp info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp  | 23 +++++---
 cpp/src/io/orc/reader_impl_preprocess.cu | 71 ++++++++++++++++--------
 2 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 1c020d2ed16..69fcd4d0772 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -30,23 +30,26 @@
 
 namespace cudf::io::orc::detail {
 
-using stream_index = struct {
+struct stripe_level_index {
   std::size_t stripe_idx;
   std::size_t level;
-  std::size_t col_idx;
-  std::size_t stream_idx;
 };
-using stream_comp_info = struct {
+struct stripe_level_comp_info {
   std::size_t num_compressed_blocks;
   std::size_t num_uncompressed_blocks;
   std::size_t total_decomp_size;
 };
-struct stream_index_hash {
-  std::size_t operator()(stream_index const& index) const
+struct stripe_level_equal {
+  bool operator()(stripe_level_index const& lhs, stripe_level_index const& rhs) const
+  {
+    return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level;
+  }
+};
+struct stripe_level_hash {
+  std::size_t operator()(stripe_level_index const& index) const
   {
     auto const hasher = std::hash<size_t>{};
-    return hasher(index.stripe_idx) ^ hasher(index.level) ^ hasher(index.col_idx) ^
-           hasher(index.stream_idx);
+    return hasher(index.stripe_idx) ^ hasher(index.level);
   }
 };
 
@@ -54,7 +57,9 @@ struct stream_index_hash {
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
 struct file_intermediate_data {
-  std::unordered_map<stream_index, stream_comp_info, stream_index_hash> compinfo_map;
+  std::
+    unordered_map<stripe_level_index, stripe_level_comp_info, stripe_level_hash, stripe_level_equal>
+      compinfo_map;
   bool compinfo_ready{false};
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index b7c3fb3e732..dbc482c1725 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -67,14 +67,23 @@ struct orc_stream_info {
   uint32_t stripe_idx;  // stripe processing index, not stripe index in source
 };
 
+struct stream_comp_info {
+  orc_stream_info* stream_info;
+  gpu::CompressedStreamInfo* comp_info;
+};
+
 /**
  * @brief Function that populates column descriptors stream/chunk
  */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               std::vector<orc_stream_info>& stream_info)
+std::size_t gather_stream_info(
+  std::size_t stripe_index,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  std::vector<orc_stream_info>& stream_info,
+  std::unordered_map<stripe_level_index, stream_comp_info, stripe_level_hash, stripe_level_equal>&
+    stream_compinfo_map)
 {
   uint64_t src_offset = 0;
   uint64_t dst_offset = 0;
@@ -86,11 +95,13 @@ std::size_t gather_stream_info(std::size_t stripe_index,
     }
 
     auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
+    auto const col_order = orc2gdf[column_id];
 
-    if (col != -1) {
+    if (col_order != -1) {
       stream_info.emplace_back(
         stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
+      stream_compinfo_map[stripe_level_index{stripe_index, level}] =
+        stream_comp_info{&stream_info.back(), nullptr};
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -762,6 +773,9 @@ void reader::impl::query_stripe_compression_info()
   auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
   lvl_stripe_data.resize(_selected_columns.num_levels());
 
+  std::unordered_map<stripe_level_index, stream_comp_info, stripe_level_hash, stripe_level_equal>
+    stream_compinfo_map;
+
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
@@ -805,8 +819,13 @@ void reader::impl::query_stripe_compression_info()
         auto const stripe_footer = stripe.second;
 
         auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(
-          stripe_idx, stripe_info, stripe_footer, col_meta.orc_col_map[level], stream_info);
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        level,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        stream_info,
+                                                        stream_compinfo_map);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -862,11 +881,20 @@ void reader::impl::query_stripe_compression_info()
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
         0, stream_info.size(), _stream);
-      for (auto const& info : stream_info) {
+
+      for (auto& [stripe_level, stripe_level_info] : stream_compinfo_map) {
+        auto const& info = *(stripe_level_info.stream_info);
         compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+          static_cast<uint8_t const*>(stripe_data[stripe_level.stripe_idx].data()) + info.dst_pos,
           info.length));
+        stripe_level_info.comp_info = &compinfo[compinfo.size() - 1];
       }
+
+      // for (auto const& info : stream_info) {
+      //   compinfo.push_back(gpu::CompressedStreamInfo(
+      //     static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      //     info.length));
+      // }
       compinfo.host_to_device_async(_stream);
 
       gpu::ParseCompressedStripeData(compinfo.device_ptr(),
@@ -876,18 +904,17 @@ void reader::impl::query_stripe_compression_info()
                                      _stream);
       compinfo.device_to_host_sync(_stream);
 
-      // Count the exact number of compressed blocks
-      std::size_t num_compressed_blocks   = 0;
-      std::size_t num_uncompressed_blocks = 0;
-      std::size_t total_decomp_size       = 0;
-      for (std::size_t i = 0; i < compinfo.size(); ++i) {
-        num_compressed_blocks += compinfo[i].num_compressed_blocks;
-        num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-        total_decomp_size += compinfo[i].max_uncompressed_size;
+      auto& compinfo_map = _file_itm_data->compinfo_map;
+      for (auto& [stripe_level, stripe_level_info] : stream_compinfo_map) {
+        if (compinfo_map.find(stripe_level) == compinfo_map.end()) {
+          compinfo_map[stripe_level] = stripe_level_comp_info{0, 0};
+        }
+        auto const& stream_compinfo = *stripe_level_info.comp_info;
+        compinfo_map[stripe_level].num_compressed_blocks += stream_compinfo.num_compressed_blocks;
+        compinfo_map[stripe_level].num_uncompressed_blocks +=
+          stream_compinfo.num_uncompressed_blocks;
+        compinfo_map[stripe_level].total_decomp_size += stream_compinfo.max_uncompressed_size;
       }
-      CUDF_EXPECTS(
-        not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
-        "Inconsistent info on compression blocks");
 
     } else {
       printf("no compression \n");

From 6d5e45fc4638054ce06b27c592c00f417024d92c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 15:20:55 -0800
Subject: [PATCH 010/321] Successfully compute stripe-level comp info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 70 ++++++++++--------------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index dbc482c1725..8fb1e9a9031 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -57,33 +57,27 @@ struct orc_stream_info {
   explicit orc_stream_info(uint64_t offset_,
                            std::size_t dst_pos_,
                            uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
+                           uint32_t stripe_idx_,
+                           std::size_t level_)
+    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_), level(level_)
   {
   }
   uint64_t offset;      // offset in file
   std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
   std::size_t length;   // length in file
   uint32_t stripe_idx;  // stripe processing index, not stripe index in source
-};
-
-struct stream_comp_info {
-  orc_stream_info* stream_info;
-  gpu::CompressedStreamInfo* comp_info;
+  std::size_t level;    // TODO
 };
 
 /**
  * @brief Function that populates column descriptors stream/chunk
  */
-std::size_t gather_stream_info(
-  std::size_t stripe_index,
-  std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
-  host_span<int const> orc2gdf,
-  std::vector<orc_stream_info>& stream_info,
-  std::unordered_map<stripe_level_index, stream_comp_info, stripe_level_hash, stripe_level_equal>&
-    stream_compinfo_map)
+std::size_t gather_stream_info(std::size_t stripe_index,
+                               std::size_t level,
+                               orc::StripeInformation const* stripeinfo,
+                               orc::StripeFooter const* stripefooter,
+                               host_span<int const> orc2gdf,
+                               std::vector<orc_stream_info>& stream_info)
 {
   uint64_t src_offset = 0;
   uint64_t dst_offset = 0;
@@ -99,9 +93,7 @@ std::size_t gather_stream_info(
 
     if (col_order != -1) {
       stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      stream_compinfo_map[stripe_level_index{stripe_index, level}] =
-        stream_comp_info{&stream_info.back(), nullptr};
+        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index, level);
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -115,6 +107,7 @@ std::size_t gather_stream_info(
  */
 std::size_t gather_stream_info_and_update_chunks(
   std::size_t stripe_index,
+  std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
   host_span<int const> orc2gdf,
@@ -188,7 +181,7 @@ std::size_t gather_stream_info_and_update_chunks(
         }
       }
       stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
+        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index, level);
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -773,7 +766,10 @@ void reader::impl::query_stripe_compression_info()
   auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
   lvl_stripe_data.resize(_selected_columns.num_levels());
 
-  std::unordered_map<stripe_level_index, stream_comp_info, stripe_level_hash, stripe_level_equal>
+  std::unordered_map<stripe_level_index,
+                     gpu::CompressedStreamInfo*,
+                     stripe_level_hash,
+                     stripe_level_equal>
     stream_compinfo_map;
 
   // Iterates through levels of nested columns, child column will be one level down
@@ -819,13 +815,8 @@ void reader::impl::query_stripe_compression_info()
         auto const stripe_footer = stripe.second;
 
         auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        level,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        stream_info,
-                                                        stream_compinfo_map);
+        auto const total_data_size = gather_stream_info(
+          stripe_idx, level, stripe_info, stripe_footer, col_meta.orc_col_map[level], stream_info);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -882,19 +873,14 @@ void reader::impl::query_stripe_compression_info()
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
         0, stream_info.size(), _stream);
 
-      for (auto& [stripe_level, stripe_level_info] : stream_compinfo_map) {
-        auto const& info = *(stripe_level_info.stream_info);
+      for (auto const& info : stream_info) {
         compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[stripe_level.stripe_idx].data()) + info.dst_pos,
+          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
           info.length));
-        stripe_level_info.comp_info = &compinfo[compinfo.size() - 1];
+        stream_compinfo_map[stripe_level_index{info.stripe_idx, info.level}] =
+          &compinfo[compinfo.size() - 1];
       }
 
-      // for (auto const& info : stream_info) {
-      //   compinfo.push_back(gpu::CompressedStreamInfo(
-      //     static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      //     info.length));
-      // }
       compinfo.host_to_device_async(_stream);
 
       gpu::ParseCompressedStripeData(compinfo.device_ptr(),
@@ -905,15 +891,14 @@ void reader::impl::query_stripe_compression_info()
       compinfo.device_to_host_sync(_stream);
 
       auto& compinfo_map = _file_itm_data->compinfo_map;
-      for (auto& [stripe_level, stripe_level_info] : stream_compinfo_map) {
+      for (auto& [stripe_level, stream_compinfo] : stream_compinfo_map) {
         if (compinfo_map.find(stripe_level) == compinfo_map.end()) {
           compinfo_map[stripe_level] = stripe_level_comp_info{0, 0};
         }
-        auto const& stream_compinfo = *stripe_level_info.comp_info;
-        compinfo_map[stripe_level].num_compressed_blocks += stream_compinfo.num_compressed_blocks;
+        compinfo_map[stripe_level].num_compressed_blocks += stream_compinfo->num_compressed_blocks;
         compinfo_map[stripe_level].num_uncompressed_blocks +=
-          stream_compinfo.num_uncompressed_blocks;
-        compinfo_map[stripe_level].total_decomp_size += stream_compinfo.max_uncompressed_size;
+          stream_compinfo->num_uncompressed_blocks;
+        compinfo_map[stripe_level].total_decomp_size += stream_compinfo->max_uncompressed_size;
       }
 
     } else {
@@ -1069,6 +1054,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         auto stream_count = stream_info.size();
         auto const total_data_size =
           gather_stream_info_and_update_chunks(stripe_idx,
+                                               level,
                                                stripe_info,
                                                stripe_footer,
                                                col_meta.orc_col_map[level],

From a701e2988b963c79506fbc74f2363309ffc50d18 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 15:31:58 -0800
Subject: [PATCH 011/321] Reuse cached comp info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp  |  6 +--
 cpp/src/io/orc/reader_impl_preprocess.cu | 55 +++++++++++++-----------
 2 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 69fcd4d0772..d336ad30bc5 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -35,9 +35,9 @@ struct stripe_level_index {
   std::size_t level;
 };
 struct stripe_level_comp_info {
-  std::size_t num_compressed_blocks;
-  std::size_t num_uncompressed_blocks;
-  std::size_t total_decomp_size;
+  std::size_t num_compressed_blocks{0};
+  std::size_t num_uncompressed_blocks{0};
+  std::size_t total_decomp_size{0};
 };
 struct stripe_level_equal {
   bool operator()(stripe_level_index const& lhs, stripe_level_index const& rhs) const
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 8fb1e9a9031..ad8eabd2ae8 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -206,6 +206,7 @@ std::size_t gather_stream_info_and_update_chunks(
  */
 rmm::device_buffer decompress_stripe_data(
   OrcDecompressor const& decompressor,
+  stripe_level_comp_info comp_info,
   host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
@@ -215,32 +216,11 @@ rmm::device_buffer decompress_stripe_data(
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
-
   // Count the exact number of compressed blocks
-  std::size_t num_compressed_blocks   = 0;
-  std::size_t num_uncompressed_blocks = 0;
-  std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
-  }
+  std::size_t num_compressed_blocks   = comp_info.num_compressed_blocks;
+  std::size_t num_uncompressed_blocks = comp_info.num_uncompressed_blocks;
+  std::size_t total_decomp_size       = comp_info.total_decomp_size;
+
   CUDF_EXPECTS(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
@@ -261,12 +241,25 @@ rmm::device_buffer decompress_stripe_data(
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
 
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+    0, stream_info.size(), stream);
+  for (auto const& info : stream_info) {
+    compinfo.push_back(gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      info.length));
+  }
+
   // Parse again to populate the decompression input/output buffers
   std::size_t decomp_offset      = 0;
   uint32_t max_uncomp_block_size = 0;
   uint32_t start_pos             = 0;
   auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
   for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    // TODO: need this?
+    compinfo[i].num_compressed_blocks   = num_compressed_blocks;
+    compinfo[i].num_uncompressed_blocks = num_uncompressed_blocks;
+    compinfo[i].max_uncompressed_size   = total_decomp_size;
+
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
     compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
@@ -766,6 +759,7 @@ void reader::impl::query_stripe_compression_info()
   auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
   lvl_stripe_data.resize(_selected_columns.num_levels());
 
+  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stripe_level_index,
                      gpu::CompressedStreamInfo*,
                      stripe_level_hash,
@@ -1190,7 +1184,18 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     }
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_level_comp_info comp_info;
+      auto& compinfo_map = _file_itm_data->compinfo_map;
+      for (auto const& info : stream_info) {
+        auto const& precomputed_info =
+          compinfo_map.at(stripe_level_index{info.stripe_idx, info.level});
+        comp_info.num_compressed_blocks += precomputed_info.num_compressed_blocks;
+        comp_info.num_uncompressed_blocks += precomputed_info.num_uncompressed_blocks;
+        comp_info.total_decomp_size += precomputed_info.total_decomp_size;
+      }
+
       auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+                                                comp_info,
                                                 stripe_data,
                                                 stream_info,
                                                 chunks,

From fbc976af5469e0c5171a0898018a8d71e3cb13cd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 16:09:24 -0800
Subject: [PATCH 012/321] Implement stream identification

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp  |  25 +++---
 cpp/src/io/orc/reader_impl_preprocess.cu | 107 +++++++++++++----------
 2 files changed, 74 insertions(+), 58 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index d336ad30bc5..c90f606da5a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -30,26 +30,32 @@
 
 namespace cudf::io::orc::detail {
 
-struct stripe_level_index {
+// unify this with orc_stream_info
+struct stream_id_info {
   std::size_t stripe_idx;
   std::size_t level;
+  uint32_t orc_col_idx;
+  StreamKind kind;
 };
 struct stripe_level_comp_info {
   std::size_t num_compressed_blocks{0};
   std::size_t num_uncompressed_blocks{0};
   std::size_t total_decomp_size{0};
 };
-struct stripe_level_equal {
-  bool operator()(stripe_level_index const& lhs, stripe_level_index const& rhs) const
+struct stream_id_equal {
+  bool operator()(stream_id_info const& lhs, stream_id_info const& rhs) const
   {
-    return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level;
+    return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+           lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
   }
 };
-struct stripe_level_hash {
-  std::size_t operator()(stripe_level_index const& index) const
+struct stream_id_hash {
+  std::size_t operator()(stream_id_info const& index) const
   {
     auto const hasher = std::hash<size_t>{};
-    return hasher(index.stripe_idx) ^ hasher(index.level);
+    return hasher(index.stripe_idx) ^ hasher(index.level) ^
+           hasher(static_cast<std::size_t>(index.orc_col_idx)) ^
+           hasher(static_cast<std::size_t>(index.kind));
   }
 };
 
@@ -57,9 +63,8 @@ struct stripe_level_hash {
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
 struct file_intermediate_data {
-  std::
-    unordered_map<stripe_level_index, stripe_level_comp_info, stripe_level_hash, stripe_level_equal>
-      compinfo_map;
+  std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal>
+    compinfo_map;
   bool compinfo_ready{false};
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index ad8eabd2ae8..e1702187e87 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -58,8 +58,16 @@ struct orc_stream_info {
                            std::size_t dst_pos_,
                            uint32_t length_,
                            uint32_t stripe_idx_,
-                           std::size_t level_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_), level(level_)
+                           std::size_t level_,
+                           uint32_t orc_col_idx_,
+                           StreamKind kind_)
+    : offset(offset_),
+      dst_pos(dst_pos_),
+      length(length_),
+      stripe_idx(stripe_idx_),
+      level(level_),
+      orc_col_idx(orc_col_idx_),
+      kind(kind_)
   {
   }
   uint64_t offset;      // offset in file
@@ -67,6 +75,8 @@ struct orc_stream_info {
   std::size_t length;   // length in file
   uint32_t stripe_idx;  // stripe processing index, not stripe index in source
   std::size_t level;    // TODO
+  uint32_t orc_col_idx;
+  StreamKind kind;
 };
 
 /**
@@ -92,8 +102,13 @@ std::size_t gather_stream_info(std::size_t stripe_index,
     auto const col_order = orc2gdf[column_id];
 
     if (col_order != -1) {
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index, level);
+      stream_info.emplace_back(stripeinfo->offset + src_offset,
+                               dst_offset,
+                               stream.length,
+                               stripe_index,
+                               level,
+                               column_id,
+                               stream.kind);
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -180,8 +195,13 @@ std::size_t gather_stream_info_and_update_chunks(
           }
         }
       }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index, level);
+      stream_info.emplace_back(stripeinfo->offset + src_offset,
+                               dst_offset,
+                               stream.length,
+                               stripe_index,
+                               level,
+                               column_id,
+                               stream.kind);
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -205,8 +225,9 @@ std::size_t gather_stream_info_and_update_chunks(
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
+  std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal> const&
+    compinfo_map,
   OrcDecompressor const& decompressor,
-  stripe_level_comp_info comp_info,
   host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
@@ -217,9 +238,28 @@ rmm::device_buffer decompress_stripe_data(
   rmm::cuda_stream_view stream)
 {
   // Count the exact number of compressed blocks
-  std::size_t num_compressed_blocks   = comp_info.num_compressed_blocks;
-  std::size_t num_uncompressed_blocks = comp_info.num_uncompressed_blocks;
-  std::size_t total_decomp_size       = comp_info.total_decomp_size;
+  std::size_t num_compressed_blocks   = 0;
+  std::size_t num_uncompressed_blocks = 0;
+  std::size_t total_decomp_size       = 0;
+
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+    0, stream_info.size(), stream);
+  for (auto const& info : stream_info) {
+    compinfo.push_back(gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      info.length));
+
+    auto const& cached_comp_info =
+      compinfo_map.at(stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind});
+    auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
+    stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+    stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+    stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+    num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+    num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+    total_decomp_size += cached_comp_info.total_decomp_size;
+  }
 
   CUDF_EXPECTS(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
@@ -241,25 +281,12 @@ rmm::device_buffer decompress_stripe_data(
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-
   // Parse again to populate the decompression input/output buffers
   std::size_t decomp_offset      = 0;
   uint32_t max_uncomp_block_size = 0;
   uint32_t start_pos             = 0;
   auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
   for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    // TODO: need this?
-    compinfo[i].num_compressed_blocks   = num_compressed_blocks;
-    compinfo[i].num_uncompressed_blocks = num_uncompressed_blocks;
-    compinfo[i].max_uncompressed_size   = total_decomp_size;
-
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
     compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
@@ -760,10 +787,7 @@ void reader::impl::query_stripe_compression_info()
   lvl_stripe_data.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  std::unordered_map<stripe_level_index,
-                     gpu::CompressedStreamInfo*,
-                     stripe_level_hash,
-                     stripe_level_equal>
+  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
     stream_compinfo_map;
 
   // Iterates through levels of nested columns, child column will be one level down
@@ -871,7 +895,8 @@ void reader::impl::query_stripe_compression_info()
         compinfo.push_back(gpu::CompressedStreamInfo(
           static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
           info.length));
-        stream_compinfo_map[stripe_level_index{info.stripe_idx, info.level}] =
+        stream_compinfo_map[stream_id_info{
+          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
           &compinfo[compinfo.size() - 1];
       }
 
@@ -885,14 +910,10 @@ void reader::impl::query_stripe_compression_info()
       compinfo.device_to_host_sync(_stream);
 
       auto& compinfo_map = _file_itm_data->compinfo_map;
-      for (auto& [stripe_level, stream_compinfo] : stream_compinfo_map) {
-        if (compinfo_map.find(stripe_level) == compinfo_map.end()) {
-          compinfo_map[stripe_level] = stripe_level_comp_info{0, 0};
-        }
-        compinfo_map[stripe_level].num_compressed_blocks += stream_compinfo->num_compressed_blocks;
-        compinfo_map[stripe_level].num_uncompressed_blocks +=
-          stream_compinfo->num_uncompressed_blocks;
-        compinfo_map[stripe_level].total_decomp_size += stream_compinfo->max_uncompressed_size;
+      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
+                                   stream_compinfo->num_uncompressed_blocks,
+                                   stream_compinfo->max_uncompressed_size};
       }
 
     } else {
@@ -1184,18 +1205,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     }
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      stripe_level_comp_info comp_info;
-      auto& compinfo_map = _file_itm_data->compinfo_map;
-      for (auto const& info : stream_info) {
-        auto const& precomputed_info =
-          compinfo_map.at(stripe_level_index{info.stripe_idx, info.level});
-        comp_info.num_compressed_blocks += precomputed_info.num_compressed_blocks;
-        comp_info.num_uncompressed_blocks += precomputed_info.num_uncompressed_blocks;
-        comp_info.total_decomp_size += precomputed_info.total_decomp_size;
-      }
-
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
-                                                comp_info,
+      auto decomp_data = decompress_stripe_data(_file_itm_data->compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
                                                 chunks,

From 2070045a73b65a1745af8413c58e4d4e35ae3533 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 17:03:46 -0800
Subject: [PATCH 013/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 100 +++++++++++++++++++----
 1 file changed, 85 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index e1702187e87..bfc073abdbe 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -69,6 +69,13 @@ struct orc_stream_info {
       orc_col_idx(orc_col_idx_),
       kind(kind_)
   {
+#if 0
+    printf("   construct stripe id [%d, %d, %d, %d]\n",
+           (int)stripe_idx,
+           (int)level,
+           (int)orc_col_idx,
+           (int)kind);
+#endif
   }
   uint64_t offset;      // offset in file
   std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
@@ -195,6 +202,7 @@ std::size_t gather_stream_info_and_update_chunks(
           }
         }
       }
+
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
@@ -245,6 +253,17 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
     0, stream_info.size(), stream);
   for (auto const& info : stream_info) {
+#if 0
+    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+           (int)info.stripe_idx,
+           (int)info.level,
+           (int)info.orc_col_idx,
+           (int)info.kind,
+           info.dst_pos,
+           info.length);
+    fflush(stdout);
+#endif
+
     compinfo.push_back(gpu::CompressedStreamInfo(
       static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
       info.length));
@@ -265,6 +284,46 @@ rmm::device_buffer decompress_stripe_data(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
+#if 0
+  std::size_t old_num_compressed_blocks   = num_compressed_blocks;
+  std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
+  std::size_t old_total_decomp_size       = total_decomp_size;
+
+  num_compressed_blocks   = 0;
+  num_uncompressed_blocks = 0;
+  total_decomp_size       = 0;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    num_compressed_blocks += compinfo[i].num_compressed_blocks;
+    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+    auto const& info = stream_info[i];
+    printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+           (int)info.stripe_idx,
+           (int)info.level,
+           (int)info.orc_col_idx,
+           (int)info.kind,
+           (size_t)compinfo[i].num_compressed_blocks,
+           (size_t)compinfo[i].num_uncompressed_blocks,
+           compinfo[i].max_uncompressed_size);
+    fflush(stdout);
+  }
+
+  if (old_num_compressed_blocks != num_compressed_blocks ||
+      old_num_uncompressed_blocks != num_uncompressed_blocks ||
+      old_total_decomp_size != total_decomp_size) {
+    printf("invalid: %d - %d, %d - %d, %d - %d\n",
+           (int)old_num_compressed_blocks,
+           (int)num_compressed_blocks,
+           (int)old_num_uncompressed_blocks,
+           (int)num_uncompressed_blocks,
+           (int)old_total_decomp_size,
+           (int)total_decomp_size
+
+    );
+  }
+#endif
+
   // Buffer needs to be padded.
   // Required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
@@ -768,21 +827,6 @@ void reader::impl::query_stripe_compression_info()
   // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
-
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
-  }();
-
   auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
   lvl_stripe_data.resize(_selected_columns.num_levels());
 
@@ -898,6 +942,16 @@ void reader::impl::query_stripe_compression_info()
         stream_compinfo_map[stream_id_info{
           info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
           &compinfo[compinfo.size() - 1];
+#if 0
+        printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+               (int)info.stripe_idx,
+               (int)info.level,
+               (int)info.orc_col_idx,
+               (int)info.kind,
+               info.dst_pos,
+               info.length);
+        fflush(stdout);
+#endif
       }
 
       compinfo.host_to_device_async(_stream);
@@ -914,8 +968,22 @@ void reader::impl::query_stripe_compression_info()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
+#if 0
+        printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+               (int)stream_id.stripe_idx,
+               (int)stream_id.level,
+               (int)stream_id.orc_col_idx,
+               (int)stream_id.kind,
+               (size_t)stream_compinfo->num_compressed_blocks,
+               (size_t)stream_compinfo->num_uncompressed_blocks,
+               stream_compinfo->max_uncompressed_size);
+        fflush(stdout);
+#endif
       }
 
+      // Must clear so we will not overwrite the old compression info stream_id.
+      stream_compinfo_map.clear();
+
     } else {
       printf("no compression \n");
       fflush(stdout);
@@ -924,6 +992,8 @@ void reader::impl::query_stripe_compression_info()
       // TODO
     }
 
+    printf("  end level %d\n\n", (int)level);
+
   }  // end loop level
 
   lvl_stripe_data.clear();

From 84813f4148aa0243557e6c97aeb469b4eae49796 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 17:08:05 -0800
Subject: [PATCH 014/321] Remove comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index bfc073abdbe..56c518a9d6a 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -270,6 +270,8 @@ rmm::device_buffer decompress_stripe_data(
 
     auto const& cached_comp_info =
       compinfo_map.at(stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind});
+    // auto const& cached_comp_info =
+    //   compinfo_map[stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind}];
     auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
@@ -992,7 +994,7 @@ void reader::impl::query_stripe_compression_info()
       // TODO
     }
 
-    printf("  end level %d\n\n", (int)level);
+    // printf("  end level %d\n\n", (int)level);
 
   }  // end loop level
 

From cd4f719fa32072c558508113a052d1ef35a1d90e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 17:29:09 -0800
Subject: [PATCH 015/321] Disable comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 56c518a9d6a..e7002a00579 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -987,8 +987,8 @@ void reader::impl::query_stripe_compression_info()
       stream_compinfo_map.clear();
 
     } else {
-      printf("no compression \n");
-      fflush(stdout);
+      // printf("no compression \n");
+      // fflush(stdout);
 
       // Set decompressed data size equal to the input size.
       // TODO

From abe91180e427c5d575282c9a2c2c4425801ab5d5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 17:49:41 -0800
Subject: [PATCH 016/321] Do not read data again

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 72 +++++++++++-------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index e7002a00579..5f74daa8ac2 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -998,7 +998,7 @@ void reader::impl::query_stripe_compression_info()
 
   }  // end loop level
 
-  lvl_stripe_data.clear();
+  // lvl_stripe_data.clear();
   _file_itm_data->compinfo_ready = true;
 }
 
@@ -1131,7 +1131,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     std::size_t num_rowgroups    = 0;
     int stripe_idx               = 0;
 
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
       // Iterate through the source files selected stripes
       for (auto const& stripe : stripe_source_mapping.stripe_info) {
@@ -1156,41 +1156,37 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                      "Invalid index rowgroup stream data");
 
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+        auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
 
         // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
+        // while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+        //   auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+        //   auto const offset = stream_info[stream_count].offset;
+        //   auto len          = stream_info[stream_count].length;
+        //   stream_count++;
+
+        //   while (stream_count < stream_info.size() &&
+        //          stream_info[stream_count].offset == offset + len) {
+        //     len += stream_info[stream_count].length;
+        //     stream_count++;
+        //   }
+        //   if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+        //         .source->is_device_read_preferred(len)) {
+        //     read_tasks.push_back(
+        //       std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+        //                   .source->device_read_async(offset, len, d_dst, _stream),
+        //                 len));
+
+        //   } else {
+        //     auto const buffer =
+        //       _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+        //         offset, len);
+        //     CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+        //     CUDF_CUDA_TRY(
+        //       cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+        //     _stream.synchronize();
+        //   }
+        // }
 
         auto const num_rows_per_stripe = stripe_info->numberOfRows;
         auto const rowgroup_id         = num_rowgroups;
@@ -1250,9 +1246,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         stripe_idx++;
       }
     }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-    }
+    // for (auto& task : read_tasks) {
+    //   CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+    // }
 
     if (stripe_data.empty()) { continue; }
 

From c1cdf26b78589ee1f8a98bc2c2872a5155abf326 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 18:06:44 -0800
Subject: [PATCH 017/321] Fix header year

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fd27b56ef0e..8254bf65fe2 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From aaf2c31d468db1144463938bc42b0a0c729df03a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 20:45:02 -0800
Subject: [PATCH 018/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 46 ++++--------------------
 1 file changed, 7 insertions(+), 39 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 5f74daa8ac2..ea636fc4ddc 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -836,6 +836,10 @@ void reader::impl::query_stripe_compression_info()
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
     stream_compinfo_map;
 
+  // Logically view streams as columns
+  std::vector<orc_stream_info> stream_info;
+  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
@@ -849,7 +853,9 @@ void reader::impl::query_stripe_compression_info()
       // Map each ORC column to its column
       col_meta.orc_col_map[level][col.id] = col_id++;
     }
+  }
 
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     // Get the total number of stripes across all input files.
     std::size_t total_num_stripes =
       std::accumulate(selected_stripes.begin(),
@@ -858,13 +864,6 @@ void reader::impl::query_stripe_compression_info()
                       [](std::size_t sum, auto& stripe_source_mapping) {
                         return sum + stripe_source_mapping.stripe_info.size();
                       });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
 
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
@@ -984,6 +983,7 @@ void reader::impl::query_stripe_compression_info()
       }
 
       // Must clear so we will not overwrite the old compression info stream_id.
+      stream_info.clear();
       stream_compinfo_map.clear();
 
     } else {
@@ -1056,7 +1056,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& columns_level = _selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
     std::vector<orc_column_meta> nested_cols;
 
     // Get a list of column data types
@@ -1080,7 +1079,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       }
 
       // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
       if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
         nested_cols.emplace_back(col);
       }
@@ -1158,36 +1156,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
         auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
 
-        // Coalesce consecutive streams into one read
-        // while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-        //   auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-        //   auto const offset = stream_info[stream_count].offset;
-        //   auto len          = stream_info[stream_count].length;
-        //   stream_count++;
-
-        //   while (stream_count < stream_info.size() &&
-        //          stream_info[stream_count].offset == offset + len) {
-        //     len += stream_info[stream_count].length;
-        //     stream_count++;
-        //   }
-        //   if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-        //         .source->is_device_read_preferred(len)) {
-        //     read_tasks.push_back(
-        //       std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-        //                   .source->device_read_async(offset, len, d_dst, _stream),
-        //                 len));
-
-        //   } else {
-        //     auto const buffer =
-        //       _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-        //         offset, len);
-        //     CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-        //     CUDF_CUDA_TRY(
-        //       cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-        //     _stream.synchronize();
-        //   }
-        // }
-
         auto const num_rows_per_stripe = stripe_info->numberOfRows;
         auto const rowgroup_id         = num_rowgroups;
         auto stripe_num_rowgroups      = 0;

From 3da6ca794cd0e33a4e476e92210e13be1609570b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 22:28:19 -0800
Subject: [PATCH 019/321] Trying to read stripe-by-stripe

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp  |   3 +-
 cpp/src/io/orc/reader_impl_preprocess.cu | 285 ++++++++++++-----------
 2 files changed, 150 insertions(+), 138 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index c90f606da5a..4bafc38dbdd 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -67,7 +67,8 @@ struct file_intermediate_data {
     compinfo_map;
   bool compinfo_ready{false};
 
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
+  // Tracker for eventually deallocating compressed and uncompressed data
+  std::vector<rmm::device_buffer> stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
   int64_t rows_to_skip;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index ea636fc4ddc..7e8feb4ef49 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -108,6 +108,8 @@ std::size_t gather_stream_info(std::size_t stripe_index,
     auto const column_id = *stream.column_id;
     auto const col_order = orc2gdf[column_id];
 
+    // TODO
+    // optimize this.
     if (col_order != -1) {
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
@@ -829,8 +831,8 @@ void reader::impl::query_stripe_compression_info()
   // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
+  auto& stripe_data = _file_itm_data->stripe_data;
+  // lvl_stripe_data.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -855,94 +857,94 @@ void reader::impl::query_stripe_compression_info()
     }
   }
 
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
+  // Get the total number of stripes across all input files.
+  std::size_t total_num_stripes =
+    std::accumulate(selected_stripes.begin(),
+                    selected_stripes.end(),
+                    0,
+                    [](std::size_t sum, auto& stripe_source_mapping) {
+                      return sum + stripe_source_mapping.stripe_info.size();
+                    });
+  stripe_data.reserve(total_num_stripes);
 
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
+  int stripe_idx = 0;
 
-    int stripe_idx = 0;
+  for (auto const& stripe_source_mapping : selected_stripes) {
+    // Iterate through the source files selected stripes
+    for (auto const& stripe : stripe_source_mapping.stripe_info) {
+      auto const stripe_info   = stripe.first;
+      auto const stripe_footer = stripe.second;
 
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
+      auto stream_count = stream_info.size();
 
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(
+      std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+      std::size_t total_data_size{0};
+      for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+        total_data_size += gather_stream_info(
           stripe_idx, level, stripe_info, stripe_footer, col_meta.orc_col_map[level], stream_info);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
+      }  // end loop level
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      // Buffer needs to be padded.
+      // Required by `copy_uncompressed_kernel`.
+      stripe_data.emplace_back(cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE),
+                               _stream);
+      auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+      // Coalesce consecutive streams into one read
+      while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+        auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+        auto const offset = stream_info[stream_count].offset;
+        auto len          = stream_info[stream_count].length;
+        stream_count++;
+
+        while (stream_count < stream_info.size() &&
+               stream_info[stream_count].offset == offset + len) {
+          len += stream_info[stream_count].length;
           stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
         }
+        if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+              .source->is_device_read_preferred(len)) {
+          read_tasks.push_back(std::pair(
+            _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->device_read_async(
+              offset, len, d_dst, _stream),
+            len));
 
-        stripe_idx++;
+        } else {
+          auto const buffer =
+            _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(offset,
+                                                                                            len);
+          CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+          CUDF_CUDA_TRY(
+            cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+          _stream.synchronize();
+        }
       }
+
+      stripe_idx++;
     }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-    }
+  }
+  for (auto& task : read_tasks) {
+    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  }
 
-    if (stripe_data.empty()) { continue; }
+  if (stripe_data.empty()) { continue; }
 
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
-      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-        0, stream_info.size(), _stream);
-
-      for (auto const& info : stream_info) {
-        compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-          info.length));
-        stream_compinfo_map[stream_id_info{
-          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
-          &compinfo[compinfo.size() - 1];
+  // Setup row group descriptors if using indexes
+  if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+    cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+      0, stream_info.size(), _stream);
+
+    for (auto const& info : stream_info) {
+      compinfo.push_back(gpu::CompressedStreamInfo(
+        static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+        info.length));
+      stream_compinfo_map[stream_id_info{
+        info.stripe_idx, info.level, info.orc_col_idx, info.kind}] = &compinfo[compinfo.size() - 1];
 #if 0
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.stripe_idx,
@@ -953,22 +955,22 @@ void reader::impl::query_stripe_compression_info()
                info.length);
         fflush(stdout);
 #endif
-      }
+    }
 
-      compinfo.host_to_device_async(_stream);
+    compinfo.host_to_device_async(_stream);
 
-      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                     compinfo.size(),
-                                     decompressor.GetBlockSize(),
-                                     decompressor.GetLog2MaxCompressionRatio(),
-                                     _stream);
-      compinfo.device_to_host_sync(_stream);
+    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                   compinfo.size(),
+                                   decompressor.GetBlockSize(),
+                                   decompressor.GetLog2MaxCompressionRatio(),
+                                   _stream);
+    compinfo.device_to_host_sync(_stream);
 
-      auto& compinfo_map = _file_itm_data->compinfo_map;
-      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
-        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
-                                   stream_compinfo->num_uncompressed_blocks,
-                                   stream_compinfo->max_uncompressed_size};
+    auto& compinfo_map = _file_itm_data->compinfo_map;
+    for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+      compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
+                                 stream_compinfo->num_uncompressed_blocks,
+                                 stream_compinfo->max_uncompressed_size};
 #if 0
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
                (int)stream_id.stripe_idx,
@@ -980,23 +982,23 @@ void reader::impl::query_stripe_compression_info()
                stream_compinfo->max_uncompressed_size);
         fflush(stdout);
 #endif
-      }
+    }
 
-      // Must clear so we will not overwrite the old compression info stream_id.
-      stream_info.clear();
-      stream_compinfo_map.clear();
+    // Must clear so we will not overwrite the old compression info stream_id.
+    stream_info.clear();
+    stream_compinfo_map.clear();
 
-    } else {
-      // printf("no compression \n");
-      // fflush(stdout);
+  } else {
+    // printf("no compression \n");
+    // fflush(stdout);
 
-      // Set decompressed data size equal to the input size.
-      // TODO
-    }
+    // Set decompressed data size equal to the input size.
+    // TODO
+  }
 
-    // printf("  end level %d\n\n", (int)level);
+  // printf("  end level %d\n\n", (int)level);
 
-  }  // end loop level
+  // }  // end loop level
 
   // lvl_stripe_data.clear();
   _file_itm_data->compinfo_ready = true;
@@ -1044,22 +1046,37 @@ void reader::impl::prepare_data(uint64_t skip_rows,
              : std::make_unique<cudf::table>();
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
+  auto& stripe_data            = _file_itm_data->stripe_data;
   auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
 
   _out_buffers.resize(_selected_columns.num_levels());
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
+
+  // Get a list of column data types
+  std::vector<std::vector<data_type>> column_types;
+  column_types.resize(_selected_columns.num_levels());
+
+  // Association between each ORC column and its cudf::column
+  std::vector<std::vector<orc_column_meta>> nested_cols;
+  nested_cols.resize(_selected_columns.num_levels());
+
+  // Get the total number of stripes across all input files.
+  std::size_t total_num_stripes =
+    std::accumulate(selected_stripes.begin(),
+                    selected_stripes.end(),
+                    0,
+                    [](std::size_t sum, auto& stripe_source_mapping) {
+                      return sum + stripe_source_mapping.stripe_info.size();
+                    });
+
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks;
+  lvl_chunks.resize(_selected_columns.num_levels());
+
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
     for (auto& col : columns_level) {
       auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
                                    _use_np_dtypes,
@@ -1073,29 +1090,35 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         // follow positive scaling.
         auto const scale =
           -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
+        column_types[level].emplace_back(col_type, scale);
       } else {
-        column_types.emplace_back(col_type);
+        column_types[level].emplace_back(col_type);
       }
 
       // Map each ORC column to its column
       if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
+        nested_cols[level].emplace_back(col);
       }
     }
 
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    auto const num_columns = columns_level.size();
+    lvl_chunks[level] =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(total_num_stripes, num_columns, _stream);
+    memset(lvl_chunks[level].base_host_ptr(), 0, lvl_chunks[level].size_bytes());
+  }
+
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level    = _selected_columns.levels[level];
     auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
       _use_index &&
@@ -1112,31 +1135,19 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     // Logically view streams as columns
     std::vector<orc_stream_info> stream_info;
 
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
     std::size_t stripe_start_row = 0;
     std::size_t num_dict_entries = 0;
     std::size_t num_rowgroups    = 0;
     int stripe_idx               = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    auto& chunks = lvl_chunks[level];
     for (auto const& stripe_source_mapping : selected_stripes) {
       // Iterate through the source files selected stripes
       for (auto const& stripe : stripe_source_mapping.stripe_info) {
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        auto stream_count = stream_info.size();
         auto const total_data_size =
           gather_stream_info_and_update_chunks(stripe_idx,
                                                level,

From f96b513edc044013567e54621c5eed5a371aa6c3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 26 Jan 2024 22:28:23 -0800
Subject: [PATCH 020/321] Revert "Trying to read stripe-by-stripe"

This reverts commit 3da6ca794cd0e33a4e476e92210e13be1609570b.
---
 cpp/src/io/orc/reader_impl_chunking.hpp  |   3 +-
 cpp/src/io/orc/reader_impl_preprocess.cu | 285 +++++++++++------------
 2 files changed, 138 insertions(+), 150 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 4bafc38dbdd..c90f606da5a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -67,8 +67,7 @@ struct file_intermediate_data {
     compinfo_map;
   bool compinfo_ready{false};
 
-  // Tracker for eventually deallocating compressed and uncompressed data
-  std::vector<rmm::device_buffer> stripe_data;
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
   int64_t rows_to_skip;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 7e8feb4ef49..ea636fc4ddc 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -108,8 +108,6 @@ std::size_t gather_stream_info(std::size_t stripe_index,
     auto const column_id = *stream.column_id;
     auto const col_order = orc2gdf[column_id];
 
-    // TODO
-    // optimize this.
     if (col_order != -1) {
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
@@ -831,8 +829,8 @@ void reader::impl::query_stripe_compression_info()
   // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  auto& stripe_data = _file_itm_data->stripe_data;
-  // lvl_stripe_data.resize(_selected_columns.num_levels());
+  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -857,94 +855,94 @@ void reader::impl::query_stripe_compression_info()
     }
   }
 
-  // Get the total number of stripes across all input files.
-  std::size_t total_num_stripes =
-    std::accumulate(selected_stripes.begin(),
-                    selected_stripes.end(),
-                    0,
-                    [](std::size_t sum, auto& stripe_source_mapping) {
-                      return sum + stripe_source_mapping.stripe_info.size();
-                    });
-  stripe_data.reserve(total_num_stripes);
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
+                      });
 
-  int stripe_idx = 0;
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
 
-  for (auto const& stripe_source_mapping : selected_stripes) {
-    // Iterate through the source files selected stripes
-    for (auto const& stripe : stripe_source_mapping.stripe_info) {
-      auto const stripe_info   = stripe.first;
-      auto const stripe_footer = stripe.second;
+    int stripe_idx = 0;
 
-      auto stream_count = stream_info.size();
+    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe_source_mapping : selected_stripes) {
+      // Iterate through the source files selected stripes
+      for (auto const& stripe : stripe_source_mapping.stripe_info) {
+        auto const stripe_info   = stripe.first;
+        auto const stripe_footer = stripe.second;
 
-      std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-      std::size_t total_data_size{0};
-      for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-        total_data_size += gather_stream_info(
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(
           stripe_idx, level, stripe_info, stripe_footer, col_meta.orc_col_map[level], stream_info);
-      }  // end loop level
-
-      auto const is_stripe_data_empty = total_data_size == 0;
-      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                   "Invalid index rowgroup stream data");
-
-      // Buffer needs to be padded.
-      // Required by `copy_uncompressed_kernel`.
-      stripe_data.emplace_back(cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE),
-                               _stream);
-      auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-      // Coalesce consecutive streams into one read
-      while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-        auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-        auto const offset = stream_info[stream_count].offset;
-        auto len          = stream_info[stream_count].length;
-        stream_count++;
-
-        while (stream_count < stream_info.size() &&
-               stream_info[stream_count].offset == offset + len) {
-          len += stream_info[stream_count].length;
+
+        auto const is_stripe_data_empty = total_data_size == 0;
+        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                     "Invalid index rowgroup stream data");
+
+        // Buffer needs to be padded.
+        // Required by `copy_uncompressed_kernel`.
+        stripe_data.emplace_back(
+          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
+        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+        // Coalesce consecutive streams into one read
+        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+          auto const offset = stream_info[stream_count].offset;
+          auto len          = stream_info[stream_count].length;
           stream_count++;
-        }
-        if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-              .source->is_device_read_preferred(len)) {
-          read_tasks.push_back(std::pair(
-            _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->device_read_async(
-              offset, len, d_dst, _stream),
-            len));
 
-        } else {
-          auto const buffer =
-            _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(offset,
-                                                                                            len);
-          CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-          CUDF_CUDA_TRY(
-            cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-          _stream.synchronize();
+          while (stream_count < stream_info.size() &&
+                 stream_info[stream_count].offset == offset + len) {
+            len += stream_info[stream_count].length;
+            stream_count++;
+          }
+          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                .source->is_device_read_preferred(len)) {
+            read_tasks.push_back(
+              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                          .source->device_read_async(offset, len, d_dst, _stream),
+                        len));
+
+          } else {
+            auto const buffer =
+              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                offset, len);
+            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+            CUDF_CUDA_TRY(
+              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+            _stream.synchronize();
+          }
         }
-      }
 
-      stripe_idx++;
+        stripe_idx++;
+      }
+    }
+    for (auto& task : read_tasks) {
+      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
     }
-  }
-  for (auto& task : read_tasks) {
-    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-  }
-
-  if (stripe_data.empty()) { continue; }
 
-  // Setup row group descriptors if using indexes
-  if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-    auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
-    cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-      0, stream_info.size(), _stream);
+    if (stripe_data.empty()) { continue; }
 
-    for (auto const& info : stream_info) {
-      compinfo.push_back(gpu::CompressedStreamInfo(
-        static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-        info.length));
-      stream_compinfo_map[stream_id_info{
-        info.stripe_idx, info.level, info.orc_col_idx, info.kind}] = &compinfo[compinfo.size() - 1];
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+        0, stream_info.size(), _stream);
+
+      for (auto const& info : stream_info) {
+        compinfo.push_back(gpu::CompressedStreamInfo(
+          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+          info.length));
+        stream_compinfo_map[stream_id_info{
+          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
+          &compinfo[compinfo.size() - 1];
 #if 0
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.stripe_idx,
@@ -955,22 +953,22 @@ void reader::impl::query_stripe_compression_info()
                info.length);
         fflush(stdout);
 #endif
-    }
+      }
 
-    compinfo.host_to_device_async(_stream);
+      compinfo.host_to_device_async(_stream);
 
-    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                   compinfo.size(),
-                                   decompressor.GetBlockSize(),
-                                   decompressor.GetLog2MaxCompressionRatio(),
-                                   _stream);
-    compinfo.device_to_host_sync(_stream);
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
 
-    auto& compinfo_map = _file_itm_data->compinfo_map;
-    for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
-      compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
-                                 stream_compinfo->num_uncompressed_blocks,
-                                 stream_compinfo->max_uncompressed_size};
+      auto& compinfo_map = _file_itm_data->compinfo_map;
+      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
+                                   stream_compinfo->num_uncompressed_blocks,
+                                   stream_compinfo->max_uncompressed_size};
 #if 0
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
                (int)stream_id.stripe_idx,
@@ -982,23 +980,23 @@ void reader::impl::query_stripe_compression_info()
                stream_compinfo->max_uncompressed_size);
         fflush(stdout);
 #endif
-    }
+      }
 
-    // Must clear so we will not overwrite the old compression info stream_id.
-    stream_info.clear();
-    stream_compinfo_map.clear();
+      // Must clear so we will not overwrite the old compression info stream_id.
+      stream_info.clear();
+      stream_compinfo_map.clear();
 
-  } else {
-    // printf("no compression \n");
-    // fflush(stdout);
+    } else {
+      // printf("no compression \n");
+      // fflush(stdout);
 
-    // Set decompressed data size equal to the input size.
-    // TODO
-  }
+      // Set decompressed data size equal to the input size.
+      // TODO
+    }
 
-  // printf("  end level %d\n\n", (int)level);
+    // printf("  end level %d\n\n", (int)level);
 
-  // }  // end loop level
+  }  // end loop level
 
   // lvl_stripe_data.clear();
   _file_itm_data->compinfo_ready = true;
@@ -1046,37 +1044,22 @@ void reader::impl::prepare_data(uint64_t skip_rows,
              : std::make_unique<cudf::table>();
   }();
 
-  auto& stripe_data            = _file_itm_data->stripe_data;
+  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
   auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
 
   _out_buffers.resize(_selected_columns.num_levels());
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
-
-  // Get a list of column data types
-  std::vector<std::vector<data_type>> column_types;
-  column_types.resize(_selected_columns.num_levels());
-
-  // Association between each ORC column and its cudf::column
-  std::vector<std::vector<orc_column_meta>> nested_cols;
-  nested_cols.resize(_selected_columns.num_levels());
-
-  // Get the total number of stripes across all input files.
-  std::size_t total_num_stripes =
-    std::accumulate(selected_stripes.begin(),
-                    selected_stripes.end(),
-                    0,
-                    [](std::size_t sum, auto& stripe_source_mapping) {
-                      return sum + stripe_source_mapping.stripe_info.size();
-                    });
-
-  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks;
-  lvl_chunks.resize(_selected_columns.num_levels());
-
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& columns_level = _selected_columns.levels[level];
+    // Association between each ORC column and its cudf::column
+    std::vector<orc_column_meta> nested_cols;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
     for (auto& col : columns_level) {
       auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
                                    _use_np_dtypes,
@@ -1090,35 +1073,29 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         // follow positive scaling.
         auto const scale =
           -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types[level].emplace_back(col_type, scale);
+        column_types.emplace_back(col_type, scale);
       } else {
-        column_types[level].emplace_back(col_type);
+        column_types.emplace_back(col_type);
       }
 
       // Map each ORC column to its column
       if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols[level].emplace_back(col);
+        nested_cols.emplace_back(col);
       }
     }
 
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    auto const num_columns = columns_level.size();
-    lvl_chunks[level] =
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(total_num_stripes, num_columns, _stream);
-    memset(lvl_chunks[level].base_host_ptr(), 0, lvl_chunks[level].size_bytes());
-  }
-
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level    = _selected_columns.levels[level];
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
+                      });
     auto const num_columns = columns_level.size();
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+      total_num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
       _use_index &&
@@ -1135,19 +1112,31 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     // Logically view streams as columns
     std::vector<orc_stream_info> stream_info;
 
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
     std::size_t stripe_start_row = 0;
     std::size_t num_dict_entries = 0;
     std::size_t num_rowgroups    = 0;
     int stripe_idx               = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    auto& chunks = lvl_chunks[level];
     for (auto const& stripe_source_mapping : selected_stripes) {
       // Iterate through the source files selected stripes
       for (auto const& stripe : stripe_source_mapping.stripe_info) {
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
+        auto stream_count = stream_info.size();
         auto const total_data_size =
           gather_stream_info_and_update_chunks(stripe_idx,
                                                level,

From c0ac62cab5bc87a69777f68ea7ec6f2abd062813 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 14:25:43 -0800
Subject: [PATCH 021/321] Add test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 16 +++++++----
 cpp/tests/io/orc_test.cpp                | 36 ++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index ea636fc4ddc..6d7d1e28b59 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#define PRINT_DEBUG
+
 #include "reader_impl.hpp"
 #include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
@@ -69,7 +71,7 @@ struct orc_stream_info {
       orc_col_idx(orc_col_idx_),
       kind(kind_)
   {
-#if 0
+#ifdef PRINT_DEBUG
     printf("   construct stripe id [%d, %d, %d, %d]\n",
            (int)stripe_idx,
            (int)level,
@@ -253,7 +255,7 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
     0, stream_info.size(), stream);
   for (auto const& info : stream_info) {
-#if 0
+#ifdef PRINT_DEBUG
     printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
            (int)info.stripe_idx,
            (int)info.level,
@@ -268,8 +270,12 @@ rmm::device_buffer decompress_stripe_data(
       static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
       info.length));
 
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
     auto const& cached_comp_info =
       compinfo_map.at(stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind});
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
     // auto const& cached_comp_info =
     //   compinfo_map[stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind}];
     auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
@@ -286,7 +292,7 @@ rmm::device_buffer decompress_stripe_data(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
-#if 0
+#ifdef XXX
   std::size_t old_num_compressed_blocks   = num_compressed_blocks;
   std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
   std::size_t old_total_decomp_size       = total_decomp_size;
@@ -943,7 +949,7 @@ void reader::impl::query_stripe_compression_info()
         stream_compinfo_map[stream_id_info{
           info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
           &compinfo[compinfo.size() - 1];
-#if 0
+#ifdef PRINT_DEBUG
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.stripe_idx,
                (int)info.level,
@@ -969,7 +975,7 @@ void reader::impl::query_stripe_compression_info()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
-#if 0
+#ifdef PRINT_DEBUG
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
                (int)stream_id.stripe_idx,
                (int)stream_id.level,
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 2ae6edc6c7d..8cdec659ce0 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/random.hpp>
@@ -1123,6 +1124,41 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
+TEST_F(OrcReaderTest, Test1)
+{
+  std::string filepath1 =
+    "/home/nghiat/Devel/cudf/1/python/cudf/cudf/tests/data/orc/"
+    "TestOrcFile.boolean_corruption_PR_6636.orc";
+
+  std::string filepath2 =
+    "/home/nghiat/Devel/cudf/1/python/cudf/cudf/tests/data/orc/"
+    "TestOrcFile.boolean_corruption_PR_6702.orc";
+
+  {
+    printf("test1\n");
+    cudf::io::orc_reader_options read_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1}});
+    auto result = cudf::io::read_orc(read_opts);
+    for (int i = 0; i < result.tbl->num_columns(); i++) {
+      auto& col = result.tbl->get_column(i);
+      cudf::test::print(col);
+      printf("\n");
+    }
+  }
+
+  {
+    printf("test2\n");
+    cudf::io::orc_reader_options read_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath2}});
+    auto result = cudf::io::read_orc(read_opts);
+    for (int i = 0; i < result.tbl->num_columns(); i++) {
+      auto& col = result.tbl->get_column(i);
+      cudf::test::print(col);
+      printf("\n");
+    }
+  }
+}
+
 TEST_F(OrcReaderTest, SingleInputs)
 {
   srand(31533);

From 6049725b1418ecb8ef0807a5ba5cc3912ca3da33 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 14:49:28 -0800
Subject: [PATCH 022/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 36 ++++++++++++++++++++----
 cpp/tests/io/orc_test.cpp                |  2 ++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 6d7d1e28b59..2a0e2aef57b 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#define PRINT_DEBUG
+// #define PRINT_DEBUG
 
 #include "reader_impl.hpp"
 #include "reader_impl_chunking.hpp"
@@ -96,6 +96,8 @@ std::size_t gather_stream_info(std::size_t stripe_index,
                                orc::StripeInformation const* stripeinfo,
                                orc::StripeFooter const* stripefooter,
                                host_span<int const> orc2gdf,
+                               host_span<orc::SchemaType const> types,
+                               bool apply_struct_map,
                                std::vector<orc_stream_info>& stream_info)
 {
   uint64_t src_offset = 0;
@@ -108,9 +110,25 @@ std::size_t gather_stream_info(std::size_t stripe_index,
     }
 
     auto const column_id = *stream.column_id;
-    auto const col_order = orc2gdf[column_id];
+    auto col             = orc2gdf[column_id];
+    printf("first construct col id = %d, order = %d\n", (int)column_id, (int)col);
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (not schema_type.subtypes.empty()) {
+        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
+          for (auto const& idx : schema_type.subtypes) {
+            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+            if (child_idx >= 0) { col = child_idx; }
+          }
+        }
+      }
+    }
 
-    if (col_order != -1) {
+    if (col != -1) {
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
@@ -167,6 +185,7 @@ std::size_t gather_stream_info_and_update_chunks(
 
     auto const column_id = *stream.column_id;
     auto col             = orc2gdf[column_id];
+    printf("construct col id = %d, order = %d\n", (int)column_id, (int)col);
 
     if (col == -1 and apply_struct_map) {
       // A struct-type column has no data itself, but rather child columns
@@ -205,6 +224,7 @@ std::size_t gather_stream_info_and_update_chunks(
         }
       }
 
+      printf("before construct col id = %d, order = %d\n", (int)column_id, (int)col);
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
@@ -884,8 +904,14 @@ void reader::impl::query_stripe_compression_info()
         auto const stripe_footer = stripe.second;
 
         auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(
-          stripe_idx, level, stripe_info, stripe_footer, col_meta.orc_col_map[level], stream_info);
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        level,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        level == 0,
+                                                        stream_info);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 8cdec659ce0..5ff27bd9e10 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1124,6 +1124,7 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
+#if 0
 TEST_F(OrcReaderTest, Test1)
 {
   std::string filepath1 =
@@ -1159,6 +1160,7 @@ TEST_F(OrcReaderTest, Test1)
   }
 }
 
+#endif
 TEST_F(OrcReaderTest, SingleInputs)
 {
   srand(31533);

From 1fa634b089121ae071bd39b3639785c0f794e564 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 14:52:08 -0800
Subject: [PATCH 023/321] Remove debug info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_preprocess.cu | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 2a0e2aef57b..4f9dac15d98 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -111,7 +111,6 @@ std::size_t gather_stream_info(std::size_t stripe_index,
 
     auto const column_id = *stream.column_id;
     auto col             = orc2gdf[column_id];
-    printf("first construct col id = %d, order = %d\n", (int)column_id, (int)col);
 
     if (col == -1 and apply_struct_map) {
       // A struct-type column has no data itself, but rather child columns
@@ -185,7 +184,6 @@ std::size_t gather_stream_info_and_update_chunks(
 
     auto const column_id = *stream.column_id;
     auto col             = orc2gdf[column_id];
-    printf("construct col id = %d, order = %d\n", (int)column_id, (int)col);
 
     if (col == -1 and apply_struct_map) {
       // A struct-type column has no data itself, but rather child columns
@@ -224,7 +222,6 @@ std::size_t gather_stream_info_and_update_chunks(
         }
       }
 
-      printf("before construct col id = %d, order = %d\n", (int)column_id, (int)col);
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
@@ -290,12 +287,12 @@ rmm::device_buffer decompress_stripe_data(
       static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
       info.length));
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    //    printf("line %d\n", __LINE__);
+    //    fflush(stdout);
     auto const& cached_comp_info =
       compinfo_map.at(stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind});
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    //    printf("line %d\n", __LINE__);
+    //    fflush(stdout);
     // auto const& cached_comp_info =
     //   compinfo_map[stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind}];
     auto& stream_comp_info                   = compinfo[compinfo.size() - 1];

From 57bd6d512238f46229b09c424b6bc31afe2e5150 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 15:04:52 -0800
Subject: [PATCH 024/321] Separate implementation

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/CMakeLists.txt                       |   1 +
 cpp/src/io/orc/reader_impl_chunking.cu   | 304 +++++++++++++++++++++++
 cpp/src/io/orc/reader_impl_chunking.hpp  |  36 +++
 cpp/src/io/orc/reader_impl_preprocess.cu | 283 ---------------------
 4 files changed, 341 insertions(+), 283 deletions(-)
 create mode 100644 cpp/src/io/orc/reader_impl_chunking.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d4ed6c113b9..49c19596d23 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -387,6 +387,7 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_chunking.cu
   src/io/orc/reader_impl_helpers.cpp
   src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
new file mode 100644
index 00000000000..028ea624749
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// #define PRINT_DEBUG
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+#include "reader_impl_helpers.hpp"
+
+#include <io/comp/gpuinflate.hpp>
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include <algorithm>
+#include <iterator>
+
+namespace cudf::io::orc::detail {
+
+namespace {
+
+/**
+ * @brief Function that populates column descriptors stream/chunk
+ */
+std::size_t gather_stream_info(std::size_t stripe_index,
+                               std::size_t level,
+                               orc::StripeInformation const* stripeinfo,
+                               orc::StripeFooter const* stripefooter,
+                               host_span<int const> orc2gdf,
+                               host_span<orc::SchemaType const> types,
+                               bool apply_struct_map,
+                               std::vector<orc_stream_info>& stream_info)
+{
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      dst_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (not schema_type.subtypes.empty()) {
+        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
+          for (auto const& idx : schema_type.subtypes) {
+            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+            if (child_idx >= 0) { col = child_idx; }
+          }
+        }
+      }
+    }
+
+    if (col != -1) {
+      stream_info.emplace_back(stripeinfo->offset + src_offset,
+                               dst_offset,
+                               stream.length,
+                               stripe_index,
+                               level,
+                               column_id,
+                               stream.kind);
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+}  // namespace
+
+void reader::impl::query_stripe_compression_info()
+{
+  if (_file_itm_data->compinfo_ready) { return; }
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  // TODO : remove?
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
+  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
+
+  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
+  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
+    stream_compinfo_map;
+
+  // Logically view streams as columns
+  std::vector<orc_stream_info> stream_info;
+  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+
+  // Iterates through levels of nested columns, child column will be one level down
+  // compared to parent column.
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+
+    size_type col_id{0};
+    for (auto& col : columns_level) {
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = col_id++;
+    }
+  }
+
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
+                      });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    int stripe_idx = 0;
+
+    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe_source_mapping : selected_stripes) {
+      // Iterate through the source files selected stripes
+      for (auto const& stripe : stripe_source_mapping.stripe_info) {
+        auto const stripe_info   = stripe.first;
+        auto const stripe_footer = stripe.second;
+
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        level,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        level == 0,
+                                                        stream_info);
+
+        auto const is_stripe_data_empty = total_data_size == 0;
+        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                     "Invalid index rowgroup stream data");
+
+        // Buffer needs to be padded.
+        // Required by `copy_uncompressed_kernel`.
+        stripe_data.emplace_back(
+          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
+        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+        // Coalesce consecutive streams into one read
+        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+          auto const offset = stream_info[stream_count].offset;
+          auto len          = stream_info[stream_count].length;
+          stream_count++;
+
+          while (stream_count < stream_info.size() &&
+                 stream_info[stream_count].offset == offset + len) {
+            len += stream_info[stream_count].length;
+            stream_count++;
+          }
+          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                .source->is_device_read_preferred(len)) {
+            read_tasks.push_back(
+              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                          .source->device_read_async(offset, len, d_dst, _stream),
+                        len));
+
+          } else {
+            auto const buffer =
+              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                offset, len);
+            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+            CUDF_CUDA_TRY(
+              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+            _stream.synchronize();
+          }
+        }
+
+        stripe_idx++;
+      }
+    }
+    for (auto& task : read_tasks) {
+      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+    }
+
+    if (stripe_data.empty()) { continue; }
+
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+        0, stream_info.size(), _stream);
+
+      for (auto const& info : stream_info) {
+        compinfo.push_back(gpu::CompressedStreamInfo(
+          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+          info.length));
+        stream_compinfo_map[stream_id_info{
+          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
+          &compinfo[compinfo.size() - 1];
+#ifdef PRINT_DEBUG
+        printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+               (int)info.stripe_idx,
+               (int)info.level,
+               (int)info.orc_col_idx,
+               (int)info.kind,
+               info.dst_pos,
+               info.length);
+        fflush(stdout);
+#endif
+      }
+
+      compinfo.host_to_device_async(_stream);
+
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
+
+      auto& compinfo_map = _file_itm_data->compinfo_map;
+      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
+                                   stream_compinfo->num_uncompressed_blocks,
+                                   stream_compinfo->max_uncompressed_size};
+#ifdef PRINT_DEBUG
+        printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+               (int)stream_id.stripe_idx,
+               (int)stream_id.level,
+               (int)stream_id.orc_col_idx,
+               (int)stream_id.kind,
+               (size_t)stream_compinfo->num_compressed_blocks,
+               (size_t)stream_compinfo->num_uncompressed_blocks,
+               stream_compinfo->max_uncompressed_size);
+        fflush(stdout);
+#endif
+      }
+
+      // Must clear so we will not overwrite the old compression info stream_id.
+      stream_info.clear();
+      stream_compinfo_map.clear();
+
+    } else {
+      // printf("no compression \n");
+      // fflush(stdout);
+
+      // Set decompressed data size equal to the input size.
+      // TODO
+    }
+
+    // printf("  end level %d\n\n", (int)level);
+
+  }  // end loop level
+
+  // lvl_stripe_data.clear();
+  _file_itm_data->compinfo_ready = true;
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index c90f606da5a..c8743001928 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -30,6 +30,42 @@
 
 namespace cudf::io::orc::detail {
 
+/**
+ * @brief Struct that maps ORC streams to columns
+ */
+struct orc_stream_info {
+  explicit orc_stream_info(uint64_t offset_,
+                           std::size_t dst_pos_,
+                           uint32_t length_,
+                           uint32_t stripe_idx_,
+                           std::size_t level_,
+                           uint32_t orc_col_idx_,
+                           StreamKind kind_)
+    : offset(offset_),
+      dst_pos(dst_pos_),
+      length(length_),
+      stripe_idx(stripe_idx_),
+      level(level_),
+      orc_col_idx(orc_col_idx_),
+      kind(kind_)
+  {
+#ifdef PRINT_DEBUG
+    printf("   construct stripe id [%d, %d, %d, %d]\n",
+           (int)stripe_idx,
+           (int)level,
+           (int)orc_col_idx,
+           (int)kind);
+#endif
+  }
+  uint64_t offset;      // offset in file
+  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
+  std::size_t length;   // length in file
+  uint32_t stripe_idx;  // stripe processing index, not stripe index in source
+  std::size_t level;    // TODO
+  uint32_t orc_col_idx;
+  StreamKind kind;
+};
+
 // unify this with orc_stream_info
 struct stream_id_info {
   std::size_t stripe_idx;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 4f9dac15d98..c40b22e0b93 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -52,97 +52,6 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
-/**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_,
-                           std::size_t level_,
-                           uint32_t orc_col_idx_,
-                           StreamKind kind_)
-    : offset(offset_),
-      dst_pos(dst_pos_),
-      length(length_),
-      stripe_idx(stripe_idx_),
-      level(level_),
-      orc_col_idx(orc_col_idx_),
-      kind(kind_)
-  {
-#ifdef PRINT_DEBUG
-    printf("   construct stripe id [%d, %d, %d, %d]\n",
-           (int)stripe_idx,
-           (int)level,
-           (int)orc_col_idx,
-           (int)kind);
-#endif
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe processing index, not stripe index in source
-  std::size_t level;    // TODO
-  uint32_t orc_col_idx;
-  StreamKind kind;
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               std::size_t level,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool apply_struct_map,
-                               std::vector<orc_stream_info>& stream_info)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      dst_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) { col = child_idx; }
-          }
-        }
-      }
-    }
-
-    if (col != -1) {
-      stream_info.emplace_back(stripeinfo->offset + src_offset,
-                               dst_offset,
-                               stream.length,
-                               stripe_index,
-                               level,
-                               column_id,
-                               stream.kind);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
 /**
  * @brief Function that populates column descriptors stream/chunk
  */
@@ -839,198 +748,6 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
-void reader::impl::query_stripe_compression_info()
-{
-  if (_file_itm_data->compinfo_ready) { return; }
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-
-  // If no rows or stripes to read, return empty columns
-  // TODO : remove?
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
-  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-
-  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
-    stream_compinfo_map;
-
-  // Logically view streams as columns
-  std::vector<orc_stream_info> stream_info;
-  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
-  auto& col_meta = *_col_meta;
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-
-    size_type col_id{0};
-    for (auto& col : columns_level) {
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = col_id++;
-    }
-  }
-
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    int stripe_idx = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        level,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        level == 0,
-                                                        stream_info);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        stripe_idx++;
-      }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-    }
-
-    if (stripe_data.empty()) { continue; }
-
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
-      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-        0, stream_info.size(), _stream);
-
-      for (auto const& info : stream_info) {
-        compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-          info.length));
-        stream_compinfo_map[stream_id_info{
-          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
-          &compinfo[compinfo.size() - 1];
-#ifdef PRINT_DEBUG
-        printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-               (int)info.stripe_idx,
-               (int)info.level,
-               (int)info.orc_col_idx,
-               (int)info.kind,
-               info.dst_pos,
-               info.length);
-        fflush(stdout);
-#endif
-      }
-
-      compinfo.host_to_device_async(_stream);
-
-      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                     compinfo.size(),
-                                     decompressor.GetBlockSize(),
-                                     decompressor.GetLog2MaxCompressionRatio(),
-                                     _stream);
-      compinfo.device_to_host_sync(_stream);
-
-      auto& compinfo_map = _file_itm_data->compinfo_map;
-      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
-        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
-                                   stream_compinfo->num_uncompressed_blocks,
-                                   stream_compinfo->max_uncompressed_size};
-#ifdef PRINT_DEBUG
-        printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-               (int)stream_id.stripe_idx,
-               (int)stream_id.level,
-               (int)stream_id.orc_col_idx,
-               (int)stream_id.kind,
-               (size_t)stream_compinfo->num_compressed_blocks,
-               (size_t)stream_compinfo->num_uncompressed_blocks,
-               stream_compinfo->max_uncompressed_size);
-        fflush(stdout);
-#endif
-      }
-
-      // Must clear so we will not overwrite the old compression info stream_id.
-      stream_info.clear();
-      stream_compinfo_map.clear();
-
-    } else {
-      // printf("no compression \n");
-      // fflush(stdout);
-
-      // Set decompressed data size equal to the input size.
-      // TODO
-    }
-
-    // printf("  end level %d\n\n", (int)level);
-
-  }  // end loop level
-
-  // lvl_stripe_data.clear();
-  _file_itm_data->compinfo_ready = true;
-}
-
 void reader::impl::prepare_data(uint64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
                                 std::vector<std::vector<size_type>> const& stripes)

From 40af354fdf518418f110aeda7e651bbb7844339b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 15:34:27 -0800
Subject: [PATCH 025/321] Cache stream_info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp           |  5 +++
 cpp/src/io/orc/reader_impl_chunking.cu   |  7 ++--
 cpp/src/io/orc/reader_impl_chunking.hpp  |  2 +
 cpp/src/io/orc/reader_impl_preprocess.cu | 51 ++++++++++++++++--------
 4 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b0869125fe9..8130ac51f6d 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -77,6 +77,11 @@ class reader::impl {
                     std::optional<size_type> const& num_rows_opt,
                     std::vector<std::vector<size_type>> const& stripes);
 
+  /**
+   *
+   */
+  void create_pass_data();
+
   /**
    * @brief Compute stripe sizes.
    */
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 028ea624749..5590f53858c 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -130,8 +130,7 @@ void reader::impl::query_stripe_compression_info()
     stream_compinfo_map;
 
   // Logically view streams as columns
-  std::vector<orc_stream_info> stream_info;
-  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+  _file_itm_data->lvl_stream_info.resize(_selected_columns.num_levels());
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
@@ -149,6 +148,9 @@ void reader::impl::query_stripe_compression_info()
   }
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& stream_info = _file_itm_data->lvl_stream_info[level];
+    stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+
     // Get the total number of stripes across all input files.
     std::size_t total_num_stripes =
       std::accumulate(selected_stripes.begin(),
@@ -282,7 +284,6 @@ void reader::impl::query_stripe_compression_info()
       }
 
       // Must clear so we will not overwrite the old compression info stream_id.
-      stream_info.clear();
       stream_compinfo_map.clear();
 
     } else {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index c8743001928..83954a7dd1b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -106,6 +106,8 @@ struct file_intermediate_data {
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
+  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+
   int64_t rows_to_skip;
   size_type rows_to_read;
   std::vector<metadata::stripe_source_mapping> selected_stripes;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index c40b22e0b93..44b3a138c69 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -65,7 +65,7 @@ std::size_t gather_stream_info_and_update_chunks(
   bool use_index,
   bool apply_struct_map,
   std::size_t* num_dictionary_entries,
-  std::vector<orc_stream_info>& stream_info,
+  std::size_t* stream_idx,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
 {
   uint64_t src_offset = 0;
@@ -106,7 +106,7 @@ std::size_t gather_stream_info_and_update_chunks(
             if (child_idx >= 0) {
               col                             = child_idx;
               auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
+              chunk.strm_id[gpu::CI_PRESENT]  = *stream_idx;
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
           }
@@ -118,7 +118,7 @@ std::size_t gather_stream_info_and_update_chunks(
         auto& chunk           = chunks[stripe_index][col];
         auto const index_type = get_stream_index_type(stream.kind);
         if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
+          chunk.strm_id[index_type]  = *stream_idx;
           chunk.strm_len[index_type] = stream.length;
           // NOTE: skip_count field is temporarily used to track the presence of index streams
           chunk.skip_count |= 1 << index_type;
@@ -131,13 +131,7 @@ std::size_t gather_stream_info_and_update_chunks(
         }
       }
 
-      stream_info.emplace_back(stripeinfo->offset + src_offset,
-                               dst_offset,
-                               stream.length,
-                               stripe_index,
-                               level,
-                               column_id,
-                               stream.kind);
+      (*stream_idx)++;
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -165,7 +159,7 @@ rmm::device_buffer decompress_stripe_data(
     compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
+  host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
   std::size_t num_stripes,
@@ -288,7 +282,7 @@ rmm::device_buffer decompress_stripe_data(
     compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
-    stream_info[i].dst_pos = decomp_offset;
+    //    stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
     start_pos += compinfo[i].num_compressed_blocks;
     start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
@@ -748,6 +742,31 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
+void reader::impl::create_pass_data()
+{
+  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
+
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // Logically view streams as columns
+  std::vector<orc_stream_info> stream_info;
+  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+
+    size_type col_id{0};
+    for (auto& col : columns_level) {
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = col_id++;
+    }
+  }
+}
+
 void reader::impl::prepare_data(uint64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
                                 std::vector<std::vector<size_type>> const& stripes)
@@ -856,7 +875,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       (rows_to_skip == 0);
 
     // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
+    auto const& stream_info = _file_itm_data->lvl_stream_info[level];
 
     null_count_prefix_sums.emplace_back();
     null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
@@ -873,7 +892,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     std::size_t stripe_start_row = 0;
     std::size_t num_dict_entries = 0;
     std::size_t num_rowgroups    = 0;
-    int stripe_idx               = 0;
+    std::size_t stripe_idx       = 0;
+    std::size_t stream_idx       = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
@@ -882,7 +902,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        auto stream_count = stream_info.size();
         auto const total_data_size =
           gather_stream_info_and_update_chunks(stripe_idx,
                                                level,
@@ -893,7 +912,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                                use_index,
                                                level == 0,
                                                &num_dict_entries,
-                                               stream_info,
+                                               &stream_idx,
                                                chunks);
 
         auto const is_stripe_data_empty = total_data_size == 0;

From 10a598eeec539c9100dfd4bb2a9e28fc3b67ba43 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 16:15:55 -0800
Subject: [PATCH 026/321] Cache everything

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu   | 135 ++++++++++++++++----
 cpp/src/io/orc/reader_impl_chunking.hpp  |   4 +-
 cpp/src/io/orc/reader_impl_preprocess.cu | 155 ++++++-----------------
 3 files changed, 151 insertions(+), 143 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5590f53858c..19c76ada8f9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -55,18 +55,36 @@ namespace {
 /**
  * @brief Function that populates column descriptors stream/chunk
  */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               std::size_t level,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool apply_struct_map,
-                               std::vector<orc_stream_info>& stream_info)
+std::size_t gather_stream_info_and_update_chunks(
+  std::size_t stripe_index,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  std::size_t* num_dictionary_entries,
+  std::vector<orc_stream_info>& stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
 {
   uint64_t src_offset = 0;
   uint64_t dst_offset = 0;
 
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
   for (auto const& stream : stripefooter->streams) {
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       dst_offset += stream.length;
@@ -85,13 +103,34 @@ std::size_t gather_stream_info(std::size_t stripe_index,
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
           for (auto const& idx : schema_type.subtypes) {
             auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) { col = child_idx; }
+            if (child_idx >= 0) {
+              col                             = child_idx;
+              auto& chunk                     = chunks[stripe_index][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
           }
         }
       }
     }
-
     if (col != -1) {
+      if (src_offset >= stripeinfo->indexLength || use_index) {
+        auto& chunk           = chunks[stripe_index][col];
+        auto const index_type = get_stream_index_type(stream.kind);
+        if (index_type < gpu::CI_NUM_STREAMS) {
+          chunk.strm_id[index_type]  = stream_info.size();
+          chunk.strm_len[index_type] = stream.length;
+          // NOTE: skip_count field is temporarily used to track the presence of index streams
+          chunk.skip_count |= 1 << index_type;
+
+          if (index_type == gpu::CI_DICTIONARY) {
+            chunk.dictionary_start = *num_dictionary_entries;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
+          }
+        }
+      }
+
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
@@ -122,8 +161,15 @@ void reader::impl::query_stripe_compression_info()
   // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+  auto& lvl_stripe_data      = _file_itm_data->lvl_stripe_data;
+  auto& lvl_data_chunks      = _file_itm_data->lvl_data_chunks;
+  auto& lvl_num_dict_entries = _file_itm_data->lvl_num_dict_entries;
+  auto& lvl_stripe_is_empty  = _file_itm_data->lvl_stripe_is_empty;
+
   lvl_stripe_data.resize(_selected_columns.num_levels());
+  lvl_data_chunks.resize(_selected_columns.num_levels());
+  lvl_num_dict_entries.resize(_selected_columns.num_levels());
+  lvl_stripe_is_empty.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -148,9 +194,6 @@ void reader::impl::query_stripe_compression_info()
   }
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& stream_info = _file_itm_data->lvl_stream_info[level];
-    stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
-
     // Get the total number of stripes across all input files.
     std::size_t total_num_stripes =
       std::accumulate(selected_stripes.begin(),
@@ -159,11 +202,37 @@ void reader::impl::query_stripe_compression_info()
                       [](std::size_t sum, auto& stripe_source_mapping) {
                         return sum + stripe_source_mapping.stripe_info.size();
                       });
+    auto& columns_level    = _selected_columns.levels[level];
+    auto const num_columns = columns_level.size();
+    _file_itm_data->lvl_data_chunks[level] =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(total_num_stripes, num_columns, _stream);
+    auto& chunks = _file_itm_data->lvl_data_chunks[level];
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    auto& stream_info = _file_itm_data->lvl_stream_info[level];
+    stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size() *
+                        num_columns);
+
+    const bool use_index =
+      _use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
 
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
 
-    int stripe_idx = 0;
+    lvl_stripe_is_empty[level].reserve(selected_stripes.size() *
+                                       selected_stripes.front().stripe_info.size());
+
+    std::size_t num_dict_entries = 0;
+    std::size_t stripe_idx       = 0;
 
     std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
@@ -172,15 +241,30 @@ void reader::impl::query_stripe_compression_info()
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        level,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        level == 0,
-                                                        stream_info);
+        auto stream_count = stream_info.size();
+        auto const total_data_size =
+          gather_stream_info_and_update_chunks(stripe_idx,
+                                               level,
+                                               stripe_info,
+                                               stripe_footer,
+                                               col_meta.orc_col_map[level],
+                                               _metadata.get_types(),
+                                               use_index,
+                                               level == 0,
+                                               &num_dict_entries,
+                                               stream_info,
+                                               chunks);
+
+        lvl_stripe_is_empty[level].push_back(total_data_size == 0);
+
+        //        auto const total_data_size = gather_stream_info(stripe_idx,
+        //                                                        level,
+        //                                                        stripe_info,
+        //                                                        stripe_footer,
+        //                                                        col_meta.orc_col_map[level],
+        //                                                        _metadata.get_types(),
+        //                                                        level == 0,
+        //                                                        stream_info);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -225,12 +309,15 @@ void reader::impl::query_stripe_compression_info()
         stripe_idx++;
       }
     }
+
     for (auto& task : read_tasks) {
       CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
     }
 
     if (stripe_data.empty()) { continue; }
 
+    lvl_num_dict_entries[level] = num_dict_entries;
+
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 83954a7dd1b..fab218f0fde 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -105,8 +105,10 @@ struct file_intermediate_data {
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+  std::vector<std::size_t> lvl_num_dict_entries;
+  std::vector<std::vector<bool>> lvl_stripe_is_empty;
 
   int64_t rows_to_skip;
   size_type rows_to_read;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 44b3a138c69..328eb27b781 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -52,94 +52,6 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info_and_update_chunks(
-  std::size_t stripe_index,
-  std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
-  host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
-  bool use_index,
-  bool apply_struct_map,
-  std::size_t* num_dictionary_entries,
-  std::size_t* stream_idx,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      dst_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = *stream_idx;
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    }
-    if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = *stream_idx;
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-
-      (*stream_idx)++;
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
 /**
  * @brief Decompresses the stripe data, at stream granularity.
  *
@@ -858,9 +770,10 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                         return sum + stripe_source_mapping.stripe_info.size();
                       });
     auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    auto& lvl_chunks           = _file_itm_data->lvl_data_chunks[level];
+    auto& lvl_num_dict_entries = _file_itm_data->lvl_num_dict_entries;
+    auto& lvl_stripe_is_empty  = _file_itm_data->lvl_stripe_is_empty[level];
 
     const bool use_index =
       _use_index &&
@@ -890,10 +803,11 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     auto& stripe_data = lvl_stripe_data[level];
 
     std::size_t stripe_start_row = 0;
-    std::size_t num_dict_entries = 0;
-    std::size_t num_rowgroups    = 0;
-    std::size_t stripe_idx       = 0;
-    std::size_t stream_idx       = 0;
+    //    std::size_t num_dict_entries = 0;
+    auto const num_dict_entries = lvl_num_dict_entries[level];
+    std::size_t num_rowgroups   = 0;
+    std::size_t stripe_idx      = 0;
+    //    std::size_t stream_idx       = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
@@ -902,20 +816,20 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        auto const total_data_size =
-          gather_stream_info_and_update_chunks(stripe_idx,
-                                               level,
-                                               stripe_info,
-                                               stripe_footer,
-                                               col_meta.orc_col_map[level],
-                                               _metadata.get_types(),
-                                               use_index,
-                                               level == 0,
-                                               &num_dict_entries,
-                                               &stream_idx,
-                                               chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
+        //        auto const total_data_size =
+        //          gather_stream_info_and_update_chunks(stripe_idx,
+        //                                               level,
+        //                                               stripe_info,
+        //                                               stripe_footer,
+        //                                               col_meta.orc_col_map[level],
+        //                                               _metadata.get_types(),
+        //                                               use_index,
+        //                                               level == 0,
+        //                                               &num_dict_entries,
+        //                                               &stream_idx,
+        //                                               chunks);
+
+        auto const is_stripe_data_empty = lvl_stripe_is_empty[stripe_idx];
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                      "Invalid index rowgroup stream data");
 
@@ -930,7 +844,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         }
         // Update chunks to reference streams pointers
         for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
+          auto& chunk = lvl_chunks[stripe_idx][col_idx];
           // start row, number of rows in a each stripe and total number of rows
           // may change in lower levels of nesting
           chunk.start_row = (level == 0)
@@ -1010,7 +924,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                                 *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
-                                                chunks,
+                                                lvl_chunks,
                                                 row_groups,
                                                 total_num_stripes,
                                                 _metadata.get_row_index_stride(),
@@ -1020,12 +934,12 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       stripe_data.push_back(std::move(decomp_data));
     } else {
       if (row_groups.size().first) {
-        chunks.host_to_device_async(_stream);
+        lvl_chunks.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
-                                chunks.base_device_ptr(),
+                                lvl_chunks.base_device_ptr(),
                                 num_columns,
                                 total_num_stripes,
                                 num_rowgroups,
@@ -1038,7 +952,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < total_num_stripes; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+        if (lvl_chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
@@ -1055,7 +969,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                        _metadata.get_row_index_stride(),
                        level,
                        tz_table->view(),
-                       chunks,
+                       lvl_chunks,
                        row_groups,
                        _out_buffers[level],
                        _stream,
@@ -1063,11 +977,16 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
     if (nested_cols.size()) {
       // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+      scan_null_counts(lvl_chunks, null_count_prefix_sums[level], _stream);
 
       row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(
-        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
+      aggregate_child_meta(level,
+                           _selected_columns,
+                           lvl_chunks,
+                           row_groups,
+                           nested_cols,
+                           _out_buffers[level],
+                           col_meta);
 
       // ORC stores number of elements at each row, so we need to generate offsets from that
       std::vector<list_buffer_data> buff_data;

From 80d41db68feebcd53bbc1ea48073977e539f6324 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 18:15:01 -0800
Subject: [PATCH 027/321] Revert "Cache everything"

This reverts commit 10a598eeec539c9100dfd4bb2a9e28fc3b67ba43.
---
 cpp/src/io/orc/reader_impl_chunking.cu   | 135 ++++----------------
 cpp/src/io/orc/reader_impl_chunking.hpp  |   4 +-
 cpp/src/io/orc/reader_impl_preprocess.cu | 155 +++++++++++++++++------
 3 files changed, 143 insertions(+), 151 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 19c76ada8f9..5590f53858c 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -55,36 +55,18 @@ namespace {
 /**
  * @brief Function that populates column descriptors stream/chunk
  */
-std::size_t gather_stream_info_and_update_chunks(
-  std::size_t stripe_index,
-  std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
-  host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
-  bool use_index,
-  bool apply_struct_map,
-  std::size_t* num_dictionary_entries,
-  std::vector<orc_stream_info>& stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
+std::size_t gather_stream_info(std::size_t stripe_index,
+                               std::size_t level,
+                               orc::StripeInformation const* stripeinfo,
+                               orc::StripeFooter const* stripefooter,
+                               host_span<int const> orc2gdf,
+                               host_span<orc::SchemaType const> types,
+                               bool apply_struct_map,
+                               std::vector<orc_stream_info>& stream_info)
 {
   uint64_t src_offset = 0;
   uint64_t dst_offset = 0;
 
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
   for (auto const& stream : stripefooter->streams) {
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       dst_offset += stream.length;
@@ -103,34 +85,13 @@ std::size_t gather_stream_info_and_update_chunks(
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
           for (auto const& idx : schema_type.subtypes) {
             auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
+            if (child_idx >= 0) { col = child_idx; }
           }
         }
       }
     }
-    if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
 
+    if (col != -1) {
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
@@ -161,15 +122,8 @@ void reader::impl::query_stripe_compression_info()
   // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  auto& lvl_stripe_data      = _file_itm_data->lvl_stripe_data;
-  auto& lvl_data_chunks      = _file_itm_data->lvl_data_chunks;
-  auto& lvl_num_dict_entries = _file_itm_data->lvl_num_dict_entries;
-  auto& lvl_stripe_is_empty  = _file_itm_data->lvl_stripe_is_empty;
-
+  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
   lvl_stripe_data.resize(_selected_columns.num_levels());
-  lvl_data_chunks.resize(_selected_columns.num_levels());
-  lvl_num_dict_entries.resize(_selected_columns.num_levels());
-  lvl_stripe_is_empty.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -194,6 +148,9 @@ void reader::impl::query_stripe_compression_info()
   }
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& stream_info = _file_itm_data->lvl_stream_info[level];
+    stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+
     // Get the total number of stripes across all input files.
     std::size_t total_num_stripes =
       std::accumulate(selected_stripes.begin(),
@@ -202,37 +159,11 @@ void reader::impl::query_stripe_compression_info()
                       [](std::size_t sum, auto& stripe_source_mapping) {
                         return sum + stripe_source_mapping.stripe_info.size();
                       });
-    auto& columns_level    = _selected_columns.levels[level];
-    auto const num_columns = columns_level.size();
-    _file_itm_data->lvl_data_chunks[level] =
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(total_num_stripes, num_columns, _stream);
-    auto& chunks = _file_itm_data->lvl_data_chunks[level];
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-    auto& stream_info = _file_itm_data->lvl_stream_info[level];
-    stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size() *
-                        num_columns);
-
-    const bool use_index =
-      _use_index &&
-      // Do stripes have row group index
-      _metadata.is_row_grp_idx_present() &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (rows_to_skip == 0);
 
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
 
-    lvl_stripe_is_empty[level].reserve(selected_stripes.size() *
-                                       selected_stripes.front().stripe_info.size());
-
-    std::size_t num_dict_entries = 0;
-    std::size_t stripe_idx       = 0;
+    int stripe_idx = 0;
 
     std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
@@ -241,30 +172,15 @@ void reader::impl::query_stripe_compression_info()
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        auto stream_count = stream_info.size();
-        auto const total_data_size =
-          gather_stream_info_and_update_chunks(stripe_idx,
-                                               level,
-                                               stripe_info,
-                                               stripe_footer,
-                                               col_meta.orc_col_map[level],
-                                               _metadata.get_types(),
-                                               use_index,
-                                               level == 0,
-                                               &num_dict_entries,
-                                               stream_info,
-                                               chunks);
-
-        lvl_stripe_is_empty[level].push_back(total_data_size == 0);
-
-        //        auto const total_data_size = gather_stream_info(stripe_idx,
-        //                                                        level,
-        //                                                        stripe_info,
-        //                                                        stripe_footer,
-        //                                                        col_meta.orc_col_map[level],
-        //                                                        _metadata.get_types(),
-        //                                                        level == 0,
-        //                                                        stream_info);
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        level,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        level == 0,
+                                                        stream_info);
 
         auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -309,15 +225,12 @@ void reader::impl::query_stripe_compression_info()
         stripe_idx++;
       }
     }
-
     for (auto& task : read_tasks) {
       CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
     }
 
     if (stripe_data.empty()) { continue; }
 
-    lvl_num_dict_entries[level] = num_dict_entries;
-
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index fab218f0fde..83954a7dd1b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -105,10 +105,8 @@ struct file_intermediate_data {
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
+
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
-  std::vector<std::size_t> lvl_num_dict_entries;
-  std::vector<std::vector<bool>> lvl_stripe_is_empty;
 
   int64_t rows_to_skip;
   size_type rows_to_read;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 328eb27b781..44b3a138c69 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -52,6 +52,94 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
+/**
+ * @brief Function that populates column descriptors stream/chunk
+ */
+std::size_t gather_stream_info_and_update_chunks(
+  std::size_t stripe_index,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  std::size_t* num_dictionary_entries,
+  std::size_t* stream_idx,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
+{
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      dst_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (not schema_type.subtypes.empty()) {
+        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
+          for (auto const& idx : schema_type.subtypes) {
+            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+            if (child_idx >= 0) {
+              col                             = child_idx;
+              auto& chunk                     = chunks[stripe_index][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *stream_idx;
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    }
+    if (col != -1) {
+      if (src_offset >= stripeinfo->indexLength || use_index) {
+        auto& chunk           = chunks[stripe_index][col];
+        auto const index_type = get_stream_index_type(stream.kind);
+        if (index_type < gpu::CI_NUM_STREAMS) {
+          chunk.strm_id[index_type]  = *stream_idx;
+          chunk.strm_len[index_type] = stream.length;
+          // NOTE: skip_count field is temporarily used to track the presence of index streams
+          chunk.skip_count |= 1 << index_type;
+
+          if (index_type == gpu::CI_DICTIONARY) {
+            chunk.dictionary_start = *num_dictionary_entries;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
+          }
+        }
+      }
+
+      (*stream_idx)++;
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
 /**
  * @brief Decompresses the stripe data, at stream granularity.
  *
@@ -770,10 +858,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                         return sum + stripe_source_mapping.stripe_info.size();
                       });
     auto const num_columns = columns_level.size();
-
-    auto& lvl_chunks           = _file_itm_data->lvl_data_chunks[level];
-    auto& lvl_num_dict_entries = _file_itm_data->lvl_num_dict_entries;
-    auto& lvl_stripe_is_empty  = _file_itm_data->lvl_stripe_is_empty[level];
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+      total_num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
       _use_index &&
@@ -803,11 +890,10 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     auto& stripe_data = lvl_stripe_data[level];
 
     std::size_t stripe_start_row = 0;
-    //    std::size_t num_dict_entries = 0;
-    auto const num_dict_entries = lvl_num_dict_entries[level];
-    std::size_t num_rowgroups   = 0;
-    std::size_t stripe_idx      = 0;
-    //    std::size_t stream_idx       = 0;
+    std::size_t num_dict_entries = 0;
+    std::size_t num_rowgroups    = 0;
+    std::size_t stripe_idx       = 0;
+    std::size_t stream_idx       = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe_source_mapping : selected_stripes) {
@@ -816,20 +902,20 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         auto const stripe_info   = stripe.first;
         auto const stripe_footer = stripe.second;
 
-        //        auto const total_data_size =
-        //          gather_stream_info_and_update_chunks(stripe_idx,
-        //                                               level,
-        //                                               stripe_info,
-        //                                               stripe_footer,
-        //                                               col_meta.orc_col_map[level],
-        //                                               _metadata.get_types(),
-        //                                               use_index,
-        //                                               level == 0,
-        //                                               &num_dict_entries,
-        //                                               &stream_idx,
-        //                                               chunks);
-
-        auto const is_stripe_data_empty = lvl_stripe_is_empty[stripe_idx];
+        auto const total_data_size =
+          gather_stream_info_and_update_chunks(stripe_idx,
+                                               level,
+                                               stripe_info,
+                                               stripe_footer,
+                                               col_meta.orc_col_map[level],
+                                               _metadata.get_types(),
+                                               use_index,
+                                               level == 0,
+                                               &num_dict_entries,
+                                               &stream_idx,
+                                               chunks);
+
+        auto const is_stripe_data_empty = total_data_size == 0;
         CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                      "Invalid index rowgroup stream data");
 
@@ -844,7 +930,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         }
         // Update chunks to reference streams pointers
         for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = lvl_chunks[stripe_idx][col_idx];
+          auto& chunk = chunks[stripe_idx][col_idx];
           // start row, number of rows in a each stripe and total number of rows
           // may change in lower levels of nesting
           chunk.start_row = (level == 0)
@@ -924,7 +1010,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                                 *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
-                                                lvl_chunks,
+                                                chunks,
                                                 row_groups,
                                                 total_num_stripes,
                                                 _metadata.get_row_index_stride(),
@@ -934,12 +1020,12 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       stripe_data.push_back(std::move(decomp_data));
     } else {
       if (row_groups.size().first) {
-        lvl_chunks.host_to_device_async(_stream);
+        chunks.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
-                                lvl_chunks.base_device_ptr(),
+                                chunks.base_device_ptr(),
                                 num_columns,
                                 total_num_stripes,
                                 num_rowgroups,
@@ -952,7 +1038,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < total_num_stripes; ++j) {
-        if (lvl_chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
@@ -969,7 +1055,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                        _metadata.get_row_index_stride(),
                        level,
                        tz_table->view(),
-                       lvl_chunks,
+                       chunks,
                        row_groups,
                        _out_buffers[level],
                        _stream,
@@ -977,16 +1063,11 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
     if (nested_cols.size()) {
       // Extract information to process nested child columns
-      scan_null_counts(lvl_chunks, null_count_prefix_sums[level], _stream);
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
 
       row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(level,
-                           _selected_columns,
-                           lvl_chunks,
-                           row_groups,
-                           nested_cols,
-                           _out_buffers[level],
-                           col_meta);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
 
       // ORC stores number of elements at each row, so we need to generate offsets from that
       std::vector<list_buffer_data> buff_data;

From 2bbe9eef190447ecb4bb6868ad21b21700ff0e21 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 20:26:01 -0800
Subject: [PATCH 028/321] Rewrite stripe selection

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp |  44 ++++--
 cpp/src/io/orc/aggregate_orc_metadata.hpp |  12 +-
 cpp/src/io/orc/orc.hpp                    |   8 +-
 cpp/src/io/orc/reader_impl_chunking.cu    | 119 +++++++--------
 cpp/src/io/orc/reader_impl_chunking.hpp   |   2 +-
 cpp/src/io/orc/reader_impl_preprocess.cu  | 173 ++++++++++------------
 6 files changed, 177 insertions(+), 181 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 02bf74e9c01..6be812d4604 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -152,7 +152,7 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+std::tuple<int64_t, size_type, std::vector<metadata::OrcStripeInfo>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   uint64_t skip_rows,
@@ -167,7 +167,16 @@ aggregate_orc_metadata::select_stripes(
     return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
   }();
 
-  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
+  struct stripe_source_mapping {
+    stripe_source_mapping(int source_idx, std::vector<metadata::OrcStripeInfo>&& stripe_info)
+      : source_idx(source_idx), stripe_info(std::move(stripe_info))
+    {
+    }
+    int source_idx;
+    std::vector<metadata::OrcStripeInfo> stripe_info;
+  };
+
+  std::vector<stripe_source_mapping> selected_stripes_mapping;
 
   if (!user_specified_stripes.empty()) {
     CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
@@ -176,7 +185,7 @@ aggregate_orc_metadata::select_stripes(
     // Each vector entry represents a source file; each nested vector represents the
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::OrcStripeInfo> stripe_infos;
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -185,13 +194,15 @@ aggregate_orc_metadata::select_stripes(
           stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
-        stripe_infos.push_back(
-          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+        stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                nullptr,
+                                static_cast<int>(src_file_idx)});
 
         // TODO: check for overflow here.
         rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
       }
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(static_cast<int>(src_file_idx),
+                                            std::move(stripe_infos));
     }
   } else {
     uint64_t count             = 0;
@@ -200,33 +211,37 @@ aggregate_orc_metadata::select_stripes(
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::OrcStripeInfo> stripe_infos;
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
         count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
         if (count > rows_to_skip || count == 0) {
-          stripe_infos.push_back(
-            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                  nullptr,
+                                  static_cast<int>(src_file_idx)});
         } else {
           stripe_skip_rows = count;
         }
       }
 
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(static_cast<int>(src_file_idx),
+                                            std::move(stripe_infos));
     }
     // Need to remove skipped rows from the stripes which are not selected.
     rows_to_skip -= stripe_skip_rows;
   }
 
+  std::vector<metadata::OrcStripeInfo> output;
+
   // Read each stripe's stripefooter metadata
   for (auto& mapping : selected_stripes_mapping) {
     // Resize to all stripe_info for the source level
     per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
     for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-      auto const stripe         = mapping.stripe_info[i].first;
+      auto const stripe         = mapping.stripe_info[i].stripe_info;
       auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
       auto const sf_comp_length = stripe->footerLength;
       CUDF_EXPECTS(
@@ -238,12 +253,15 @@ aggregate_orc_metadata::select_stripes(
         {buffer->data(), buffer->size()}, stream);
       ProtobufReader(sf_data.data(), sf_data.size())
         .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      mapping.stripe_info[i].stripe_footer =
+        &per_file_metadata[mapping.source_idx].stripefooters[i];
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
+
+    output.insert(output.end(), mapping.stripe_info.begin(), mapping.stripe_info.end());
   }
 
-  return {rows_to_skip, rows_to_read, selected_stripes_mapping};
+  return {rows_to_skip, rows_to_read, std::move(output)};
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index f05946a4346..f6bba46b4c8 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -45,8 +45,6 @@ struct column_hierarchy {
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -113,11 +111,11 @@ class aggregate_orc_metadata {
    *
    * Stripes are potentially selected from multiple files.
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
-  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
-                 uint64_t skip_rows,
-                 std::optional<size_type> const& num_rows,
-                 rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::OrcStripeInfo>> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    uint64_t skip_rows,
+    std::optional<size_type> const& num_rows,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 4f3e0a82768..d17291d4acb 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -601,13 +601,13 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
  public:
-  struct stripe_source_mapping {
+  struct OrcStripeInfo {
+    StripeInformation const* stripe_info;
+    StripeFooter const* stripe_footer;
     int source_idx;
-    std::vector<OrcStripeInfo> stripe_info;
   };
+  std::vector<OrcStripeInfo> stripe_info;
 
  public:
   explicit metadata(datasource* const src, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5590f53858c..e58e804d449 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -148,17 +148,12 @@ void reader::impl::query_stripe_compression_info()
   }
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& stream_info = _file_itm_data->lvl_stream_info[level];
-    stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+    auto& stream_info      = _file_itm_data->lvl_stream_info[level];
+    auto const num_columns = _selected_columns.levels[level].size();
+    stream_info.reserve(selected_stripes.size() * num_columns);
 
     // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
+    std::size_t total_num_stripes = selected_stripes.size();
 
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
@@ -166,65 +161,61 @@ void reader::impl::query_stripe_compression_info()
     int stripe_idx = 0;
 
     std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        level,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        level == 0,
-                                                        stream_info);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
+    for (auto const& stripe : selected_stripes) {
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      auto stream_count          = stream_info.size();
+      auto const total_data_size = gather_stream_info(stripe_idx,
+                                                      level,
+                                                      stripe_info,
+                                                      stripe_footer,
+                                                      col_meta.orc_col_map[level],
+                                                      _metadata.get_types(),
+                                                      level == 0,
+                                                      stream_info);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      // Buffer needs to be padded.
+      // Required by `copy_uncompressed_kernel`.
+      stripe_data.emplace_back(cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE),
+                               _stream);
+      auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+      // Coalesce consecutive streams into one read
+      while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+        auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+        auto const offset = stream_info[stream_count].offset;
+        auto len          = stream_info[stream_count].length;
+        stream_count++;
+
+        while (stream_count < stream_info.size() &&
+               stream_info[stream_count].offset == offset + len) {
+          len += stream_info[stream_count].length;
           stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
         }
-
-        stripe_idx++;
+        if (_metadata.per_file_metadata[stripe.source_idx].source->is_device_read_preferred(len)) {
+          read_tasks.push_back(
+            std::pair(_metadata.per_file_metadata[stripe.source_idx].source->device_read_async(
+                        offset, len, d_dst, _stream),
+                      len));
+
+        } else {
+          auto const buffer =
+            _metadata.per_file_metadata[stripe.source_idx].source->host_read(offset, len);
+          CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+          CUDF_CUDA_TRY(
+            cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+          _stream.synchronize();
+        }
       }
+
+      stripe_idx++;
     }
+
     for (auto& task : read_tasks) {
       CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
     }
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 83954a7dd1b..f6728f22688 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -110,7 +110,7 @@ struct file_intermediate_data {
 
   int64_t rows_to_skip;
   size_type rows_to_read;
-  std::vector<metadata::stripe_source_mapping> selected_stripes;
+  std::vector<metadata::OrcStripeInfo> selected_stripes;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 44b3a138c69..e8f39b2d870 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -751,7 +751,7 @@ void reader::impl::create_pass_data()
 
   // Logically view streams as columns
   std::vector<orc_stream_info> stream_info;
-  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+  //  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
 
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
@@ -803,10 +803,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
         });
       });
 
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
   }();
 
   auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
@@ -850,14 +849,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     }
 
     // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
+    std::size_t total_num_stripes = selected_stripes.size();
+    auto const num_columns        = columns_level.size();
     cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
       total_num_stripes, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
@@ -896,89 +889,85 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     std::size_t stream_idx       = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto const total_data_size =
-          gather_stream_info_and_update_chunks(stripe_idx,
-                                               level,
-                                               stripe_info,
-                                               stripe_footer,
-                                               col_meta.orc_col_map[level],
-                                               _metadata.get_types(),
-                                               use_index,
-                                               level == 0,
-                                               &num_dict_entries,
-                                               &stream_idx,
-                                               chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
-        }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
+    for (auto const& stripe : selected_stripes) {
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      auto const total_data_size = gather_stream_info_and_update_chunks(stripe_idx,
+                                                                        level,
+                                                                        stripe_info,
+                                                                        stripe_footer,
+                                                                        col_meta.orc_col_map[level],
+                                                                        _metadata.get_types(),
+                                                                        use_index,
+                                                                        level == 0,
+                                                                        &num_dict_entries,
+                                                                        &stream_idx,
+                                                                        chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
+
+      auto const num_rows_per_stripe = stripe_info->numberOfRows;
+      auto const rowgroup_id         = num_rowgroups;
+      auto stripe_num_rowgroups      = 0;
+      if (use_index) {
+        stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                               _metadata.get_row_index_stride();
+      }
+      // Update chunks to reference streams pointers
+      for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+        auto& chunk = chunks[stripe_idx][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row       = (level == 0)
+                                  ? stripe_start_row
+                                  : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+        chunk.num_rows        = (level == 0)
+                                  ? stripe_info->numberOfRows
+                                  : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0)
+            ? nullptr
+            : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+        if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
           }
         }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
       }
+      stripe_start_row += num_rows_per_stripe;
+      num_rowgroups += stripe_num_rowgroups;
+
+      stripe_idx++;
     }
+
     // for (auto& task : read_tasks) {
     //   CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
     // }

From 90ac38065138dca73dfe9fceecf39a8bfd2a6387 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 20:53:09 -0800
Subject: [PATCH 029/321] Store data chunk descriptors

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp  | 2 +-
 cpp/src/io/orc/reader_impl_preprocess.cu | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index f6728f22688..f4f23a9382a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -105,7 +105,7 @@ struct file_intermediate_data {
 
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
   int64_t rows_to_skip;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index e8f39b2d870..62f5c6be049 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -810,7 +810,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
   auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
   auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
+  auto& lvl_chunks             = _file_itm_data->lvl_data_chunks;
   lvl_stripe_data.resize(_selected_columns.num_levels());
+  lvl_chunks.resize(_selected_columns.num_levels());
 
   _out_buffers.resize(_selected_columns.num_levels());
 
@@ -851,8 +853,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     // Get the total number of stripes across all input files.
     std::size_t total_num_stripes = selected_stripes.size();
     auto const num_columns        = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
+    auto& chunks                  = lvl_chunks[level];
+    chunks =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(total_num_stripes, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =

From c8eeaccb29bc6f1abe65f8c1307a467f35d1b992 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 21:04:10 -0800
Subject: [PATCH 030/321] Create read_info vector

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  7 ++++++-
 cpp/src/io/orc/reader_impl_chunking.hpp | 11 +++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index e58e804d449..249a24fdd3c 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -123,7 +123,9 @@ void reader::impl::query_stripe_compression_info()
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
   auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+  auto& lvl_read_info   = _file_itm_data->lvl_read_info;
   lvl_stripe_data.resize(_selected_columns.num_levels());
+  lvl_read_info.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -150,7 +152,9 @@ void reader::impl::query_stripe_compression_info()
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stream_info      = _file_itm_data->lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
-    stream_info.reserve(selected_stripes.size() * num_columns);
+    auto& read_info        = lvl_read_info[level];
+    stream_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
+    read_info.reserve(selected_stripes.size() * num_columns);    // final size is unknown
 
     // Get the total number of stripes across all input files.
     std::size_t total_num_stripes = selected_stripes.size();
@@ -197,6 +201,7 @@ void reader::impl::query_stripe_compression_info()
           len += stream_info[stream_count].length;
           stream_count++;
         }
+        read_info.emplace_back(offset, len, d_dst);
         if (_metadata.per_file_metadata[stripe.source_idx].source->is_device_read_preferred(len)) {
           read_tasks.push_back(
             std::pair(_metadata.per_file_metadata[stripe.source_idx].source->device_read_async(
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index f4f23a9382a..f3e0b421843 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -108,6 +108,17 @@ struct file_intermediate_data {
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
+  struct read_info {
+    read_info(uint64_t offset_, std::size_t length_, uint8_t* dst_pos_)
+      : offset(offset_), length(length_), dst_pos(dst_pos_)
+    {
+    }
+    uint64_t offset;
+    std::size_t length;
+    uint8_t* dst_pos;
+  };
+  std::vector<std::vector<read_info>> lvl_read_info;
+
   int64_t rows_to_skip;
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;

From 9a8949d48b5b9bcd9aa4ccc2780c4a03a109140c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 21:30:38 -0800
Subject: [PATCH 031/321] Store stripe sizes

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 22 ++++++++++++----------
 cpp/src/io/orc/reader_impl_chunking.hpp |  1 +
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 249a24fdd3c..d866c9ea6d4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -122,9 +122,11 @@ void reader::impl::query_stripe_compression_info()
   // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
-  auto& lvl_read_info   = _file_itm_data->lvl_read_info;
+  auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
+  auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
+  auto& lvl_read_info    = _file_itm_data->lvl_read_info;
   lvl_stripe_data.resize(_selected_columns.num_levels());
+  lvl_stripe_sizes.resize(_selected_columns.num_levels());
   lvl_read_info.resize(_selected_columns.num_levels());
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
@@ -153,19 +155,20 @@ void reader::impl::query_stripe_compression_info()
     auto& stream_info      = _file_itm_data->lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
     auto& read_info        = lvl_read_info[level];
-    stream_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
-    read_info.reserve(selected_stripes.size() * num_columns);    // final size is unknown
+    auto& stripe_sizes     = lvl_stripe_sizes[level];
+    stream_info.reserve(selected_stripes.size() * num_columns);   // final size is unknown
+    read_info.reserve(selected_stripes.size() * num_columns);     // final size is unknown
+    stripe_sizes.reserve(selected_stripes.size() * num_columns);  // final size is unknown
 
     // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes = selected_stripes.size();
+    std::size_t num_stripes = selected_stripes.size();
 
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
 
-    int stripe_idx = 0;
-
     std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe : selected_stripes) {
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; stripe_idx++) {
+      auto const& stripe       = selected_stripes[stripe_idx];
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
@@ -178,6 +181,7 @@ void reader::impl::query_stripe_compression_info()
                                                       _metadata.get_types(),
                                                       level == 0,
                                                       stream_info);
+      stripe_sizes.push_back(total_data_size);
 
       auto const is_stripe_data_empty = total_data_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -217,8 +221,6 @@ void reader::impl::query_stripe_compression_info()
           _stream.synchronize();
         }
       }
-
-      stripe_idx++;
     }
 
     for (auto& task : read_tasks) {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index f3e0b421843..1b5ee86d107 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -103,6 +103,7 @@ struct file_intermediate_data {
     compinfo_map;
   bool compinfo_ready{false};
 
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;

From 06ee057f0088d36feaaf997b356bdee658ba1c73 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 22:52:16 -0800
Subject: [PATCH 032/321] Read separately from parse stripe sizes

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 130 +++++++++++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp |  22 +++-
 2 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index d866c9ea6d4..fabe00dadf9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -124,10 +124,10 @@ void reader::impl::query_stripe_compression_info()
 
   auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
   auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
-  auto& lvl_read_info    = _file_itm_data->lvl_read_info;
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
-  lvl_read_info.resize(_selected_columns.num_levels());
+
+  auto& read_info = _file_itm_data->read_info;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -151,51 +151,46 @@ void reader::impl::query_stripe_compression_info()
     }
   }
 
+  // Get the total number of stripes across all input files.
+  std::size_t num_stripes = selected_stripes.size();
+
+  // Compute input size for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    lvl_stripe_data[level].resize(num_stripes);
+
     auto& stream_info      = _file_itm_data->lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
-    auto& read_info        = lvl_read_info[level];
     auto& stripe_sizes     = lvl_stripe_sizes[level];
-    stream_info.reserve(selected_stripes.size() * num_columns);   // final size is unknown
-    read_info.reserve(selected_stripes.size() * num_columns);     // final size is unknown
-    stripe_sizes.reserve(selected_stripes.size() * num_columns);  // final size is unknown
-
-    // Get the total number of stripes across all input files.
-    std::size_t num_stripes = selected_stripes.size();
+    stream_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
 
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
+    stripe_sizes.resize(selected_stripes.size());
+    if (read_info.capacity() < selected_stripes.size()) {
+      read_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
+    }
 
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; stripe_idx++) {
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
       auto const& stripe       = selected_stripes[stripe_idx];
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
-      auto stream_count          = stream_info.size();
-      auto const total_data_size = gather_stream_info(stripe_idx,
-                                                      level,
-                                                      stripe_info,
-                                                      stripe_footer,
-                                                      col_meta.orc_col_map[level],
-                                                      _metadata.get_types(),
-                                                      level == 0,
-                                                      stream_info);
-      stripe_sizes.push_back(total_data_size);
-
-      auto const is_stripe_data_empty = total_data_size == 0;
+      auto stream_count        = stream_info.size();
+      auto const stripe_size   = gather_stream_info(stripe_idx,
+                                                  level,
+                                                  stripe_info,
+                                                  stripe_footer,
+                                                  col_meta.orc_col_map[level],
+                                                  _metadata.get_types(),
+                                                  level == 0,
+                                                  stream_info);
+      stripe_sizes[stripe_idx] = stripe_size;
+
+      auto const is_stripe_data_empty = stripe_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      // Buffer needs to be padded.
-      // Required by `copy_uncompressed_kernel`.
-      stripe_data.emplace_back(cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE),
-                               _stream);
-      auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
       // Coalesce consecutive streams into one read
       while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-        auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+        auto const d_dst  = stream_info[stream_count].dst_pos;
         auto const offset = stream_info[stream_count].offset;
         auto len          = stream_info[stream_count].length;
         stream_count++;
@@ -205,28 +200,65 @@ void reader::impl::query_stripe_compression_info()
           len += stream_info[stream_count].length;
           stream_count++;
         }
-        read_info.emplace_back(offset, len, d_dst);
-        if (_metadata.per_file_metadata[stripe.source_idx].source->is_device_read_preferred(len)) {
-          read_tasks.push_back(
-            std::pair(_metadata.per_file_metadata[stripe.source_idx].source->device_read_async(
-                        offset, len, d_dst, _stream),
-                      len));
-
-        } else {
-          auto const buffer =
-            _metadata.per_file_metadata[stripe.source_idx].source->host_read(offset, len);
-          CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-          CUDF_CUDA_TRY(
-            cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-          _stream.synchronize();
-        }
+        read_info.emplace_back(offset, len, d_dst, stripe.source_idx, stripe_idx, level);
       }
     }
+  }
 
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  // Prepare the buffer to read raw data onto.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& stripe_data  = lvl_stripe_data[level];
+    auto& stripe_sizes = lvl_stripe_sizes[level];
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      stripe_data[stripe_idx] = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
     }
+  }
+
+  std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+  // Should not read all, but read stripe by stripe.
+  for (auto const& read : read_info) {
+    auto& stripe_data = lvl_stripe_data[read.level];
+    auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
 
+    if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
+          read.length)) {
+      read_tasks.push_back(
+        std::pair(_metadata.per_file_metadata[read.source_idx].source->device_read_async(
+                    read.offset, read.length, dst_base + read.dst_pos, _stream),
+                  read.length));
+
+    } else {
+      read_tasks.push_back(
+        std::pair(std::async(std::launch::async,
+                             [&, read = read, dst_base = dst_base] {
+                               auto const buffer =
+                                 _metadata.per_file_metadata[read.source_idx].source->host_read(
+                                   read.offset, read.length);
+                               CUDF_EXPECTS(buffer->size() == read.length,
+                                            "Unexpected discrepancy in bytes read.");
+                               CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read.dst_pos,
+                                                             buffer->data(),
+                                                             read.length,
+                                                             cudaMemcpyDefault,
+                                                             _stream.value()));
+                               _stream.synchronize();
+                               return read.length;
+                             }),
+                  read.length));
+    }
+  }
+  for (auto& task : read_tasks) {
+    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  }
+
+  // Parse the decompressed sizes for each stripe.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& stream_info      = _file_itm_data->lvl_stream_info[level];
+    auto const num_columns = _selected_columns.levels[level].size();
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 
     // Setup row group descriptors if using indexes
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 1b5ee86d107..9797d113e9a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -109,16 +109,30 @@ struct file_intermediate_data {
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
+  // Each read correspond to one or more consecutive stream combined.
   struct read_info {
-    read_info(uint64_t offset_, std::size_t length_, uint8_t* dst_pos_)
-      : offset(offset_), length(length_), dst_pos(dst_pos_)
+    read_info(uint64_t offset_,
+              std::size_t length_,
+              std::size_t dst_pos_,
+              std::size_t source_idx_,
+              std::size_t stripe_idx_,
+              std::size_t level_)
+      : offset(offset_),
+        length(length_),
+        dst_pos(dst_pos_),
+        source_idx(source_idx_),
+        stripe_idx(stripe_idx_),
+        level(level_)
     {
     }
     uint64_t offset;
     std::size_t length;
-    uint8_t* dst_pos;
+    std::size_t dst_pos;
+    std::size_t source_idx;
+    std::size_t stripe_idx;
+    std::size_t level;
   };
-  std::vector<std::vector<read_info>> lvl_read_info;
+  std::vector<read_info> read_info;
 
   int64_t rows_to_skip;
   size_type rows_to_read;

From 746a2ef05fae3e0db57d641451195abed1922417 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 23:10:31 -0800
Subject: [PATCH 033/321] Fix copyright year

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/orc.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index d17291d4acb..a3fdef78a37 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 3f2a36efc2087fae99a686594b4eef1f2ef6bc21 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 23:11:05 -0800
Subject: [PATCH 034/321] Compute input size stripe by stripe

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index fabe00dadf9..40b4ee7fc63 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -136,6 +136,9 @@ void reader::impl::query_stripe_compression_info()
   // Logically view streams as columns
   _file_itm_data->lvl_stream_info.resize(_selected_columns.num_levels());
 
+  // Get the total number of stripes across all input files.
+  std::size_t num_stripes = selected_stripes.size();
+
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
@@ -149,13 +152,7 @@ void reader::impl::query_stripe_compression_info()
       // Map each ORC column to its column
       col_meta.orc_col_map[level][col.id] = col_id++;
     }
-  }
-
-  // Get the total number of stripes across all input files.
-  std::size_t num_stripes = selected_stripes.size();
 
-  // Compute input size for each stripe.
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     lvl_stripe_data[level].resize(num_stripes);
 
     auto& stream_info      = _file_itm_data->lvl_stream_info[level];
@@ -167,11 +164,17 @@ void reader::impl::query_stripe_compression_info()
     if (read_info.capacity() < selected_stripes.size()) {
       read_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
     }
+  }
 
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-      auto const& stripe       = selected_stripes[stripe_idx];
-      auto const stripe_info   = stripe.stripe_info;
-      auto const stripe_footer = stripe.stripe_footer;
+  // Compute input size for each stripe.
+  for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+    auto const& stripe       = selected_stripes[stripe_idx];
+    auto const stripe_info   = stripe.stripe_info;
+    auto const stripe_footer = stripe.stripe_footer;
+
+    for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+      auto& stream_info  = _file_itm_data->lvl_stream_info[level];
+      auto& stripe_sizes = lvl_stripe_sizes[level];
 
       auto stream_count        = stream_info.size();
       auto const stripe_size   = gather_stream_info(stripe_idx,
@@ -217,6 +220,7 @@ void reader::impl::query_stripe_compression_info()
 
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
   // Should not read all, but read stripe by stripe.
+  // read_info should be limited by stripe.
   for (auto const& read : read_info) {
     auto& stripe_data = lvl_stripe_data[read.level];
     auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());

From 5c2c5ea22b5859a18fbc076b1df9dc5c9885a3d0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 23:19:27 -0800
Subject: [PATCH 035/321] Read without async

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 40b4ee7fc63..0a41c735d33 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -233,6 +233,15 @@ void reader::impl::query_stripe_compression_info()
                   read.length));
 
     } else {
+      auto const buffer =
+        _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
+      CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        dst_base + read.dst_pos, buffer->data(), read.length, cudaMemcpyDefault, _stream.value()));
+      _stream.synchronize();
+
+#if 0
+     // This in theory should be faster, but in practice it's slower. Why?
       read_tasks.push_back(
         std::pair(std::async(std::launch::async,
                              [&, read = read, dst_base = dst_base] {
@@ -250,6 +259,7 @@ void reader::impl::query_stripe_compression_info()
                                return read.length;
                              }),
                   read.length));
+#endif
     }
   }
   for (auto& task : read_tasks) {

From 0a1db0924a6c705d18dff57d7c780b8d2869cb8c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 27 Jan 2024 23:45:34 -0800
Subject: [PATCH 036/321] Find stripe split

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 83 ++++++++++++++++++++++++-
 cpp/src/io/orc/reader_impl_chunking.hpp |  7 +++
 2 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 0a41c735d33..a0e79959051 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -107,6 +107,63 @@ std::size_t gather_stream_info(std::size_t stripe_index,
   return dst_offset;
 }
 
+struct cumulative_size {
+  std::size_t count;
+  std::size_t size_bytes;
+};
+
+struct cumulative_size_sum {
+  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
+  {
+    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
+  }
+};
+
+#if 0
+std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
+                               size_type num_rows,
+                               size_t size_limit)
+{
+  std::vector<chunk> splits;
+
+  uint32_t cur_count         = 0;
+  int64_t cur_pos            = 0;
+  size_t cur_cumulative_size = 0;
+  auto const start           = thrust::make_transform_iterator(
+    sizes.begin(), [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
+  auto const end = start + static_cast<int64_t>(sizes.size());
+  while (cur_count < static_cast<uint32_t>(num_rows)) {
+    int64_t split_pos =
+      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit));
+
+    // If we're past the end, or if the returned bucket is bigger than the chunk_read_limit, move
+    // back one.
+    if (static_cast<size_t>(split_pos) >= sizes.size() ||
+        (sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
+      split_pos--;
+    }
+
+    // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+    // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+    // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+    // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+    // either do this, or we have to call unique() on the input first.
+    while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+           (split_pos < 0 || sizes[split_pos].count == cur_count)) {
+      split_pos++;
+    }
+
+    auto const start_row = cur_count;
+    cur_count            = sizes[split_pos].count;
+    splits.emplace_back(chunk{start_row, static_cast<size_type>(cur_count - start_row)});
+    cur_pos             = split_pos;
+    cur_cumulative_size = sizes[split_pos].size_bytes;
+  }
+
+  return splits;
+}
+#endif
+
 }  // namespace
 
 void reader::impl::query_stripe_compression_info()
@@ -114,7 +171,6 @@ void reader::impl::query_stripe_compression_info()
   if (_file_itm_data->compinfo_ready) { return; }
   if (_selected_columns.num_levels() == 0) { return; }
 
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
@@ -166,12 +222,15 @@ void reader::impl::query_stripe_compression_info()
     }
   }
 
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_stripes, _stream);
+
   // Compute input size for each stripe.
   for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
     auto const& stripe       = selected_stripes[stripe_idx];
     auto const stripe_info   = stripe.stripe_info;
     auto const stripe_footer = stripe.stripe_footer;
 
+    std::size_t total_stripe_size{0};
     for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
       auto& stream_info  = _file_itm_data->lvl_stream_info[level];
       auto& stripe_sizes = lvl_stripe_sizes[level];
@@ -186,6 +245,7 @@ void reader::impl::query_stripe_compression_info()
                                                   level == 0,
                                                   stream_info);
       stripe_sizes[stripe_idx] = stripe_size;
+      total_stripe_size += stripe_size;
 
       auto const is_stripe_data_empty = stripe_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -206,8 +266,26 @@ void reader::impl::query_stripe_compression_info()
         read_info.emplace_back(offset, len, d_dst, stripe.source_idx, stripe_idx, level);
       }
     }
+    total_stripe_sizes[stripe_idx] = {1, total_stripe_size};
   }
 
+  // Compute the prefix sum of stripe data sizes.
+  total_stripe_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy(_stream),
+                         total_stripe_sizes.d_begin(),
+                         total_stripe_sizes.d_end(),
+                         total_stripe_sizes.d_begin(),
+                         cumulative_size_sum{});
+
+  total_stripe_sizes.device_to_host_sync(_stream);
+
+  //  fix this:
+  //  _file_itm_data->stripe_chunks =
+  //    find_splits(total_stripe_sizes, _file_itm_data->rows_to_read, /*chunk_size_limit*/ 0);
+
+  //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
+  //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);
+
   // Prepare the buffer to read raw data onto.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stripe_data  = lvl_stripe_data[level];
@@ -221,6 +299,9 @@ void reader::impl::query_stripe_compression_info()
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
   // Should not read all, but read stripe by stripe.
   // read_info should be limited by stripe.
+  // Read level-by-level.
+  // TODO: Test with read and parse/decode column by column.
+  // This is future work.
   for (auto const& read : read_info) {
     auto& stripe_data = lvl_stripe_data[read.level];
     auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 9797d113e9a..bacdbc7933f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -95,6 +95,11 @@ struct stream_id_hash {
   }
 };
 
+struct chunk {
+  int64_t start_idx;
+  int64_t count;
+};
+
 /**
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
@@ -109,6 +114,8 @@ struct file_intermediate_data {
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
+  std::vector<chunk> stripe_chunks;
+
   // Each read correspond to one or more consecutive stream combined.
   struct read_info {
     read_info(uint64_t offset_,

From 7c9867440430306273d37627fdcd09d3066359ec Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 08:15:11 -0800
Subject: [PATCH 037/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index a0e79959051..01d0e12395b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -119,9 +119,9 @@ struct cumulative_size_sum {
   }
 };
 
-#if 0
+#if 1
 std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
-                               size_type num_rows,
+                               size_type total_count,
                                size_t size_limit)
 {
   std::vector<chunk> splits;
@@ -132,7 +132,7 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
   auto const start           = thrust::make_transform_iterator(
     sizes.begin(), [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
   auto const end = start + static_cast<int64_t>(sizes.size());
-  while (cur_count < static_cast<uint32_t>(num_rows)) {
+  while (cur_count < static_cast<uint32_t>(total_count)) {
     int64_t split_pos =
       thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit));
 
@@ -153,9 +153,9 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
       split_pos++;
     }
 
-    auto const start_row = cur_count;
+    auto const start_idx = cur_count;
     cur_count            = sizes[split_pos].count;
-    splits.emplace_back(chunk{start_row, static_cast<size_type>(cur_count - start_row)});
+    splits.emplace_back(chunk{start_idx, static_cast<size_type>(cur_count - start_idx)});
     cur_pos             = split_pos;
     cur_cumulative_size = sizes[split_pos].size_bytes;
   }
@@ -279,9 +279,17 @@ void reader::impl::query_stripe_compression_info()
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
-  //  fix this:
-  //  _file_itm_data->stripe_chunks =
-  //    find_splits(total_stripe_sizes, _file_itm_data->rows_to_read, /*chunk_size_limit*/ 0);
+  _file_itm_data->stripe_chunks = find_splits(
+    total_stripe_sizes,
+    total_stripe_sizes.size(),
+    /*chunk_size_limit/2*/ total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3);
+
+  auto& splits = _file_itm_data->stripe_chunks;
+  printf("------------\nSplits (/%d): \n", (int)num_stripes);
+  for (size_t idx = 0; idx < splits.size(); idx++) {
+    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+  }
+  fflush(stdout);
 
   //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
   //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);

From 3b9aabb0564a5564485056326036399ce2204343 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 08:20:24 -0800
Subject: [PATCH 038/321] Rename variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  6 +++---
 cpp/src/io/orc/reader_impl_chunking.hpp | 12 ++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 01d0e12395b..4cd807fff2e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -183,7 +183,7 @@ void reader::impl::query_stripe_compression_info()
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
 
-  auto& read_info = _file_itm_data->read_info;
+  auto& read_info = _file_itm_data->stream_read_info;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -279,12 +279,12 @@ void reader::impl::query_stripe_compression_info()
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
-  _file_itm_data->stripe_chunks = find_splits(
+  _file_itm_data->load_stripe_chunks = find_splits(
     total_stripe_sizes,
     total_stripe_sizes.size(),
     /*chunk_size_limit/2*/ total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3);
 
-  auto& splits = _file_itm_data->stripe_chunks;
+  auto& splits = _file_itm_data->load_stripe_chunks;
   printf("------------\nSplits (/%d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index bacdbc7933f..fae4e2c69ca 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -114,11 +114,15 @@ struct file_intermediate_data {
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
-  std::vector<chunk> stripe_chunks;
+  std::vector<chunk> load_stripe_chunks;
+  std::size_t curr_load_stripe_chunk{0};
+
+  std::vector<chunk> decode_stripe_chunks;
+  std::size_t curr_decode_stripe_chunk{0};
 
   // Each read correspond to one or more consecutive stream combined.
-  struct read_info {
-    read_info(uint64_t offset_,
+  struct stream_read_info {
+    stream_read_info(uint64_t offset_,
               std::size_t length_,
               std::size_t dst_pos_,
               std::size_t source_idx_,
@@ -139,7 +143,7 @@ struct file_intermediate_data {
     std::size_t stripe_idx;
     std::size_t level;
   };
-  std::vector<read_info> read_info;
+  std::vector<stream_read_info> stream_read_info;
 
   int64_t rows_to_skip;
   size_type rows_to_read;

From ea1c94f3e46f955f7d09434dfcb6915033972821 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 08:30:15 -0800
Subject: [PATCH 039/321] Add function interface

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp           | 10 ++++----
 cpp/src/io/orc/reader_impl_preprocess.cu | 29 ++++++------------------
 2 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 8130ac51f6d..8cf13a13b4d 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -77,10 +77,12 @@ class reader::impl {
                     std::optional<size_type> const& num_rows_opt,
                     std::vector<std::vector<size_type>> const& stripes);
 
-  /**
-   *
-   */
-  void create_pass_data();
+  // Do once for the entire file.
+  void global_preprocess();
+
+  void pass_preprocess();
+
+  void subpass_preprocess();
 
   /**
    * @brief Compute stripe sizes.
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 62f5c6be049..eb4652d5498 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -742,30 +742,11 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
-void reader::impl::create_pass_data()
-{
-  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
+void reader::impl::global_preprocess() {}
 
-  // Logically view streams as columns
-  std::vector<orc_stream_info> stream_info;
-  //  stream_info.reserve(selected_stripes.size() * selected_stripes.front().stripe_info.size());
+void reader::impl::pass_preprocess() {}
 
-  auto& col_meta = *_col_meta;
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-
-    size_type col_id{0};
-    for (auto& col : columns_level) {
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = col_id++;
-    }
-  }
-}
+void reader::impl::subpass_preprocess() {}
 
 void reader::impl::prepare_data(uint64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
@@ -779,6 +760,10 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // There are no columns in the table
   if (_selected_columns.num_levels() == 0) { return; }
 
+  global_preprocess();
+  pass_preprocess();
+  subpass_preprocess();
+
   _file_itm_data = std::make_unique<file_intermediate_data>();
 
   // Select only stripes required (aka row groups)

From a4776fa25907f24692f60310dd0ef61665fd64d6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 08:47:12 -0800
Subject: [PATCH 040/321] Add more interface

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp           |  4 +-
 cpp/src/io/orc/reader_impl_chunking.cu   |  4 +-
 cpp/src/io/orc/reader_impl_chunking.hpp  | 16 +++++---
 cpp/src/io/orc/reader_impl_preprocess.cu | 48 ++++++++++++++++++------
 4 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 8cf13a13b4d..c439c44870a 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -78,7 +78,9 @@ class reader::impl {
                     std::vector<std::vector<size_type>> const& stripes);
 
   // Do once for the entire file.
-  void global_preprocess();
+  void global_preprocess(uint64_t skip_rows,
+                         std::optional<size_type> const& num_rows_opt,
+                         std::vector<std::vector<size_type>> const& stripes);
 
   void pass_preprocess();
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4cd807fff2e..99511f2b320 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -168,7 +168,7 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 
 void reader::impl::query_stripe_compression_info()
 {
-  if (_file_itm_data->compinfo_ready) { return; }
+  // if (_file_itm_data->compinfo_ready) { return; }
   if (_selected_columns.num_levels() == 0) { return; }
 
   auto const rows_to_read      = _file_itm_data->rows_to_read;
@@ -432,7 +432,7 @@ void reader::impl::query_stripe_compression_info()
   }  // end loop level
 
   // lvl_stripe_data.clear();
-  _file_itm_data->compinfo_ready = true;
+  // _file_itm_data->compinfo_ready = true;
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index fae4e2c69ca..83a61703f20 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -106,7 +106,7 @@ struct chunk {
 struct file_intermediate_data {
   std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal>
     compinfo_map;
-  bool compinfo_ready{false};
+  // bool compinfo_ready{false};
 
   std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
@@ -123,11 +123,11 @@ struct file_intermediate_data {
   // Each read correspond to one or more consecutive stream combined.
   struct stream_read_info {
     stream_read_info(uint64_t offset_,
-              std::size_t length_,
-              std::size_t dst_pos_,
-              std::size_t source_idx_,
-              std::size_t stripe_idx_,
-              std::size_t level_)
+                     std::size_t length_,
+                     std::size_t dst_pos_,
+                     std::size_t source_idx_,
+                     std::size_t stripe_idx_,
+                     std::size_t level_)
       : offset(offset_),
         length(length_),
         dst_pos(dst_pos_),
@@ -148,6 +148,10 @@ struct file_intermediate_data {
   int64_t rows_to_skip;
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;
+
+  bool global_preprocessed{false};
+  bool pass_preprocessed{false};
+  bool subpass_preprocessed{false};
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index eb4652d5498..1f092243b74 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -742,11 +742,43 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
-void reader::impl::global_preprocess() {}
+void reader::impl::global_preprocess(uint64_t skip_rows,
+                                     std::optional<size_type> const& num_rows_opt,
+                                     std::vector<std::vector<size_type>> const& stripes)
+{
+  if (_file_itm_data == nullptr) { _file_itm_data = std::make_unique<file_intermediate_data>(); }
+  if (_file_itm_data->global_preprocessed) { return; }
 
-void reader::impl::pass_preprocess() {}
+  // TODO: move this to end of func.
+  _file_itm_data->global_preprocessed = true;
 
-void reader::impl::subpass_preprocess() {}
+  // Select only stripes required (aka row groups)
+  std::tie(
+    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
+    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
+  query_stripe_compression_info();
+}
+
+void reader::impl::pass_preprocess()
+{
+  if (_file_itm_data->pass_preprocessed) { return; }
+
+  _file_itm_data->pass_preprocessed = true;
+}
+
+void reader::impl::subpass_preprocess()
+{
+  if (_file_itm_data->subpass_preprocessed) { return; }
+
+  _file_itm_data->subpass_preprocessed = true;
+}
 
 void reader::impl::prepare_data(uint64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
@@ -760,16 +792,10 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // There are no columns in the table
   if (_selected_columns.num_levels() == 0) { return; }
 
-  global_preprocess();
+  global_preprocess(skip_rows, num_rows_opt, stripes);
   pass_preprocess();
   subpass_preprocess();
 
-  _file_itm_data = std::make_unique<file_intermediate_data>();
-
-  // Select only stripes required (aka row groups)
-  std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
   auto const rows_to_skip      = _file_itm_data->rows_to_skip;
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
@@ -777,8 +803,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // If no rows or stripes to read, return empty columns
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
-  query_stripe_compression_info();
-
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {
     auto const has_timestamp_column = std::any_of(

From 7da761e7f3729f29ae15e18a0fde57af3447f588 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 09:20:54 -0800
Subject: [PATCH 041/321] Separate preprocessing into global, pass and subpass
 steps

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp           |  5 ---
 cpp/src/io/orc/reader_impl_chunking.cu   | 56 ++++++++++++++++++++++--
 cpp/src/io/orc/reader_impl_preprocess.cu | 38 ----------------
 3 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index c439c44870a..1b5bef347d0 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -86,11 +86,6 @@ class reader::impl {
 
   void subpass_preprocess();
 
-  /**
-   * @brief Compute stripe sizes.
-   */
-  void query_stripe_compression_info();
-
   /**
    * @brief Create the output table metadata from file metadata.
    *
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 99511f2b320..ea763de3d4f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -166,16 +166,25 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 
 }  // namespace
 
-void reader::impl::query_stripe_compression_info()
+void reader::impl::global_preprocess(uint64_t skip_rows,
+                                     std::optional<size_type> const& num_rows_opt,
+                                     std::vector<std::vector<size_type>> const& stripes)
 {
-  // if (_file_itm_data->compinfo_ready) { return; }
-  if (_selected_columns.num_levels() == 0) { return; }
+  if (_file_itm_data == nullptr) { _file_itm_data = std::make_unique<file_intermediate_data>(); }
+  if (_file_itm_data->global_preprocessed) { return; }
 
+  // TODO: move this to end of func.
+  _file_itm_data->global_preprocessed = true;
+
+  // Select only stripes required (aka row groups)
+  std::tie(
+    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
+    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
   // If no rows or stripes to read, return empty columns
-  // TODO : remove?
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
 
   auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
@@ -303,6 +312,32 @@ void reader::impl::query_stripe_compression_info()
         cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
     }
   }
+}
+
+void reader::impl::pass_preprocess()
+{
+  if (_file_itm_data->pass_preprocessed) { return; }
+  _file_itm_data->pass_preprocessed = true;
+
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
+  auto& lvl_stripe_sizes       = _file_itm_data->lvl_stripe_sizes;
+  auto& read_info              = _file_itm_data->stream_read_info;
+
+  std::size_t num_stripes = selected_stripes.size();
+
+  // TODO: this is a pass
+
+  // Prepare the buffer to read raw data onto.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& stripe_data  = lvl_stripe_data[level];
+    auto& stripe_sizes = lvl_stripe_sizes[level];
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      stripe_data[stripe_idx] = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
+    }
+  }
 
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
   // Should not read all, but read stripe by stripe.
@@ -354,6 +389,19 @@ void reader::impl::query_stripe_compression_info()
   for (auto& task : read_tasks) {
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
+}
+
+void reader::impl::subpass_preprocess()
+{
+  if (_file_itm_data->subpass_preprocessed) { return; }
+  _file_itm_data->subpass_preprocessed = true;
+
+  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+
+  // TODO: This is subpass
+  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
+  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
+    stream_compinfo_map;
 
   // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 1f092243b74..0a99e831f8f 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -742,44 +742,6 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
-void reader::impl::global_preprocess(uint64_t skip_rows,
-                                     std::optional<size_type> const& num_rows_opt,
-                                     std::vector<std::vector<size_type>> const& stripes)
-{
-  if (_file_itm_data == nullptr) { _file_itm_data = std::make_unique<file_intermediate_data>(); }
-  if (_file_itm_data->global_preprocessed) { return; }
-
-  // TODO: move this to end of func.
-  _file_itm_data->global_preprocessed = true;
-
-  // Select only stripes required (aka row groups)
-  std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
-  query_stripe_compression_info();
-}
-
-void reader::impl::pass_preprocess()
-{
-  if (_file_itm_data->pass_preprocessed) { return; }
-
-  _file_itm_data->pass_preprocessed = true;
-}
-
-void reader::impl::subpass_preprocess()
-{
-  if (_file_itm_data->subpass_preprocessed) { return; }
-
-  _file_itm_data->subpass_preprocessed = true;
-}
-
 void reader::impl::prepare_data(uint64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
                                 std::vector<std::vector<size_type>> const& stripes)

From 7abe0cadf9caef9661e038ee7eb8a30bf64ca2f3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 11:09:07 -0800
Subject: [PATCH 042/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ea763de3d4f..cc0fd46efaa 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -302,28 +302,22 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
   //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);
-
-  // Prepare the buffer to read raw data onto.
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& stripe_data  = lvl_stripe_data[level];
-    auto& stripe_sizes = lvl_stripe_sizes[level];
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-      stripe_data[stripe_idx] = rmm::device_buffer(
-        cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
-    }
-  }
 }
 
 void reader::impl::pass_preprocess()
 {
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
   if (_file_itm_data->pass_preprocessed) { return; }
   _file_itm_data->pass_preprocessed = true;
 
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& lvl_stripe_sizes       = _file_itm_data->lvl_stripe_sizes;
-  auto& read_info              = _file_itm_data->stream_read_info;
+  auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
+  auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
+  auto& read_info        = _file_itm_data->stream_read_info;
 
   std::size_t num_stripes = selected_stripes.size();
 
@@ -393,6 +387,12 @@ void reader::impl::pass_preprocess()
 
 void reader::impl::subpass_preprocess()
 {
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
   if (_file_itm_data->subpass_preprocessed) { return; }
   _file_itm_data->subpass_preprocessed = true;
 

From ff8497783ecc02a693fbd63b0534d479378b946a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 11:16:21 -0800
Subject: [PATCH 043/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 15 ++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 +++
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index cc0fd46efaa..bb1df98a6b3 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -171,6 +171,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
                                      std::vector<std::vector<size_type>> const& stripes)
 {
   if (_file_itm_data == nullptr) { _file_itm_data = std::make_unique<file_intermediate_data>(); }
+  if (_file_itm_data->has_no_data()) { return; }
   if (_file_itm_data->global_preprocessed) { return; }
 
   // TODO: move this to end of func.
@@ -184,9 +185,6 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
   auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
   auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
   lvl_stripe_data.resize(_selected_columns.num_levels());
@@ -204,6 +202,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Get the total number of stripes across all input files.
   std::size_t num_stripes = selected_stripes.size();
 
+  // Prepare data.
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
@@ -306,12 +305,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
 void reader::impl::pass_preprocess()
 {
+  if (_file_itm_data->has_no_data()) { return; }
+
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
   if (_file_itm_data->pass_preprocessed) { return; }
   _file_itm_data->pass_preprocessed = true;
 
@@ -387,12 +385,11 @@ void reader::impl::pass_preprocess()
 
 void reader::impl::subpass_preprocess()
 {
+  if (_file_itm_data->has_no_data()) { return; }
+
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
   if (_file_itm_data->subpass_preprocessed) { return; }
   _file_itm_data->subpass_preprocessed = true;
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 83a61703f20..8142f2bf270 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -104,6 +104,9 @@ struct chunk {
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
 struct file_intermediate_data {
+  // If no rows or stripes to read, return empty columns
+  bool has_no_data() const { return rows_to_read > 0 && selected_stripes.empty(); }
+
   std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal>
     compinfo_map;
   // bool compinfo_ready{false};

From ddffedd4457aa53035434c42a0b2445bee15740b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 11:32:59 -0800
Subject: [PATCH 044/321] Add chunk validation

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 44 ++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index bb1df98a6b3..050df586c66 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -108,7 +108,7 @@ std::size_t gather_stream_info(std::size_t stripe_index,
 }
 
 struct cumulative_size {
-  std::size_t count;
+  int64_t count;
   std::size_t size_bytes;
 };
 
@@ -164,6 +164,33 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 }
 #endif
 
+void verify_splits(host_span<chunk const> splits,
+                   host_span<cumulative_size const> sizes,
+                   size_type total_count,
+                   size_t size_limit)
+{
+  chunk last_split{0, 0};
+  int64_t count{0};
+  for (auto const& split : splits) {
+    CUDF_EXPECTS(split.count > 0, "Invalid split count.");
+    CUDF_EXPECTS(last_split.start_idx + last_split.count == split.start_idx,
+                 "Invalid split start_idx.");
+    count += split.count;
+    last_split = split;
+
+    if (split.count > 1) {
+      std::size_t size{0};
+      for (int64_t i = split.start_idx; i < split.start_idx + split.count; ++i) {
+        size += sizes[i].size_bytes;
+      }
+      CUDF_EXPECTS(size < size_limit, "Chunk total size exceeds limit.");
+    }
+  }
+  CUDF_EXPECTS(last_split.start_idx + last_split.count == sizes[sizes.size() - 1].count,
+               "Invalid split start_idx.");
+  CUDF_EXPECTS(count == total_count, "Invalid total count.");
+}
+
 }  // namespace
 
 void reader::impl::global_preprocess(uint64_t skip_rows,
@@ -202,6 +229,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Get the total number of stripes across all input files.
   std::size_t num_stripes = selected_stripes.size();
 
+  // TODO: Check if these data depends on pass and subpass, instead of global pass.
   // Prepare data.
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
@@ -289,7 +317,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   _file_itm_data->load_stripe_chunks = find_splits(
     total_stripe_sizes,
-    total_stripe_sizes.size(),
+    num_stripes,
     /*chunk_size_limit/2*/ total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3);
 
   auto& splits = _file_itm_data->load_stripe_chunks;
@@ -301,6 +329,18 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
   //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);
+
+  // We need to verify that:
+  //  1. All chunk must have count > 0
+  //  2. Chunks are continuous.
+  //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
+  //  4. sum(number of stripes in all chunks) == total_num_stripes.
+  // TODO: enable only in debug.
+  verify_splits(
+    splits,
+    total_stripe_sizes,
+    num_stripes,
+    /*chunk_size_limit/2*/ total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3);
 }
 
 void reader::impl::pass_preprocess()

From 4efa52c28cabbdbac5b8aa756dae47b6039c1432 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 11:53:00 -0800
Subject: [PATCH 045/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 5 +++--
 cpp/src/io/orc/reader_impl_chunking.hpp | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 050df586c66..3555005f852 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -126,7 +126,7 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 {
   std::vector<chunk> splits;
 
-  uint32_t cur_count         = 0;
+  int64_t cur_count          = 0;
   int64_t cur_pos            = 0;
   size_t cur_cumulative_size = 0;
   auto const start           = thrust::make_transform_iterator(
@@ -198,7 +198,6 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
                                      std::vector<std::vector<size_type>> const& stripes)
 {
   if (_file_itm_data == nullptr) { _file_itm_data = std::make_unique<file_intermediate_data>(); }
-  if (_file_itm_data->has_no_data()) { return; }
   if (_file_itm_data->global_preprocessed) { return; }
 
   // TODO: move this to end of func.
@@ -208,6 +207,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   std::tie(
     _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
     _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  if (_file_itm_data->has_no_data()) { return; }
+
   auto const rows_to_skip      = _file_itm_data->rows_to_skip;
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 8142f2bf270..61d46d24025 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -105,7 +105,7 @@ struct chunk {
  */
 struct file_intermediate_data {
   // If no rows or stripes to read, return empty columns
-  bool has_no_data() const { return rows_to_read > 0 && selected_stripes.empty(); }
+  bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
   std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal>
     compinfo_map;

From 4ac494fe971ba1f0bf7e949019432a94bd01bc0d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 14:28:32 -0800
Subject: [PATCH 046/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 47 ++++++++++++++++++-------
 cpp/src/io/orc/reader_impl_chunking.hpp |  2 ++
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 3555005f852..ce729af8d13 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -171,6 +171,7 @@ void verify_splits(host_span<chunk const> splits,
 {
   chunk last_split{0, 0};
   int64_t count{0};
+  size_t cur_cumulative_size{0};
   for (auto const& split : splits) {
     CUDF_EXPECTS(split.count > 0, "Invalid split count.");
     CUDF_EXPECTS(last_split.start_idx + last_split.count == split.start_idx,
@@ -179,12 +180,28 @@ void verify_splits(host_span<chunk const> splits,
     last_split = split;
 
     if (split.count > 1) {
-      std::size_t size{0};
-      for (int64_t i = split.start_idx; i < split.start_idx + split.count; ++i) {
-        size += sizes[i].size_bytes;
+      //      printf("split: %ld - %ld, size: %zu, limit: %zu\n",
+      //             split.start_idx,
+      //             split.count,
+      //             sizes[split.start_idx + split.count - 1].size_bytes - cur_cumulative_size,
+      //             size_limit);
+      //      fflush(stdout);
+      CUDF_EXPECTS(
+        sizes[split.start_idx + split.count - 1].size_bytes - cur_cumulative_size <= size_limit,
+        "Chunk total size exceeds limit.");
+      if (split.start_idx + split.count < total_count) {
+        //        printf("wrong split: %ld - %ld, size: %zu, limit: %zu\n",
+        //               split.start_idx,
+        //               split.count + 1,
+        //               sizes[split.start_idx + split.count].size_bytes - cur_cumulative_size,
+        //               size_limit);
+
+        CUDF_EXPECTS(
+          sizes[split.start_idx + split.count].size_bytes - cur_cumulative_size > size_limit,
+          "Invalid split.");
       }
-      CUDF_EXPECTS(size < size_limit, "Chunk total size exceeds limit.");
     }
+    cur_cumulative_size = sizes[split.start_idx + split.count - 1].size_bytes;
   }
   CUDF_EXPECTS(last_split.start_idx + last_split.count == sizes[sizes.size() - 1].count,
                "Invalid split start_idx.");
@@ -316,11 +333,17 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
-  _file_itm_data->load_stripe_chunks = find_splits(
-    total_stripe_sizes,
-    num_stripes,
-    /*chunk_size_limit/2*/ total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3);
+  //  for (auto& size : total_stripe_sizes) {
+  //    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+  //  }
 
+  auto limit = total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
+
+  _file_itm_data->load_stripe_chunks = find_splits(total_stripe_sizes,
+                                                   num_stripes,
+                                                   /*chunk_size_limit/2*/ limit);
+
+#if 0
   auto& splits = _file_itm_data->load_stripe_chunks;
   printf("------------\nSplits (/%d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
@@ -337,13 +360,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
   //  4. sum(number of stripes in all chunks) == total_num_stripes.
   // TODO: enable only in debug.
-  verify_splits(
-    splits,
-    total_stripe_sizes,
-    num_stripes,
-    /*chunk_size_limit/2*/ total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3);
+  verify_splits(splits, total_stripe_sizes, num_stripes, limit);
+#endif
 }
 
+// Load each chunk from `load_stripe_chunks`.
 void reader::impl::pass_preprocess()
 {
   if (_file_itm_data->has_no_data()) { return; }
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 61d46d24025..5c8770655c9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -117,9 +117,11 @@ struct file_intermediate_data {
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
+  // Chunks of stripes that can be load such that total of their data size is within a limit.
   std::vector<chunk> load_stripe_chunks;
   std::size_t curr_load_stripe_chunk{0};
 
+  // Chunks of stripes such that total of their decompression size is within a limit.
   std::vector<chunk> decode_stripe_chunks;
   std::size_t curr_decode_stripe_chunk{0};
 

From 71789814f436b7fbc39ee82a66f0508eb3f3b0e1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 15:03:38 -0800
Subject: [PATCH 047/321] Use limit

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 37 +++++++++++++++----------
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 ++
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ce729af8d13..850e7d12e6a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -121,24 +121,28 @@ struct cumulative_size_sum {
 
 #if 1
 std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
-                               size_type total_count,
+                               int64_t total_count,
                                size_t size_limit)
 {
+  // if (size_limit == 0) { return {chunk{0, total_count}}; }
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
+
   std::vector<chunk> splits;
+  int64_t cur_count{0};
+  int64_t cur_pos{0};
+  size_t cur_cumulative_size{0};
 
-  int64_t cur_count          = 0;
-  int64_t cur_pos            = 0;
-  size_t cur_cumulative_size = 0;
-  auto const start           = thrust::make_transform_iterator(
+  auto const start = thrust::make_transform_iterator(
     sizes.begin(), [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
   auto const end = start + static_cast<int64_t>(sizes.size());
-  while (cur_count < static_cast<uint32_t>(total_count)) {
+
+  while (cur_count < total_count) {
     int64_t split_pos =
       thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit));
 
     // If we're past the end, or if the returned bucket is bigger than the chunk_read_limit, move
     // back one.
-    if (static_cast<size_t>(split_pos) >= sizes.size() ||
+    if (static_cast<std::size_t>(split_pos) >= sizes.size() ||
         (sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
       split_pos--;
     }
@@ -164,6 +168,7 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 }
 #endif
 
+#ifdef PRINT_DEBUG
 void verify_splits(host_span<chunk const> splits,
                    host_span<cumulative_size const> sizes,
                    size_type total_count,
@@ -207,6 +212,7 @@ void verify_splits(host_span<chunk const> splits,
                "Invalid split start_idx.");
   CUDF_EXPECTS(count == total_count, "Invalid total count.");
 }
+#endif
 
 }  // namespace
 
@@ -323,6 +329,12 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     total_stripe_sizes[stripe_idx] = {1, total_stripe_size};
   }
 
+  // Load all chunks if there is no read limit.
+  if (_file_itm_data->read_size_limit == 0) {
+    _file_itm_data->load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
+    return;
+  }
+
   // Compute the prefix sum of stripe data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
@@ -337,13 +349,10 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //    printf("size: %ld, %zu\n", size.count, size.size_bytes);
   //  }
 
-  auto limit = total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
-
-  _file_itm_data->load_stripe_chunks = find_splits(total_stripe_sizes,
-                                                   num_stripes,
-                                                   /*chunk_size_limit/2*/ limit);
+  _file_itm_data->load_stripe_chunks =
+    find_splits(total_stripe_sizes, num_stripes, _file_itm_data->read_size_limit);
 
-#if 0
+#ifdef PRINT_DEBUG
   auto& splits = _file_itm_data->load_stripe_chunks;
   printf("------------\nSplits (/%d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
@@ -360,7 +369,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
   //  4. sum(number of stripes in all chunks) == total_num_stripes.
   // TODO: enable only in debug.
-  verify_splits(splits, total_stripe_sizes, num_stripes, limit);
+  verify_splits(splits, total_stripe_sizes, num_stripes, _file_itm_data->read_size_limit);
 #endif
 }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 5c8770655c9..cd94240a208 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -154,6 +154,9 @@ struct file_intermediate_data {
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;
 
+  // TODO: Change this
+  std::size_t read_size_limit{0};
+
   bool global_preprocessed{false};
   bool pass_preprocessed{false};
   bool subpass_preprocessed{false};

From 8759a5451089fd7f62faa1479ee5920f331d9bad Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 17:48:57 -0800
Subject: [PATCH 048/321] Load data using chunk

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu   | 80 +++++++++++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp  |  4 +-
 cpp/src/io/orc/reader_impl_preprocess.cu |  4 +-
 3 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 850e7d12e6a..4f23614fa6f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -241,7 +241,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
 
-  auto& read_info = _file_itm_data->stream_read_info;
+  auto& read_info                 = _file_itm_data->stream_read_info;
+  auto& stripe_stream_read_chunks = _file_itm_data->stripe_stream_read_chunks;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -253,6 +254,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Get the total number of stripes across all input files.
   std::size_t num_stripes = selected_stripes.size();
 
+  stripe_stream_read_chunks.resize(num_stripes);
+
   // TODO: Check if these data depends on pass and subpass, instead of global pass.
   // Prepare data.
   // Iterates through levels of nested columns, child column will be one level down
@@ -269,7 +272,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       col_meta.orc_col_map[level][col.id] = col_id++;
     }
 
-    lvl_stripe_data[level].resize(num_stripes);
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(num_stripes);
 
     auto& stream_info      = _file_itm_data->lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
@@ -291,6 +295,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     auto const stripe_footer = stripe.stripe_footer;
 
     std::size_t total_stripe_size{0};
+    auto const last_read_size = static_cast<int64_t>(read_info.size());
     for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
       auto& stream_info  = _file_itm_data->lvl_stream_info[level];
       auto& stripe_sizes = lvl_stripe_sizes[level];
@@ -327,8 +332,14 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       }
     }
     total_stripe_sizes[stripe_idx] = {1, total_stripe_size};
+    stripe_stream_read_chunks[stripe_idx] =
+      chunk{last_read_size, static_cast<int64_t>(read_info.size() - last_read_size)};
   }
 
+  // DEBUG only
+  _file_itm_data->read_size_limit =
+    total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
+
   // Load all chunks if there is no read limit.
   if (_file_itm_data->read_size_limit == 0) {
     _file_itm_data->load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
@@ -381,51 +392,61 @@ void reader::impl::pass_preprocess()
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
-  if (_file_itm_data->pass_preprocessed) { return; }
-  _file_itm_data->pass_preprocessed = true;
-
   auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
   auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
   auto& read_info        = _file_itm_data->stream_read_info;
 
-  std::size_t num_stripes = selected_stripes.size();
-
-  // TODO: this is a pass
+  //  std::size_t num_stripes = selected_stripes.size();
+  auto const stripe_chunk =
+    _file_itm_data->load_stripe_chunks[_file_itm_data->curr_load_stripe_chunk++];
+  auto const stripe_start = stripe_chunk.start_idx;
+  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
   // Prepare the buffer to read raw data onto.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stripe_data  = lvl_stripe_data[level];
     auto& stripe_sizes = lvl_stripe_sizes[level];
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
       stripe_data[stripe_idx] = rmm::device_buffer(
         cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
     }
   }
 
+  auto const& stripe_stream_read_chunks = _file_itm_data->stripe_stream_read_chunks;
+
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
   // Should not read all, but read stripe by stripe.
   // read_info should be limited by stripe.
   // Read level-by-level.
   // TODO: Test with read and parse/decode column by column.
   // This is future work.
-  for (auto const& read : read_info) {
-    auto& stripe_data = lvl_stripe_data[read.level];
-    auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
-
-    if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
-          read.length)) {
-      read_tasks.push_back(
-        std::pair(_metadata.per_file_metadata[read.source_idx].source->device_read_async(
-                    read.offset, read.length, dst_base + read.dst_pos, _stream),
-                  read.length));
-
-    } else {
-      auto const buffer =
-        _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
-      CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        dst_base + read.dst_pos, buffer->data(), read.length, cudaMemcpyDefault, _stream.value()));
-      _stream.synchronize();
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    auto const read_chunk = stripe_stream_read_chunks[stripe_idx];
+    auto const read_begin = read_chunk.start_idx;
+    auto const read_end   = read_chunk.start_idx + read_chunk.count;
+
+    for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
+      auto const& read  = read_info[read_idx];
+      auto& stripe_data = lvl_stripe_data[read.level];
+      auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
+
+      if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
+            read.length)) {
+        read_tasks.push_back(
+          std::pair(_metadata.per_file_metadata[read.source_idx].source->device_read_async(
+                      read.offset, read.length, dst_base + read.dst_pos, _stream),
+                    read.length));
+
+      } else {
+        auto const buffer =
+          _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
+        CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
+        CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read.dst_pos,
+                                      buffer->data(),
+                                      read.length,
+                                      cudaMemcpyDefault,
+                                      _stream.value()));
+        _stream.synchronize();
 
 #if 0
      // This in theory should be faster, but in practice it's slower. Why?
@@ -447,8 +468,10 @@ void reader::impl::pass_preprocess()
                              }),
                   read.length));
 #endif
+      }
     }
   }
+
   for (auto& task : read_tasks) {
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
@@ -461,9 +484,6 @@ void reader::impl::subpass_preprocess()
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
-  if (_file_itm_data->subpass_preprocessed) { return; }
-  _file_itm_data->subpass_preprocessed = true;
-
   auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
 
   // TODO: This is subpass
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index cd94240a208..784cec0ca29 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -120,6 +120,7 @@ struct file_intermediate_data {
   // Chunks of stripes that can be load such that total of their data size is within a limit.
   std::vector<chunk> load_stripe_chunks;
   std::size_t curr_load_stripe_chunk{0};
+  bool more_stripe_to_load() { return curr_load_stripe_chunk < load_stripe_chunks.size(); }
 
   // Chunks of stripes such that total of their decompression size is within a limit.
   std::vector<chunk> decode_stripe_chunks;
@@ -149,6 +150,7 @@ struct file_intermediate_data {
     std::size_t level;
   };
   std::vector<stream_read_info> stream_read_info;
+  std::vector<chunk> stripe_stream_read_chunks;
 
   int64_t rows_to_skip;
   size_type rows_to_read;
@@ -158,8 +160,6 @@ struct file_intermediate_data {
   std::size_t read_size_limit{0};
 
   bool global_preprocessed{false};
-  bool pass_preprocessed{false};
-  bool subpass_preprocessed{false};
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 0a99e831f8f..6bcf012d7e6 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -755,7 +755,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   if (_selected_columns.num_levels() == 0) { return; }
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
-  pass_preprocess();
+  while (_file_itm_data->more_stripe_to_load()) {
+    pass_preprocess();
+  }
   subpass_preprocess();
 
   auto const rows_to_skip      = _file_itm_data->rows_to_skip;

From d811e0f0088620e885df78585f7f6efe6cd7ff1b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 18:42:54 -0800
Subject: [PATCH 049/321] Parse stream by chunk

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu   | 143 ++++++++++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp  |   6 +-
 cpp/src/io/orc/reader_impl_preprocess.cu |   9 +-
 3 files changed, 100 insertions(+), 58 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4f23614fa6f..7a126c8f653 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -243,6 +243,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   auto& read_info                 = _file_itm_data->stream_read_info;
   auto& stripe_stream_read_chunks = _file_itm_data->stripe_stream_read_chunks;
+  auto& lvl_stripe_stream_chunks  = _file_itm_data->lvl_stripe_stream_chunks;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
@@ -255,6 +256,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   std::size_t num_stripes = selected_stripes.size();
 
   stripe_stream_read_chunks.resize(num_stripes);
+  lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
 
   // TODO: Check if these data depends on pass and subpass, instead of global pass.
   // Prepare data.
@@ -284,6 +286,9 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     if (read_info.capacity() < selected_stripes.size()) {
       read_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
     }
+
+    auto& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
+    stripe_stream_chunks.resize(num_stripes);
   }
 
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_stripes, _stream);
@@ -300,8 +305,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       auto& stream_info  = _file_itm_data->lvl_stream_info[level];
       auto& stripe_sizes = lvl_stripe_sizes[level];
 
-      auto stream_count        = stream_info.size();
-      auto const stripe_size   = gather_stream_info(stripe_idx,
+      auto stream_count      = stream_info.size();
+      auto const stripe_size = gather_stream_info(stripe_idx,
                                                   level,
                                                   stripe_info,
                                                   stripe_footer,
@@ -309,13 +314,19 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
                                                   _metadata.get_types(),
                                                   level == 0,
                                                   stream_info);
-      stripe_sizes[stripe_idx] = stripe_size;
-      total_stripe_size += stripe_size;
 
       auto const is_stripe_data_empty = stripe_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
+      stripe_sizes[stripe_idx] = stripe_size;
+      total_stripe_size += stripe_size;
+
+      auto& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
+      stripe_stream_chunks[stripe_idx] =
+        chunk{static_cast<int64_t>(stream_count),
+              static_cast<int64_t>(stream_info.size() - stream_count)};
+
       // Coalesce consecutive streams into one read
       while (not is_stripe_data_empty and stream_count < stream_info.size()) {
         auto const d_dst  = stream_info[stream_count].dst_pos;
@@ -425,6 +436,7 @@ void reader::impl::pass_preprocess()
     auto const read_begin = read_chunk.start_idx;
     auto const read_end   = read_chunk.start_idx + read_chunk.count;
 
+    // TODO: instead of loop stripe => loop read, we can directly loop read of first + last stripe
     for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
       auto const& read  = read_info[read_idx];
       auto& stripe_data = lvl_stripe_data[read.level];
@@ -484,13 +496,18 @@ void reader::impl::subpass_preprocess()
   auto const rows_to_read      = _file_itm_data->rows_to_read;
   auto const& selected_stripes = _file_itm_data->selected_stripes;
 
-  auto& lvl_stripe_data = _file_itm_data->lvl_stripe_data;
+  auto& lvl_stripe_data          = _file_itm_data->lvl_stripe_data;
+  auto& lvl_stripe_stream_chunks = _file_itm_data->lvl_stripe_stream_chunks;
 
   // TODO: This is subpass
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
     stream_compinfo_map;
 
+  // TODO: fix this, loop only current chunk
+  auto const stripe_chunk =
+    _file_itm_data->load_stripe_chunks[_file_itm_data->curr_load_stripe_chunk++];
+
   // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stream_info      = _file_itm_data->lvl_stream_info[level];
@@ -500,67 +517,81 @@ void reader::impl::subpass_preprocess()
     auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
-      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-        0, stream_info.size(), _stream);
-
-      for (auto const& info : stream_info) {
-        compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-          info.length));
-        stream_compinfo_map[stream_id_info{
-          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
-          &compinfo[compinfo.size() - 1];
+    auto const& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
+
+    auto const stripe_start = stripe_chunk.start_idx;
+    auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      auto const stream_chunk = stripe_stream_chunks[stripe_idx];
+      auto const stream_start = stream_chunk.start_idx;
+      auto const stream_end   = stream_chunk.start_idx + stream_chunk.count;
+
+      // Setup row group descriptors if using indexes
+      if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+        auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+        cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+          0, /*stream_info.size()*/ stream_chunk.count, _stream);
+
+        // TODO: Instead of all stream info, loop using read_chunk info to process
+        // only stream info of the curr_load_stripe_chunk.
+
+        for (auto stream_idx = stream_start; stream_idx < stream_end; ++stream_idx) {
+          auto const& info = stream_info[stream_idx];
+          compinfo.push_back(gpu::CompressedStreamInfo(
+            static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+            info.length));
+          stream_compinfo_map[stream_id_info{
+            info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
+            &compinfo[compinfo.size() - 1];
 #ifdef PRINT_DEBUG
-        printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-               (int)info.stripe_idx,
-               (int)info.level,
-               (int)info.orc_col_idx,
-               (int)info.kind,
-               info.dst_pos,
-               info.length);
-        fflush(stdout);
+          printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+                 (int)info.stripe_idx,
+                 (int)info.level,
+                 (int)info.orc_col_idx,
+                 (int)info.kind,
+                 info.dst_pos,
+                 info.length);
+          fflush(stdout);
 #endif
-      }
+        }
 
-      compinfo.host_to_device_async(_stream);
+        compinfo.host_to_device_async(_stream);
 
-      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                     compinfo.size(),
-                                     decompressor.GetBlockSize(),
-                                     decompressor.GetLog2MaxCompressionRatio(),
-                                     _stream);
-      compinfo.device_to_host_sync(_stream);
+        gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                       compinfo.size(),
+                                       decompressor.GetBlockSize(),
+                                       decompressor.GetLog2MaxCompressionRatio(),
+                                       _stream);
+        compinfo.device_to_host_sync(_stream);
 
-      auto& compinfo_map = _file_itm_data->compinfo_map;
-      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
-        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
-                                   stream_compinfo->num_uncompressed_blocks,
-                                   stream_compinfo->max_uncompressed_size};
+        auto& compinfo_map = _file_itm_data->compinfo_map;
+        for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+          compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
+                                     stream_compinfo->num_uncompressed_blocks,
+                                     stream_compinfo->max_uncompressed_size};
 #ifdef PRINT_DEBUG
-        printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-               (int)stream_id.stripe_idx,
-               (int)stream_id.level,
-               (int)stream_id.orc_col_idx,
-               (int)stream_id.kind,
-               (size_t)stream_compinfo->num_compressed_blocks,
-               (size_t)stream_compinfo->num_uncompressed_blocks,
-               stream_compinfo->max_uncompressed_size);
-        fflush(stdout);
+          printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+                 (int)stream_id.stripe_idx,
+                 (int)stream_id.level,
+                 (int)stream_id.orc_col_idx,
+                 (int)stream_id.kind,
+                 (size_t)stream_compinfo->num_compressed_blocks,
+                 (size_t)stream_compinfo->num_uncompressed_blocks,
+                 stream_compinfo->max_uncompressed_size);
+          fflush(stdout);
 #endif
-      }
+        }
 
-      // Must clear so we will not overwrite the old compression info stream_id.
-      stream_compinfo_map.clear();
+        // Must clear so we will not overwrite the old compression info stream_id.
+        stream_compinfo_map.clear();
 
-    } else {
-      // printf("no compression \n");
-      // fflush(stdout);
+      } else {
+        // printf("no compression \n");
+        // fflush(stdout);
 
-      // Set decompressed data size equal to the input size.
-      // TODO
+        // Set decompressed data size equal to the input size.
+        // TODO
+      }
     }
 
     // printf("  end level %d\n\n", (int)level);
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 784cec0ca29..b903948ecad 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -150,7 +150,11 @@ struct file_intermediate_data {
     std::size_t level;
   };
   std::vector<stream_read_info> stream_read_info;
-  std::vector<chunk> stripe_stream_read_chunks;
+  std::vector<chunk> stripe_stream_read_chunks;  // chunk identify the reading streams (multiple
+                                                 // streams can be read once) for each stripe
+  std::vector<std::vector<chunk>>
+    lvl_stripe_stream_chunks;  // chunk identify all processing streams for each stripe, need to be
+                               // level-based
 
   int64_t rows_to_skip;
   size_type rows_to_read;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 6bcf012d7e6..1e619dea571 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -755,10 +755,17 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   if (_selected_columns.num_levels() == 0) { return; }
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
+
+  // TODO: fix this, should be called once
   while (_file_itm_data->more_stripe_to_load()) {
     pass_preprocess();
   }
-  subpass_preprocess();
+
+  // Fix this, subpass should be call once
+  _file_itm_data->curr_load_stripe_chunk = 0;
+  while (_file_itm_data->more_stripe_to_load()) {
+    subpass_preprocess();
+  }
 
   auto const rows_to_skip      = _file_itm_data->rows_to_skip;
   auto const rows_to_read      = _file_itm_data->rows_to_read;

From 23e14de58032950693c06eb0b90b3dfbd8e4b7b4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 19:57:05 -0800
Subject: [PATCH 050/321] Add interface functions

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index b903948ecad..f3e582266d7 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -125,6 +125,12 @@ struct file_intermediate_data {
   // Chunks of stripes such that total of their decompression size is within a limit.
   std::vector<chunk> decode_stripe_chunks;
   std::size_t curr_decode_stripe_chunk{0};
+  bool more_stripe_to_decode() { return curr_decode_stripe_chunk < decode_stripe_chunks.size(); }
+
+  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
+  std::vector<chunk> output_table_chunks;
+  std::size_t curr_output_table_chunk{0};
+  bool more_table_chunk_to_output() { return curr_output_table_chunk < output_table_chunks.size(); }
 
   // Each read correspond to one or more consecutive stream combined.
   struct stream_read_info {

From 55104115b420630f638f80b1ce158baa7f233146 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 22:11:30 -0800
Subject: [PATCH 051/321] Implement `chunk_read_data` struct

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu            |  2 +-
 cpp/src/io/orc/reader_impl.hpp           |  5 +-
 cpp/src/io/orc/reader_impl_chunking.cu   | 81 ++++++++++++------------
 cpp/src/io/orc/reader_impl_chunking.hpp  | 63 ++++++++++++------
 cpp/src/io/orc/reader_impl_preprocess.cu | 22 +++----
 5 files changed, 100 insertions(+), 73 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index cf3121fe659..e5470df05a2 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -83,7 +83,7 @@ table_with_metadata reader::impl::read_chunk_internal()
   auto out_metadata = make_output_metadata();
 
   // If no rows or stripes to read, return empty columns
-  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
+  if (_file_itm_data.has_no_data()) {
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
                    std::back_inserter(out_columns),
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 1b5bef347d0..b2e22a16b85 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "aggregate_orc_metadata.hpp"
+#include "reader_impl_chunking.hpp"
 
 #include <io/utilities/column_buffer.hpp>
 
@@ -33,7 +34,6 @@
 namespace cudf::io::orc::detail {
 
 struct reader_column_meta;
-struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
@@ -116,7 +116,8 @@ class reader::impl {
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
-  std::unique_ptr<file_intermediate_data> _file_itm_data;
+  file_intermediate_data _file_itm_data;
+  chunk_read_data _chunk_read_data;
   std::unique_ptr<table_metadata> _output_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 };
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 7a126c8f653..767b21dd959 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -220,37 +220,36 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
                                      std::optional<size_type> const& num_rows_opt,
                                      std::vector<std::vector<size_type>> const& stripes)
 {
-  if (_file_itm_data == nullptr) { _file_itm_data = std::make_unique<file_intermediate_data>(); }
-  if (_file_itm_data->global_preprocessed) { return; }
+  if (_file_itm_data.global_preprocessed) { return; }
 
   // TODO: move this to end of func.
-  _file_itm_data->global_preprocessed = true;
+  _file_itm_data.global_preprocessed = true;
 
   // Select only stripes required (aka row groups)
   std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
+    _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
     _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
-  if (_file_itm_data->has_no_data()) { return; }
+  if (_file_itm_data.has_no_data()) { return; }
 
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
+  //  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  //  auto const rows_to_read      = _file_itm_data.rows_to_read;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
-  auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
+  auto& lvl_stripe_data  = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes = _file_itm_data.lvl_stripe_sizes;
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
 
-  auto& read_info                 = _file_itm_data->stream_read_info;
-  auto& stripe_stream_read_chunks = _file_itm_data->stripe_stream_read_chunks;
-  auto& lvl_stripe_stream_chunks  = _file_itm_data->lvl_stripe_stream_chunks;
+  auto& read_info                 = _file_itm_data.stream_read_info;
+  auto& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
+  auto& lvl_stripe_stream_chunks  = _file_itm_data.lvl_stripe_stream_chunks;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
     stream_compinfo_map;
 
   // Logically view streams as columns
-  _file_itm_data->lvl_stream_info.resize(_selected_columns.num_levels());
+  _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
 
   // Get the total number of stripes across all input files.
   std::size_t num_stripes = selected_stripes.size();
@@ -277,7 +276,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     auto& stripe_data = lvl_stripe_data[level];
     stripe_data.resize(num_stripes);
 
-    auto& stream_info      = _file_itm_data->lvl_stream_info[level];
+    auto& stream_info      = _file_itm_data.lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
     auto& stripe_sizes     = lvl_stripe_sizes[level];
     stream_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
@@ -302,7 +301,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     std::size_t total_stripe_size{0};
     auto const last_read_size = static_cast<int64_t>(read_info.size());
     for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-      auto& stream_info  = _file_itm_data->lvl_stream_info[level];
+      auto& stream_info  = _file_itm_data.lvl_stream_info[level];
       auto& stripe_sizes = lvl_stripe_sizes[level];
 
       auto stream_count      = stream_info.size();
@@ -348,12 +347,12 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   }
 
   // DEBUG only
-  _file_itm_data->read_size_limit =
+  _chunk_read_data.read_size_limit =
     total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
 
   // Load all chunks if there is no read limit.
-  if (_file_itm_data->read_size_limit == 0) {
-    _file_itm_data->load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
+  if (_chunk_read_data.read_size_limit == 0) {
+    _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
     return;
   }
 
@@ -371,11 +370,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //    printf("size: %ld, %zu\n", size.count, size.size_bytes);
   //  }
 
-  _file_itm_data->load_stripe_chunks =
-    find_splits(total_stripe_sizes, num_stripes, _file_itm_data->read_size_limit);
+  _chunk_read_data.load_stripe_chunks =
+    find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.read_size_limit);
 
 #ifdef PRINT_DEBUG
-  auto& splits = _file_itm_data->load_stripe_chunks;
+  auto& splits = _file_itm_data.load_stripe_chunks;
   printf("------------\nSplits (/%d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
@@ -391,25 +390,25 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
   //  4. sum(number of stripes in all chunks) == total_num_stripes.
   // TODO: enable only in debug.
-  verify_splits(splits, total_stripe_sizes, num_stripes, _file_itm_data->read_size_limit);
+  verify_splits(splits, total_stripe_sizes, num_stripes, _file_itm_data.read_size_limit);
 #endif
 }
 
 // Load each chunk from `load_stripe_chunks`.
 void reader::impl::pass_preprocess()
 {
-  if (_file_itm_data->has_no_data()) { return; }
+  if (_file_itm_data.has_no_data()) { return; }
 
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
+  //  auto const rows_to_read      = _file_itm_data.rows_to_read;
+  //  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  auto& lvl_stripe_data  = _file_itm_data->lvl_stripe_data;
-  auto& lvl_stripe_sizes = _file_itm_data->lvl_stripe_sizes;
-  auto& read_info        = _file_itm_data->stream_read_info;
+  auto& lvl_stripe_data  = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes = _file_itm_data.lvl_stripe_sizes;
+  auto& read_info        = _file_itm_data.stream_read_info;
 
   //  std::size_t num_stripes = selected_stripes.size();
   auto const stripe_chunk =
-    _file_itm_data->load_stripe_chunks[_file_itm_data->curr_load_stripe_chunk++];
+    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk++];
   auto const stripe_start = stripe_chunk.start_idx;
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
@@ -423,7 +422,7 @@ void reader::impl::pass_preprocess()
     }
   }
 
-  auto const& stripe_stream_read_chunks = _file_itm_data->stripe_stream_read_chunks;
+  auto const& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
 
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
   // Should not read all, but read stripe by stripe.
@@ -491,13 +490,13 @@ void reader::impl::pass_preprocess()
 
 void reader::impl::subpass_preprocess()
 {
-  if (_file_itm_data->has_no_data()) { return; }
+  if (_file_itm_data.has_no_data()) { return; }
 
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
+  //  auto const rows_to_read      = _file_itm_data.rows_to_read;
+  //  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  auto& lvl_stripe_data          = _file_itm_data->lvl_stripe_data;
-  auto& lvl_stripe_stream_chunks = _file_itm_data->lvl_stripe_stream_chunks;
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
 
   // TODO: This is subpass
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
@@ -506,11 +505,11 @@ void reader::impl::subpass_preprocess()
 
   // TODO: fix this, loop only current chunk
   auto const stripe_chunk =
-    _file_itm_data->load_stripe_chunks[_file_itm_data->curr_load_stripe_chunk++];
+    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk++];
 
   // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& stream_info      = _file_itm_data->lvl_stream_info[level];
+    auto& stream_info      = _file_itm_data.lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
 
     // Tracker for eventually deallocating compressed and uncompressed data
@@ -564,7 +563,7 @@ void reader::impl::subpass_preprocess()
                                        _stream);
         compinfo.device_to_host_sync(_stream);
 
-        auto& compinfo_map = _file_itm_data->compinfo_map;
+        auto& compinfo_map = _file_itm_data.compinfo_map;
         for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
           compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                      stream_compinfo->num_uncompressed_blocks,
@@ -599,7 +598,7 @@ void reader::impl::subpass_preprocess()
   }  // end loop level
 
   // lvl_stripe_data.clear();
-  // _file_itm_data->compinfo_ready = true;
+  // _file_itm_data.compinfo_ready = true;
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index f3e582266d7..e8d071aae57 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -100,6 +100,11 @@ struct chunk {
   int64_t count;
 };
 
+struct range {
+  int64_t begin;
+  int64_t end;
+};
+
 /**
  * @brief Struct to store file-level data that remains constant for all chunks being read.
  */
@@ -117,21 +122,6 @@ struct file_intermediate_data {
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
-  // Chunks of stripes that can be load such that total of their data size is within a limit.
-  std::vector<chunk> load_stripe_chunks;
-  std::size_t curr_load_stripe_chunk{0};
-  bool more_stripe_to_load() { return curr_load_stripe_chunk < load_stripe_chunks.size(); }
-
-  // Chunks of stripes such that total of their decompression size is within a limit.
-  std::vector<chunk> decode_stripe_chunks;
-  std::size_t curr_decode_stripe_chunk{0};
-  bool more_stripe_to_decode() { return curr_decode_stripe_chunk < decode_stripe_chunks.size(); }
-
-  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
-  std::vector<chunk> output_table_chunks;
-  std::size_t curr_output_table_chunk{0};
-  bool more_table_chunk_to_output() { return curr_output_table_chunk < output_table_chunks.size(); }
-
   // Each read correspond to one or more consecutive stream combined.
   struct stream_read_info {
     stream_read_info(uint64_t offset_,
@@ -166,10 +156,47 @@ struct file_intermediate_data {
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;
 
-  // TODO: Change this
-  std::size_t read_size_limit{0};
-
   bool global_preprocessed{false};
 };
 
+/**
+ * @brief Struct to store all data necessary for chunked reading.
+ */
+struct chunk_read_data {
+  explicit chunk_read_data(std::size_t output_size_limit_ = 0, std::size_t read_size_limit_ = 0)
+    : output_size_limit{output_size_limit_}, read_size_limit(read_size_limit_)
+  {
+  }
+
+  std::size_t output_size_limit;  // Maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t read_size_limit;    // Maximum size (in bytes) of an output chunk, or 0 for no limit
+
+  // Chunks of stripes that can be load such that total of their data size is within a limit.
+  std::vector<chunk> load_stripe_chunks;
+  std::size_t curr_load_stripe_chunk{0};
+  bool more_stripe_to_load() const { return curr_load_stripe_chunk < load_stripe_chunks.size(); }
+
+  // Chunks of stripes such that total of their decompression size is within a limit.
+  std::vector<chunk> decode_stripe_chunks;
+  std::size_t curr_decode_stripe_chunk{0};
+  bool more_stripe_to_decode() const
+  {
+    return curr_decode_stripe_chunk < decode_stripe_chunks.size();
+  }
+
+  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
+  std::vector<chunk> output_table_chunks;
+  std::size_t curr_output_table_chunk{0};
+  bool more_table_chunk_to_output() const
+  {
+    return curr_output_table_chunk < output_table_chunks.size();
+  }
+
+  // Only has more chunk to output if:
+  bool has_next() const
+  {
+    return more_stripe_to_load() || more_stripe_to_decode() || more_table_chunk_to_output();
+  }
+};
+
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 1e619dea571..c1f8b7a12df 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -757,19 +757,19 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   global_preprocess(skip_rows, num_rows_opt, stripes);
 
   // TODO: fix this, should be called once
-  while (_file_itm_data->more_stripe_to_load()) {
+  while (_chunk_read_data.more_stripe_to_load()) {
     pass_preprocess();
   }
 
   // Fix this, subpass should be call once
-  _file_itm_data->curr_load_stripe_chunk = 0;
-  while (_file_itm_data->more_stripe_to_load()) {
+  _chunk_read_data.curr_load_stripe_chunk = 0;
+  while (_chunk_read_data.more_stripe_to_load()) {
     subpass_preprocess();
   }
 
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const rows_to_read      = _file_itm_data.rows_to_read;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
   // If no rows or stripes to read, return empty columns
   if (rows_to_read == 0 || selected_stripes.empty()) { return; }
@@ -788,9 +788,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                 : std::make_unique<cudf::table>();
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
-  auto& lvl_chunks             = _file_itm_data->lvl_data_chunks;
+  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
+  auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
+  auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_chunks.resize(_selected_columns.num_levels());
 
@@ -851,7 +851,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       (rows_to_skip == 0);
 
     // Logically view streams as columns
-    auto const& stream_info = _file_itm_data->lvl_stream_info[level];
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
 
     null_count_prefix_sums.emplace_back();
     null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
@@ -978,7 +978,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     }
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(_file_itm_data->compinfo_map,
+      auto decomp_data = decompress_stripe_data(_file_itm_data.compinfo_map,
                                                 *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,

From 2dd88a0750c12aa8efd11163eef33e23b7b005ab Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 28 Jan 2024 22:24:29 -0800
Subject: [PATCH 052/321] Host read with sync just once

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 767b21dd959..166b4ea8264 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -424,7 +424,9 @@ void reader::impl::pass_preprocess()
 
   auto const& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
 
+  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+
   // Should not read all, but read stripe by stripe.
   // read_info should be limited by stripe.
   // Read level-by-level.
@@ -449,7 +451,7 @@ void reader::impl::pass_preprocess()
                     read.length));
 
       } else {
-        auto const buffer =
+        auto buffer =
           _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
         CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
         CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read.dst_pos,
@@ -457,7 +459,8 @@ void reader::impl::pass_preprocess()
                                       read.length,
                                       cudaMemcpyDefault,
                                       _stream.value()));
-        _stream.synchronize();
+        //        _stream.synchronize();
+        host_read_buffers.emplace_back(std::move(buffer));
 
 #if 0
      // This in theory should be faster, but in practice it's slower. Why?
@@ -482,7 +485,7 @@ void reader::impl::pass_preprocess()
       }
     }
   }
-
+  if (host_read_buffers.size() > 0) { _stream.synchronize(); }
   for (auto& task : read_tasks) {
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }

From d2bb9111615955377f7f03b83a043e6ae6573f46 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 08:23:55 -0800
Subject: [PATCH 053/321] Simplify loops

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 187 ++++++++++++-------------
 1 file changed, 88 insertions(+), 99 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 166b4ea8264..86fe621c702 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -427,40 +427,31 @@ void reader::impl::pass_preprocess()
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
-  // Should not read all, but read stripe by stripe.
-  // read_info should be limited by stripe.
-  // Read level-by-level.
-  // TODO: Test with read and parse/decode column by column.
-  // This is future work.
-  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-    auto const read_chunk = stripe_stream_read_chunks[stripe_idx];
-    auto const read_begin = read_chunk.start_idx;
-    auto const read_end   = read_chunk.start_idx + read_chunk.count;
-
-    // TODO: instead of loop stripe => loop read, we can directly loop read of first + last stripe
-    for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
-      auto const& read  = read_info[read_idx];
-      auto& stripe_data = lvl_stripe_data[read.level];
-      auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
-
-      if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
-            read.length)) {
-        read_tasks.push_back(
-          std::pair(_metadata.per_file_metadata[read.source_idx].source->device_read_async(
-                      read.offset, read.length, dst_base + read.dst_pos, _stream),
-                    read.length));
-
-      } else {
-        auto buffer =
-          _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
-        CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
-        CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read.dst_pos,
-                                      buffer->data(),
-                                      read.length,
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
-        //        _stream.synchronize();
-        host_read_buffers.emplace_back(std::move(buffer));
+  auto const stripe_first_chunk = stripe_stream_read_chunks[stripe_start];
+  auto const stripe_last_chunk  = stripe_stream_read_chunks[stripe_end - 1];
+  auto const read_begin         = stripe_first_chunk.start_idx;
+  auto const read_end           = stripe_last_chunk.start_idx + stripe_last_chunk.count;
+
+  for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
+    auto const& read  = read_info[read_idx];
+    auto& stripe_data = lvl_stripe_data[read.level];
+    auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
+
+    if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
+          read.length)) {
+      read_tasks.push_back(
+        std::pair(_metadata.per_file_metadata[read.source_idx].source->device_read_async(
+                    read.offset, read.length, dst_base + read.dst_pos, _stream),
+                  read.length));
+
+    } else {
+      auto buffer =
+        _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
+      CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        dst_base + read.dst_pos, buffer->data(), read.length, cudaMemcpyDefault, _stream.value()));
+      //        _stream.synchronize();
+      host_read_buffers.emplace_back(std::move(buffer));
 
 #if 0
      // This in theory should be faster, but in practice it's slower. Why?
@@ -482,9 +473,9 @@ void reader::impl::pass_preprocess()
                              }),
                   read.length));
 #endif
-      }
     }
   }
+
   if (host_read_buffers.size() > 0) { _stream.synchronize(); }
   for (auto& task : read_tasks) {
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
@@ -520,80 +511,78 @@ void reader::impl::subpass_preprocess()
     if (stripe_data.empty()) { continue; }
 
     auto const& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
-
-    auto const stripe_start = stripe_chunk.start_idx;
-    auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
-    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-      auto const stream_chunk = stripe_stream_chunks[stripe_idx];
-      auto const stream_start = stream_chunk.start_idx;
-      auto const stream_end   = stream_chunk.start_idx + stream_chunk.count;
-
-      // Setup row group descriptors if using indexes
-      if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-        auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
-        cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-          0, /*stream_info.size()*/ stream_chunk.count, _stream);
-
-        // TODO: Instead of all stream info, loop using read_chunk info to process
-        // only stream info of the curr_load_stripe_chunk.
-
-        for (auto stream_idx = stream_start; stream_idx < stream_end; ++stream_idx) {
-          auto const& info = stream_info[stream_idx];
-          compinfo.push_back(gpu::CompressedStreamInfo(
-            static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-            info.length));
-          stream_compinfo_map[stream_id_info{
-            info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
-            &compinfo[compinfo.size() - 1];
+    auto const stripe_start          = stripe_chunk.start_idx;
+    auto const stripe_end            = stripe_chunk.start_idx + stripe_chunk.count;
+    auto const stripe_first_chunk    = stripe_stream_chunks[stripe_start];
+    auto const stripe_last_chunk     = stripe_stream_chunks[stripe_end - 1];
+    auto const stream_begin          = stripe_first_chunk.start_idx;
+    auto const stream_end            = stripe_last_chunk.start_idx + stripe_last_chunk.count;
+    auto const num_streams           = stream_end - stream_begin;
+
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
+
+      // TODO: Instead of all stream info, loop using read_chunk info to process
+      // only stream info of the curr_load_stripe_chunk.
+
+      for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        compinfo.push_back(gpu::CompressedStreamInfo(
+          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+          info.length));
+        stream_compinfo_map[stream_id_info{
+          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
+          &compinfo[compinfo.size() - 1];
 #ifdef PRINT_DEBUG
-          printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-                 (int)info.stripe_idx,
-                 (int)info.level,
-                 (int)info.orc_col_idx,
-                 (int)info.kind,
-                 info.dst_pos,
-                 info.length);
-          fflush(stdout);
+        printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+               (int)info.stripe_idx,
+               (int)info.level,
+               (int)info.orc_col_idx,
+               (int)info.kind,
+               info.dst_pos,
+               info.length);
+        fflush(stdout);
 #endif
-        }
+      }
 
-        compinfo.host_to_device_async(_stream);
+      compinfo.host_to_device_async(_stream);
 
-        gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                       compinfo.size(),
-                                       decompressor.GetBlockSize(),
-                                       decompressor.GetLog2MaxCompressionRatio(),
-                                       _stream);
-        compinfo.device_to_host_sync(_stream);
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
 
-        auto& compinfo_map = _file_itm_data.compinfo_map;
-        for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
-          compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
-                                     stream_compinfo->num_uncompressed_blocks,
-                                     stream_compinfo->max_uncompressed_size};
+      auto& compinfo_map = _file_itm_data.compinfo_map;
+      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
+                                   stream_compinfo->num_uncompressed_blocks,
+                                   stream_compinfo->max_uncompressed_size};
 #ifdef PRINT_DEBUG
-          printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-                 (int)stream_id.stripe_idx,
-                 (int)stream_id.level,
-                 (int)stream_id.orc_col_idx,
-                 (int)stream_id.kind,
-                 (size_t)stream_compinfo->num_compressed_blocks,
-                 (size_t)stream_compinfo->num_uncompressed_blocks,
-                 stream_compinfo->max_uncompressed_size);
-          fflush(stdout);
+        printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+               (int)stream_id.stripe_idx,
+               (int)stream_id.level,
+               (int)stream_id.orc_col_idx,
+               (int)stream_id.kind,
+               (size_t)stream_compinfo->num_compressed_blocks,
+               (size_t)stream_compinfo->num_uncompressed_blocks,
+               stream_compinfo->max_uncompressed_size);
+        fflush(stdout);
 #endif
-        }
+      }
 
-        // Must clear so we will not overwrite the old compression info stream_id.
-        stream_compinfo_map.clear();
+      // Must clear so we will not overwrite the old compression info stream_id.
+      stream_compinfo_map.clear();
 
-      } else {
-        // printf("no compression \n");
-        // fflush(stdout);
+    } else {
+      // printf("no compression \n");
+      // fflush(stdout);
 
-        // Set decompressed data size equal to the input size.
-        // TODO
-      }
+      // Set decompressed data size equal to the input size.
+      // TODO
     }
 
     // printf("  end level %d\n\n", (int)level);

From a3dff8f70cf8e99f8239cd7b66bb88ded89bdc25 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 08:56:11 -0800
Subject: [PATCH 054/321] Add comments and change variable name

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu   |  4 +++
 cpp/src/io/orc/reader_impl_chunking.hpp  |  7 ++++-
 cpp/src/io/orc/reader_impl_preprocess.cu | 37 ++++++++++++------------
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 86fe621c702..b987cb3d8cf 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -522,6 +522,9 @@ void reader::impl::subpass_preprocess()
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+
+      // Cannot be cached, since this is for streams in a loaded stripe chunk, while
+      // the latter decoding step will use a different stripe chunk.
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
 
       // TODO: Instead of all stream info, loop using read_chunk info to process
@@ -558,6 +561,7 @@ void reader::impl::subpass_preprocess()
 
       auto& compinfo_map = _file_itm_data.compinfo_map;
       for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+        // Cache these parsed numbers so they can be reused in the decoding step.
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index e8d071aae57..ea2c1d2cd81 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -116,8 +116,13 @@ struct file_intermediate_data {
     compinfo_map;
   // bool compinfo_ready{false};
 
-  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+  // The buffers are initialized for each reading stripe chunks.
+  // After decoding, such buffers need to be released.
+  // This can only be implemented after chunked output is ready.
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
+
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index c1f8b7a12df..3333af1ac97 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -174,6 +174,7 @@ rmm::device_buffer decompress_stripe_data(
 
   cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
     0, stream_info.size(), stream);
+
   for (auto const& info : stream_info) {
 #ifdef PRINT_DEBUG
     printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
@@ -756,6 +757,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
 
+  if (_file_itm_data.has_no_data()) { return; }
+
   // TODO: fix this, should be called once
   while (_chunk_read_data.more_stripe_to_load()) {
     pass_preprocess();
@@ -771,9 +774,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   auto const rows_to_read      = _file_itm_data.rows_to_read;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {
     auto const has_timestamp_column = std::any_of(
@@ -791,11 +791,13 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
   auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
   auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-  lvl_chunks.resize(_selected_columns.num_levels());
 
+  // TODO: move this to global step
+  lvl_chunks.resize(_selected_columns.num_levels());
   _out_buffers.resize(_selected_columns.num_levels());
 
+  std::size_t num_stripes = selected_stripes.size();
+
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
@@ -830,12 +832,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       }
     }
 
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes = selected_stripes.size();
-    auto const num_columns        = columns_level.size();
-    auto& chunks                  = lvl_chunks[level];
-    chunks =
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(total_num_stripes, num_columns, _stream);
+    auto const num_columns = columns_level.size();
+    auto& chunks           = lvl_chunks[level];
+    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
@@ -845,7 +844,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
       (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * num_stripes < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
@@ -859,7 +858,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                     _selected_columns.levels[level].size(),
                     [&]() {
                       return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                        num_stripes, _stream, rmm::mr::get_current_device_resource());
                     });
 
     // Tracker for eventually deallocating compressed and uncompressed data
@@ -868,8 +867,10 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     std::size_t stripe_start_row = 0;
     std::size_t num_dict_entries = 0;
     std::size_t num_rowgroups    = 0;
-    std::size_t stripe_idx       = 0;
-    std::size_t stream_idx       = 0;
+
+    // TODO: Stripe and stream idx must be by chunk.
+    std::size_t stripe_idx = 0;
+    std::size_t stream_idx = 0;
 
     // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
     for (auto const& stripe : selected_stripes) {
@@ -984,7 +985,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                                 stream_info,
                                                 chunks,
                                                 row_groups,
-                                                total_num_stripes,
+                                                num_stripes,
                                                 _metadata.get_row_index_stride(),
                                                 level == 0,
                                                 _stream);
@@ -999,7 +1000,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                 nullptr,
                                 chunks.base_device_ptr(),
                                 num_columns,
-                                total_num_stripes,
+                                num_stripes,
                                 num_rowgroups,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
@@ -1009,7 +1010,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+      for (std::size_t j = 0; j < num_stripes; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;

From 294ad503f10374296fc41a242d2235a56c8aca57 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 09:02:00 -0800
Subject: [PATCH 055/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          |  4 ++++
 cpp/src/io/orc/reader_impl.hpp         |  4 ++++
 cpp/src/io/orc/reader_impl_chunking.cu | 21 ---------------------
 3 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index e5470df05a2..4ee25fdab70 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -115,6 +115,10 @@ table_with_metadata reader::impl::read_chunk_internal()
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
+void reader::impl::decompress_and_decode() {}
+
+table_with_metadata reader::impl::make_output_chunk() { return table_with_metadata{}; }
+
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b2e22a16b85..d7653e3e180 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -86,6 +86,10 @@ class reader::impl {
 
   void subpass_preprocess();
 
+  void decompress_and_decode();
+
+  table_with_metadata make_output_chunk();
+
   /**
    * @brief Create the output table metadata from file metadata.
    *
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index b987cb3d8cf..75485cdd711 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -452,27 +452,6 @@ void reader::impl::pass_preprocess()
         dst_base + read.dst_pos, buffer->data(), read.length, cudaMemcpyDefault, _stream.value()));
       //        _stream.synchronize();
       host_read_buffers.emplace_back(std::move(buffer));
-
-#if 0
-     // This in theory should be faster, but in practice it's slower. Why?
-      read_tasks.push_back(
-        std::pair(std::async(std::launch::async,
-                             [&, read = read, dst_base = dst_base] {
-                               auto const buffer =
-                                 _metadata.per_file_metadata[read.source_idx].source->host_read(
-                                   read.offset, read.length);
-                               CUDF_EXPECTS(buffer->size() == read.length,
-                                            "Unexpected discrepancy in bytes read.");
-                               CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read.dst_pos,
-                                                             buffer->data(),
-                                                             read.length,
-                                                             cudaMemcpyDefault,
-                                                             _stream.value()));
-                               _stream.synchronize();
-                               return read.length;
-                             }),
-                  read.length));
-#endif
     }
   }
 

From 5b00dac5c5a0f3fe8175f2d7caf8b58a2631c91e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 19:20:50 -0800
Subject: [PATCH 056/321] Extract common code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 37 ++++++++++++++++----------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 75485cdd711..9407a7c3d24 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -214,6 +214,24 @@ void verify_splits(host_span<chunk const> splits,
 }
 #endif
 
+std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
+                                      chunk const& selected_chunks)
+{
+  // Range indices to input_chunks
+  auto const chunk_begin = selected_chunks.start_idx;
+  auto const chunk_end   = selected_chunks.start_idx + selected_chunks.count;
+
+  // The first and last chunk, according to selected_chunk
+  auto const& first_chunk = input_chunks[chunk_begin];
+  auto const& last_chunk  = input_chunks[chunk_end - 1];
+
+  // The range of data covered from the first to the last chunk.
+  auto const begin = first_chunk.start_idx;
+  auto const end   = last_chunk.start_idx + last_chunk.count;
+
+  return {begin, end};
+}
+
 }  // namespace
 
 void reader::impl::global_preprocess(uint64_t skip_rows,
@@ -422,15 +440,11 @@ void reader::impl::pass_preprocess()
     }
   }
 
-  auto const& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
-
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
-  auto const stripe_first_chunk = stripe_stream_read_chunks[stripe_start];
-  auto const stripe_last_chunk  = stripe_stream_read_chunks[stripe_end - 1];
-  auto const read_begin         = stripe_first_chunk.start_idx;
-  auto const read_end           = stripe_last_chunk.start_idx + stripe_last_chunk.count;
+  auto const& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
+  auto const [read_begin, read_end]     = get_range(stripe_stream_read_chunks, stripe_chunk);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read  = read_info[read_idx];
@@ -489,14 +503,9 @@ void reader::impl::subpass_preprocess()
     auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 
-    auto const& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
-    auto const stripe_start          = stripe_chunk.start_idx;
-    auto const stripe_end            = stripe_chunk.start_idx + stripe_chunk.count;
-    auto const stripe_first_chunk    = stripe_stream_chunks[stripe_start];
-    auto const stripe_last_chunk     = stripe_stream_chunks[stripe_end - 1];
-    auto const stream_begin          = stripe_first_chunk.start_idx;
-    auto const stream_end            = stripe_last_chunk.start_idx + stripe_last_chunk.count;
-    auto const num_streams           = stream_end - stream_begin;
+    auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
+    auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
+    auto const num_streams                = stream_end - stream_begin;
 
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {

From 4e2aec15a9c8ce4d016087d41194f7ee45d769d9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 20:18:07 -0800
Subject: [PATCH 057/321] Split chunks by decompression size

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 80 +++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9407a7c3d24..4efaf26e402 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -108,8 +108,8 @@ std::size_t gather_stream_info(std::size_t stripe_index,
 }
 
 struct cumulative_size {
-  int64_t count;
-  std::size_t size_bytes;
+  int64_t count{0};
+  std::size_t size_bytes{0};
 };
 
 struct cumulative_size_sum {
@@ -214,6 +214,13 @@ void verify_splits(host_span<chunk const> splits,
 }
 #endif
 
+/**
+ * @brief
+ *
+ * @param input_chunks
+ * @param selected_chunks
+ * @return
+ */
 std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
                                       chunk const& selected_chunks)
 {
@@ -391,8 +398,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.read_size_limit);
 
-#ifdef PRINT_DEBUG
-  auto& splits = _file_itm_data.load_stripe_chunks;
+#ifndef PRINT_DEBUG
+  auto& splits = _chunk_read_data.load_stripe_chunks;
   printf("------------\nSplits (/%d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
@@ -408,7 +415,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
   //  4. sum(number of stripes in all chunks) == total_num_stripes.
   // TODO: enable only in debug.
-  verify_splits(splits, total_stripe_sizes, num_stripes, _file_itm_data.read_size_limit);
+//  verify_splits(splits, total_stripe_sizes, num_stripes, _chunk_read_data.read_size_limit);
 #endif
 }
 
@@ -480,7 +487,6 @@ void reader::impl::subpass_preprocess()
   if (_file_itm_data.has_no_data()) { return; }
 
   //  auto const rows_to_read      = _file_itm_data.rows_to_read;
-  //  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
   auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
@@ -494,6 +500,9 @@ void reader::impl::subpass_preprocess()
   auto const stripe_chunk =
     _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk++];
 
+  cudf::detail::hostdevice_vector<cumulative_size> stripe_decompression_sizes(stripe_chunk.count,
+                                                                              _stream);
+
   // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stream_info      = _file_itm_data.lvl_stream_info[level];
@@ -553,6 +562,10 @@ void reader::impl::subpass_preprocess()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
+        stripe_decompression_sizes[stream_id.stripe_idx - stripe_chunk.start_idx] = {
+          1,
+          stripe_decompression_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +
+            stream_compinfo->max_uncompressed_size};
 #ifdef PRINT_DEBUG
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
                (int)stream_id.stripe_idx,
@@ -573,14 +586,65 @@ void reader::impl::subpass_preprocess()
       // printf("no compression \n");
       // fflush(stdout);
 
-      // Set decompressed data size equal to the input size.
-      // TODO
+      // Set decompression size equal to the input size.
+      for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        stripe_decompression_sizes[info.stripe_idx - stripe_chunk.start_idx] = {
+          1,
+          stripe_decompression_sizes[info.stripe_idx - stripe_chunk.start_idx].size_bytes +
+            info.length};
+      }
     }
 
     // printf("  end level %d\n\n", (int)level);
 
   }  // end loop level
 
+  // Compute the prefix sum of stripe data sizes.
+  stripe_decompression_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy(_stream),
+                         stripe_decompression_sizes.d_begin(),
+                         stripe_decompression_sizes.d_end(),
+                         stripe_decompression_sizes.d_begin(),
+                         cumulative_size_sum{});
+
+  stripe_decompression_sizes.device_to_host_sync(_stream);
+
+  // DEBUG only
+  _chunk_read_data.read_size_limit =
+    stripe_decompression_sizes[stripe_decompression_sizes.size() - 1].size_bytes / 3;
+
+  _chunk_read_data.decode_stripe_chunks =
+    find_splits(stripe_decompression_sizes, stripe_chunk.count, _chunk_read_data.read_size_limit);
+  for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
+    chunk.start_idx += stripe_chunk.start_idx;
+  }
+
+  for (auto& size : stripe_decompression_sizes) {
+    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+  }
+
+#ifndef PRINT_DEBUG
+  auto& splits = _chunk_read_data.decode_stripe_chunks;
+  printf("------------\nSplits second level (/%d): \n", (int)stripe_chunk.count);
+  for (size_t idx = 0; idx < splits.size(); idx++) {
+    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+  }
+  fflush(stdout);
+
+  //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
+  //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);
+
+  // We need to verify that:
+  //  1. All chunk must have count > 0
+  //  2. Chunks are continuous.
+  //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
+  //  4. sum(number of stripes in all chunks) == total_num_stripes.
+  // TODO: enable only in debug.
+//  verify_splits(splits, stripe_decompression_sizes, stripe_chunk.count,
+//  _file_itm_data.read_size_limit);
+#endif
+
   // lvl_stripe_data.clear();
   // _file_itm_data.compinfo_ready = true;
 }

From 33d304a39501cfee6edf9f6900903b1e39f3c071 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 20:27:50 -0800
Subject: [PATCH 058/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4efaf26e402..d01039b7a62 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -371,14 +371,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       chunk{last_read_size, static_cast<int64_t>(read_info.size() - last_read_size)};
   }
 
-  // DEBUG only
-  _chunk_read_data.read_size_limit =
-    total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
-
   // Load all chunks if there is no read limit.
   if (_chunk_read_data.read_size_limit == 0) {
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
-    return;
+    // TODO: DEBUG only
+    //    return;
   }
 
   // Compute the prefix sum of stripe data sizes.
@@ -391,9 +388,13 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
-  //  for (auto& size : total_stripe_sizes) {
-  //    printf("size: %ld, %zu\n", size.count, size.size_bytes);
-  //  }
+  for (auto& size : total_stripe_sizes) {
+    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+  }
+
+  // DEBUG only
+  _chunk_read_data.read_size_limit =
+    total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
 
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.read_size_limit);

From b4b286a87a01fe41cae5294af49b239d09d17388 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 20:40:17 -0800
Subject: [PATCH 059/321] Implement chunk merging

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index d01039b7a62..ad5bfdd398b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -164,6 +164,17 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
     cur_cumulative_size = sizes[split_pos].size_bytes;
   }
 
+  // If the last chunk has size smaller than `merge_threshold` percent of the second last one,
+  // merge it with the second last one.
+  if (splits.size() > 1) {
+    auto constexpr merge_threshold = 0.15;
+    if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
+        last.count <= static_cast<int64_t>(merge_threshold * second_last.count)) {
+      splits.pop_back();
+      splits.back().count += last.count;
+    }
+  }
+
   return splits;
 }
 #endif
@@ -601,6 +612,13 @@ void reader::impl::subpass_preprocess()
 
   }  // end loop level
 
+  // Decode all chunks if there is no read limit.
+  if (_chunk_read_data.read_size_limit == 0) {
+    _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
+    // TODO: DEBUG only
+    //    return;
+  }
+
   // Compute the prefix sum of stripe data sizes.
   stripe_decompression_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
@@ -612,8 +630,8 @@ void reader::impl::subpass_preprocess()
   stripe_decompression_sizes.device_to_host_sync(_stream);
 
   // DEBUG only
-  _chunk_read_data.read_size_limit =
-    stripe_decompression_sizes[stripe_decompression_sizes.size() - 1].size_bytes / 3;
+  //  _chunk_read_data.read_size_limit =
+  //    stripe_decompression_sizes[stripe_decompression_sizes.size() - 1].size_bytes / 3;
 
   _chunk_read_data.decode_stripe_chunks =
     find_splits(stripe_decompression_sizes, stripe_chunk.count, _chunk_read_data.read_size_limit);
@@ -622,7 +640,7 @@ void reader::impl::subpass_preprocess()
   }
 
   for (auto& size : stripe_decompression_sizes) {
-    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+    printf("decomp size: %ld, %zu\n", size.count, size.size_bytes);
   }
 
 #ifndef PRINT_DEBUG

From 2e81db13cd6aa05412209642429e1dcf5c9d0b41 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 20:57:49 -0800
Subject: [PATCH 060/321] Rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 29 +++++++++++---------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ad5bfdd398b..ff9dc6cd3ef 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -512,8 +512,8 @@ void reader::impl::subpass_preprocess()
   auto const stripe_chunk =
     _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk++];
 
-  cudf::detail::hostdevice_vector<cumulative_size> stripe_decompression_sizes(stripe_chunk.count,
-                                                                              _stream);
+  cudf::detail::hostdevice_vector<cumulative_size> stripe_decomp_sizes(stripe_chunk.count, _stream);
+  std::fill(stripe_decomp_sizes.begin(), stripe_decomp_sizes.end(), cumulative_size{1, 0});
 
   // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
@@ -574,10 +574,8 @@ void reader::impl::subpass_preprocess()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
-        stripe_decompression_sizes[stream_id.stripe_idx - stripe_chunk.start_idx] = {
-          1,
-          stripe_decompression_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +
-            stream_compinfo->max_uncompressed_size};
+        stripe_decomp_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
+          stream_compinfo->max_uncompressed_size;
 #ifdef PRINT_DEBUG
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
                (int)stream_id.stripe_idx,
@@ -601,10 +599,7 @@ void reader::impl::subpass_preprocess()
       // Set decompression size equal to the input size.
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-        stripe_decompression_sizes[info.stripe_idx - stripe_chunk.start_idx] = {
-          1,
-          stripe_decompression_sizes[info.stripe_idx - stripe_chunk.start_idx].size_bytes +
-            info.length};
+        stripe_decomp_sizes[info.stripe_idx - stripe_chunk.start_idx].size_bytes += info.length;
       }
     }
 
@@ -620,26 +615,26 @@ void reader::impl::subpass_preprocess()
   }
 
   // Compute the prefix sum of stripe data sizes.
-  stripe_decompression_sizes.host_to_device_async(_stream);
+  stripe_decomp_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
-                         stripe_decompression_sizes.d_begin(),
-                         stripe_decompression_sizes.d_end(),
-                         stripe_decompression_sizes.d_begin(),
+                         stripe_decomp_sizes.d_begin(),
+                         stripe_decomp_sizes.d_end(),
+                         stripe_decomp_sizes.d_begin(),
                          cumulative_size_sum{});
 
-  stripe_decompression_sizes.device_to_host_sync(_stream);
+  stripe_decomp_sizes.device_to_host_sync(_stream);
 
   // DEBUG only
   //  _chunk_read_data.read_size_limit =
   //    stripe_decompression_sizes[stripe_decompression_sizes.size() - 1].size_bytes / 3;
 
   _chunk_read_data.decode_stripe_chunks =
-    find_splits(stripe_decompression_sizes, stripe_chunk.count, _chunk_read_data.read_size_limit);
+    find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.read_size_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
     chunk.start_idx += stripe_chunk.start_idx;
   }
 
-  for (auto& size : stripe_decompression_sizes) {
+  for (auto& size : stripe_decomp_sizes) {
     printf("decomp size: %ld, %zu\n", size.count, size.size_bytes);
   }
 

From 3260ecc5a1d4e27ab5ff0d0990000896ec6a3d38 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Jan 2024 21:35:41 -0800
Subject: [PATCH 061/321] Find a test that can benefit from subpass

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_test.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 5ff27bd9e10..a0a0cecb038 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -225,16 +225,17 @@ struct SkipRowTest {
 
 TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
 {
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
 
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 10000000;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
                                                                          sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
   cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::SNAPPY);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =

From 2dca1eb9d2d7049a04f1b5549594e0c482ce3efa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Feb 2024 13:28:05 +0700
Subject: [PATCH 062/321] Reorganize, rename, and add docs for variables

---
 cpp/src/io/orc/reader_impl_chunking.cu   |  14 +-
 cpp/src/io/orc/reader_impl_chunking.hpp  | 174 ++++++++++++++---------
 cpp/src/io/orc/reader_impl_preprocess.cu |   4 +-
 3 files changed, 119 insertions(+), 73 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ff9dc6cd3ef..2463196dc31 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -276,8 +276,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
 
-  auto& read_info                 = _file_itm_data.stream_read_info;
-  auto& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
+  auto& read_info                 = _file_itm_data.data_read_info;
+  auto& stripe_data_read_chunks = _file_itm_data.stripe_data_read_chunks;
   auto& lvl_stripe_stream_chunks  = _file_itm_data.lvl_stripe_stream_chunks;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
@@ -290,7 +290,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Get the total number of stripes across all input files.
   std::size_t num_stripes = selected_stripes.size();
 
-  stripe_stream_read_chunks.resize(num_stripes);
+  stripe_data_read_chunks.resize(num_stripes);
   lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
 
   // TODO: Check if these data depends on pass and subpass, instead of global pass.
@@ -378,7 +378,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       }
     }
     total_stripe_sizes[stripe_idx] = {1, total_stripe_size};
-    stripe_stream_read_chunks[stripe_idx] =
+    stripe_data_read_chunks[stripe_idx] =
       chunk{last_read_size, static_cast<int64_t>(read_info.size() - last_read_size)};
   }
 
@@ -441,7 +441,7 @@ void reader::impl::pass_preprocess()
 
   auto& lvl_stripe_data  = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes = _file_itm_data.lvl_stripe_sizes;
-  auto& read_info        = _file_itm_data.stream_read_info;
+  auto& read_info        = _file_itm_data.data_read_info;
 
   //  std::size_t num_stripes = selected_stripes.size();
   auto const stripe_chunk =
@@ -462,8 +462,8 @@ void reader::impl::pass_preprocess()
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
-  auto const& stripe_stream_read_chunks = _file_itm_data.stripe_stream_read_chunks;
-  auto const [read_begin, read_end]     = get_range(stripe_stream_read_chunks, stripe_chunk);
+  auto const& stripe_data_read_chunks = _file_itm_data.stripe_data_read_chunks;
+  auto const [read_begin, read_end]     = get_range(stripe_data_read_chunks, stripe_chunk);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read  = read_info[read_idx];
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index ea2c1d2cd81..1e5d0a3d988 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -31,9 +31,45 @@
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct that maps ORC streams to columns
+ * @brief Struct that store identification of an ORC streams
+ */
+struct stream_id_info {
+  uint32_t stripe_idx;  // TODO: check if this is correct stripe processing index, not stripe index in source
+  // TODO: change type below
+  std::size_t level;    // level of the nested column 
+  uint32_t orc_col_idx; // orc column id
+  StreamKind kind;      // stream kind
+
+  struct hash {
+    std::size_t operator()(stream_id_info const& index) const
+    {
+      auto const hasher = std::hash<size_t>{};
+      return hasher(index.stripe_idx) ^ hasher(index.level) ^
+             hasher(static_cast<std::size_t>(index.orc_col_idx)) ^
+             hasher(static_cast<std::size_t>(index.kind));
+    }
+  };
+  struct equal_to {
+    bool operator()(stream_id_info const& lhs, stream_id_info const& rhs) const
+    {
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
+    }
+  };
+};
+
+/**
+ * @brief Map to lookup a value from stream id.
+*/
+template<typename T>
+using stream_id_map =
+  std::unordered_map<stream_id_info, T, stream_id_info::hash, stream_id_info::equal_to>;
+
+/**
+ * @brief Struct that store identification of an ORC streams.
  */
 struct orc_stream_info {
+  // TODO: remove constructor
   explicit orc_stream_info(uint64_t offset_,
                            std::size_t dst_pos_,
                            uint32_t length_,
@@ -57,79 +93,72 @@ struct orc_stream_info {
            (int)kind);
 #endif
   }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe processing index, not stripe index in source
-  std::size_t level;    // TODO
-  uint32_t orc_col_idx;
-  StreamKind kind;
-};
+  // Data info:
+  uint64_t offset;      // offset in data source
+  std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;   // stream length to read
 
-// unify this with orc_stream_info
-struct stream_id_info {
-  std::size_t stripe_idx;
-  std::size_t level;
-  uint32_t orc_col_idx;
-  StreamKind kind;
+  // Store location of the stream in the stripe, so we can look up where this stream comes from.
+  stream_id_info id;
 };
+
+/**
+ * @brief Struct that store compression information for a stripe at a specific nested level.
+ */
 struct stripe_level_comp_info {
   std::size_t num_compressed_blocks{0};
   std::size_t num_uncompressed_blocks{0};
   std::size_t total_decomp_size{0};
 };
-struct stream_id_equal {
-  bool operator()(stream_id_info const& lhs, stream_id_info const& rhs) const
-  {
-    return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
-           lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
-  }
-};
-struct stream_id_hash {
-  std::size_t operator()(stream_id_info const& index) const
-  {
-    auto const hasher = std::hash<size_t>{};
-    return hasher(index.stripe_idx) ^ hasher(index.level) ^
-           hasher(static_cast<std::size_t>(index.orc_col_idx)) ^
-           hasher(static_cast<std::size_t>(index.kind));
-  }
-};
 
+// TODO: remove this and use range instead
+/**
+ * @brief Struct that store information about a chunk of data.
+ */
 struct chunk {
   int64_t start_idx;
   int64_t count;
 };
 
+/**
+ * @brief Struct that store information about a range of data.
+ */
 struct range {
   int64_t begin;
   int64_t end;
 };
 
 /**
- * @brief Struct to store file-level data that remains constant for all chunks being read.
+ * @brief Struct to store file-level data that remains constant for all chunks being output.
  */
 struct file_intermediate_data {
-  // If no rows or stripes to read, return empty columns
+    int64_t rows_to_skip;
+  size_type rows_to_read;
+  std::vector<metadata::OrcStripeInfo> selected_stripes;
+
+
+  // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
-  std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal>
-    compinfo_map;
-  // bool compinfo_ready{false};
+  // Store the compression information for each data stream.
+  stream_id_map<stripe_level_comp_info> compinfo_map;
 
-  // The buffers are initialized for each reading stripe chunks.
-  // After decoding, such buffers need to be released.
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
   // This can only be implemented after chunked output is ready.
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
 
+  // Store the size of each stripe at each nested level.
+  // This is used to initialize the stripe_data buffers.
   std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
 
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
-  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
-  // Each read correspond to one or more consecutive stream combined.
-  struct stream_read_info {
-    stream_read_info(uint64_t offset_,
+
+  // Store information to identify where to read a chunk of data from source.
+  // Each read corresponds to one or more consecutive streams combined.
+  struct data_read_info {
+    // TODO: remove constructor
+    data_read_info(uint64_t offset_,
                      std::size_t length_,
                      std::size_t dst_pos_,
                      std::size_t source_idx_,
@@ -143,23 +172,39 @@ struct file_intermediate_data {
         level(level_)
     {
     }
-    uint64_t offset;
-    std::size_t length;
-    std::size_t dst_pos;
-    std::size_t source_idx;
-    std::size_t stripe_idx;
-    std::size_t level;
+      uint64_t offset;      // offset in data source
+    std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
+    std::size_t length;   // data length to read
+    std::size_t source_idx; // the data source id
+    std::size_t stripe_idx; // stream id TODO: processing or source stripe id?
+    std::size_t level; // nested level
   };
-  std::vector<stream_read_info> stream_read_info;
-  std::vector<chunk> stripe_stream_read_chunks;  // chunk identify the reading streams (multiple
-                                                 // streams can be read once) for each stripe
-  std::vector<std::vector<chunk>>
-    lvl_stripe_stream_chunks;  // chunk identify all processing streams for each stripe, need to be
-                               // level-based
-
-  int64_t rows_to_skip;
-  size_type rows_to_read;
-  std::vector<metadata::OrcStripeInfo> selected_stripes;
+
+    // Identify what data to read from source.
+  std::vector<data_read_info> data_read_info;
+
+  // For each stripe, we perform a number of read for its streams.
+  // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
+  std::vector<chunk> stripe_data_read_chunks;
+
+
+  // Store info for each ORC stream at each nested level.
+  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+
+
+// At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
+// This is used to identify the range of streams for each stripe from that vector.
+  std::vector<std::vector<chunk>> lvl_stripe_stream_chunks; 
+
+
+// TODO
+  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
+
+  // For data processing, decompression, and decoding.
+  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
+
+
 
   bool global_preprocessed{false};
 };
@@ -173,15 +218,16 @@ struct chunk_read_data {
   {
   }
 
-  std::size_t output_size_limit;  // Maximum size (in bytes) of an output chunk, or 0 for no limit
-  std::size_t read_size_limit;    // Maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t read_size_limit;    // approximate maximum size (in bytes) used for store
+                                  // intermediate data, or 0 for no limit
 
-  // Chunks of stripes that can be load such that total of their data size is within a limit.
+  // Chunks of stripes that can be load into memory such that their data size is within a size limit.
   std::vector<chunk> load_stripe_chunks;
   std::size_t curr_load_stripe_chunk{0};
   bool more_stripe_to_load() const { return curr_load_stripe_chunk < load_stripe_chunks.size(); }
 
-  // Chunks of stripes such that total of their decompression size is within a limit.
+  // Chunks of stripes such that their decompression size is within a size limit.
   std::vector<chunk> decode_stripe_chunks;
   std::size_t curr_decode_stripe_chunk{0};
   bool more_stripe_to_decode() const
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 3333af1ac97..a64c22fa332 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -140,6 +140,7 @@ std::size_t gather_stream_info_and_update_chunks(
   return dst_offset;
 }
 
+// TODO: update
 /**
  * @brief Decompresses the stripe data, at stream granularity.
  *
@@ -155,8 +156,7 @@ std::size_t gather_stream_info_and_update_chunks(
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
-  std::unordered_map<stream_id_info, stripe_level_comp_info, stream_id_hash, stream_id_equal> const&
-    compinfo_map,
+  stream_id_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info const> stream_info,

From 48c96a28bed15fc1d0e1336580800069037551ef Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Feb 2024 13:49:28 +0700
Subject: [PATCH 063/321] Fix bugs

---
 cpp/src/io/orc/reader_impl_chunking.cu   | 22 +++++++++++-----------
 cpp/src/io/orc/reader_impl_chunking.hpp  | 20 +++++++-------------
 cpp/src/io/orc/reader_impl_preprocess.cu | 22 +++++++++++-----------
 3 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2463196dc31..350be1ac712 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -95,10 +95,10 @@ std::size_t gather_stream_info(std::size_t stripe_index,
       stream_info.emplace_back(stripeinfo->offset + src_offset,
                                dst_offset,
                                stream.length,
-                               stripe_index,
+                               stream_id_info{stripe_index,
                                level,
                                column_id,
-                               stream.kind);
+                               stream.kind});
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -542,17 +542,17 @@ void reader::impl::subpass_preprocess()
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+          static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
           info.length));
         stream_compinfo_map[stream_id_info{
-          info.stripe_idx, info.level, info.orc_col_idx, info.kind}] =
+          info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind}] =
           &compinfo[compinfo.size() - 1];
 #ifdef PRINT_DEBUG
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-               (int)info.stripe_idx,
-               (int)info.level,
-               (int)info.orc_col_idx,
-               (int)info.kind,
+               (int)info.id.stripe_idx,
+               (int)info.id.level,
+               (int)info.id.orc_cold_idx,
+               (int)info.id.kind,
                info.dst_pos,
                info.length);
         fflush(stdout);
@@ -574,11 +574,11 @@ void reader::impl::subpass_preprocess()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
-        stripe_decomp_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
+        stripe_decomp_sizes[stream_id.id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
           stream_compinfo->max_uncompressed_size;
 #ifdef PRINT_DEBUG
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-               (int)stream_id.stripe_idx,
+               (int)stream_id.id.stripe_idx,
                (int)stream_id.level,
                (int)stream_id.orc_col_idx,
                (int)stream_id.kind,
@@ -599,7 +599,7 @@ void reader::impl::subpass_preprocess()
       // Set decompression size equal to the input size.
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-        stripe_decomp_sizes[info.stripe_idx - stripe_chunk.start_idx].size_bytes += info.length;
+        stripe_decomp_sizes[info.id.stripe_idx - stripe_chunk.start_idx].size_bytes += info.length;
       }
     }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 1e5d0a3d988..d3d6c04d1fb 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -34,19 +34,19 @@ namespace cudf::io::orc::detail {
  * @brief Struct that store identification of an ORC streams
  */
 struct stream_id_info {
-  uint32_t stripe_idx;  // TODO: check if this is correct stripe processing index, not stripe index in source
+  uint32_t stripe_idx;  // global stripe id throughout the data source
   // TODO: change type below
   std::size_t level;    // level of the nested column 
   uint32_t orc_col_idx; // orc column id
   StreamKind kind;      // stream kind
 
   struct hash {
-    std::size_t operator()(stream_id_info const& index) const
+    std::size_t operator()(stream_id_info const& id) const
     {
       auto const hasher = std::hash<size_t>{};
-      return hasher(index.stripe_idx) ^ hasher(index.level) ^
-             hasher(static_cast<std::size_t>(index.orc_col_idx)) ^
-             hasher(static_cast<std::size_t>(index.kind));
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^
+             hasher(static_cast<std::size_t>(id.orc_col_idx)) ^
+             hasher(static_cast<std::size_t>(id.kind));
     }
   };
   struct equal_to {
@@ -73,17 +73,11 @@ struct orc_stream_info {
   explicit orc_stream_info(uint64_t offset_,
                            std::size_t dst_pos_,
                            uint32_t length_,
-                           uint32_t stripe_idx_,
-                           std::size_t level_,
-                           uint32_t orc_col_idx_,
-                           StreamKind kind_)
+                           stream_id_info const& id_)
     : offset(offset_),
       dst_pos(dst_pos_),
       length(length_),
-      stripe_idx(stripe_idx_),
-      level(level_),
-      orc_col_idx(orc_col_idx_),
-      kind(kind_)
+      id(id_)
   {
 #ifdef PRINT_DEBUG
     printf("   construct stripe id [%d, %d, %d, %d]\n",
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index a64c22fa332..ac6f4d4336b 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -178,27 +178,27 @@ rmm::device_buffer decompress_stripe_data(
   for (auto const& info : stream_info) {
 #ifdef PRINT_DEBUG
     printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-           (int)info.stripe_idx,
-           (int)info.level,
-           (int)info.orc_col_idx,
-           (int)info.kind,
+           (int)info.id.stripe_idx,
+           (int)info.id.level,
+           (int)info.id.orc_cold_idx,
+           (int)info.id.kind,
            info.dst_pos,
            info.length);
     fflush(stdout);
 #endif
 
     compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
       info.length));
 
     //    printf("line %d\n", __LINE__);
     //    fflush(stdout);
     auto const& cached_comp_info =
-      compinfo_map.at(stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind});
+      compinfo_map.at(stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind});
     //    printf("line %d\n", __LINE__);
     //    fflush(stdout);
     // auto const& cached_comp_info =
-    //   compinfo_map[stream_id_info{info.stripe_idx, info.level, info.orc_col_idx, info.kind}];
+    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind}];
     auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
@@ -228,10 +228,10 @@ rmm::device_buffer decompress_stripe_data(
 
     auto const& info = stream_info[i];
     printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-           (int)info.stripe_idx,
-           (int)info.level,
-           (int)info.orc_col_idx,
-           (int)info.kind,
+           (int)info.id.stripe_idx,
+           (int)info.id.level,
+           (int)info.id.orc_cold_idx,
+           (int)info.id.kind,
            (size_t)compinfo[i].num_compressed_blocks,
            (size_t)compinfo[i].num_uncompressed_blocks,
            compinfo[i].max_uncompressed_size);

From b64c7f2d5c4cd1ac4808d59df50ad485afaa21d4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Feb 2024 14:18:40 +0700
Subject: [PATCH 064/321] Rename variables

---
 cpp/src/io/orc/reader_impl.cu           | 20 ++++++++++++++++++++
 cpp/src/io/orc/reader_impl.hpp          | 21 +++++++++++++++++++++
 cpp/src/io/orc/reader_impl_chunking.cu  | 16 ++++++++--------
 cpp/src/io/orc/reader_impl_chunking.hpp |  6 +++---
 4 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4ee25fdab70..8e9e4365a1e 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -37,6 +37,26 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 {
 }
 
+reader::impl::impl(std::size_t output_size_limit,
+                  std::size_t data_read_limit,
+                  std::vector<std::unique_ptr<datasource>>&& sources,
+                   orc_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _stream(stream),
+    _mr(mr),
+    _timestamp_type{options.get_timestamp_type()},
+    _use_index{options.is_enabled_use_index()},
+    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
+    _decimal128_columns{options.get_decimal128_columns()},
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _chunk_read_data{output_size_limit, data_read_limit}
+{
+}
+
 table_with_metadata reader::impl::read(uint64_t skip_rows,
                                        std::optional<size_type> const& num_rows_opt,
                                        std::vector<std::vector<size_type>> const& stripes)
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index d7653e3e180..5324f8dbc8a 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -53,6 +53,25 @@ class reader::impl {
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
+  /**
+   * @brief Constructor from a dataset source with reader options.
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param sources Dataset sources
+   * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit impl(std::size_t output_size_limit,
+                std::size_t data_read_limit,
+                std::vector<std::unique_ptr<datasource>>&& sources,
+                orc_reader_options const& options,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
+
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
@@ -65,6 +84,8 @@ class reader::impl {
                            std::optional<size_type> const& num_rows_opt,
                            std::vector<std::vector<size_type>> const& stripes);
 
+
+
  private:
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 350be1ac712..e0f2db311d6 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -383,7 +383,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   }
 
   // Load all chunks if there is no read limit.
-  if (_chunk_read_data.read_size_limit == 0) {
+  if (_chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
     // TODO: DEBUG only
     //    return;
@@ -404,11 +404,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   }
 
   // DEBUG only
-  _chunk_read_data.read_size_limit =
+  _chunk_read_data.data_read_limit =
     total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
 
   _chunk_read_data.load_stripe_chunks =
-    find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.read_size_limit);
+    find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
 
 #ifndef PRINT_DEBUG
   auto& splits = _chunk_read_data.load_stripe_chunks;
@@ -427,7 +427,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
   //  4. sum(number of stripes in all chunks) == total_num_stripes.
   // TODO: enable only in debug.
-//  verify_splits(splits, total_stripe_sizes, num_stripes, _chunk_read_data.read_size_limit);
+//  verify_splits(splits, total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
 #endif
 }
 
@@ -608,7 +608,7 @@ void reader::impl::subpass_preprocess()
   }  // end loop level
 
   // Decode all chunks if there is no read limit.
-  if (_chunk_read_data.read_size_limit == 0) {
+  if (_chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
     // TODO: DEBUG only
     //    return;
@@ -625,11 +625,11 @@ void reader::impl::subpass_preprocess()
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
   // DEBUG only
-  //  _chunk_read_data.read_size_limit =
+  //  _chunk_read_data.data_read_limit =
   //    stripe_decompression_sizes[stripe_decompression_sizes.size() - 1].size_bytes / 3;
 
   _chunk_read_data.decode_stripe_chunks =
-    find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.read_size_limit);
+    find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
     chunk.start_idx += stripe_chunk.start_idx;
   }
@@ -656,7 +656,7 @@ void reader::impl::subpass_preprocess()
   //  4. sum(number of stripes in all chunks) == total_num_stripes.
   // TODO: enable only in debug.
 //  verify_splits(splits, stripe_decompression_sizes, stripe_chunk.count,
-//  _file_itm_data.read_size_limit);
+//  _file_itm_data.data_read_limit);
 #endif
 
   // lvl_stripe_data.clear();
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index d3d6c04d1fb..2d07cae3214 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -207,13 +207,13 @@ struct file_intermediate_data {
  * @brief Struct to store all data necessary for chunked reading.
  */
 struct chunk_read_data {
-  explicit chunk_read_data(std::size_t output_size_limit_ = 0, std::size_t read_size_limit_ = 0)
-    : output_size_limit{output_size_limit_}, read_size_limit(read_size_limit_)
+  explicit chunk_read_data(std::size_t output_size_limit_ = 0, std::size_t data_read_limit_ = 0)
+    : output_size_limit{output_size_limit_}, data_read_limit(data_read_limit_)
   {
   }
 
   std::size_t output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
-  std::size_t read_size_limit;    // approximate maximum size (in bytes) used for store
+  std::size_t data_read_limit;    // approximate maximum size (in bytes) used for store
                                   // intermediate data, or 0 for no limit
 
   // Chunks of stripes that can be load into memory such that their data size is within a size limit.

From 6ede82e53864913fa4a6f9626d79a7e01e682a88 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Feb 2024 21:47:00 +0700
Subject: [PATCH 065/321] Add more docs

---
 cpp/src/io/orc/reader_impl.cu            |  1 +
 cpp/src/io/orc/reader_impl.hpp           | 51 ++++++++++++++++++++----
 cpp/src/io/orc/reader_impl_chunking.cu   |  3 +-
 cpp/src/io/orc/reader_impl_preprocess.cu |  2 +-
 4 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 8e9e4365a1e..71996f026f9 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -135,6 +135,7 @@ table_with_metadata reader::impl::read_chunk_internal()
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
+// TODO:  move code here
 void reader::impl::decompress_and_decode() {}
 
 table_with_metadata reader::impl::make_output_chunk() { return table_with_metadata{}; }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 5324f8dbc8a..0b7c00d621d 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -89,6 +89,9 @@ class reader::impl {
  private:
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
+   * 
+   * This is the proxy to call all other data preprocessing functions, which are prerequisite
+   * for generating an output table.
    *
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
@@ -98,17 +101,55 @@ class reader::impl {
                     std::optional<size_type> const& num_rows_opt,
                     std::vector<std::vector<size_type>> const& stripes);
 
-  // Do once for the entire file.
+  /**
+   * @brief Perform a global preprocessing step that executes exactly once for the entire duration
+   * of the reader.
+   * 
+   * In this step, the metadata of all stripes in the data source is parsed, and information about 
+   * data streams for all selected columns in alls tripes are generated. If the reader has a data 
+   * read limit, data size of all stripes are used to determine the chunks of consecutive
+   * stripes for reading each time using the `read_data()` step. This is to ensure that loading 
+   * these stripes will not exceed a fixed portion the data read limit.
+  */
   void global_preprocess(uint64_t skip_rows,
                          std::optional<size_type> const& num_rows_opt,
                          std::vector<std::vector<size_type>> const& stripes);
 
-  void pass_preprocess();
+  /**
+   * @brief Read stripes from the input source and store the data in the internal buffers.
+   * 
+   * If there is a data read limit, only a chunk of stripes are read at a time such that
+   * their total data size does not exceed a fixed portion of the limit. Then, the data is
+   * probed to determine the uncompressed sizes for these loaded stripes, which are in turn
+   * used to determine a subset of stripes to decompress and decode in the next step 
+   * `decompress_and_decode()`.
+   * This is to ensure that loading data together with decompression and decoding will not exceed 
+   * the data read limit.
+   */
+  void read_data();
 
+  /**
+   * TODO: merge with read data.
+  */
   void subpass_preprocess();
 
+  /**
+   * @brief Decompress and decode the data in the internal buffers, and store the result into
+   * an internal table.
+   * 
+   * If there is a data read limit, only a chunk of stripes are decompressed and decoded at a time.
+   * Then, the result is stored in an internal table, and sizes of its rows are computed 
+   * to determine slices of rows to return as the output table in the final step `make_output_chunk`.
+   */
   void decompress_and_decode();
 
+  /**
+   * @brief Create the output table from the internal buffers and return it along with metadata.
+   *
+   * This function is called internally and expects all preprocessing steps have already been done.
+   *
+   * @return The output table along with columns' metadata
+   */
   table_with_metadata make_output_chunk();
 
   /**
@@ -119,11 +160,7 @@ class reader::impl {
   table_metadata make_output_metadata();
 
   /**
-   * @brief Read a chunk of data from the input source and return an output table with metadata.
-   *
-   * This function is called internally and expects all preprocessing steps have already been done.
-   *
-   * @return The output table along with columns' metadata
+   * TODO: move code to make_output_chunk
    */
   table_with_metadata read_chunk_internal();
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index e0f2db311d6..ccb6d3dc4e8 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -432,7 +432,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 }
 
 // Load each chunk from `load_stripe_chunks`.
-void reader::impl::pass_preprocess()
+void reader::impl::read_data()
 {
   if (_file_itm_data.has_no_data()) { return; }
 
@@ -494,6 +494,7 @@ void reader::impl::pass_preprocess()
   }
 }
 
+// TODO: merge with read_data()
 void reader::impl::subpass_preprocess()
 {
   if (_file_itm_data.has_no_data()) { return; }
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index ac6f4d4336b..ed6219d584c 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -761,7 +761,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
   // TODO: fix this, should be called once
   while (_chunk_read_data.more_stripe_to_load()) {
-    pass_preprocess();
+    read_data();
   }
 
   // Fix this, subpass should be call once

From ceaf1ff57a4dc5295931b9036b6dbe3210df6822 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Feb 2024 22:06:33 +0700
Subject: [PATCH 066/321] Rename variable

---
 cpp/src/io/orc/reader_impl.cu      |  6 +++---
 cpp/src/io/orc/reader_impl.hpp     |  2 +-
 cpp/src/io/parquet/reader_impl.cpp | 16 ++++++++--------
 cpp/src/io/parquet/reader_impl.hpp |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 71996f026f9..48d110c61bd 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -67,7 +67,7 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
 
 table_metadata reader::impl::make_output_metadata()
 {
-  if (_output_metadata) { return table_metadata{*_output_metadata}; }
+  if (_out_metadata) { return table_metadata{*_out_metadata}; }
 
   // Copy user data to the output metadata.
   table_metadata out_metadata;
@@ -88,8 +88,8 @@ table_metadata reader::impl::make_output_metadata()
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Save the output table metadata into `_output_metadata` for reuse next time.
-  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+  // Save the output table metadata into `_out_metadata` for reuse next time.
+  _out_metadata = std::make_unique<table_metadata>(out_metadata);
 
   return out_metadata;
 }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 0b7c00d621d..d131d907fa1 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -180,7 +180,7 @@ class reader::impl {
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
   file_intermediate_data _file_itm_data;
   chunk_read_data _chunk_read_data;
-  std::unique_ptr<table_metadata> _output_metadata;
+  std::unique_ptr<table_metadata> _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 };
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 24d46d91dbb..bba7ad6b337 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -407,8 +407,8 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
 table_with_metadata reader::impl::read_chunk_internal(
   bool uses_custom_row_bounds, std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  // If `_output_metadata` has been constructed, just copy it over.
-  auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
+  // If `_out_metadata` has been constructed, just copy it over.
+  auto out_metadata = _out_metadata ? table_metadata{*_out_metadata} : table_metadata{};
   out_metadata.schema_info.resize(_output_buffers.size());
 
   // output cudf columns as determined by the top level schema
@@ -439,8 +439,8 @@ table_with_metadata reader::impl::read_chunk_internal(
       metadata = std::make_optional<reader_column_schema>();
       metadata->set_convert_binary_to_strings(false);
     }
-    // Only construct `out_metadata` if `_output_metadata` has not been cached.
-    if (!_output_metadata) {
+    // Only construct `out_metadata` if `_out_metadata` has not been cached.
+    if (!_out_metadata) {
       column_name_info& col_name = out_metadata.schema_info[i];
       out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream));
     } else {
@@ -459,7 +459,7 @@ table_with_metadata reader::impl::finalize_output(
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
-    if (!_output_metadata) {
+    if (!_out_metadata) {
       column_name_info& col_name = out_metadata.schema_info[i];
       out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr));
     } else {
@@ -467,10 +467,10 @@ table_with_metadata reader::impl::finalize_output(
     }
   }
 
-  if (!_output_metadata) {
+  if (!_out_metadata) {
     populate_metadata(out_metadata);
-    // Finally, save the output table metadata into `_output_metadata` for reuse next time.
-    _output_metadata = std::make_unique<table_metadata>(out_metadata);
+    // Finally, save the output table metadata into `_out_metadata` for reuse next time.
+    _out_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
   // advance output chunk/subpass/pass info
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 67c56c9c2d7..09a1069e6c7 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -362,7 +362,7 @@ class reader::impl {
   std::vector<int> _output_column_schemas;
 
   // _output_buffers associated metadata
-  std::unique_ptr<table_metadata> _output_metadata;
+  std::unique_ptr<table_metadata> _out_metadata;
 
   bool _strings_to_categorical = false;
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;

From a3052ddcec286fd14ff28bf661772e63b55ee673 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sat, 3 Feb 2024 08:02:09 +0700
Subject: [PATCH 067/321] Add comments and unify functions

---
 cpp/src/io/orc/reader_impl.cu            | 12 +++++-------
 cpp/src/io/orc/reader_impl.hpp           |  4 ----
 cpp/src/io/orc/reader_impl_chunking.cu   | 20 ++------------------
 cpp/src/io/orc/reader_impl_chunking.hpp  |  2 +-
 cpp/src/io/orc/reader_impl_preprocess.cu |  5 +++++
 5 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 48d110c61bd..8a3f2e4fd4a 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -62,7 +62,7 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
                                        std::vector<std::vector<size_type>> const& stripes)
 {
   prepare_data(skip_rows, num_rows_opt, stripes);
-  return read_chunk_internal();
+  return make_output_chunk();
 }
 
 table_metadata reader::impl::make_output_metadata()
@@ -94,7 +94,10 @@ table_metadata reader::impl::make_output_metadata()
   return out_metadata;
 }
 
-table_with_metadata reader::impl::read_chunk_internal()
+// TODO:  move code here
+void reader::impl::decompress_and_decode() {}
+
+table_with_metadata reader::impl::make_output_chunk()
 {
   // There is no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
@@ -135,11 +138,6 @@ table_with_metadata reader::impl::read_chunk_internal()
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// TODO:  move code here
-void reader::impl::decompress_and_decode() {}
-
-table_with_metadata reader::impl::make_output_chunk() { return table_with_metadata{}; }
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index d131d907fa1..4b6ab4494c6 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -128,10 +128,6 @@ class reader::impl {
    */
   void read_data();
 
-  /**
-   * TODO: merge with read data.
-  */
-  void subpass_preprocess();
 
   /**
    * @brief Decompress and decode the data in the internal buffers, and store the result into
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ccb6d3dc4e8..408a01f6d41 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -492,26 +492,10 @@ void reader::impl::read_data()
   for (auto& task : read_tasks) {
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
-}
-
-// TODO: merge with read_data()
-void reader::impl::subpass_preprocess()
-{
-  if (_file_itm_data.has_no_data()) { return; }
-
-  //  auto const rows_to_read      = _file_itm_data.rows_to_read;
 
-  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
+  stream_id_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
-  // TODO: This is subpass
-  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
-    stream_compinfo_map;
-
-  // TODO: fix this, loop only current chunk
-  auto const stripe_chunk =
-    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk++];
 
   cudf::detail::hostdevice_vector<cumulative_size> stripe_decomp_sizes(stripe_chunk.count, _stream);
   std::fill(stripe_decomp_sizes.begin(), stripe_decomp_sizes.end(), cumulative_size{1, 0});
@@ -590,7 +574,7 @@ void reader::impl::subpass_preprocess()
 #endif
       }
 
-      // Must clear so we will not overwrite the old compression info stream_id.
+      // Must clear map since the next level will have similar keys.
       stream_compinfo_map.clear();
 
     } else {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 2d07cae3214..85dea4194d4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -191,7 +191,7 @@ struct file_intermediate_data {
   std::vector<std::vector<chunk>> lvl_stripe_stream_chunks; 
 
 
-// TODO
+// TODO rename
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
   // For data processing, decompression, and decoding.
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index ed6219d584c..eae77e4d71f 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -796,6 +796,11 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   lvl_chunks.resize(_selected_columns.num_levels());
   _out_buffers.resize(_selected_columns.num_levels());
 
+
+//
+//
+//
+// TODO: move this to reader_impl.cu, decomp and decode step
   std::size_t num_stripes = selected_stripes.size();
 
   // Iterates through levels of nested columns, child column will be one level down

From 2f62b8c55ddb5d09b6ffa527fd5bf50f793b9c91 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sat, 3 Feb 2024 09:16:04 +0700
Subject: [PATCH 068/321] Reorganize variables

---
 cpp/src/io/orc/reader_impl.cu            | 22 +++++++++++-----------
 cpp/src/io/orc/reader_impl.hpp           | 17 +++++++----------
 cpp/src/io/orc/reader_impl_preprocess.cu | 10 +++++-----
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 8a3f2e4fd4a..c55fe69d463 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -26,10 +26,10 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
-    _timestamp_type{options.get_timestamp_type()},
-    _use_index{options.is_enabled_use_index()},
-    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _decimal128_columns{options.get_decimal128_columns()},
+    _config{options.get_timestamp_type(),
+            options.is_enabled_use_index(),
+            options.is_enabled_use_np_dtypes(),
+            options.get_decimal128_columns()},
     _col_meta{std::make_unique<reader_column_meta>()},
     _sources(std::move(sources)),
     _metadata{_sources, stream},
@@ -45,10 +45,10 @@ reader::impl::impl(std::size_t output_size_limit,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
-    _timestamp_type{options.get_timestamp_type()},
-    _use_index{options.is_enabled_use_index()},
-    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _decimal128_columns{options.get_decimal128_columns()},
+    _config.timestamp_type{options.get_timestamp_type()},
+    _config.use_index{options.is_enabled_use_index()},
+    _config.use_np_dtypes{options.is_enabled_use_np_dtypes()},
+    _config.decimal128_columns{options.get_decimal128_columns()},
     _col_meta{std::make_unique<reader_column_meta>()},
     _sources(std::move(sources)),
     _metadata{_sources, stream},
@@ -114,9 +114,9 @@ table_with_metadata reader::impl::make_output_chunk()
                      out_metadata.schema_info.emplace_back("");
                      return create_empty_column(col_meta.id,
                                                 _metadata,
-                                                _decimal128_columns,
-                                                _use_np_dtypes,
-                                                _timestamp_type,
+                                                _config.decimal128_columns,
+                                                _config.use_np_dtypes,
+                                                _config.timestamp_type,
                                                 out_metadata.schema_info.back(),
                                                 _stream);
                    });
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4b6ab4494c6..083701e62ff 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -155,22 +155,19 @@ class reader::impl {
    */
   table_metadata make_output_metadata();
 
-  /**
-   * TODO: move code to make_output_chunk
-   */
-  table_with_metadata read_chunk_internal();
-
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
 
   // Reader configs
-  data_type const _timestamp_type;  // Override output timestamp resolution
-  bool const _use_index;            // Enable or disable attempt to use row index for parsing
-  bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-  std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
-  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
+  struct {
+  data_type timestamp_type;  // Override output timestamp resolution
+  bool use_index;            // Enable or disable attempt to use row index for parsing
+  bool use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
+  std::vector<std::string> decimal128_columns;   // Control decimals conversion
+  } const _config;
 
   // Intermediate data for internal processing.
+  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index eae77e4d71f..88a423ec506 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -815,9 +815,9 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     std::vector<data_type> column_types;
     for (auto& col : columns_level) {
       auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
+                                   _config.use_np_dtypes,
+                                   _config.timestamp_type.id(),
+                                   to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
       if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
           col_type == type_id::DECIMAL128) {
@@ -843,7 +843,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
-      _use_index &&
+      _config.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
@@ -944,7 +944,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                 ? sizeof(size_type)
                                 : cudf::size_of(column_types[col_idx]);
         chunk.num_rowgroups = stripe_num_rowgroups;
-        if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
+        if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _config.timestamp_type.id(); }
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
             chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;

From c99d33838e34e08a85b064239eea23ba3d43542f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sat, 3 Feb 2024 09:19:28 +0700
Subject: [PATCH 069/321] Rewrite constructor

---
 cpp/src/io/orc/reader_impl.cu | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index c55fe69d463..bc0d3ea1c35 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -43,16 +43,11 @@ reader::impl::impl(std::size_t output_size_limit,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _stream(stream),
-    _mr(mr),
-    _config.timestamp_type{options.get_timestamp_type()},
-    _config.use_index{options.is_enabled_use_index()},
-    _config.use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _config.decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()},
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())},
+  : 
+  reader::impl::impl(std::move(sources),
+                     options,
+                     stream,
+                     mr   ),
     _chunk_read_data{output_size_limit, data_read_limit}
 {
 }

From 504208bed1d50610e96dc4e8cc6eead656c44841 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sat, 3 Feb 2024 10:15:05 +0700
Subject: [PATCH 070/321] Reorganize code

---
 cpp/src/io/orc/reader_impl.cu            | 83 +++++++++++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp  |  1 +
 cpp/src/io/orc/reader_impl_preprocess.cu |  1 +
 3 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index bc0d3ea1c35..b5edaafc49b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -60,35 +60,6 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
   return make_output_chunk();
 }
 
-table_metadata reader::impl::make_output_metadata()
-{
-  if (_out_metadata) { return table_metadata{*_out_metadata}; }
-
-  // Copy user data to the output metadata.
-  table_metadata out_metadata;
-  out_metadata.per_file_user_data.reserve(_metadata.per_file_metadata.size());
-  std::transform(_metadata.per_file_metadata.cbegin(),
-                 _metadata.per_file_metadata.cend(),
-                 std::back_inserter(out_metadata.per_file_user_data),
-                 [](auto const& meta) {
-                   std::unordered_map<std::string, std::string> kv_map;
-                   std::transform(meta.ff.metadata.cbegin(),
-                                  meta.ff.metadata.cend(),
-                                  std::inserter(kv_map, kv_map.end()),
-                                  [](auto const& kv) {
-                                    return std::pair{kv.name, kv.value};
-                                  });
-                   return kv_map;
-                 });
-  out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
-                            out_metadata.per_file_user_data[0].end()};
-
-  // Save the output table metadata into `_out_metadata` for reuse next time.
-  _out_metadata = std::make_unique<table_metadata>(out_metadata);
-
-  return out_metadata;
-}
-
 // TODO:  move code here
 void reader::impl::decompress_and_decode() {}
 
@@ -101,11 +72,11 @@ table_with_metadata reader::impl::make_output_chunk()
   auto out_metadata = make_output_metadata();
 
   // If no rows or stripes to read, return empty columns
-  if (_file_itm_data.has_no_data()) {
+  if (_file_itm_data.has_no_data() || !_chunk_read_data.has_next()) {
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
                    std::back_inserter(out_columns),
-                   [&](auto const col_meta) {
+                   [&](auto const &  col_meta) {
                      out_metadata.schema_info.emplace_back("");
                      return create_empty_column(col_meta.id,
                                                 _metadata,
@@ -118,6 +89,7 @@ table_with_metadata reader::impl::make_output_chunk()
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
+// TODO: move this into decompress_and_decode
   // Create columns from buffer with respective schema information.
   std::transform(
     _selected_columns.levels[0].begin(),
@@ -130,9 +102,56 @@ table_with_metadata reader::impl::make_output_chunk()
       return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
     });
 
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+// todo: remove this
+    // auto decoded_table = std::make_unique<table>(std::move(out_columns));
+
+    auto out_table = [&] {
+      if(_chunk_read_data.output_table_chunks.size() == 1){
+        return std::move(_chunk_read_data.decoded_table);
+      }
+      
+      auto const out_chunk = _chunk_read_data.output_table_chunks[_chunked_read_data.curr_output_table_chunk++];
+      auto const out_tview = cudf::slice(_chunk_read_data.decoded_table->view(), 
+                                         {out_chunk.first, 
+                                         out_chunk.second}, _stream)[0];
+      return std::make_unique<table>(out_tview);
+    }();
+    
+
+  return {std::move(out_table), std::move(out_metadata)};
+}
+
+
+table_metadata reader::impl::make_output_metadata()
+{
+  if (_out_metadata) { return table_metadata{*_out_metadata}; }
+
+  // Copy user data to the output metadata.
+  table_metadata out_metadata;
+  out_metadata.per_file_user_data.reserve(_metadata.per_file_metadata.size());
+  std::transform(_metadata.per_file_metadata.cbegin(),
+                 _metadata.per_file_metadata.cend(),
+                 std::back_inserter(out_metadata.per_file_user_data),
+                 [](auto const& meta) {
+                   std::unordered_map<std::string, std::string> kv_map;
+                   std::transform(meta.ff.metadata.cbegin(),
+                                  meta.ff.metadata.cend(),
+                                  std::inserter(kv_map, kv_map.end()),
+                                  [](auto const& kv) {
+                                    return std::pair{kv.name, kv.value};
+                                  });
+                   return kv_map;
+                 });
+  out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
+                            out_metadata.per_file_user_data[0].end()};
+
+  // Save the output table metadata into `_out_metadata` for reuse next time.
+  _out_metadata = std::make_unique<table_metadata>(out_metadata);
+
+  return out_metadata;
 }
 
+
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 85dea4194d4..279503c175b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -232,6 +232,7 @@ struct chunk_read_data {
   // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
   std::vector<chunk> output_table_chunks;
   std::size_t curr_output_table_chunk{0};
+  std::unique_ptr<cudf::table> decoded_table;
   bool more_table_chunk_to_output() const
   {
     return curr_output_table_chunk < output_table_chunks.size();
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 88a423ec506..2ec78ac84f8 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -52,6 +52,7 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
+// TODO: merge this with gather stream info
 /**
  * @brief Function that populates column descriptors stream/chunk
  */

From df95f64eb6fbc77a65da7e18429d4aea8a1641ad Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sat, 3 Feb 2024 21:33:09 +0700
Subject: [PATCH 071/321] Rewrite `gather_stream_info`

---
 cpp/src/io/orc/reader_impl_chunking.cu   |  64 ++------------
 cpp/src/io/orc/reader_impl_helpers.cpp   | 104 +++++++++++++++++++++++
 cpp/src/io/orc/reader_impl_helpers.hpp   |  18 ++++
 cpp/src/io/orc/reader_impl_preprocess.cu | 104 ++---------------------
 4 files changed, 138 insertions(+), 152 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 408a01f6d41..2cd40f5db0d 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -52,61 +52,6 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               std::size_t level,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool apply_struct_map,
-                               std::vector<orc_stream_info>& stream_info)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      dst_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) { col = child_idx; }
-          }
-        }
-      }
-    }
-
-    if (col != -1) {
-      stream_info.emplace_back(stripeinfo->offset + src_offset,
-                               dst_offset,
-                               stream.length,
-                               stream_id_info{stripe_index,
-                               level,
-                               column_id,
-                               stream.kind});
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
 struct cumulative_size {
   int64_t count{0};
   std::size_t size_bytes{0};
@@ -341,14 +286,19 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       auto& stripe_sizes = lvl_stripe_sizes[level];
 
       auto stream_count      = stream_info.size();
-      auto const stripe_size = gather_stream_info(stripe_idx,
+      auto const stripe_size = gather_stream_info_and_column_desc(stripe_idx,
                                                   level,
                                                   stripe_info,
                                                   stripe_footer,
                                                   col_meta.orc_col_map[level],
                                                   _metadata.get_types(),
+                                                  false, // use_index,
                                                   level == 0,
-                                                  stream_info);
+                                                  nullptr, // num_dictionary_entries
+                                                  nullptr, // stream_idx
+                                                  &stream_info,
+                                                  std::nullopt // chunks
+                                                  );
 
       auto const is_stripe_data_empty = stripe_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index ea4e5dcfaab..812e3474bba 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -18,6 +18,110 @@
 
 namespace cudf::io::orc::detail {
 
+
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_index,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  std::size_t* num_dictionary_entries,
+  std::size_t* stream_idx,
+  std::optional<std::vector<orc_stream_info>*> const& stream_info,
+  std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks)
+{
+  CUDF_EXPECTS(stream_info.has_value() ^ chunks.has_value(),
+    "Either stream_info or chunks must be provided, but not both.");
+
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      // TODO: fix dst to src
+      src_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (! schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
+          stream.kind == orc::PRESENT) {
+            
+          for (auto const& idx : schema_type.subtypes) {
+            auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+            if (child_idx >= 0) {
+              col                             = child_idx;
+              if(chunks.has_value()) {
+                auto& chunk                     = (*chunks.value())[stripe_index][col];
+                chunk.strm_id[gpu::CI_PRESENT]  = *stream_idx;
+                chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+              }
+            }
+          }
+        }
+    }
+    if (col != -1) {
+      if (chunks.has_value()) {
+         if (src_offset >= stripeinfo->indexLength || use_index) {
+        auto const index_type = get_stream_index_type(stream.kind);
+        if (index_type < gpu::CI_NUM_STREAMS) {
+          auto& chunk           = (*chunks.value())[stripe_index][col];
+          chunk.strm_id[index_type]  = *stream_idx;
+          chunk.strm_len[index_type] = stream.length;
+          // NOTE: skip_count field is temporarily used to track the presence of index streams
+          chunk.skip_count |= 1 << index_type;
+
+          if (index_type == gpu::CI_DICTIONARY) {
+            chunk.dictionary_start = *num_dictionary_entries;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
+          }
+        }
+      }
+      (*stream_idx)++;
+      } else { // not chunks.has_value()
+        stream_info.value().emplace_back(stripeinfo->offset + src_offset,
+                               dst_offset,
+                               stream.length,
+                               stream_id_info{stripe_index,
+                               level,
+                               column_id,
+                               stream.kind});
+      }
+
+      
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
                                             aggregate_orc_metadata const& metadata,
                                             host_span<std::string const> decimal128_columns,
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index f0d91c75fc3..811cd05cdce 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -127,6 +127,24 @@ inline std::string get_map_child_col_name(std::size_t const idx)
   return (idx == 0) ? "key" : "value";
 }
 
+
+/**
+ * @brief Function that populates descriptors for either individual streams or chunks of column data, but not both.
+ */
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_index,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  std::size_t* num_dictionary_entries,
+  std::size_t* stream_idx,
+  std::optional<std::vector<orc_stream_info>*> const& stream_info,
+  std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks);
+
 /**
  * @brief Create empty columns and respective schema information from the buffer.
  */
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 2ec78ac84f8..90404c7b9ca 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -52,95 +52,6 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
-// TODO: merge this with gather stream info
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info_and_update_chunks(
-  std::size_t stripe_index,
-  std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
-  host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
-  bool use_index,
-  bool apply_struct_map,
-  std::size_t* num_dictionary_entries,
-  std::size_t* stream_idx,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      dst_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = *stream_idx;
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    }
-    if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = *stream_idx;
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-
-      (*stream_idx)++;
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
 // TODO: update
 /**
  * @brief Decompresses the stripe data, at stream granularity.
@@ -353,6 +264,8 @@ rmm::device_buffer decompress_stripe_data(
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
 
+   // TODO: proclam return type
+
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(
@@ -627,6 +540,8 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   stream.synchronize();
 }
 
+
+// TODO: this is called for each chunk of stripes.
 /**
  * @brief Aggregate child metadata from parent column chunks.
  */
@@ -809,6 +724,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& columns_level = _selected_columns.levels[level];
+
+    // TODO: do it in global step
     // Association between each ORC column and its cudf::column
     std::vector<orc_column_meta> nested_cols;
 
@@ -883,7 +800,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
-      auto const total_data_size = gather_stream_info_and_update_chunks(stripe_idx,
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx,
                                                                         level,
                                                                         stripe_info,
                                                                         stripe_footer,
@@ -893,7 +810,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
                                                                         level == 0,
                                                                         &num_dict_entries,
                                                                         &stream_idx,
-                                                                        chunks);
+                                                                        std::nullopt, // stream_info
+                                                                        &chunks);
 
       auto const is_stripe_data_empty = total_data_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
@@ -958,10 +876,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
       stripe_idx++;
     }
 
-    // for (auto& task : read_tasks) {
-    //   CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-    // }
-
     if (stripe_data.empty()) { continue; }
 
     // Process dataset chunk pages into output columns

From e6ebcc0cba17383b46acb1c76b9307fd7210beb3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 9 Feb 2024 14:44:32 +0700
Subject: [PATCH 072/321] Remove `_preprocess.cu` file

---
 cpp/CMakeLists.txt                       |   1 -
 cpp/src/io/orc/reader_impl.cu            | 959 ++++++++++++++++++++++
 cpp/src/io/orc/reader_impl_preprocess.cu | 980 -----------------------
 3 files changed, 959 insertions(+), 981 deletions(-)
 delete mode 100644 cpp/src/io/orc/reader_impl_preprocess.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 49c19596d23..7719d702eec 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -389,7 +389,6 @@ add_library(
   src/io/orc/reader_impl.cu
   src/io/orc/reader_impl_chunking.cu
   src/io/orc/reader_impl_helpers.cpp
-  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index b5edaafc49b..c8f8635c9f2 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,12 +14,971 @@
  * limitations under the License.
  */
 
+
+// #define PRINT_DEBUG
+
 #include "reader_impl.hpp"
 #include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
+#include <io/comp/gpuinflate.hpp>
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include <algorithm>
+#include <iterator>
+
 namespace cudf::io::orc::detail {
 
+namespace {
+
+// TODO: update
+/**
+ * @brief Decompresses the stripe data, at stream granularity.
+ *
+ * @param decompressor Block decompressor
+ * @param stripe_data List of source stripe column data
+ * @param stream_info List of stream to column mappings
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param num_stripes Number of stripes making up column chunks
+ * @param row_index_stride Distance between each row index
+ * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Device buffer to decompressed page data
+ */
+rmm::device_buffer decompress_stripe_data(
+  stream_id_map<stripe_level_comp_info> const& compinfo_map,
+  OrcDecompressor const& decompressor,
+  host_span<rmm::device_buffer const> stripe_data,
+  host_span<orc_stream_info const> stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+  std::size_t num_stripes,
+  std::size_t row_index_stride,
+  bool use_base_stride,
+  rmm::cuda_stream_view stream)
+{
+  // Count the exact number of compressed blocks
+  std::size_t num_compressed_blocks   = 0;
+  std::size_t num_uncompressed_blocks = 0;
+  std::size_t total_decomp_size       = 0;
+
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+    0, stream_info.size(), stream);
+
+  for (auto const& info : stream_info) {
+#ifdef PRINT_DEBUG
+    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+           (int)info.id.stripe_idx,
+           (int)info.id.level,
+           (int)info.id.orc_cold_idx,
+           (int)info.id.kind,
+           info.dst_pos,
+           info.length);
+    fflush(stdout);
+#endif
+
+    compinfo.push_back(gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
+      info.length));
+
+    //    printf("line %d\n", __LINE__);
+    //    fflush(stdout);
+    auto const& cached_comp_info =
+      compinfo_map.at(stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind});
+    //    printf("line %d\n", __LINE__);
+    //    fflush(stdout);
+    // auto const& cached_comp_info =
+    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind}];
+    auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
+    stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+    stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+    stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+    num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+    num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+    total_decomp_size += cached_comp_info.total_decomp_size;
+  }
+
+  CUDF_EXPECTS(
+    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+    "Inconsistent info on compression blocks");
+
+#ifdef XXX
+  std::size_t old_num_compressed_blocks   = num_compressed_blocks;
+  std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
+  std::size_t old_total_decomp_size       = total_decomp_size;
+
+  num_compressed_blocks   = 0;
+  num_uncompressed_blocks = 0;
+  total_decomp_size       = 0;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    num_compressed_blocks += compinfo[i].num_compressed_blocks;
+    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+    auto const& info = stream_info[i];
+    printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+           (int)info.id.stripe_idx,
+           (int)info.id.level,
+           (int)info.id.orc_cold_idx,
+           (int)info.id.kind,
+           (size_t)compinfo[i].num_compressed_blocks,
+           (size_t)compinfo[i].num_uncompressed_blocks,
+           compinfo[i].max_uncompressed_size);
+    fflush(stdout);
+  }
+
+  if (old_num_compressed_blocks != num_compressed_blocks ||
+      old_num_uncompressed_blocks != num_uncompressed_blocks ||
+      old_total_decomp_size != total_decomp_size) {
+    printf("invalid: %d - %d, %d - %d, %d - %d\n",
+           (int)old_num_compressed_blocks,
+           (int)num_compressed_blocks,
+           (int)old_num_uncompressed_blocks,
+           (int)num_uncompressed_blocks,
+           (int)old_total_decomp_size,
+           (int)total_decomp_size
+
+    );
+  }
+#endif
+
+  // Buffer needs to be padded.
+  // Required by `gpuDecodeOrcColumnData`.
+  rmm::device_buffer decomp_data(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+  if (decomp_data.is_empty()) { return decomp_data; }
+
+  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> inflate_out(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               inflate_res.begin(),
+               inflate_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  // Parse again to populate the decompression input/output buffers
+  std::size_t decomp_offset      = 0;
+  uint32_t max_uncomp_block_size = 0;
+  uint32_t start_pos             = 0;
+  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
+    compinfo[i].uncompressed_data = dst_base + decomp_offset;
+    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
+    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
+    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
+
+    //    stream_info[i].dst_pos = decomp_offset;
+    decomp_offset += compinfo[i].max_uncompressed_size;
+    start_pos += compinfo[i].num_compressed_blocks;
+    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
+    max_uncomp_block_size =
+      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
+  }
+  compinfo.host_to_device_async(stream);
+  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                 compinfo.size(),
+                                 decompressor.GetBlockSize(),
+                                 decompressor.GetLog2MaxCompressionRatio(),
+                                 stream);
+
+  // Value for checking whether we decompress successfully.
+  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
+  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
+  any_block_failure[0] = false;
+  any_block_failure.host_to_device_async(stream);
+
+  // Dispatch batches of blocks to decompress
+  if (num_compressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
+                                                            num_compressed_blocks};
+    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
+    switch (decompressor.compression()) {
+      case compression_type::ZLIB:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::SNAPPY:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::ZSTD:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   inflate_in_view,
+                                   inflate_out_view,
+                                   inflate_res,
+                                   max_uncomp_block_size,
+                                   total_decomp_size,
+                                   stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+
+   // TODO: proclam return type
+
+    // Check if any block has been failed to decompress.
+    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(std::size_t{0}),
+      thrust::make_counting_iterator(inflate_res.size()),
+      [results           = inflate_res.begin(),
+       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
+        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
+      });
+  }
+
+  if (num_uncompressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
+                                                         num_uncompressed_blocks};
+    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
+                                                    num_uncompressed_blocks};
+    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
+  }
+
+  // Copy without stream sync, thus need to wait for stream sync below to access.
+  any_block_failure.device_to_host_async(stream);
+
+  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
+
+  // We can check on host after stream synchronize
+  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
+
+  auto const num_columns = chunks.size().second;
+
+  // Update the stream information with the updated uncompressed info
+  // TBD: We could update the value from the information we already
+  // have in stream_info[], but using the gpu results also updates
+  // max_uncompressed_size to the actual uncompressed size, or zero if
+  // decompression failed.
+  for (std::size_t i = 0; i < num_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
+      auto& chunk = chunks[i][j];
+      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
+        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
+          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
+          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
+        }
+      }
+    }
+  }
+
+  if (row_groups.size().first) {
+    chunks.host_to_device_async(stream);
+    row_groups.host_to_device_async(stream);
+    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                            compinfo.device_ptr(),
+                            chunks.base_device_ptr(),
+                            num_columns,
+                            num_stripes,
+                            row_groups.size().first,
+                            row_index_stride,
+                            use_base_stride,
+                            stream);
+  }
+
+  return decomp_data;
+}
+
+/**
+ * @brief Updates null mask of columns whose parent is a struct column.
+ *
+ * If struct column has null element, that row would be skipped while writing child column in ORC,
+ * so we need to insert the missing null elements in child column. There is another behavior from
+ * pyspark, where if the child column doesn't have any null elements, it will not have present
+ * stream, so in that case parent null mask need to be copied to child column.
+ *
+ * @param chunks Vector of list of column chunk descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                      host_span<column_buffer> out_buffers,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  bool is_mask_updated   = false;
+
+  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
+      if (not is_mask_updated) {
+        chunks.device_to_host_sync(stream);
+        is_mask_updated = true;
+      }
+
+      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
+      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
+      auto child_mask_len =
+        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
+      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
+
+      if (child_valid_map_base != nullptr) {
+        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
+        // Copy indexes at which the parent has valid value.
+        thrust::copy_if(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator(0),
+                        thrust::make_counting_iterator(0) + parent_mask_len,
+                        dst_idx.begin(),
+                        [parent_valid_map_base] __device__(auto idx) {
+                          return bit_is_set(parent_valid_map_base, idx);
+                        });
+
+        auto merged_null_mask = cudf::detail::create_null_mask(
+          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
+        uint32_t* dst_idx_ptr = dst_idx.data();
+        // Copy child valid bits from child column to valid indexes, this will merge both child
+        // and parent null masks
+        thrust::for_each(rmm::exec_policy(stream),
+                         thrust::make_counting_iterator(0),
+                         thrust::make_counting_iterator(0) + dst_idx.size(),
+                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
+                           if (bit_is_set(child_valid_map_base, idx)) {
+                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
+                           };
+                         });
+
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
+
+      } else {
+        // Since child column doesn't have a mask, copy parent null mask
+        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
+      }
+    }
+  }
+
+  if (is_mask_updated) {
+    // Update chunks with pointers to column data which might have been changed.
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+        auto& chunk          = chunks[stripe_idx][col_idx];
+        chunk.valid_map_base = out_buffers[col_idx].null_mask();
+      }
+    }
+    chunks.host_to_device_sync(stream);
+  }
+}
+
+/**
+ * @brief Converts the stripe column data and outputs to columns.
+ *
+ * @param num_dicts Number of dictionary entries required
+ * @param skip_rows Number of rows to offset from start
+ * @param row_index_stride Distance between each row index
+ * @param level Current nesting level being processed
+ * @param tz_table Local time to UTC conversion table
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void decode_stream_data(std::size_t num_dicts,
+                        std::size_t skip_rows,
+                        std::size_t row_index_stride,
+                        std::size_t level,
+                        table_view const& tz_table,
+                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        std::vector<column_buffer>& out_buffers,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  thrust::counting_iterator<int> col_idx_it(0);
+  thrust::counting_iterator<int> stripe_idx_it(0);
+
+  // Update chunks with pointers to column data
+  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
+    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
+      auto& chunk            = chunks[stripe_idx][col_idx];
+      chunk.column_data_base = out_buffers[col_idx].data();
+      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
+    });
+  });
+
+  // Allocate global dictionary for deserializing
+  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
+
+  chunks.host_to_device_sync(stream);
+  gpu::DecodeNullsAndStringDictionaries(
+    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
+
+  if (level > 0) {
+    // Update nullmasks for children if parent was a struct and had null mask
+    update_null_mask(chunks, out_buffers, stream, mr);
+  }
+
+  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
+  rmm::device_scalar<size_type> error_count(0, stream);
+  // Update the null map for child columns
+  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
+                           global_dict.data(),
+                           row_groups,
+                           num_columns,
+                           num_stripes,
+                           skip_rows,
+                           *tz_table_dptr,
+                           row_groups.size().first,
+                           row_index_stride,
+                           level,
+                           error_count.data(),
+                           stream);
+  chunks.device_to_host_async(stream);
+  // `value` synchronizes
+  auto const num_errors = error_count.value(stream);
+  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
+
+  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
+    out_buffers[col_idx].null_count() =
+      std::accumulate(stripe_idx_it + 0,
+                      stripe_idx_it + num_stripes,
+                      0,
+                      [&](auto null_count, auto const stripe_idx) {
+                        return null_count + chunks[stripe_idx][col_idx].null_count;
+                      });
+  });
+}
+
+/**
+ * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
+ * layer.
+ */
+void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      rmm::cuda_stream_view stream)
+{
+  auto const num_stripes = chunks.size().first;
+  if (num_stripes == 0) return;
+
+  auto const num_columns = chunks.size().second;
+  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
+    // Null counts sums are only needed for children of struct columns
+    if (chunks[0][col_idx].type_kind == STRUCT) {
+      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+    }
+  }
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+                     auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(0) + psums.size(),
+                       psums.begin(),
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+
+                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
+                   });
+  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
+  stream.synchronize();
+}
+
+
+// TODO: this is called for each chunk of stripes.
+/**
+ * @brief Aggregate child metadata from parent column chunks.
+ */
+void aggregate_child_meta(std::size_t level,
+                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          host_span<orc_column_meta const> nested_cols,
+                          host_span<column_buffer> out_buffers,
+                          reader_column_meta& col_meta)
+{
+  auto const num_of_stripes         = chunks.size().first;
+  auto const num_of_rowgroups       = row_groups.size().first;
+  auto const num_child_cols         = selected_columns.levels[level + 1].size();
+  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
+  auto& num_child_rows              = col_meta.num_child_rows;
+  auto& parent_column_data          = col_meta.parent_column_data;
+
+  // Reset the meta to store child column details.
+  num_child_rows.resize(selected_columns.levels[level + 1].size());
+  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
+  parent_column_data.resize(number_of_child_chunks);
+  col_meta.parent_column_index.resize(number_of_child_chunks);
+  col_meta.child_start_row.resize(number_of_child_chunks);
+  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
+  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
+
+  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
+    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
+  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
+    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
+    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+
+  int index = 0;  // number of child column processed
+
+  // For each parent column, update its child column meta for each stripe.
+  std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
+    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+    auto start_row            = 0;
+    auto processed_row_groups = 0;
+
+    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
+      // Aggregate num_rows and start_row from processed parent columns per row groups
+      if (num_of_rowgroups) {
+        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
+        auto processed_child_rows  = 0;
+
+        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
+             rowgroup_id++, processed_row_groups++) {
+          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
+          for (size_type id = 0; id < p_col.num_children; id++) {
+            auto const child_col_idx                                  = index + id;
+            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
+            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
+          }
+          processed_child_rows += child_rows;
+        }
+      }
+
+      // Aggregate start row, number of rows per chunk and total number of rows in a column
+      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      for (size_type id = 0; id < p_col.num_children; id++) {
+        auto const child_col_idx = index + id;
+
+        // TODO: Check for overflow here.
+        num_child_rows[child_col_idx] += child_rows;
+        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
+        // start row could be different for each column when there is nesting at each stripe level
+        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+      }
+      start_row += child_rows;
+    }
+
+    // Parent column null mask and null count would be required for child column
+    // to adjust its nullmask.
+    auto type              = out_buffers[parent_col_idx].type.id();
+    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
+    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
+    auto num_rows          = out_buffers[parent_col_idx].size;
+
+    for (size_type id = 0; id < p_col.num_children; id++) {
+      auto const child_col_idx                    = index + id;
+      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
+      if (type == type_id::STRUCT) {
+        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
+        // Number of rows in child will remain same as parent in case of struct column
+        num_child_rows[child_col_idx] = num_rows;
+      } else {
+        parent_column_data[child_col_idx] = {nullptr, 0};
+      }
+    }
+    index += p_col.num_children;
+  });
+}
+
+/**
+ * @brief struct to store buffer data and size of list buffer
+ */
+struct list_buffer_data {
+  size_type* data;
+  size_type size;
+};
+
+// Generates offsets for list buffer from number of elements in a row.
+void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
+{
+  for (auto& list_data : buff_data) {
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           list_data.data,
+                           list_data.data + list_data.size,
+                           list_data.data);
+  }
+}
+
+}  // namespace
+
+void reader::impl::prepare_data(uint64_t skip_rows,
+                                std::optional<size_type> const& num_rows_opt,
+                                std::vector<std::vector<size_type>> const& stripes)
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns
+  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested columns");
+
+  // There are no columns in the table
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  global_preprocess(skip_rows, num_rows_opt, stripes);
+
+  if (_file_itm_data.has_no_data()) { return; }
+
+  // TODO: fix this, should be called once
+  while (_chunk_read_data.more_stripe_to_load()) {
+    read_data();
+  }
+
+  // Fix this, subpass should be call once
+  _chunk_read_data.curr_load_stripe_chunk = 0;
+  while (_chunk_read_data.more_stripe_to_load()) {
+    subpass_preprocess();
+  }
+
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const rows_to_read      = _file_itm_data.rows_to_read;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+
+  // Set up table for converting timestamp columns from local to UTC time
+  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
+  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
+  auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
+  auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
+
+  // TODO: move this to global step
+  lvl_chunks.resize(_selected_columns.num_levels());
+  _out_buffers.resize(_selected_columns.num_levels());
+
+
+//
+//
+//
+// TODO: move this to reader_impl.cu, decomp and decode step
+  std::size_t num_stripes = selected_stripes.size();
+
+  // Iterates through levels of nested columns, child column will be one level down
+  // compared to parent column.
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
+
+    // TODO: do it in global step
+    // Association between each ORC column and its cudf::column
+    std::vector<orc_column_meta> nested_cols;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
+    for (auto& col : columns_level) {
+      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
+                                   _config.use_np_dtypes,
+                                   _config.timestamp_type.id(),
+                                   to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    auto const num_columns = columns_level.size();
+    auto& chunks           = lvl_chunks[level];
+    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    const bool use_index =
+      _config.use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
+
+    // Logically view streams as columns
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    std::size_t stripe_start_row = 0;
+    std::size_t num_dict_entries = 0;
+    std::size_t num_rowgroups    = 0;
+
+    // TODO: Stripe and stream idx must be by chunk.
+    std::size_t stripe_idx = 0;
+    std::size_t stream_idx = 0;
+
+    // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe : selected_stripes) {
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx,
+                                                                        level,
+                                                                        stripe_info,
+                                                                        stripe_footer,
+                                                                        col_meta.orc_col_map[level],
+                                                                        _metadata.get_types(),
+                                                                        use_index,
+                                                                        level == 0,
+                                                                        &num_dict_entries,
+                                                                        &stream_idx,
+                                                                        std::nullopt, // stream_info
+                                                                        &chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
+
+      auto const num_rows_per_stripe = stripe_info->numberOfRows;
+      auto const rowgroup_id         = num_rowgroups;
+      auto stripe_num_rowgroups      = 0;
+      if (use_index) {
+        stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                               _metadata.get_row_index_stride();
+      }
+      // Update chunks to reference streams pointers
+      for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+        auto& chunk = chunks[stripe_idx][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row       = (level == 0)
+                                  ? stripe_start_row
+                                  : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+        chunk.num_rows        = (level == 0)
+                                  ? stripe_info->numberOfRows
+                                  : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0)
+            ? nullptr
+            : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+        if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _config.timestamp_type.id(); }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+          }
+        }
+      }
+      stripe_start_row += num_rows_per_stripe;
+      num_rowgroups += stripe_num_rowgroups;
+
+      stripe_idx++;
+    }
+
+    if (stripe_data.empty()) { continue; }
+
+    // Process dataset chunk pages into output columns
+    auto row_groups =
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+    if (level > 0 and row_groups.size().first) {
+      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                     num_rowgroups * num_columns);
+      auto& rw_grp_meta = col_meta.rwgrp_meta;
+
+      // Update start row and num rows per row group
+      std::transform(rw_grp_meta.begin(),
+                     rw_grp_meta.end(),
+                     row_groups_span.begin(),
+                     rw_grp_meta.begin(),
+                     [&](auto meta, auto& row_grp) {
+                       row_grp.num_rows  = meta.num_rows;
+                       row_grp.start_row = meta.start_row;
+                       return meta;
+                     });
+    }
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto decomp_data = decompress_stripe_data(_file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                num_stripes,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+      stripe_data.clear();
+      stripe_data.push_back(std::move(decomp_data));
+    } else {
+      if (row_groups.size().first) {
+        chunks.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                                nullptr,
+                                chunks.base_device_ptr(),
+                                num_columns,
+                                num_stripes,
+                                num_rowgroups,
+                                _metadata.get_row_index_stride(),
+                                level == 0,
+                                _stream);
+      }
+    }
+
+    for (std::size_t i = 0; i < column_types.size(); ++i) {
+      bool is_nullable = false;
+      for (std::size_t j = 0; j < num_stripes; ++j) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          is_nullable = true;
+          break;
+        }
+      }
+      auto is_list_type = (column_types[i].id() == type_id::LIST);
+      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+      // For list column, offset column will be always size + 1
+      if (is_list_type) n_rows++;
+      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+    }
+
+    decode_stream_data(num_dict_entries,
+                       rows_to_skip,
+                       _metadata.get_row_index_stride(),
+                       level,
+                       tz_table->view(),
+                       chunks,
+                       row_groups,
+                       _out_buffers[level],
+                       _stream,
+                       _mr);
+
+    if (nested_cols.size()) {
+      // Extract information to process nested child columns
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+
+      row_groups.device_to_host_sync(_stream);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
+
+      // ORC stores number of elements at each row, so we need to generate offsets from that
+      std::vector<list_buffer_data> buff_data;
+      std::for_each(
+        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+          if (out_buffer.type.id() == type_id::LIST) {
+            auto data = static_cast<size_type*>(out_buffer.data());
+            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+          }
+        });
+
+      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    }
+  }  // end loop level
+}
+
+
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
deleted file mode 100644
index 90404c7b9ca..00000000000
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ /dev/null
@@ -1,980 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// #define PRINT_DEBUG
-
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
-
-#include <io/comp/gpuinflate.hpp>
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-
-#include <cudf/detail/timezone.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-#include <algorithm>
-#include <iterator>
-
-namespace cudf::io::orc::detail {
-
-namespace {
-
-// TODO: update
-/**
- * @brief Decompresses the stripe data, at stream granularity.
- *
- * @param decompressor Block decompressor
- * @param stripe_data List of source stripe column data
- * @param stream_info List of stream to column mappings
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
- * @param row_index_stride Distance between each row index
- * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
- */
-rmm::device_buffer decompress_stripe_data(
-  stream_id_map<stripe_level_comp_info> const& compinfo_map,
-  OrcDecompressor const& decompressor,
-  host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info const> stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  std::size_t num_stripes,
-  std::size_t row_index_stride,
-  bool use_base_stride,
-  rmm::cuda_stream_view stream)
-{
-  // Count the exact number of compressed blocks
-  std::size_t num_compressed_blocks   = 0;
-  std::size_t num_uncompressed_blocks = 0;
-  std::size_t total_decomp_size       = 0;
-
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-
-  for (auto const& info : stream_info) {
-#ifdef PRINT_DEBUG
-    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-           (int)info.id.stripe_idx,
-           (int)info.id.level,
-           (int)info.id.orc_cold_idx,
-           (int)info.id.kind,
-           info.dst_pos,
-           info.length);
-    fflush(stdout);
-#endif
-
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
-      info.length));
-
-    //    printf("line %d\n", __LINE__);
-    //    fflush(stdout);
-    auto const& cached_comp_info =
-      compinfo_map.at(stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind});
-    //    printf("line %d\n", __LINE__);
-    //    fflush(stdout);
-    // auto const& cached_comp_info =
-    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind}];
-    auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
-    stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
-    stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
-    stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
-
-    num_compressed_blocks += cached_comp_info.num_compressed_blocks;
-    num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
-    total_decomp_size += cached_comp_info.total_decomp_size;
-  }
-
-  CUDF_EXPECTS(
-    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
-    "Inconsistent info on compression blocks");
-
-#ifdef XXX
-  std::size_t old_num_compressed_blocks   = num_compressed_blocks;
-  std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
-  std::size_t old_total_decomp_size       = total_decomp_size;
-
-  num_compressed_blocks   = 0;
-  num_uncompressed_blocks = 0;
-  total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
-
-    auto const& info = stream_info[i];
-    printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-           (int)info.id.stripe_idx,
-           (int)info.id.level,
-           (int)info.id.orc_cold_idx,
-           (int)info.id.kind,
-           (size_t)compinfo[i].num_compressed_blocks,
-           (size_t)compinfo[i].num_uncompressed_blocks,
-           compinfo[i].max_uncompressed_size);
-    fflush(stdout);
-  }
-
-  if (old_num_compressed_blocks != num_compressed_blocks ||
-      old_num_uncompressed_blocks != num_uncompressed_blocks ||
-      old_total_decomp_size != total_decomp_size) {
-    printf("invalid: %d - %d, %d - %d, %d - %d\n",
-           (int)old_num_compressed_blocks,
-           (int)num_compressed_blocks,
-           (int)old_num_uncompressed_blocks,
-           (int)num_uncompressed_blocks,
-           (int)old_total_decomp_size,
-           (int)total_decomp_size
-
-    );
-  }
-#endif
-
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
-  rmm::device_buffer decomp_data(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-  if (decomp_data.is_empty()) { return decomp_data; }
-
-  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<device_span<uint8_t>> inflate_out(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
-               inflate_res.begin(),
-               inflate_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  // Parse again to populate the decompression input/output buffers
-  std::size_t decomp_offset      = 0;
-  uint32_t max_uncomp_block_size = 0;
-  uint32_t start_pos             = 0;
-  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
-    compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
-    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
-    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
-    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
-    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
-
-    //    stream_info[i].dst_pos = decomp_offset;
-    decomp_offset += compinfo[i].max_uncompressed_size;
-    start_pos += compinfo[i].num_compressed_blocks;
-    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
-    max_uncomp_block_size =
-      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
-  }
-  compinfo.host_to_device_async(stream);
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-
-  // Value for checking whether we decompress successfully.
-  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
-  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device_async(stream);
-
-  // Dispatch batches of blocks to decompress
-  if (num_compressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
-                                                            num_compressed_blocks};
-    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
-    switch (decompressor.compression()) {
-      case compression_type::ZLIB:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
-          gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::SNAPPY:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::ZSTD:
-        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
-            reason) {
-          CUDF_FAIL("Decompression error: " + reason.value());
-        }
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   inflate_in_view,
-                                   inflate_out_view,
-                                   inflate_res,
-                                   max_uncomp_block_size,
-                                   total_decomp_size,
-                                   stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-
-   // TODO: proclam return type
-
-    // Check if any block has been failed to decompress.
-    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(std::size_t{0}),
-      thrust::make_counting_iterator(inflate_res.size()),
-      [results           = inflate_res.begin(),
-       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
-        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
-      });
-  }
-
-  if (num_uncompressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
-                                                         num_uncompressed_blocks};
-    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
-                                                    num_uncompressed_blocks};
-    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
-  }
-
-  // Copy without stream sync, thus need to wait for stream sync below to access.
-  any_block_failure.device_to_host_async(stream);
-
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
-  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
-
-  // We can check on host after stream synchronize
-  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
-
-  auto const num_columns = chunks.size().second;
-
-  // Update the stream information with the updated uncompressed info
-  // TBD: We could update the value from the information we already
-  // have in stream_info[], but using the gpu results also updates
-  // max_uncompressed_size to the actual uncompressed size, or zero if
-  // decompression failed.
-  for (std::size_t i = 0; i < num_stripes; ++i) {
-    for (std::size_t j = 0; j < num_columns; ++j) {
-      auto& chunk = chunks[i][j];
-      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
-        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
-          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
-          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
-        }
-      }
-    }
-  }
-
-  if (row_groups.size().first) {
-    chunks.host_to_device_async(stream);
-    row_groups.host_to_device_async(stream);
-    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                            compinfo.device_ptr(),
-                            chunks.base_device_ptr(),
-                            num_columns,
-                            num_stripes,
-                            row_groups.size().first,
-                            row_index_stride,
-                            use_base_stride,
-                            stream);
-  }
-
-  return decomp_data;
-}
-
-/**
- * @brief Updates null mask of columns whose parent is a struct column.
- *
- * If struct column has null element, that row would be skipped while writing child column in ORC,
- * so we need to insert the missing null elements in child column. There is another behavior from
- * pyspark, where if the child column doesn't have any null elements, it will not have present
- * stream, so in that case parent null mask need to be copied to child column.
- *
- * @param chunks Vector of list of column chunk descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
- */
-void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                      host_span<column_buffer> out_buffers,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  bool is_mask_updated   = false;
-
-  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
-      if (not is_mask_updated) {
-        chunks.device_to_host_sync(stream);
-        is_mask_updated = true;
-      }
-
-      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
-      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
-      auto child_mask_len =
-        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
-      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
-
-      if (child_valid_map_base != nullptr) {
-        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
-        // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator(0),
-                        thrust::make_counting_iterator(0) + parent_mask_len,
-                        dst_idx.begin(),
-                        [parent_valid_map_base] __device__(auto idx) {
-                          return bit_is_set(parent_valid_map_base, idx);
-                        });
-
-        auto merged_null_mask = cudf::detail::create_null_mask(
-          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
-        uint32_t* dst_idx_ptr = dst_idx.data();
-        // Copy child valid bits from child column to valid indexes, this will merge both child
-        // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0),
-                         thrust::make_counting_iterator(0) + dst_idx.size(),
-                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
-                           if (bit_is_set(child_valid_map_base, idx)) {
-                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
-                           };
-                         });
-
-        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
-
-      } else {
-        // Since child column doesn't have a mask, copy parent null mask
-        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx].set_null_mask(
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
-      }
-    }
-  }
-
-  if (is_mask_updated) {
-    // Update chunks with pointers to column data which might have been changed.
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto& chunk          = chunks[stripe_idx][col_idx];
-        chunk.valid_map_base = out_buffers[col_idx].null_mask();
-      }
-    }
-    chunks.host_to_device_sync(stream);
-  }
-}
-
-/**
- * @brief Converts the stripe column data and outputs to columns.
- *
- * @param num_dicts Number of dictionary entries required
- * @param skip_rows Number of rows to offset from start
- * @param row_index_stride Distance between each row index
- * @param level Current nesting level being processed
- * @param tz_table Local time to UTC conversion table
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
- */
-void decode_stream_data(std::size_t num_dicts,
-                        std::size_t skip_rows,
-                        std::size_t row_index_stride,
-                        std::size_t level,
-                        table_view const& tz_table,
-                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
-                        std::vector<column_buffer>& out_buffers,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  thrust::counting_iterator<int> col_idx_it(0);
-  thrust::counting_iterator<int> stripe_idx_it(0);
-
-  // Update chunks with pointers to column data
-  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
-    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
-      auto& chunk            = chunks[stripe_idx][col_idx];
-      chunk.column_data_base = out_buffers[col_idx].data();
-      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
-    });
-  });
-
-  // Allocate global dictionary for deserializing
-  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
-
-  chunks.host_to_device_sync(stream);
-  gpu::DecodeNullsAndStringDictionaries(
-    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
-
-  if (level > 0) {
-    // Update nullmasks for children if parent was a struct and had null mask
-    update_null_mask(chunks, out_buffers, stream, mr);
-  }
-
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
-  rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
-  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
-                           global_dict.data(),
-                           row_groups,
-                           num_columns,
-                           num_stripes,
-                           skip_rows,
-                           *tz_table_dptr,
-                           row_groups.size().first,
-                           row_index_stride,
-                           level,
-                           error_count.data(),
-                           stream);
-  chunks.device_to_host_async(stream);
-  // `value` synchronizes
-  auto const num_errors = error_count.value(stream);
-  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
-
-  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
-    out_buffers[col_idx].null_count() =
-      std::accumulate(stripe_idx_it + 0,
-                      stripe_idx_it + num_stripes,
-                      0,
-                      [&](auto null_count, auto const stripe_idx) {
-                        return null_count + chunks[stripe_idx][col_idx].null_count;
-                      });
-  });
-}
-
-/**
- * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
- * layer.
- */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
-                      rmm::cuda_stream_view stream)
-{
-  auto const num_stripes = chunks.size().first;
-  if (num_stripes == 0) return;
-
-  auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
-  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
-    // Null counts sums are only needed for children of struct columns
-    if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
-    }
-  }
-  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
-
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
-  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
-  stream.synchronize();
-}
-
-
-// TODO: this is called for each chunk of stripes.
-/**
- * @brief Aggregate child metadata from parent column chunks.
- */
-void aggregate_child_meta(std::size_t level,
-                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
-                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
-                          host_span<orc_column_meta const> nested_cols,
-                          host_span<column_buffer> out_buffers,
-                          reader_column_meta& col_meta)
-{
-  auto const num_of_stripes         = chunks.size().first;
-  auto const num_of_rowgroups       = row_groups.size().first;
-  auto const num_child_cols         = selected_columns.levels[level + 1].size();
-  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
-  auto& num_child_rows              = col_meta.num_child_rows;
-  auto& parent_column_data          = col_meta.parent_column_data;
-
-  // Reset the meta to store child column details.
-  num_child_rows.resize(selected_columns.levels[level + 1].size());
-  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
-  parent_column_data.resize(number_of_child_chunks);
-  col_meta.parent_column_index.resize(number_of_child_chunks);
-  col_meta.child_start_row.resize(number_of_child_chunks);
-  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
-  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
-
-  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
-    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
-  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
-    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
-  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
-
-  int index = 0;  // number of child column processed
-
-  // For each parent column, update its child column meta for each stripe.
-  std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
-    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
-    auto start_row            = 0;
-    auto processed_row_groups = 0;
-
-    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
-      // Aggregate num_rows and start_row from processed parent columns per row groups
-      if (num_of_rowgroups) {
-        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
-        auto processed_child_rows  = 0;
-
-        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
-             rowgroup_id++, processed_row_groups++) {
-          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (size_type id = 0; id < p_col.num_children; id++) {
-            auto const child_col_idx                                  = index + id;
-            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
-            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
-          }
-          processed_child_rows += child_rows;
-        }
-      }
-
-      // Aggregate start row, number of rows per chunk and total number of rows in a column
-      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      for (size_type id = 0; id < p_col.num_children; id++) {
-        auto const child_col_idx = index + id;
-
-        // TODO: Check for overflow here.
-        num_child_rows[child_col_idx] += child_rows;
-        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
-        // start row could be different for each column when there is nesting at each stripe level
-        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
-      }
-      start_row += child_rows;
-    }
-
-    // Parent column null mask and null count would be required for child column
-    // to adjust its nullmask.
-    auto type              = out_buffers[parent_col_idx].type.id();
-    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
-    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
-    auto num_rows          = out_buffers[parent_col_idx].size;
-
-    for (size_type id = 0; id < p_col.num_children; id++) {
-      auto const child_col_idx                    = index + id;
-      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
-      if (type == type_id::STRUCT) {
-        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
-        // Number of rows in child will remain same as parent in case of struct column
-        num_child_rows[child_col_idx] = num_rows;
-      } else {
-        parent_column_data[child_col_idx] = {nullptr, 0};
-      }
-    }
-    index += p_col.num_children;
-  });
-}
-
-/**
- * @brief struct to store buffer data and size of list buffer
- */
-struct list_buffer_data {
-  size_type* data;
-  size_type size;
-};
-
-// Generates offsets for list buffer from number of elements in a row.
-void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
-{
-  for (auto& list_data : buff_data) {
-    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
-                           list_data.data,
-                           list_data.data + list_data.size,
-                           list_data.data);
-  }
-}
-
-}  // namespace
-
-void reader::impl::prepare_data(uint64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
-{
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  global_preprocess(skip_rows, num_rows_opt, stripes);
-
-  if (_file_itm_data.has_no_data()) { return; }
-
-  // TODO: fix this, should be called once
-  while (_chunk_read_data.more_stripe_to_load()) {
-    read_data();
-  }
-
-  // Fix this, subpass should be call once
-  _chunk_read_data.curr_load_stripe_chunk = 0;
-  while (_chunk_read_data.more_stripe_to_load()) {
-    subpass_preprocess();
-  }
-
-  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
-  auto const rows_to_read      = _file_itm_data.rows_to_read;
-  auto const& selected_stripes = _file_itm_data.selected_stripes;
-
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
-
-    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
-                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
-                                : std::make_unique<cudf::table>();
-  }();
-
-  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
-  auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
-
-  // TODO: move this to global step
-  lvl_chunks.resize(_selected_columns.num_levels());
-  _out_buffers.resize(_selected_columns.num_levels());
-
-
-//
-//
-//
-// TODO: move this to reader_impl.cu, decomp and decode step
-  std::size_t num_stripes = selected_stripes.size();
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
-  auto& col_meta = *_col_meta;
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-
-    // TODO: do it in global step
-    // Association between each ORC column and its cudf::column
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _config.use_np_dtypes,
-                                   _config.timestamp_type.id(),
-                                   to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
-
-      // Map each ORC column to its column
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
-
-    auto const num_columns = columns_level.size();
-    auto& chunks           = lvl_chunks[level];
-    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-    const bool use_index =
-      _config.use_index &&
-      // Do stripes have row group index
-      _metadata.is_row_grp_idx_present() &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() > 0 && num_columns * num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (rows_to_skip == 0);
-
-    // Logically view streams as columns
-    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    std::size_t stripe_start_row = 0;
-    std::size_t num_dict_entries = 0;
-    std::size_t num_rowgroups    = 0;
-
-    // TODO: Stripe and stream idx must be by chunk.
-    std::size_t stripe_idx = 0;
-    std::size_t stream_idx = 0;
-
-    // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe : selected_stripes) {
-      auto const stripe_info   = stripe.stripe_info;
-      auto const stripe_footer = stripe.stripe_footer;
-
-      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx,
-                                                                        level,
-                                                                        stripe_info,
-                                                                        stripe_footer,
-                                                                        col_meta.orc_col_map[level],
-                                                                        _metadata.get_types(),
-                                                                        use_index,
-                                                                        level == 0,
-                                                                        &num_dict_entries,
-                                                                        &stream_idx,
-                                                                        std::nullopt, // stream_info
-                                                                        &chunks);
-
-      auto const is_stripe_data_empty = total_data_size == 0;
-      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                   "Invalid index rowgroup stream data");
-
-      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
-
-      auto const num_rows_per_stripe = stripe_info->numberOfRows;
-      auto const rowgroup_id         = num_rowgroups;
-      auto stripe_num_rowgroups      = 0;
-      if (use_index) {
-        stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                               _metadata.get_row_index_stride();
-      }
-      // Update chunks to reference streams pointers
-      for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-        auto& chunk = chunks[stripe_idx][col_idx];
-        // start row, number of rows in a each stripe and total number of rows
-        // may change in lower levels of nesting
-        chunk.start_row       = (level == 0)
-                                  ? stripe_start_row
-                                  : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-        chunk.num_rows        = (level == 0)
-                                  ? stripe_info->numberOfRows
-                                  : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-        chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-        chunk.parent_validity_info =
-          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-        chunk.parent_null_count_prefix_sums =
-          (level == 0)
-            ? nullptr
-            : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-        chunk.type_kind =
-          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
-        // num_child_rows for a struct column will be same, for other nested types it will be
-        // calculated.
-        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-        chunk.dtype_id       = column_types[col_idx].id();
-        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
-                                .ff.types[columns_level[col_idx].id]
-                                .scale.value_or(0);
-
-        chunk.rowgroup_id   = rowgroup_id;
-        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                ? sizeof(string_index_pair)
-                              : ((column_types[col_idx].id() == type_id::LIST) or
-                             (column_types[col_idx].id() == type_id::STRUCT))
-                                ? sizeof(size_type)
-                                : cudf::size_of(column_types[col_idx]);
-        chunk.num_rowgroups = stripe_num_rowgroups;
-        if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _config.timestamp_type.id(); }
-        if (not is_stripe_data_empty) {
-          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-          }
-        }
-      }
-      stripe_start_row += num_rows_per_stripe;
-      num_rowgroups += stripe_num_rowgroups;
-
-      stripe_idx++;
-    }
-
-    if (stripe_data.empty()) { continue; }
-
-    // Process dataset chunk pages into output columns
-    auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
-    if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
-      auto& rw_grp_meta = col_meta.rwgrp_meta;
-
-      // Update start row and num rows per row group
-      std::transform(rw_grp_meta.begin(),
-                     rw_grp_meta.end(),
-                     row_groups_span.begin(),
-                     rw_grp_meta.begin(),
-                     [&](auto meta, auto& row_grp) {
-                       row_grp.num_rows  = meta.num_rows;
-                       row_grp.start_row = meta.start_row;
-                       return meta;
-                     });
-    }
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(_file_itm_data.compinfo_map,
-                                                *_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                num_stripes,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
-    } else {
-      if (row_groups.size().first) {
-        chunks.host_to_device_async(_stream);
-        row_groups.host_to_device_async(_stream);
-        row_groups.host_to_device_async(_stream);
-        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                nullptr,
-                                chunks.base_device_ptr(),
-                                num_columns,
-                                num_stripes,
-                                num_rowgroups,
-                                _metadata.get_row_index_stride(),
-                                level == 0,
-                                _stream);
-      }
-    }
-
-    for (std::size_t i = 0; i < column_types.size(); ++i) {
-      bool is_nullable = false;
-      for (std::size_t j = 0; j < num_stripes; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-          is_nullable = true;
-          break;
-        }
-      }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
-    }
-
-    decode_stream_data(num_dict_entries,
-                       rows_to_skip,
-                       _metadata.get_row_index_stride(),
-                       level,
-                       tz_table->view(),
-                       chunks,
-                       row_groups,
-                       _out_buffers[level],
-                       _stream,
-                       _mr);
-
-    if (nested_cols.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
-
-      row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(
-        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
-
-      // ORC stores number of elements at each row, so we need to generate offsets from that
-      std::vector<list_buffer_data> buff_data;
-      std::for_each(
-        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
-          if (out_buffer.type.id() == type_id::LIST) {
-            auto data = static_cast<size_type*>(out_buffer.data());
-            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
-          }
-        });
-
-      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
-    }
-  }  // end loop level
-}
-
-}  // namespace cudf::io::orc::detail

From de6b717c9e4d04d1e93917d72957ba237dc46328 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 11:13:31 -0800
Subject: [PATCH 073/321] Fix compilation

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu   | 30 +++++-------
 cpp/src/io/orc/reader_impl_chunking.hpp  | 62 ++++++++++--------------
 cpp/src/io/orc/reader_impl_preprocess.cu |  7 +--
 3 files changed, 43 insertions(+), 56 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ccb6d3dc4e8..ff875e2050c 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -92,13 +92,11 @@ std::size_t gather_stream_info(std::size_t stripe_index,
     }
 
     if (col != -1) {
-      stream_info.emplace_back(stripeinfo->offset + src_offset,
-                               dst_offset,
-                               stream.length,
-                               stream_id_info{stripe_index,
-                               level,
-                               column_id,
-                               stream.kind});
+      stream_info.emplace_back(
+        stripeinfo->offset + src_offset,
+        dst_offset,
+        stream.length,
+        stream_id_info{static_cast<uint32_t>(stripe_index), level, column_id, stream.kind});
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -276,13 +274,12 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   lvl_stripe_data.resize(_selected_columns.num_levels());
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
 
-  auto& read_info                 = _file_itm_data.data_read_info;
-  auto& stripe_data_read_chunks = _file_itm_data.stripe_data_read_chunks;
-  auto& lvl_stripe_stream_chunks  = _file_itm_data.lvl_stripe_stream_chunks;
+  auto& read_info                = _file_itm_data.data_read_info;
+  auto& stripe_data_read_chunks  = _file_itm_data.stripe_data_read_chunks;
+  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
 
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
-    stream_compinfo_map;
+  stream_id_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
   // Logically view streams as columns
   _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
@@ -463,7 +460,7 @@ void reader::impl::read_data()
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
   auto const& stripe_data_read_chunks = _file_itm_data.stripe_data_read_chunks;
-  auto const [read_begin, read_end]     = get_range(stripe_data_read_chunks, stripe_chunk);
+  auto const [read_begin, read_end]   = get_range(stripe_data_read_chunks, stripe_chunk);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read  = read_info[read_idx];
@@ -506,8 +503,7 @@ void reader::impl::subpass_preprocess()
 
   // TODO: This is subpass
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  std::unordered_map<stream_id_info, gpu::CompressedStreamInfo*, stream_id_hash, stream_id_equal>
-    stream_compinfo_map;
+  stream_id_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
   // TODO: fix this, loop only current chunk
   auto const stripe_chunk =
@@ -546,7 +542,7 @@ void reader::impl::subpass_preprocess()
           static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
           info.length));
         stream_compinfo_map[stream_id_info{
-          info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind}] =
+          info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind}] =
           &compinfo[compinfo.size() - 1];
 #ifdef PRINT_DEBUG
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
@@ -575,7 +571,7 @@ void reader::impl::subpass_preprocess()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
-        stripe_decomp_sizes[stream_id.id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
+        stripe_decomp_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
           stream_compinfo->max_uncompressed_size;
 #ifdef PRINT_DEBUG
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 2d07cae3214..0dbfde47363 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -36,9 +36,9 @@ namespace cudf::io::orc::detail {
 struct stream_id_info {
   uint32_t stripe_idx;  // global stripe id throughout the data source
   // TODO: change type below
-  std::size_t level;    // level of the nested column 
-  uint32_t orc_col_idx; // orc column id
-  StreamKind kind;      // stream kind
+  std::size_t level;     // level of the nested column
+  uint32_t orc_col_idx;  // orc column id
+  StreamKind kind;       // stream kind
 
   struct hash {
     std::size_t operator()(stream_id_info const& id) const
@@ -60,13 +60,13 @@ struct stream_id_info {
 
 /**
  * @brief Map to lookup a value from stream id.
-*/
-template<typename T>
+ */
+template <typename T>
 using stream_id_map =
   std::unordered_map<stream_id_info, T, stream_id_info::hash, stream_id_info::equal_to>;
 
 /**
- * @brief Struct that store identification of an ORC streams.
+ * @brief Struct that store identification of an ORC stream.
  */
 struct orc_stream_info {
   // TODO: remove constructor
@@ -74,10 +74,7 @@ struct orc_stream_info {
                            std::size_t dst_pos_,
                            uint32_t length_,
                            stream_id_info const& id_)
-    : offset(offset_),
-      dst_pos(dst_pos_),
-      length(length_),
-      id(id_)
+    : offset(offset_), dst_pos(dst_pos_), length(length_), id(id_)
   {
 #ifdef PRINT_DEBUG
     printf("   construct stripe id [%d, %d, %d, %d]\n",
@@ -126,11 +123,10 @@ struct range {
  * @brief Struct to store file-level data that remains constant for all chunks being output.
  */
 struct file_intermediate_data {
-    int64_t rows_to_skip;
+  int64_t rows_to_skip;
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;
 
-
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
@@ -146,18 +142,16 @@ struct file_intermediate_data {
   // This is used to initialize the stripe_data buffers.
   std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
 
-
-
   // Store information to identify where to read a chunk of data from source.
   // Each read corresponds to one or more consecutive streams combined.
   struct data_read_info {
     // TODO: remove constructor
     data_read_info(uint64_t offset_,
-                     std::size_t length_,
-                     std::size_t dst_pos_,
-                     std::size_t source_idx_,
-                     std::size_t stripe_idx_,
-                     std::size_t level_)
+                   std::size_t length_,
+                   std::size_t dst_pos_,
+                   std::size_t source_idx_,
+                   std::size_t stripe_idx_,
+                   std::size_t level_)
       : offset(offset_),
         length(length_),
         dst_pos(dst_pos_),
@@ -166,40 +160,35 @@ struct file_intermediate_data {
         level(level_)
     {
     }
-      uint64_t offset;      // offset in data source
-    std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
-    std::size_t length;   // data length to read
-    std::size_t source_idx; // the data source id
-    std::size_t stripe_idx; // stream id TODO: processing or source stripe id?
-    std::size_t level; // nested level
+    uint64_t offset;         // offset in data source
+    std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
+    std::size_t length;      // data length to read
+    std::size_t source_idx;  // the data source id
+    std::size_t stripe_idx;  // stream id TODO: processing or source stripe id?
+    std::size_t level;       // nested level
   };
 
-    // Identify what data to read from source.
+  // Identify what data to read from source.
   std::vector<data_read_info> data_read_info;
 
   // For each stripe, we perform a number of read for its streams.
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
   std::vector<chunk> stripe_data_read_chunks;
 
-
   // Store info for each ORC stream at each nested level.
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
+  // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
+  // This is used to identify the range of streams for each stripe from that vector.
+  std::vector<std::vector<chunk>> lvl_stripe_stream_chunks;
 
-// At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
-// This is used to identify the range of streams for each stripe from that vector.
-  std::vector<std::vector<chunk>> lvl_stripe_stream_chunks; 
-
-
-// TODO
+  // TODO
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
   // For data processing, decompression, and decoding.
   // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
 
-
-
   bool global_preprocessed{false};
 };
 
@@ -216,7 +205,8 @@ struct chunk_read_data {
   std::size_t data_read_limit;    // approximate maximum size (in bytes) used for store
                                   // intermediate data, or 0 for no limit
 
-  // Chunks of stripes that can be load into memory such that their data size is within a size limit.
+  // Chunks of stripes that can be load into memory such that their data size is within a size
+  // limit.
   std::vector<chunk> load_stripe_chunks;
   std::size_t curr_load_stripe_chunk{0};
   bool more_stripe_to_load() const { return curr_load_stripe_chunk < load_stripe_chunks.size(); }
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 2d0d01e056b..e22cfc9a6d2 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -193,12 +193,13 @@ rmm::device_buffer decompress_stripe_data(
 
     //    printf("line %d\n", __LINE__);
     //    fflush(stdout);
-    auto const& cached_comp_info =
-      compinfo_map.at(stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind});
+    auto const& cached_comp_info = compinfo_map.at(
+      stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind});
     //    printf("line %d\n", __LINE__);
     //    fflush(stdout);
     // auto const& cached_comp_info =
-    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx, info.id.kind}];
+    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx,
+    //   info.id.kind}];
     auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;

From c75daeb0dddb3301c7acc34f9d31f5b3762b65f9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 11:25:59 -0800
Subject: [PATCH 074/321] Reformat

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu  |  4 ++--
 cpp/src/io/orc/reader_impl.hpp | 29 ++++++++++++++---------------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 48d110c61bd..0e3987f2cd5 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -38,8 +38,8 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 }
 
 reader::impl::impl(std::size_t output_size_limit,
-                  std::size_t data_read_limit,
-                  std::vector<std::unique_ptr<datasource>>&& sources,
+                   std::size_t data_read_limit,
+                   std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index d131d907fa1..2447346f2a0 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -84,12 +84,10 @@ class reader::impl {
                            std::optional<size_type> const& num_rows_opt,
                            std::vector<std::vector<size_type>> const& stripes);
 
-
-
  private:
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
-   * 
+   *
    * This is the proxy to call all other data preprocessing functions, which are prerequisite
    * for generating an output table.
    *
@@ -104,42 +102,43 @@ class reader::impl {
   /**
    * @brief Perform a global preprocessing step that executes exactly once for the entire duration
    * of the reader.
-   * 
-   * In this step, the metadata of all stripes in the data source is parsed, and information about 
-   * data streams for all selected columns in alls tripes are generated. If the reader has a data 
+   *
+   * In this step, the metadata of all stripes in the data source is parsed, and information about
+   * data streams for all selected columns in alls tripes are generated. If the reader has a data
    * read limit, data size of all stripes are used to determine the chunks of consecutive
-   * stripes for reading each time using the `read_data()` step. This is to ensure that loading 
+   * stripes for reading each time using the `read_data()` step. This is to ensure that loading
    * these stripes will not exceed a fixed portion the data read limit.
-  */
+   */
   void global_preprocess(uint64_t skip_rows,
                          std::optional<size_type> const& num_rows_opt,
                          std::vector<std::vector<size_type>> const& stripes);
 
   /**
    * @brief Read stripes from the input source and store the data in the internal buffers.
-   * 
+   *
    * If there is a data read limit, only a chunk of stripes are read at a time such that
    * their total data size does not exceed a fixed portion of the limit. Then, the data is
    * probed to determine the uncompressed sizes for these loaded stripes, which are in turn
-   * used to determine a subset of stripes to decompress and decode in the next step 
+   * used to determine a subset of stripes to decompress and decode in the next step
    * `decompress_and_decode()`.
-   * This is to ensure that loading data together with decompression and decoding will not exceed 
+   * This is to ensure that loading data together with decompression and decoding will not exceed
    * the data read limit.
    */
   void read_data();
 
   /**
    * TODO: merge with read data.
-  */
+   */
   void subpass_preprocess();
 
   /**
    * @brief Decompress and decode the data in the internal buffers, and store the result into
    * an internal table.
-   * 
+   *
    * If there is a data read limit, only a chunk of stripes are decompressed and decoded at a time.
-   * Then, the result is stored in an internal table, and sizes of its rows are computed 
-   * to determine slices of rows to return as the output table in the final step `make_output_chunk`.
+   * Then, the result is stored in an internal table, and sizes of its rows are computed
+   * to determine slices of rows to return as the output table in the final step
+   * `make_output_chunk`.
    */
   void decompress_and_decode();
 

From 3ae55fad33b8a7aba4a7cf3c8bf518cf8bee7756 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 14:44:52 -0800
Subject: [PATCH 075/321] Fix style

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index a97ec081570..4e083d714ee 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -158,10 +158,10 @@ class reader::impl {
 
   // Reader configs
   struct {
-  data_type timestamp_type;  // Override output timestamp resolution
-  bool use_index;            // Enable or disable attempt to use row index for parsing
-  bool use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-  std::vector<std::string> decimal128_columns;   // Control decimals conversion
+    data_type timestamp_type;  // Override output timestamp resolution
+    bool use_index;            // Enable or disable attempt to use row index for parsing
+    bool use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
+    std::vector<std::string> decimal128_columns;  // Control decimals conversion
   } const _config;
 
   // Intermediate data for internal processing.

From 0188cc760221224a13d1735bf32aa306a2925847 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 15:17:45 -0800
Subject: [PATCH 076/321] Move code around

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 82 ++++++++++++++------------
 cpp/src/io/orc/reader_impl_chunking.cu |  5 ++
 2 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index a20456458bf..8384a9937bc 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -695,6 +695,13 @@ void reader::impl::prepare_data(uint64_t skip_rows,
     read_data();
   }
 
+  decompress_and_decode();
+}
+
+// TODO: this should be called per chunk of stripes.
+void reader::impl::decompress_and_decode()
+{
+  if (_file_itm_data.has_no_data()) { return; }
   auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   auto const rows_to_read      = _file_itm_data.rows_to_read;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
@@ -988,45 +995,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   }  // end loop level
 }
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : reader::impl::impl(0UL, 0UL, std::move(sources), options, stream, mr)
-{
-}
-
-reader::impl::impl(std::size_t output_size_limit,
-                   std::size_t data_read_limit,
-                   std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _stream(stream),
-    _mr(mr),
-    _config{options.get_timestamp_type(),
-            options.is_enabled_use_index(),
-            options.is_enabled_use_np_dtypes(),
-            options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()},
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())},
-    _chunk_read_data{output_size_limit, data_read_limit}
-{
-}
-
-table_with_metadata reader::impl::read(uint64_t skip_rows,
-                                       std::optional<size_type> const& num_rows_opt,
-                                       std::vector<std::vector<size_type>> const& stripes)
-{
-  prepare_data(skip_rows, num_rows_opt, stripes);
-  return make_output_chunk();
-}
-
-// TODO:  move code here
-void reader::impl::decompress_and_decode() {}
-
 table_with_metadata reader::impl::make_output_chunk()
 {
   // There is no columns in the table.
@@ -1118,6 +1086,42 @@ table_metadata reader::impl::make_output_metadata()
   return out_metadata;
 }
 
+reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                   orc_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : reader::impl::impl(0UL, 0UL, std::move(sources), options, stream, mr)
+{
+}
+
+reader::impl::impl(std::size_t output_size_limit,
+                   std::size_t data_read_limit,
+                   std::vector<std::unique_ptr<datasource>>&& sources,
+                   orc_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _stream(stream),
+    _mr(mr),
+    _config{options.get_timestamp_type(),
+            options.is_enabled_use_index(),
+            options.is_enabled_use_np_dtypes(),
+            options.get_decimal128_columns()},
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _chunk_read_data{output_size_limit, data_read_limit}
+{
+}
+
+table_with_metadata reader::impl::read(uint64_t skip_rows,
+                                       std::optional<size_type> const& num_rows_opt,
+                                       std::vector<std::vector<size_type>> const& stripes)
+{
+  prepare_data(skip_rows, num_rows_opt, stripes);
+  return make_output_chunk();
+}
+
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ef17adc6e64..2bfedea7506 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -337,6 +337,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   stripe_data_read_chunks.resize(num_stripes);
   lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
 
+  // TODO: move this
+  auto& lvl_chunks = _file_itm_data.lvl_data_chunks;
+  lvl_chunks.resize(_selected_columns.num_levels());
+  _out_buffers.resize(_selected_columns.num_levels());
+
   // TODO: Check if these data depends on pass and subpass, instead of global pass.
   // Prepare data.
   // Iterates through levels of nested columns, child column will be one level down

From 25d810a3e0a1c897d74635983b46f7b5275a64f7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 9 Feb 2024 16:11:03 +0700
Subject: [PATCH 077/321] Rename function

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 2 +-
 cpp/src/io/orc/reader_impl.hpp         | 6 +++---
 cpp/src/io/orc/reader_impl_chunking.cu | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 8384a9937bc..718029da652 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -692,7 +692,7 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // TODO: fix this, should be called once
   _chunk_read_data.curr_load_stripe_chunk = 0;
   while (_chunk_read_data.more_stripe_to_load()) {
-    read_data();
+    load_data();
   }
 
   decompress_and_decode();
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4e083d714ee..67454eb0378 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -106,7 +106,7 @@ class reader::impl {
    * In this step, the metadata of all stripes in the data source is parsed, and information about
    * data streams for all selected columns in alls tripes are generated. If the reader has a data
    * read limit, data size of all stripes are used to determine the chunks of consecutive
-   * stripes for reading each time using the `read_data()` step. This is to ensure that loading
+   * stripes for reading each time using the `load_data()` step. This is to ensure that loading
    * these stripes will not exceed a fixed portion the data read limit.
    */
   void global_preprocess(uint64_t skip_rows,
@@ -114,7 +114,7 @@ class reader::impl {
                          std::vector<std::vector<size_type>> const& stripes);
 
   /**
-   * @brief Read stripes from the input source and store the data in the internal buffers.
+   * @brief Load stripes from the input source and store the data in the internal buffers.
    *
    * If there is a data read limit, only a chunk of stripes are read at a time such that
    * their total data size does not exceed a fixed portion of the limit. Then, the data is
@@ -124,7 +124,7 @@ class reader::impl {
    * This is to ensure that loading data together with decompression and decoding will not exceed
    * the data read limit.
    */
-  void read_data();
+  void load_data();
 
   /**
    * @brief Decompress and decode the data in the internal buffers, and store the result into
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2bfedea7506..d3a04b7cec5 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -487,7 +487,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 }
 
 // Load each chunk from `load_stripe_chunks`.
-void reader::impl::read_data()
+void reader::impl::load_data()
 {
   if (_file_itm_data.has_no_data()) { return; }
 

From 21b2a9a29a85cddeb736ed94762cd9a52a6171d3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 15:27:10 -0800
Subject: [PATCH 078/321] Remove redundant code

# Conflicts:
#	cpp/src/io/orc/reader_impl.cu
---
 cpp/src/io/orc/reader_impl.cu          | 1 -
 cpp/src/io/orc/reader_impl_chunking.cu | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 718029da652..ebb6cefb709 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -687,7 +687,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
 
-  if (_file_itm_data.has_no_data()) { return; }
 
   // TODO: fix this, should be called once
   _chunk_read_data.curr_load_stripe_chunk = 0;
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index d3a04b7cec5..5a467f45768 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -459,9 +459,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   }
 
   // DEBUG only
+  // TODO: use 0.3 constant
   _chunk_read_data.data_read_limit =
     total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
 
+
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
 

From a5622c6ac36aff9b2eabce880449f3e7cb3a71c9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sat, 10 Feb 2024 15:14:45 +0700
Subject: [PATCH 079/321] Add comments

---
 cpp/src/io/orc/reader_impl.cu          |  1 -
 cpp/src/io/orc/reader_impl_chunking.cu | 30 ++++++++++++++++++++------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ebb6cefb709..84eec8526af 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -687,7 +687,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
 
-
   // TODO: fix this, should be called once
   _chunk_read_data.curr_load_stripe_chunk = 0;
   while (_chunk_read_data.more_stripe_to_load()) {
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5a467f45768..1f56cb0b4cd 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -152,11 +152,17 @@ std::size_t gather_stream_info_and_column_desc(
 
 namespace {
 
+/**
+ * @brief Struct to accummulate sizes of chunks of some data such as stripe or rows.
+ */
 struct cumulative_size {
   int64_t count{0};
   std::size_t size_bytes{0};
 };
 
+/**
+ * @brief Functor to sum up cummulative sizes.
+ */
 struct cumulative_size_sum {
   __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
   {
@@ -165,6 +171,10 @@ struct cumulative_size_sum {
 };
 
 #if 1
+/**
+ * @brief Find the splits of the input data such that each split has cummulative size less than a
+ * given `size_limit`.
+ */
 std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
                                int64_t total_count,
                                size_t size_limit)
@@ -225,6 +235,15 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 #endif
 
 #ifdef PRINT_DEBUG
+/**
+ * @brief Verify the splits, checking if they are correct.
+ *
+ * We need to verify that:
+ *  1. All chunk must have count > 0
+ *  2. Chunks are continuous.
+ *  3. sum(all sizes in a chunk) < size_limit
+ *  4. sum(all counts in all chunks) == total_count.
+ */
 void verify_splits(host_span<chunk const> splits,
                    host_span<cumulative_size const> sizes,
                    size_type total_count,
@@ -271,11 +290,11 @@ void verify_splits(host_span<chunk const> splits,
 #endif
 
 /**
- * @brief
+ * @brief Find range of the data span by a given chunk of chunks.
  *
- * @param input_chunks
- * @param selected_chunks
- * @return
+ * @param input_chunks The list of all data chunks
+ * @param selected_chunks A chunk of chunks in the input_chunks
+ * @return The range of data span by the selected chunk of given chunks
  */
 std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
                                       chunk const& selected_chunks)
@@ -284,7 +303,7 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
   auto const chunk_begin = selected_chunks.start_idx;
   auto const chunk_end   = selected_chunks.start_idx + selected_chunks.count;
 
-  // The first and last chunk, according to selected_chunk
+  // The first and last chunk, according to selected_chunk.
   auto const& first_chunk = input_chunks[chunk_begin];
   auto const& last_chunk  = input_chunks[chunk_end - 1];
 
@@ -463,7 +482,6 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   _chunk_read_data.data_read_limit =
     total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
 
-
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
 

From b3be9650c5abfa49e6219869cbedff3711b22d68 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 18:46:03 -0800
Subject: [PATCH 080/321] Fix spell

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 1f56cb0b4cd..4acfe998c57 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -161,7 +161,7 @@ struct cumulative_size {
 };
 
 /**
- * @brief Functor to sum up cummulative sizes.
+ * @brief Functor to sum up cumulative sizes.
  */
 struct cumulative_size_sum {
   __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const

From 782b99a123ab482e2b79bbf367094cc2db09fe1e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 19:14:55 -0800
Subject: [PATCH 081/321] Fix spell

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4acfe998c57..d8182102a2b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -172,7 +172,7 @@ struct cumulative_size_sum {
 
 #if 1
 /**
- * @brief Find the splits of the input data such that each split has cummulative size less than a
+ * @brief Find the splits of the input data such that each split has cumulative size less than a
  * given `size_limit`.
  */
 std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,

From 97717f9aa7ed0a1992fad4c36f68183ff32211a0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 19:50:33 -0800
Subject: [PATCH 082/321] Move code around

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 49 +++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 84eec8526af..95a586e49c0 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -673,29 +673,6 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 
 }  // namespace
 
-void reader::impl::prepare_data(uint64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
-{
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  global_preprocess(skip_rows, num_rows_opt, stripes);
-
-  // TODO: fix this, should be called once
-  _chunk_read_data.curr_load_stripe_chunk = 0;
-  while (_chunk_read_data.more_stripe_to_load()) {
-    load_data();
-  }
-
-  decompress_and_decode();
-}
-
 // TODO: this should be called per chunk of stripes.
 void reader::impl::decompress_and_decode()
 {
@@ -993,6 +970,32 @@ void reader::impl::decompress_and_decode()
   }  // end loop level
 }
 
+void reader::impl::prepare_data(uint64_t skip_rows,
+                                std::optional<size_type> const& num_rows_opt,
+                                std::vector<std::vector<size_type>> const& stripes)
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns
+  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested columns");
+
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  // Perform a global preprocessing step for the entire input sources.
+  global_preprocess(skip_rows, num_rows_opt, stripes);
+
+  // TODO: fix this, should be called once
+  // TODO: only load data if needed.
+  _chunk_read_data.curr_load_stripe_chunk = 0;
+  while (_chunk_read_data.more_stripe_to_load()) {
+    load_data();
+  }
+
+  // TODO: only do if needed.
+  decompress_and_decode();
+}
+
 table_with_metadata reader::impl::make_output_chunk()
 {
   // There is no columns in the table.

From 6fbb424707b76388e244d8ba6779604b773e1001 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 20 Feb 2024 20:58:39 -0800
Subject: [PATCH 083/321] Remove unused var

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index d8182102a2b..d8569ccb2c0 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -344,9 +344,6 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   auto& stripe_data_read_chunks  = _file_itm_data.stripe_data_read_chunks;
   auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
 
-  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  stream_id_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
-
   // Logically view streams as columns
   _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
 

From 99f0374539dc191a13d67b6dfb763c5127e9a833 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 21 Feb 2024 12:50:34 -0800
Subject: [PATCH 084/321] Misc

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu  | 1 -
 cpp/src/io/orc/reader_impl.hpp | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 95a586e49c0..be79c4901e2 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -987,7 +987,6 @@ void reader::impl::prepare_data(uint64_t skip_rows,
 
   // TODO: fix this, should be called once
   // TODO: only load data if needed.
-  _chunk_read_data.curr_load_stripe_chunk = 0;
   while (_chunk_read_data.more_stripe_to_load()) {
     load_data();
   }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 67454eb0378..5ad0e49a889 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -103,6 +103,8 @@ class reader::impl {
    * @brief Perform a global preprocessing step that executes exactly once for the entire duration
    * of the reader.
    *
+   * TODO: rewrite, not use "ensure".
+   *
    * In this step, the metadata of all stripes in the data source is parsed, and information about
    * data streams for all selected columns in alls tripes are generated. If the reader has a data
    * read limit, data size of all stripes are used to determine the chunks of consecutive

From f10467c8aa8bb334489f01220b240b90a6de0137 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 21 Feb 2024 14:20:25 -0800
Subject: [PATCH 085/321] Implement chunking interface

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/CMakeLists.txt                 |  1 +
 cpp/include/cudf/io/detail/orc.hpp | 71 +++++++++++++++++++++++++++-
 cpp/include/cudf/io/orc.hpp        | 74 ++++++++++++++++++++++++++++++
 cpp/src/io/functions.cpp           | 42 +++++++++++++++++
 cpp/src/io/orc/reader.cu           | 56 ++++++++++++++++++++++
 cpp/src/io/orc/reader_impl.cu      | 28 ++++++-----
 cpp/src/io/orc/reader_impl.hpp     | 21 +++++----
 7 files changed, 269 insertions(+), 24 deletions(-)
 create mode 100644 cpp/src/io/orc/reader.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bc836eb393a..f8d794e3334 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -386,6 +386,7 @@ add_library(
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
+  src/io/orc/reader.cu
   src/io/orc/reader_impl.cu
   src/io/orc/reader_impl_chunking.cu
   src/io/orc/reader_impl_helpers.cpp
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index a0bf8b24b80..012b06d338a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -41,10 +41,15 @@ namespace orc::detail {
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
- private:
+ protected:
   class impl;
   std::unique_ptr<impl> _impl;
 
+  /**
+   * @brief Default constructor, needed for subclassing.
+   */
+  reader();
+
  public:
   /**
    * @brief Constructor from an array of datasources
@@ -62,7 +67,7 @@ class reader {
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
    */
-  ~reader();
+  virtual ~reader();
 
   /**
    * @brief Reads the entire dataset.
@@ -73,6 +78,67 @@ class reader {
   table_with_metadata read(orc_reader_options const& options);
 };
 
+/**
+ * @brief The reader class that supports iterative reading of a given file.
+ *
+ * This class intentionally subclasses the `reader` class with private inheritance to hide the
+ * `reader::read()` API. As such, only chunked reading APIs are supported.
+ */
+class chunked_reader : private reader {
+ public:
+  /**
+   * @brief Constructor from size limits and an array of data sources with reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `output_size_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
+   * whole file and return a table containing all rows.
+   *
+   * TODO: data read limit
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_reader(std::size_t output_size_limit,
+                          std::size_t data_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+};
+
 /**
  * @brief Class to write ORC dataset data into columns.
  */
@@ -133,5 +199,6 @@ class writer {
    */
   void skip_close();
 };
+
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index a3f76817f8a..d512f4a6cc4 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -404,6 +404,80 @@ table_with_metadata read_orc(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief The chunked orc reader class to read ORC file iteratively in to a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large ORC files such
+ * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its size stay within
+ * the given limit.
+ */
+class chunked_orc_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_orc_reader() = default;
+
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `orc_reader_option` parameter as in
+   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
+   * output table for each reading.
+   *
+   * TODO: data read limit
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param options The options used to read Parquet file
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  chunked_orc_reader(std::size_t output_size_limit,
+                     std::size_t data_read_limit,
+                     orc_reader_options const& options,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   *
+   * Since the declaration of the internal `reader` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_orc_reader();
+
+  /**
+   * @brief Check if there is any data in the given file has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given ORC file.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given file at once.
+   *
+   * An empty table will be returned if the given file is empty, or all the data in the file has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
+};
+
 /** @} */  // end of group
 /**
  * @addtogroup io_writers
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 42f2fd02d52..fc29201ce19 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -448,6 +448,48 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   }
 }
 
+/**
+ * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader
+ */
+chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
+                                       std::size_t data_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
+                                                         data_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+/**
+ * @copydoc cudf::io::chunked_orc_reader::~chunked_orc_reader
+ */
+chunked_orc_reader::~chunked_orc_reader() = default;
+
+/**
+ * @copydoc cudf::io::chunked_orc_reader::has_next
+ */
+bool chunked_orc_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+/**
+ * @copydoc cudf::io::chunked_orc_reader::read_chunk
+ */
+table_with_metadata chunked_orc_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
+}
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
new file mode 100644
index 00000000000..325986d7aef
--- /dev/null
+++ b/cpp/src/io/orc/reader.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_helpers.hpp"
+
+namespace cudf::io::orc::detail {
+
+// Constructor and destructor are defined within this translation unit.
+reader::reader()  = default;
+reader::~reader() = default;
+
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               orc_reader_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
+  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+{
+}
+
+table_with_metadata reader::read(orc_reader_options const& options)
+{
+  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
+}
+
+chunked_reader::chunked_reader(std::size_t output_size_limit,
+                               std::size_t data_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  _impl = std::make_unique<impl>(
+    output_size_limit, data_read_limit, std::move(sources), options, stream, mr);
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index be79c4901e2..c7421c5e41f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1122,22 +1122,26 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
   return make_output_chunk();
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               orc_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+bool reader::impl::has_next()
 {
+  prepare_data(0 /*skip_rows*/, std::nullopt /*num_rows, `std::nullopt` means unlimited*/, {});
+  // return _chunk_read_info.current_chunk_idx < _chunk_read_info.chunks.size();
+  return true;
 }
 
-// Destructor within this translation unit
-reader::~reader() = default;
-
-// Forward to implementation
-table_with_metadata reader::read(orc_reader_options const& options)
+table_with_metadata reader::impl::read_chunk()
 {
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
+  // Reset the output buffers to their original states (right after reader construction).
+  // Don't need to do it if we read the file all at once.
+  // if (_chunk_read_info.chunk_size_limit > 0) {
+  //    _output_buffers.resize(0);
+  //    for (auto const& buff : _output_buffers_template) {
+  //      _output_buffers.emplace_back(column_buffer::empty_like(buff));
+  //    }
+  // }
+
+  prepare_data(0 /*skip_rows*/, std::nullopt /*num_rows, `std::nullopt` means unlimited*/, {});
+  return make_output_chunk();
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 5ad0e49a889..d4e38304ddd 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -54,16 +54,7 @@ class reader::impl {
                 rmm::mr::device_memory_resource* mr);
 
   /**
-   * @brief Constructor from a dataset source with reader options.
-   *
-   * @param output_size_limit Limit on total number of bytes to be returned per read,
-   *        or `0` if there is no limit
-   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
-   *        of input, or `0` if there is no limit
-   * @param sources Dataset sources
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
+   * @copydoc cudf::io::orc::detail::chunked_reader
    */
   explicit impl(std::size_t output_size_limit,
                 std::size_t data_read_limit,
@@ -84,6 +75,16 @@ class reader::impl {
                            std::optional<size_type> const& num_rows_opt,
                            std::vector<std::vector<size_type>> const& stripes);
 
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  bool has_next();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  table_with_metadata read_chunk();
+
  private:
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.

From 28e328c838c1dd56f3411c3684f6845899c06757 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 21 Feb 2024 18:25:03 -0800
Subject: [PATCH 086/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp |  2 +-
 cpp/src/io/orc/reader.cu                  |  4 +--
 cpp/src/io/orc/reader_impl.cu             | 32 +++++------------------
 cpp/src/io/orc/reader_impl.hpp            | 10 +++----
 cpp/src/io/orc/reader_impl_chunking.cu    |  6 ++---
 cpp/src/io/orc/reader_impl_helpers.hpp    |  4 +--
 6 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 8412a767d3f..620294a1e47 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
 
 #include "io/utilities/row_selection.hpp"
 
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 325986d7aef..855a96c9ae3 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "reader_impl.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 
 namespace cudf::io::orc::detail {
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 59ab3f52eaa..ec517f93e23 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -18,10 +18,10 @@
 
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/timezone.hpp>
@@ -981,16 +981,8 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return; }
 
-  // Perform a global preprocessing step for the entire input sources.
   global_preprocess(skip_rows, num_rows_opt, stripes);
-
-  // TODO: fix this, should be called once
-  // TODO: only load data if needed.
-  while (_chunk_read_data.more_stripe_to_load()) {
-    load_data();
-  }
-
-  // TODO: only do if needed.
+  load_data();
   decompress_and_decode();
 }
 
@@ -1123,23 +1115,13 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
 
 bool reader::impl::has_next()
 {
-  prepare_data(0 /*skip_rows*/, std::nullopt /*num_rows, `std::nullopt` means unlimited*/, {});
-  // return _chunk_read_info.current_chunk_idx < _chunk_read_info.chunks.size();
-  return true;
+  prepare_data();
+  return _chunk_read_data.has_next();
 }
 
 table_with_metadata reader::impl::read_chunk()
 {
-  // Reset the output buffers to their original states (right after reader construction).
-  // Don't need to do it if we read the file all at once.
-  // if (_chunk_read_info.chunk_size_limit > 0) {
-  //    _output_buffers.resize(0);
-  //    for (auto const& buff : _output_buffers_template) {
-  //      _output_buffers.emplace_back(column_buffer::empty_like(buff));
-  //    }
-  // }
-
-  prepare_data(0 /*skip_rows*/, std::nullopt /*num_rows, `std::nullopt` means unlimited*/, {});
+  prepare_data();
   return make_output_chunk();
 }
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 6996e366c55..4da73f3a08f 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
-#include "reader_impl_chunking.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
@@ -96,9 +96,9 @@ class reader::impl {
    * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
    * @param stripes Indices of individual stripes to load if non-empty
    */
-  void prepare_data(uint64_t skip_rows,
-                    std::optional<size_type> const& num_rows_opt,
-                    std::vector<std::vector<size_type>> const& stripes);
+  void prepare_data(uint64_t skip_rows                                 = 0,
+                    std::optional<size_type> const& num_rows_opt       = std::nullopt,
+                    std::vector<std::vector<size_type>> const& stripes = {});
 
   /**
    * @brief Perform a global preprocessing step that executes exactly once for the entire duration
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 64cf668c508..4330c62c751 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -18,10 +18,10 @@
 
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 48742b5fc8c..6f83b567710 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
-#include "orc.hpp"
 
 #include <cudf/io/orc.hpp>
 

From 0effdb3c0e0321c94dd20ff757e74b2c8bf9e5ae Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 21 Feb 2024 21:07:44 -0800
Subject: [PATCH 087/321] Add `front` and `back`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu              |  7 +++++--
 cpp/src/io/orc/reader_impl_chunking.cu     | 14 ++++++--------
 cpp/src/io/utilities/hostdevice_vector.hpp | 13 +++++++++++++
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ec517f93e23..1c5191624d0 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -112,7 +112,7 @@ rmm::device_buffer decompress_stripe_data(
     // auto const& cached_comp_info =
     //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx,
     //   info.id.kind}];
-    auto& stream_comp_info                   = compinfo[compinfo.size() - 1];
+    auto& stream_comp_info                   = compinfo.back();
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
     stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
@@ -982,7 +982,10 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   if (_selected_columns.num_levels() == 0) { return; }
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
-  load_data();
+  // load_data();
+  while (_chunk_read_data.more_stripe_to_load()) {
+    load_data();
+  }
   decompress_and_decode();
 }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4330c62c751..cdd060aca4e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -282,7 +282,7 @@ void verify_splits(host_span<chunk const> splits,
     }
     cur_cumulative_size = sizes[split.start_idx + split.count - 1].size_bytes;
   }
-  CUDF_EXPECTS(last_split.start_idx + last_split.count == sizes[sizes.size() - 1].count,
+  CUDF_EXPECTS(last_split.start_idx + last_split.count == sizes.back().count,
                "Invalid split start_idx.");
   CUDF_EXPECTS(count == total_count, "Invalid total count.");
 }
@@ -324,7 +324,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // TODO: move this to end of func.
   _file_itm_data.global_preprocessed = true;
 
-  // Select only stripes required (aka row groups)
+  // Load stripes's metadata.
   std::tie(
     _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
     _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
@@ -347,7 +347,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
 
   // Get the total number of stripes across all input files.
-  std::size_t num_stripes = selected_stripes.size();
+  auto const num_stripes = selected_stripes.size();
 
   stripe_data_read_chunks.resize(num_stripes);
   lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
@@ -475,8 +475,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   // DEBUG only
   // TODO: use 0.3 constant
-  _chunk_read_data.data_read_limit =
-    total_stripe_sizes[total_stripe_sizes.size() - 1].size_bytes / 3;
+  _chunk_read_data.data_read_limit = total_stripe_sizes.back().size_bytes / 3;
 
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
@@ -603,8 +602,7 @@ void reader::impl::load_data()
           static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
           info.length));
         stream_compinfo_map[stream_id_info{
-          info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind}] =
-          &compinfo[compinfo.size() - 1];
+          info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind}] = &compinfo.back();
 #ifdef PRINT_DEBUG
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.id.stripe_idx,
@@ -684,7 +682,7 @@ void reader::impl::load_data()
 
   // DEBUG only
   //  _chunk_read_data.data_read_limit =
-  //    stripe_decompression_sizes[stripe_decompression_sizes.size() - 1].size_bytes / 3;
+  //    stripe_decompression_sizes.back().size_bytes / 3;
 
   _chunk_read_data.decode_stripe_chunks =
     find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 3cd70801cdf..39059c665dc 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -96,6 +96,19 @@ class hostdevice_vector {
   [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); }
   [[nodiscard]] bool empty() const noexcept { return size() == 0; }
 
+  [[nodiscard]] T& front()
+  {
+    CUDF_EXPECTS(size() > 0, "Cannot access `front()` element of an empty vector.");
+    return host_data[0];
+  }
+  [[nodiscard]] T& back()
+  {
+    CUDF_EXPECTS(size() > 0, "Cannot access `back()` element of an empty vector.");
+    return host_data[size() - 1];
+  }
+  [[nodiscard]] T const& front() const { return front(); }
+  [[nodiscard]] T const& back() const { return back(); }
+
   [[nodiscard]] T& operator[](size_t i) { return host_data[i]; }
   [[nodiscard]] T const& operator[](size_t i) const { return host_data[i]; }
 

From bbb6b47ec0c488b55e0428b7ee2637d569f073df Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 21 Feb 2024 21:15:15 -0800
Subject: [PATCH 088/321] Rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index be3a8916d1f..2b172011259 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -143,14 +143,14 @@ struct file_intermediate_data {
 
   // Store information to identify where to read a chunk of data from source.
   // Each read corresponds to one or more consecutive streams combined.
-  struct data_read_info {
+  struct stream_data_read_info {
     // TODO: remove constructor
-    data_read_info(uint64_t offset_,
-                   std::size_t length_,
-                   std::size_t dst_pos_,
-                   std::size_t source_idx_,
-                   std::size_t stripe_idx_,
-                   std::size_t level_)
+    stream_data_read_info(uint64_t offset_,
+                          std::size_t length_,
+                          std::size_t dst_pos_,
+                          std::size_t source_idx_,
+                          std::size_t stripe_idx_,
+                          std::size_t level_)
       : offset(offset_),
         length(length_),
         dst_pos(dst_pos_),
@@ -168,7 +168,7 @@ struct file_intermediate_data {
   };
 
   // Identify what data to read from source.
-  std::vector<data_read_info> data_read_info;
+  std::vector<stream_data_read_info> data_read_info;
 
   // For each stripe, we perform a number of read for its streams.
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.

From cce2d34f5a79f2862d764b71467eb51d5f58203d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 21 Feb 2024 22:20:52 -0800
Subject: [PATCH 089/321] Misc

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           | 2 +-
 cpp/src/io/orc/reader_impl_chunking.cu  | 4 +---
 cpp/src/io/orc/reader_impl_chunking.hpp | 3 +++
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1c5191624d0..bbec21c4910 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -866,7 +866,7 @@ void reader::impl::decompress_and_decode()
       num_rowgroups += stripe_num_rowgroups;
 
       stripe_idx++;
-    }
+    }  // for (stripe : selected_stripes)
 
     if (stripe_data.empty()) { continue; }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index cdd060aca4e..3c354bd0f7a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -320,8 +320,6 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
                                      std::vector<std::vector<size_type>> const& stripes)
 {
   if (_file_itm_data.global_preprocessed) { return; }
-
-  // TODO: move this to end of func.
   _file_itm_data.global_preprocessed = true;
 
   // Load stripes's metadata.
@@ -455,7 +453,6 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Load all chunks if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
-    // TODO: DEBUG only
     //    return;
   }
 
@@ -520,6 +517,7 @@ void reader::impl::load_data()
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
   // Prepare the buffer to read raw data onto.
+  // TODO: clear all old buffer.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stripe_data  = lvl_stripe_data[level];
     auto& stripe_sizes = lvl_stripe_sizes[level];
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 2b172011259..4ab5b6bd427 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -129,6 +129,9 @@ struct file_intermediate_data {
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
+  // TODO: remove
+  std::size_t num_stripes() const { return selected_stripes.size(); }
+
   // Store the compression information for each data stream.
   stream_id_map<stripe_level_comp_info> compinfo_map;
 

From e9430e8adf2b60eee1cf985fd2a5d96038527276 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 22 Feb 2024 14:13:25 -0800
Subject: [PATCH 090/321] Testing multiple decoding step

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           |   77 +-
 cpp/src/io/orc/reader_impl_chunking.cu  |   13 +-
 cpp/src/io/orc/reader_impl_chunking.hpp |    3 +
 cpp/tests/io/orc_test.cpp               | 2033 +----------------------
 4 files changed, 109 insertions(+), 2017 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index bbec21c4910..4bb86091fb0 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -16,6 +16,12 @@
 
 // #define PRINT_DEBUG
 
+// TODO: remove
+#include <cudf/concatenate.hpp>
+//
+//
+//
+
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/reader_impl.hpp"
@@ -676,6 +682,12 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 void reader::impl::decompress_and_decode()
 {
   if (_file_itm_data.has_no_data()) { return; }
+
+  //  auto const stripe_chunk =
+  //    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_decode_stripe_chunk++];
+  //  auto const stripe_start = stripe_chunk.start_idx;
+  //  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+
   auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   auto const rows_to_read      = _file_itm_data.rows_to_read;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
@@ -982,11 +994,19 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   if (_selected_columns.num_levels() == 0) { return; }
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
+
+  // TODO: only load data if there is no loaded stripe ready to decode.
   // load_data();
   while (_chunk_read_data.more_stripe_to_load()) {
     load_data();
+    printf("done load data\n\n");
   }
+
+  // decompress_and_decode();
+  // while (_chunk_read_data.more_stripe_to_decode()) {
   decompress_and_decode();
+  _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
+  // }
 }
 
 table_with_metadata reader::impl::make_output_chunk()
@@ -1017,19 +1037,54 @@ table_with_metadata reader::impl::make_output_chunk()
 
   // TODO: move this into decompress_and_decode
   // Create columns from buffer with respective schema information.
-  std::transform(
-    _selected_columns.levels[0].begin(),
-    _selected_columns.levels[0].end(),
-    std::back_inserter(out_columns),
-    [&](auto const& orc_col_meta) {
-      out_metadata.schema_info.emplace_back("");
-      auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
-      return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
-    });
+
+  // TODO: remove
+  std::vector<std::unique_ptr<table>> tabs;
+  std::vector<cudf::table_view> tv;
+
+  for (auto& buffers : _file_itm_data.out_buffers) {
+    //
+    out_columns.clear();  // TODO: remove
+
+    std::transform(_selected_columns.levels[0].begin(),
+                   _selected_columns.levels[0].end(),
+                   std::back_inserter(out_columns),
+                   [&](auto const& orc_col_meta) {
+                     out_metadata.schema_info.emplace_back("");
+                     auto col_buffer = assemble_buffer(orc_col_meta.id,
+                                                       0,
+                                                       *_col_meta,
+                                                       _metadata,
+                                                       _selected_columns,
+                                                       buffers, /*_out_buffers*/
+                                                       _stream,
+                                                       _mr);
+                     return make_column(
+                       col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
+                   });
+
+    auto tbl = std::make_unique<table>(std::move(out_columns));
+    tabs.push_back(std::move(tbl));
+    tv.push_back(tabs.back()->view());
+
+    //
+    printf(" ----- decode one chunk\n");
+    fflush(stdout);
+    //
+    //
+    //
+    //
+  }
+  printf(" ----- decode total %d chunks\n", (int)tv.size());
+  fflush(stdout);
 
   // todo: remove this
-  auto out_table = std::make_unique<table>(std::move(out_columns));
+  // auto out_table = std::make_unique<table>(std::move(out_columns));
+  auto out_table = [&] {
+    if (tv.size() > 1) { return cudf::concatenate(tv); }
+    return std::move(tabs.front());
+  }();
+  // auto out_table = std::move(tabs.front());
 
 #if 0
   auto out_table = [&] {
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 3c354bd0f7a..3acd13964e7 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -466,6 +466,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
+  printf("total stripe sizes:\n");
   for (auto& size : total_stripe_sizes) {
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
   }
@@ -479,7 +480,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
 #ifndef PRINT_DEBUG
   auto& splits = _chunk_read_data.load_stripe_chunks;
-  printf("------------\nSplits (/%d): \n", (int)num_stripes);
+  printf("------------\nSplits (/total num stripe = %d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
   }
@@ -516,6 +517,8 @@ void reader::impl::load_data()
   auto const stripe_start = stripe_chunk.start_idx;
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
+  printf("loading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+
   // Prepare the buffer to read raw data onto.
   // TODO: clear all old buffer.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
@@ -679,8 +682,7 @@ void reader::impl::load_data()
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
   // DEBUG only
-  //  _chunk_read_data.data_read_limit =
-  //    stripe_decompression_sizes.back().size_bytes / 3;
+  _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
 
   _chunk_read_data.decode_stripe_chunks =
     find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
@@ -694,7 +696,7 @@ void reader::impl::load_data()
 
 #ifndef PRINT_DEBUG
   auto& splits = _chunk_read_data.decode_stripe_chunks;
-  printf("------------\nSplits second level (/%d): \n", (int)stripe_chunk.count);
+  printf("------------\nSplits decode_stripe_chunks (/%d): \n", (int)stripe_chunk.count);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
   }
@@ -715,6 +717,9 @@ void reader::impl::load_data()
 
   // lvl_stripe_data.clear();
   // _file_itm_data.compinfo_ready = true;
+
+  // Decoding is reset to start from the first chunk in `decode_stripe_chunks`.
+  _chunk_read_data.curr_decode_stripe_chunk = 0;
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 4ab5b6bd427..becb9a1d0d5 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -122,6 +122,9 @@ struct range {
  * @brief Struct to store file-level data that remains constant for all chunks being output.
  */
 struct file_intermediate_data {
+  // TODO: remove
+  std::vector<std::vector<std::vector<cudf::io::detail::column_buffer>>> out_buffers;
+
   int64_t rows_to_skip;
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b972311d988..36ef05ecc36 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -95,1305 +95,32 @@ std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_colum
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-// Base test fixture for tests
-struct OrcWriterTest : public cudf::test::BaseFixture {};
-
-// Typed test fixture for numeric type tests
-template <typename T>
-struct OrcWriterNumericTypeTest : public OrcWriterTest {
-  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
-};
-
-// Typed test fixture for timestamp type tests
-template <typename T>
-struct OrcWriterTimestampTypeTest : public OrcWriterTest {
-  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
-};
-
-// Declare typed test cases
-// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5351
-using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
-TYPED_TEST_SUITE(OrcWriterNumericTypeTest, SupportedTypes);
-using SupportedTimestampTypes =
-  cudf::test::RemoveIf<cudf::test::ContainedIn<cudf::test::Types<cudf::timestamp_D>>,
-                       cudf::test::TimestampTypes>;
-TYPED_TEST_SUITE(OrcWriterTimestampTypeTest, SupportedTimestampTypes);
-
-// Base test fixture for chunked writer tests
-struct OrcChunkedWriterTest : public cudf::test::BaseFixture {};
-
-// Typed test fixture for numeric type tests
-template <typename T>
-struct OrcChunkedWriterNumericTypeTest : public OrcChunkedWriterTest {
-  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
-};
-
-// Declare typed test cases
-TYPED_TEST_SUITE(OrcChunkedWriterNumericTypeTest, SupportedTypes);
-
-// Test fixture for reader tests
-struct OrcReaderTest : public cudf::test::BaseFixture {};
-
-// Test fixture for statistics tests
-struct OrcStatisticsTest : public cudf::test::BaseFixture {};
-
-// Test fixture for metadata tests
-struct OrcMetadataReaderTest : public cudf::test::BaseFixture {};
-
-struct OrcCompressionTest : public cudf::test::BaseFixture,
-                            public ::testing::WithParamInterface<cudf::io::compression_type> {};
-
-namespace {
-// Generates a vector of uniform random values of type T
-template <typename T>
-inline auto random_values(size_t size)
-{
-  std::vector<T> values(size);
-
-  using T1 = T;
-  using uniform_distribution =
-    typename std::conditional_t<std::is_same_v<T1, bool>,
-                                std::bernoulli_distribution,
-                                std::conditional_t<std::is_floating_point_v<T1>,
-                                                   std::uniform_real_distribution<T1>,
-                                                   std::uniform_int_distribution<T1>>>;
-
-  static constexpr auto seed = 0xf00d;
-  static std::mt19937 engine{seed};
-  static uniform_distribution dist{};
-  std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; });
-
-  return values;
-}
-
-struct SkipRowTest {
-  int test_calls{0};
-  SkipRowTest() {}
-
-  std::unique_ptr<table> get_expected_result(std::string const& filepath,
-                                             int skip_rows,
-                                             int file_num_rows,
-                                             int read_num_rows)
-  {
-    auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-    column_wrapper<int32_t, typename decltype(sequence)::value_type> input_col(
-      sequence, sequence + file_num_rows);
-    table_view input_table({input_col});
-
-    cudf::io::orc_writer_options out_opts =
-      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input_table);
-    cudf::io::write_orc(out_opts);
-
-    auto begin_sequence = sequence, end_sequence = sequence;
-    if (skip_rows < file_num_rows) {
-      begin_sequence += skip_rows;
-      end_sequence += std::min(skip_rows + read_num_rows, file_num_rows);
-    }
-    column_wrapper<int32_t, typename decltype(sequence)::value_type> output_col(begin_sequence,
-                                                                                end_sequence);
-    std::vector<std::unique_ptr<column>> output_cols;
-    output_cols.push_back(output_col.release());
-    return std::make_unique<table>(std::move(output_cols));
-  }
-
-  void test(int skip_rows, int file_num_rows, int read_num_rows)
-  {
-    auto filepath =
-      temp_env->get_temp_filepath("SkipRowTest" + std::to_string(test_calls++) + ".orc");
-    auto expected_result = get_expected_result(filepath, skip_rows, file_num_rows, read_num_rows);
-    cudf::io::orc_reader_options in_opts =
-      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-        .use_index(false)
-        .skip_rows(skip_rows)
-        .num_rows(read_num_rows);
-    auto result = cudf::io::read_orc(in_opts);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_result->view(), result.tbl->view());
-  }
-
-  void test(int skip_rows, int file_num_rows)
-  {
-    auto filepath =
-      temp_env->get_temp_filepath("SkipRowTest" + std::to_string(test_calls++) + ".orc");
-    auto expected_result =
-      get_expected_result(filepath, skip_rows, file_num_rows, file_num_rows - skip_rows);
-    cudf::io::orc_reader_options in_opts =
-      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-        .use_index(false)
-        .skip_rows(skip_rows);
-    auto result = cudf::io::read_orc(in_opts);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_result->view(), result.tbl->view());
-  }
-};
-
-}  // namespace
-
-TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
-{
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
-
-  constexpr auto num_rows = 10000000;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
-                                                                         sequence + num_rows);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .compression(cudf::io::compression_type::SNAPPY);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
-{
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
-
-  constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
-{
-  auto sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (std::rand() / 10); });
-
-  constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
-                                                                         sequence + num_rows);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .use_index(false)
-      .timestamp_type(this->type());
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
-{
-  auto sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (std::rand() / 10); });
-  auto validity =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 30) && (i < 60); });
-
-  constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .use_index(false)
-      .timestamp_type(this->type());
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
-{
-  constexpr int64_t max = std::numeric_limits<int64_t>::max();
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
-
-  constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
-                                                                         sequence + num_rows);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcTimestampOverflow.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .use_index(false)
-      .timestamp_type(this->type());
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TEST_F(OrcWriterTest, MultiColumn)
-{
-  constexpr auto num_rows = 10;
-
-  auto col0_data = random_values<bool>(num_rows);
-  auto col1_data = random_values<int8_t>(num_rows);
-  auto col2_data = random_values<int16_t>(num_rows);
-  auto col3_data = random_values<int32_t>(num_rows);
-  auto col4_data = random_values<float>(num_rows);
-  auto col5_data = random_values<double>(num_rows);
-  auto col6_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
-  });
-
-  bool_col col0(col0_data.begin(), col0_data.end());
-  int8_col col1(col1_data.begin(), col1_data.end());
-  int16_col col2(col2_data.begin(), col2_data.end());
-  int32_col col3(col3_data.begin(), col3_data.end());
-  float32_col col4(col4_data.begin(), col4_data.end());
-  float64_col col5(col5_data.begin(), col5_data.end());
-  dec128_col col6(col6_data, col6_data + num_rows);
-  dec128_col col7(col7_data, col7_data + num_rows);
-
-  list_col<int64_t> col8{
-    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
-
-  int32_col child_col{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
-  struct_col col9{child_col};
-
-  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("bools");
-  expected_metadata.column_metadata[1].set_name("int8s");
-  expected_metadata.column_metadata[2].set_name("int16s");
-  expected_metadata.column_metadata[3].set_name("int32s");
-  expected_metadata.column_metadata[4].set_name("floats");
-  expected_metadata.column_metadata[5].set_name("doubles");
-  expected_metadata.column_metadata[6].set_name("decimal_pos_scale");
-  expected_metadata.column_metadata[7].set_name("decimal_neg_scale");
-  expected_metadata.column_metadata[8].set_name("lists");
-  expected_metadata.column_metadata[9].set_name("structs");
-
-  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcWriterTest, MultiColumnWithNulls)
-{
-  constexpr auto num_rows = 10;
-
-  auto col0_data = random_values<bool>(num_rows);
-  auto col1_data = random_values<int8_t>(num_rows);
-  auto col2_data = random_values<int16_t>(num_rows);
-  auto col3_data = random_values<int32_t>(num_rows);
-  auto col4_data = random_values<float>(num_rows);
-  auto col5_data = random_values<double>(num_rows);
-  auto col6_vals = random_values<int32_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
-  });
-  auto col0_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
-  auto col1_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); });
-  auto col3_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
-  auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 4 && i <= 6); });
-  auto col5_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 8); });
-  auto col6_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); });
-
-  bool_col col0{col0_data.begin(), col0_data.end(), col0_mask};
-  int8_col col1{col1_data.begin(), col1_data.end(), col1_mask};
-  int16_col col2(col2_data.begin(), col2_data.end());
-  int32_col col3{col3_data.begin(), col3_data.end(), col3_mask};
-  float32_col col4{col4_data.begin(), col4_data.end(), col4_mask};
-  float64_col col5{col5_data.begin(), col5_data.end(), col5_mask};
-  dec64_col col6{col6_data, col6_data + num_rows, col6_mask};
-  list_col<int32_t> col7{
-    {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
-    col0_mask};
-  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
-    {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}};
-  struct_col col8{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
-  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("bools");
-  expected_metadata.column_metadata[1].set_name("int8s");
-  expected_metadata.column_metadata[2].set_name("int16s");
-  expected_metadata.column_metadata[3].set_name("int32s");
-  expected_metadata.column_metadata[4].set_name("floats");
-  expected_metadata.column_metadata[5].set_name("doubles");
-  expected_metadata.column_metadata[6].set_name("decimal");
-  expected_metadata.column_metadata[7].set_name("lists");
-  expected_metadata.column_metadata[8].set_name("structs");
-
-  auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcWriterTest, ReadZeroRows)
-{
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-
-  constexpr auto num_rows = 10;
-  column_wrapper<int64_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                       sequence + num_rows);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .use_index(false)
-      .num_rows(0);
-  auto result = cudf::io::read_orc(in_opts);
-
-  EXPECT_EQ(0, result.tbl->num_rows());
-  EXPECT_EQ(1, result.tbl->num_columns());
-}
-
-TEST_F(OrcWriterTest, Strings)
-{
-  std::vector<char const*> strings{
-    "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  auto const num_rows = strings.size();
-
-  auto seq_col0 = random_values<int>(num_rows);
-  auto seq_col2 = random_values<float>(num_rows);
-
-  int32_col col0(seq_col0.begin(), seq_col0.end());
-  str_col col1(strings.begin(), strings.end());
-  float32_col col2(seq_col2.begin(), seq_col2.end());
-
-  table_view expected({col0, col1, col2});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("col_other");
-  expected_metadata.column_metadata[1].set_name("col_string");
-  expected_metadata.column_metadata[2].set_name("col_another");
-
-  auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcWriterTest, SlicedTable)
-{
-  // This test checks for writing zero copy, offsetted views into existing cudf tables
-
-  std::vector<char const*> strings{
-    "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  auto const num_rows = strings.size();
-
-  auto seq_col0  = random_values<int32_t>(num_rows);
-  auto seq_col2  = random_values<float>(num_rows);
-  auto vals_col3 = random_values<int32_t>(num_rows);
-  auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
-  });
-
-  int32_col col0(seq_col0.begin(), seq_col0.end());
-  str_col col1(strings.begin(), strings.end());
-  float32_col col2(seq_col2.begin(), seq_col2.end());
-  float32_col col3(seq_col3, seq_col3 + num_rows);
-
-  list_col<int64_t> col4{
-    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
-
-  int16_col ages_col{{48, 27, 25, 31, 351, 351, 29, 15}, cudf::test::iterators::null_at(5)};
-  struct_col col5{{ages_col}, cudf::test::iterators::null_at(4)};
-
-  table_view expected({col0, col1, col2, col3, col4, col5});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("col_other");
-  expected_metadata.column_metadata[1].set_name("col_string");
-  expected_metadata.column_metadata[2].set_name("col_another");
-  expected_metadata.column_metadata[3].set_name("col_decimal");
-  expected_metadata.column_metadata[4].set_name("lists");
-  expected_metadata.column_metadata[5].set_name("structs");
-
-  auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows)});
-
-  auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcWriterTest, HostBuffer)
-{
-  constexpr auto num_rows = 100 << 10;
-  auto const seq_col      = random_values<int>(num_rows);
-  int32_col col(seq_col.begin(), seq_col.end());
-
-  table_view expected{{col}};
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("col_other");
-
-  std::vector<char> out_buffer;
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(
-      cudf::io::source_info(out_buffer.data(), out_buffer.size()))
-      .use_index(false);
-  auto const result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcWriterTest, negTimestampsNano)
-{
-  // This is a separate test because ORC format has a bug where writing a timestamp between -1 and 0
-  // seconds from UNIX epoch is read as that timestamp + 1 second. We mimic that behavior and so
-  // this test has to hardcode test values which are < -1 second.
-  // Details: https://github.com/rapidsai/cudf/pull/5529#issuecomment-648768925
-  auto timestamps_ns =
-    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep>{
-      -131968727238000000,
-      -1530705634500000000,
-      -1674638741932929000,
-    };
-  cudf::table_view expected({timestamps_ns});
-
-  auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TEST_F(OrcWriterTest, Slice)
-{
-  int32_col col{{1, 2, 3, 4, 5}, cudf::test::iterators::null_at(3)};
-  std::vector<cudf::size_type> indices{2, 5};
-  std::vector<cudf::column_view> result = cudf::slice(col, indices);
-  cudf::table_view tbl{result};
-
-  auto filepath = temp_env->get_temp_filepath("Slice.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto read_table = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl);
-}
-
-TEST_F(OrcChunkedWriterTest, SingleTable)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(5, 5, true);
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedSingle.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(*table1);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
-}
-
-TEST_F(OrcChunkedWriterTest, SimpleTable)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(5, 5, true);
-  auto table2 = create_random_fixed_table<int>(5, 5, true);
-
-  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedSimple.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
-}
-
-TEST_F(OrcChunkedWriterTest, LargeTables)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(512, 4096, true);
-  auto table2 = create_random_fixed_table<int>(512, 8192, true);
-
-  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedLarge.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
-}
-
-TEST_F(OrcChunkedWriterTest, ManyTables)
-{
-  srand(31337);
-  std::vector<std::unique_ptr<table>> tables;
-  std::vector<table_view> table_views;
-  constexpr int num_tables = 96;
-  for (int idx = 0; idx < num_tables; idx++) {
-    auto tbl = create_random_fixed_table<int>(16, 64, true);
-    table_views.push_back(*tbl);
-    tables.push_back(std::move(tbl));
-  }
-
-  auto expected = cudf::concatenate(table_views);
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer writer(opts);
-  std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
-    writer.write(tbl);
-  });
-  writer.close();
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
-}
-
-TEST_F(OrcChunkedWriterTest, Metadata)
-{
-  std::vector<char const*> strings{
-    "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"};
-  auto const num_rows = strings.size();
-
-  auto seq_col0 = random_values<int>(num_rows);
-  auto seq_col2 = random_values<float>(num_rows);
-
-  int32_col col0(seq_col0.begin(), seq_col0.end());
-  str_col col1{strings.begin(), strings.end()};
-  float32_col col2(seq_col2.begin(), seq_col2.end());
-
-  table_view expected({col0, col1, col2});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("col_other");
-  expected_metadata.column_metadata[1].set_name("col_string");
-  expected_metadata.column_metadata[2].set_name("col_another");
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath})
-      .metadata(expected_metadata);
-  cudf::io::orc_chunked_writer(opts).write(expected).write(expected);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcChunkedWriterTest, Strings)
-{
-  bool mask1[] = {true, true, false, true, true, true, true};
-  std::vector<char const*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
-  str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
-  table_view tbl1({strings1});
-
-  bool mask2[] = {false, true, true, true, true, true, true};
-  std::vector<char const*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
-  str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
-  table_view tbl2({strings2});
-
-  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedStrings.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
-}
-
-TEST_F(OrcChunkedWriterTest, MismatchedTypes)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(4, 4, true);
-  auto table2 = create_random_fixed_table<float>(4, 4, true);
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer writer(opts);
-  writer.write(*table1);
-  EXPECT_THROW(writer.write(*table2), cudf::logic_error);
-}
-
-TEST_F(OrcChunkedWriterTest, ChunkedWritingAfterClosing)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(4, 4, true);
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedWritingAfterClosing.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer writer(opts);
-  writer.write(*table1);
-  writer.close();
-  EXPECT_THROW(writer.write(*table1), cudf::logic_error);
-}
-
-TEST_F(OrcChunkedWriterTest, MismatchedStructure)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(4, 4, true);
-  auto table2 = create_random_fixed_table<int>(3, 4, true);
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer writer(opts);
-  writer.write(*table1);
-  EXPECT_THROW(writer.write(*table2), cudf::logic_error);
-}
-
-TEST_F(OrcChunkedWriterTest, ReadStripes)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(5, 5, true);
-  auto table2 = create_random_fixed_table<int>(5, 5, true);
-
-  auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedStripes.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{1, 0, 1}});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
-}
-
-TEST_F(OrcChunkedWriterTest, ReadStripesError)
-{
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(5, 5, true);
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedStripesError.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(*table1);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{0, 1}});
-  EXPECT_THROW(cudf::io::read_orc(read_opts), cudf::logic_error);
-  read_opts.set_stripes({{-1}});
-  EXPECT_THROW(cudf::io::read_orc(read_opts), cudf::logic_error);
-}
-
-TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
-{
-  // write out two 31 row tables and make sure they get
-  // read back with all their validity bits in the right place
-
-  using T = TypeParam;
-
-  int num_els = 31;
-
-  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true};
-
-  T c1a[num_els];
-  std::fill(c1a, c1a + num_els, static_cast<T>(5));
-  T c1b[num_els];
-  std::fill(c1b, c1b + num_els, static_cast<T>(6));
-  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
-  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
-  table_view tbl1({c1a_w, c1b_w});
-
-  T c2a[num_els];
-  std::fill(c2a, c2a + num_els, static_cast<T>(8));
-  T c2b[num_els];
-  std::fill(c2b, c2b + num_els, static_cast<T>(9));
-  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
-  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
-  table_view tbl2({c2a_w, c2b_w});
-
-  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
-}
-
-TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
-{
-  // write out two 33 row tables and make sure they get
-  // read back with all their validity bits in the right place
-
-  using T = TypeParam;
-
-  int num_els = 33;
-
-  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true};
-
-  T c1a[num_els];
-  std::fill(c1a, c1a + num_els, static_cast<T>(5));
-  T c1b[num_els];
-  std::fill(c1b, c1b + num_els, static_cast<T>(6));
-  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
-  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
-  table_view tbl1({c1a_w, c1b_w});
-
-  T c2a[num_els];
-  std::fill(c2a, c2a + num_els, static_cast<T>(8));
-  T c2b[num_els];
-  std::fill(c2b, c2b + num_els, static_cast<T>(9));
-  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
-  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
-  table_view tbl2({c2a_w, c2b_w});
-
-  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
-
-  auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.orc");
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
-  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
-}
-
-TEST_F(OrcReaderTest, CombinedSkipRowTest)
-{
-  SkipRowTest skip_row;
-  skip_row.test(50, 75);
-  skip_row.test(2, 100);
-  skip_row.test(2, 100, 50);
-  skip_row.test(2, 100, 98);
-  skip_row.test(2, 100, 99);
-  skip_row.test(2, 100, 100);
-  skip_row.test(2, 100, 110);
-}
-
-TEST_F(OrcStatisticsTest, Basic)
-{
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto ts_sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; });
-  auto dec_sequence =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1001; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
-
-  std::vector<char const*> strings{
-    "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Wednesday", "Tuesday"};
-  int num_rows = strings.size();
-
-  column_wrapper<int32_t, typename decltype(sequence)::value_type> col1(
-    sequence, sequence + num_rows, validity);
-  column_wrapper<float, typename decltype(sequence)::value_type> col2(
-    sequence, sequence + num_rows, validity);
-  str_col col3{strings.begin(), strings.end()};
-  column_wrapper<cudf::timestamp_ns, typename decltype(sequence)::value_type> col4(
-    ts_sequence, ts_sequence + num_rows, validity);
-  column_wrapper<cudf::timestamp_us, typename decltype(sequence)::value_type> col5(
-    ts_sequence, ts_sequence + num_rows, validity);
-  bool_col col6({true, true, true, true, true, false, false, false, false}, validity);
-
-  cudf::test::fixed_point_column_wrapper<int64_t> col7(
-    dec_sequence, dec_sequence + num_rows, numeric::scale_type{-1});
-
-  table_view expected({col1, col2, col3, col4, col5, col6, col7});
-
-  auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
-
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
-
-  auto expected_column_names = std::vector<std::string>{""};
-  std::generate_n(
-    std::back_inserter(expected_column_names),
-    expected.num_columns(),
-    [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); });
-  EXPECT_EQ(stats.column_names, expected_column_names);
-
-  auto validate_statistics = [&](std::vector<cudf::io::column_statistics> const& stats) {
-    ASSERT_EQ(stats.size(), expected.num_columns() + 1);
-    auto& s0 = stats[0];
-    EXPECT_EQ(*s0.number_of_values, 9ul);
-    EXPECT_TRUE(s0.has_null.has_value());
-    EXPECT_FALSE(*s0.has_null);
-
-    auto& s1 = stats[1];
-    EXPECT_EQ(*s1.number_of_values, 4ul);
-    EXPECT_TRUE(*s1.has_null);
-    auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
-    EXPECT_EQ(*ts1.minimum, 1);
-    EXPECT_EQ(*ts1.maximum, 7);
-    EXPECT_EQ(*ts1.sum, 16);
-
-    auto& s2 = stats[2];
-    EXPECT_EQ(*s2.number_of_values, 4ul);
-    EXPECT_TRUE(*s2.has_null);
-    auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
-    EXPECT_EQ(*ts2.minimum, 1.);
-    EXPECT_EQ(*ts2.maximum, 7.);
-    EXPECT_EQ(*ts2.sum, 16.);
-
-    auto& s3 = stats[3];
-    EXPECT_EQ(*s3.number_of_values, 9ul);
-    EXPECT_FALSE(*s3.has_null);
-    auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
-    EXPECT_EQ(*ts3.minimum, "Friday");
-    EXPECT_EQ(*ts3.maximum, "Wednesday");
-    EXPECT_EQ(*ts3.sum, 58ul);
-
-    auto& s4 = stats[4];
-    EXPECT_EQ(*s4.number_of_values, 4ul);
-    EXPECT_TRUE(*s4.has_null);
-    auto& ts4 = std::get<cudf::io::timestamp_statistics>(s4.type_specific_stats);
-    EXPECT_EQ(*ts4.minimum, -4);
-    EXPECT_EQ(*ts4.maximum, 3);
-    EXPECT_EQ(*ts4.minimum_utc, -4);
-    EXPECT_EQ(*ts4.maximum_utc, 3);
-    EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_EQ(*ts4.maximum_nanos, 6);
-
-    auto& s5 = stats[5];
-    EXPECT_EQ(*s5.number_of_values, 4ul);
-    EXPECT_TRUE(*s5.has_null);
-    auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
-    EXPECT_EQ(*ts5.minimum, -3001);
-    EXPECT_EQ(*ts5.maximum, 3000);
-    EXPECT_EQ(*ts5.minimum_utc, -3001);
-    EXPECT_EQ(*ts5.maximum_utc, 3000);
-    EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_EQ(*ts5.maximum_nanos, 6000);
-
-    auto& s6 = stats[6];
-    EXPECT_EQ(*s6.number_of_values, 4ul);
-    EXPECT_TRUE(*s6.has_null);
-    auto& ts6 = std::get<cudf::io::bucket_statistics>(s6.type_specific_stats);
-    EXPECT_EQ(ts6.count[0], 2);
-
-    auto& s7 = stats[7];
-    EXPECT_EQ(*s7.number_of_values, 9ul);
-    EXPECT_FALSE(*s7.has_null);
-    auto& ts7 = std::get<cudf::io::decimal_statistics>(s7.type_specific_stats);
-    EXPECT_EQ(*ts7.minimum, "0.0");
-    EXPECT_EQ(*ts7.maximum, "800.8");
-    EXPECT_EQ(*ts7.sum, "3603.6");
-  };
-
-  validate_statistics(stats.file_stats);
-  // There's only one stripe, so column stats are the same as stripe stats
-  validate_statistics(stats.stripes_stats[0]);
-}
-
-TEST_F(OrcWriterTest, SlicedValidMask)
-{
-  std::vector<char const*> strings;
-  // Need more than 32 elements to reproduce the issue
-  for (int i = 0; i < 34; ++i)
-    strings.emplace_back("a long string to make sure overflow affects the output");
-  // An element is null only to enforce the output column to be nullable
-  str_col col{strings.begin(), strings.end(), cudf::test::iterators::null_at(32)};
-
-  // Bug tested here is easiest to reproduce when column_offset % 32 is 31
-  std::vector<cudf::size_type> indices{31, 34};
-  auto sliced_col = cudf::slice(static_cast<cudf::column_view>(col), indices);
-  cudf::table_view tbl{sliced_col};
-
-  cudf::io::table_input_metadata expected_metadata(tbl);
-  expected_metadata.column_metadata[0].set_name("col_string");
-
-  auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-#if 0
-TEST_F(OrcReaderTest, Test1)
-{
-  std::string filepath1 =
-    "/home/nghiat/Devel/cudf/1/python/cudf/cudf/tests/data/orc/"
-    "TestOrcFile.boolean_corruption_PR_6636.orc";
-
-  std::string filepath2 =
-    "/home/nghiat/Devel/cudf/1/python/cudf/cudf/tests/data/orc/"
-    "TestOrcFile.boolean_corruption_PR_6702.orc";
-
-  {
-    printf("test1\n");
-    cudf::io::orc_reader_options read_opts =
-      cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1}});
-    auto result = cudf::io::read_orc(read_opts);
-    for (int i = 0; i < result.tbl->num_columns(); i++) {
-      auto& col = result.tbl->get_column(i);
-      cudf::test::print(col);
-      printf("\n");
-    }
-  }
-
-  {
-    printf("test2\n");
-    cudf::io::orc_reader_options read_opts =
-      cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath2}});
-    auto result = cudf::io::read_orc(read_opts);
-    for (int i = 0; i < result.tbl->num_columns(); i++) {
-      auto& col = result.tbl->get_column(i);
-      cudf::test::print(col);
-      printf("\n");
-    }
-  }
-}
-
-#endif
-TEST_F(OrcReaderTest, SingleInputs)
-{
-  srand(31533);
-  auto table1 = create_random_fixed_table<int>(5, 5, true);
-
-  auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  cudf::io::orc_writer_options write_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath1}, table1->view());
-  cudf::io::write_orc(write_opts);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1}});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
-}
-
-TEST_F(OrcReaderTest, zstdCompressionRegression)
-{
-  if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) {
-    GTEST_SKIP() << "Newer nvCOMP version is required";
-  }
-
-  // Test with zstd compressed orc file with high compression ratio.
-  constexpr uint8_t input_buffer[] = {
-    0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74,
-    0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04,
-    0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb,
-    0xaf, 0x38, 0xc0, 0x07, 0x00, 0x00, 0x40, 0x01, 0xc0, 0x05, 0x00, 0x00, 0x46, 0x4d, 0x45, 0x00,
-    0x00, 0x0a, 0x06, 0x08, 0x01, 0x10, 0x01, 0x18, 0x30, 0x0a, 0x06, 0x08, 0x02, 0x10, 0x01, 0x18,
-    0x06, 0x0a, 0x06, 0x08, 0x03, 0x10, 0x01, 0x18, 0x05, 0x12, 0x02, 0x08, 0x00, 0x12, 0x04, 0x08,
-    0x03, 0x10, 0x02, 0x59, 0x00, 0x00, 0x08, 0x03, 0x10, 0x63, 0x1a, 0x0c, 0x08, 0x03, 0x10, 0x00,
-    0x18, 0x3b, 0x20, 0x25, 0x28, 0xa0, 0x9e, 0x75, 0x22, 0x10, 0x08, 0x0c, 0x12, 0x01, 0x01, 0x1a,
-    0x09, 0x63, 0x64, 0x5f, 0x67, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x22, 0x02, 0x08, 0x07, 0x30, 0xa0,
-    0x9e, 0x75, 0x08, 0x2f, 0x10, 0x05, 0x18, 0x80, 0x80, 0x10, 0x22, 0x02, 0x00, 0x0c, 0x28, 0x00,
-    0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17};
-
-  auto source =
-    cudf::io::source_info(reinterpret_cast<char const*>(input_buffer), sizeof(input_buffer));
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(source).use_index(false);
-
-  cudf::io::table_with_metadata result;
-  CUDF_EXPECT_NO_THROW(result = cudf::io::read_orc(in_opts));
-  EXPECT_EQ(1920800, result.tbl->num_rows());
-}
-
-TEST_F(OrcReaderTest, MultipleInputs)
-{
-  srand(31537);
-  auto table1 = create_random_fixed_table<int>(5, 5, true);
-  auto table2 = create_random_fixed_table<int>(5, 5, true);
-
-  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
-
-  auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  {
-    cudf::io::orc_writer_options out_opts =
-      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath1}, table1->view());
-    cudf::io::write_orc(out_opts);
-  }
-
-  auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
-  {
-    cudf::io::orc_writer_options out_opts =
-      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath2}, table2->view());
-    cudf::io::write_orc(out_opts);
-  }
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}});
-  auto result = cudf::io::read_orc(read_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
-}
-
-struct OrcWriterTestDecimal : public OrcWriterTest,
-                              public ::testing::WithParamInterface<std::tuple<int, int>> {};
-
-TEST_P(OrcWriterTestDecimal, Decimal64)
-{
-  auto const [num_rows, scale] = GetParam();
-
-  // Using int16_t because scale causes values to overflow if they already require 32 bits
-  auto const vals = random_values<int32_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
-  });
-  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  dec64_col col{data, data + num_rows, mask};
-  cudf::table_view tbl({static_cast<cudf::column_view>(col)});
-
-  auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
-
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0));
-}
-
-INSTANTIATE_TEST_CASE_P(OrcWriterTest,
-                        OrcWriterTestDecimal,
-                        ::testing::Combine(::testing::Values(1, 10000, 10001, 34567),
-                                           ::testing::Values(-2, 0, 2)));
-
-TEST_F(OrcWriterTest, Decimal32)
-{
-  constexpr auto num_rows = 12000;
-
-  // Using int16_t because scale causes values to overflow if they already require 32 bits
-  auto const vals = random_values<int16_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal32{vals[i], numeric::scale_type{2}};
-  });
-  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
-  dec32_col col{data, data + num_rows, mask};
-  cudf::table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0));
-}
-
-TEST_F(OrcStatisticsTest, Overflow)
+namespace {
+// Generates a vector of uniform random values of type T
+template <typename T>
+inline auto random_values(size_t size)
 {
-  int num_rows       = 10;
-  auto too_large_seq = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i * (std::numeric_limits<int64_t>::max() / 20); });
-  auto too_small_seq = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i * (std::numeric_limits<int64_t>::min() / 20); });
-  auto not_too_large_seq = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i * (std::numeric_limits<int64_t>::max() / 200); });
-  auto not_too_small_seq = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i * (std::numeric_limits<int64_t>::min() / 200); });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
-
-  column_wrapper<int64_t, typename decltype(too_large_seq)::value_type> col1(
-    too_large_seq, too_large_seq + num_rows, validity);
-  column_wrapper<int64_t, typename decltype(too_small_seq)::value_type> col2(
-    too_small_seq, too_small_seq + num_rows, validity);
-  column_wrapper<int64_t, typename decltype(not_too_large_seq)::value_type> col3(
-    not_too_large_seq, not_too_large_seq + num_rows, validity);
-  column_wrapper<int64_t, typename decltype(not_too_small_seq)::value_type> col4(
-    not_too_small_seq, not_too_small_seq + num_rows, validity);
-  table_view tbl({col1, col2, col3, col4});
-
-  auto filepath = temp_env->get_temp_filepath("OrcStatsOverflow.orc");
+  std::vector<T> values(size);
 
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
-  cudf::io::write_orc(out_opts);
+  using T1 = T;
+  using uniform_distribution =
+    typename std::conditional_t<std::is_same_v<T1, bool>,
+                                std::bernoulli_distribution,
+                                std::conditional_t<std::is_floating_point_v<T1>,
+                                                   std::uniform_real_distribution<T1>,
+                                                   std::uniform_int_distribution<T1>>>;
 
-  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
+  static constexpr auto seed = 0xf00d;
+  static std::mt19937 engine{seed};
+  static uniform_distribution dist{};
+  std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; });
 
-  auto check_sum_exist = [&](int idx, bool expected) {
-    auto const& s  = stats.file_stats[idx];
-    auto const& ts = std::get<cudf::io::integer_statistics>(s.type_specific_stats);
-    EXPECT_EQ(ts.sum.has_value(), expected);
-  };
-  check_sum_exist(1, false);
-  check_sum_exist(2, false);
-  check_sum_exist(3, true);
-  check_sum_exist(4, true);
+  return values;
 }
+}  // namespace
 
-TEST_F(OrcStatisticsTest, HasNull)
-{
-  // This test can now be implemented with libcudf; keeping the pandas version to keep the test
-  // inputs diversified
-  // Method to create file:
-  // >>> import pandas as pd
-  // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]})
-  // >>> df.to_orc("temp.orc")
-  //
-  // Contents of file:
-  // >>> import pyarrow.orc as po
-  // >>> po.ORCFile('temp.orc').read()
-  // pyarrow.Table
-  // a: int64
-  // b: int64
-  // ----
-  // a: [[1,2,null]]
-  // b: [[3,4,5]]
-  auto nulls_orc = std::array<uint8_t, 308>{
-    0x4F, 0x52, 0x43, 0x1D, 0x00, 0x00, 0x0A, 0x0C, 0x0A, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x04,
-    0x08, 0x03, 0x50, 0x00, 0x2C, 0x00, 0x00, 0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E,
-    0x0E, 0x26, 0x21, 0x36, 0x0E, 0x26, 0x01, 0x16, 0x09, 0xB6, 0x00, 0x46, 0x00, 0x2C, 0x00, 0x00,
-    0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E, 0x0E, 0x66, 0x21, 0x36, 0x0E, 0x36, 0x01,
-    0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x05, 0x00, 0x00, 0xFF, 0xC0,
-    0x07, 0x00, 0x00, 0x46, 0x01, 0x24, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x09, 0x00, 0x00, 0x46, 0x02,
-    0x68, 0xA0, 0x68, 0x00, 0x00, 0xE3, 0x62, 0xE3, 0x60, 0x13, 0x60, 0x90, 0x10, 0xE4, 0x02, 0xD1,
-    0x8C, 0x12, 0x92, 0x60, 0x9A, 0x09, 0x4C, 0x33, 0x00, 0xC5, 0x59, 0xC1, 0x34, 0x23, 0x98, 0x66,
-    0x04, 0xD2, 0x6C, 0x60, 0x3E, 0x13, 0x94, 0xCF, 0x24, 0xC1, 0x2E, 0xC4, 0x02, 0x52, 0x07, 0x24,
-    0x99, 0x60, 0xA4, 0x14, 0x73, 0x68, 0x88, 0x33, 0x00, 0x46, 0x00, 0x00, 0xE3, 0x52, 0xE2, 0x62,
-    0xE1, 0x60, 0x0E, 0x60, 0xE0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91, 0x60,
-    0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0xB4,
-    0x00, 0x00, 0xE3, 0x60, 0x16, 0x98, 0xC6, 0x28, 0xC5, 0xC5, 0xC1, 0x2C, 0xE0, 0x2C, 0x21, 0xA3,
-    0x60, 0xAE, 0xC1, 0xAC, 0x24, 0xC4, 0xC1, 0x23, 0xC4, 0xC4, 0xC8, 0x24, 0xC5, 0x98, 0x28, 0xC5,
-    0x98, 0xA4, 0xC0, 0xA0, 0xC1, 0x60, 0xC0, 0xA0, 0xC4, 0xC1, 0xC1, 0x82, 0xCE, 0x32, 0x60, 0xB6,
-    0x62, 0xE1, 0x60, 0x0E, 0x60, 0xB0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91,
-    0x60, 0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x87,
-    0x09, 0x7E, 0x1E, 0x8C, 0x49, 0xAC, 0x86, 0x7A, 0xE6, 0x7A, 0xA6, 0x00, 0x08, 0x5D, 0x10, 0x01,
-    0x18, 0x80, 0x80, 0x04, 0x22, 0x02, 0x00, 0x0C, 0x28, 0x26, 0x30, 0x06, 0x82, 0xF4, 0x03, 0x03,
-    0x4F, 0x52, 0x43, 0x17,
-  };
-
-  auto const stats = cudf::io::read_parsed_orc_statistics(
-    cudf::io::source_info{reinterpret_cast<char const*>(nulls_orc.data()), nulls_orc.size()});
-
-  EXPECT_EQ(stats.file_stats[1].has_null, true);
-  EXPECT_EQ(stats.file_stats[2].has_null, false);
-
-  EXPECT_EQ(stats.stripes_stats[0][1].has_null, true);
-  EXPECT_EQ(stats.stripes_stats[0][2].has_null, false);
-}
+// Base test fixture for tests
+struct OrcWriterTest : public cudf::test::BaseFixture {};
 
 struct OrcWriterTestStripes
   : public OrcWriterTest,
@@ -1438,716 +165,18 @@ TEST_P(OrcWriterTestStripes, StripeSize)
     cudf::io::orc_chunked_writer(opts).write(expected->view());
     validate(out_buffer_chunked);
   }
-  {
-    std::vector<char> out_buffer;
-    cudf::io::orc_writer_options out_opts =
-      cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected->view())
-        .stripe_size_rows(size_rows)
-        .stripe_size_bytes(size_bytes);
-    cudf::io::write_orc(out_opts);
-    validate(out_buffer);
-  }
 }
 
 INSTANTIATE_TEST_CASE_P(OrcWriterTest,
                         OrcWriterTestStripes,
-                        ::testing::Values(std::make_tuple(800000ul, 1000000),
-                                          std::make_tuple(2000000ul, 1000000),
-                                          std::make_tuple(4000000ul, 1000000),
-                                          std::make_tuple(8000000ul, 1000000),
-                                          std::make_tuple(8000000ul, 500000),
-                                          std::make_tuple(8000000ul, 250000),
-                                          std::make_tuple(8000000ul, 100000)));
-
-TEST_F(OrcWriterTest, StripeSizeInvalid)
-{
-  auto const unused_table = std::make_unique<table>();
-  std::vector<char> out_buffer;
-
-  EXPECT_THROW(
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
-      .stripe_size_rows(511),
-    cudf::logic_error);
-  EXPECT_THROW(
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
-      .stripe_size_bytes(63 << 10),
-    cudf::logic_error);
-  EXPECT_THROW(
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
-      .row_index_stride(511),
-    cudf::logic_error);
-}
-
-TEST_F(OrcWriterTest, TestMap)
-{
-  auto const num_rows       = 1200000;
-  auto const lists_per_row  = 4;
-  auto const num_child_rows = (num_rows * lists_per_row) / 2;  // half due to validity
-
-  auto keys      = random_values<int>(num_child_rows);
-  auto vals      = random_values<float>(num_child_rows);
-  auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
-  int32_col keys_col(keys.begin(), keys.end());
-  float32_col vals_col{vals.begin(), vals.end(), vals_mask};
-  auto s_col = struct_col({keys_col, vals_col}).release();
-
-  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
-
-  std::vector<int> row_offsets(num_rows + 1);
-  int offset = 0;
-  for (int idx = 0; idx < (num_rows) + 1; ++idx) {
-    row_offsets[idx] = offset;
-    if (valids[idx]) { offset += lists_per_row; }
-  }
-  int32_col offsets(row_offsets.begin(), row_offsets.end());
-
-  auto num_list_rows           = static_cast<cudf::column_view>(offsets).size() - 1;
-  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_list_rows);
-  auto list_col                = cudf::make_lists_column(
-    num_list_rows, offsets.release(), std::move(s_col), null_count, std::move(null_mask));
-
-  table_view expected({*list_col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_list_column_as_map();
-
-  auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(expected_metadata);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-}
-
-TEST_F(OrcReaderTest, NestedColumnSelection)
-{
-  auto const num_rows  = 1000;
-  auto child_col1_data = random_values<int32_t>(num_rows);
-  auto child_col2_data = random_values<int64_t>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
-  int32_col child_col1{child_col1_data.begin(), child_col1_data.end(), validity};
-  int64_col child_col2{child_col2_data.begin(), child_col2_data.end(), validity};
-  struct_col s_col{child_col1, child_col2};
-  table_view expected({s_col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("struct_s");
-  expected_metadata.column_metadata[0].child(0).set_name("field_a");
-  expected_metadata.column_metadata[0].child(1).set_name("field_b");
-
-  auto filepath = temp_env->get_temp_filepath("OrcNestedSelection.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(std::move(expected_metadata));
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .use_index(false)
-      .columns({"struct_s.field_b"});
-  auto result = cudf::io::read_orc(in_opts);
-
-  // Verify that only one child column is included in the output table
-  ASSERT_EQ(1, result.tbl->view().column(0).num_children());
-  // Verify that the first child column is `field_b`
-  int64_col expected_col{child_col2_data.begin(), child_col2_data.end(), validity};
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_col, result.tbl->view().column(0).child(0));
-  ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
-}
-
-TEST_F(OrcReaderTest, DecimalOptions)
-{
-  constexpr auto num_rows = 10;
-  auto col_vals           = random_values<int64_t>(num_rows);
-  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
-  });
-  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
-
-  dec128_col col{col_data, col_data + num_rows, mask};
-  table_view expected({col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("dec");
-
-  auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(std::move(expected_metadata));
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options valid_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .decimal128_columns({"dec", "fake_name"});
-  // Should not throw, even with "fake name"
-  EXPECT_NO_THROW(cudf::io::read_orc(valid_opts));
-}
-
-TEST_F(OrcWriterTest, DecimalOptionsNested)
-{
-  auto const num_rows = 100;
-
-  auto dec_vals  = random_values<int32_t>(num_rows);
-  auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
-  });
-  auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
-  });
-  dec64_col dec1_col(dec1_data, dec1_data + num_rows);
-  dec128_col dec2_col(dec2_data, dec2_data + num_rows);
-  auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col};
-
-  auto int_vals = random_values<int32_t>(num_rows);
-  int32_col int_col(int_vals.begin(), int_vals.end());
-  auto map_struct_col = struct_col({child_struct_col, int_col}).release();
-
-  std::vector<int> row_offsets(num_rows + 1);
-  std::iota(row_offsets.begin(), row_offsets.end(), 0);
-  int32_col offsets(row_offsets.begin(), row_offsets.end());
-
-  auto map_list_col = cudf::make_lists_column(
-    num_rows, offsets.release(), std::move(map_struct_col), 0, rmm::device_buffer{});
-
-  table_view expected({*map_list_col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("maps");
-  expected_metadata.column_metadata[0].set_list_column_as_map();
-  expected_metadata.column_metadata[0].child(1).child(0).child(0).set_name("dec64");
-  expected_metadata.column_metadata[0].child(1).child(0).child(1).set_name("dec128");
-
-  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(std::move(expected_metadata));
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
-      .use_index(false)
-      // One less level of nesting because children of map columns are the child struct's children
-      .decimal128_columns({"maps.0.dec64"});
-  auto result = cudf::io::read_orc(in_opts);
-
-  // Both columns should be read as decimal128
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0).child(0),
-                                      result.tbl->view().column(0).child(1).child(0).child(1));
-}
-
-TEST_F(OrcReaderTest, EmptyColumnsParam)
-{
-  srand(31337);
-  auto const expected = create_random_fixed_table<int>(2, 4, false);
-
-  std::vector<char> out_buffer;
-  cudf::io::orc_writer_options args =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, *expected);
-  cudf::io::write_orc(args);
-
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(
-      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
-      .columns({});
-  auto const result = cudf::io::read_orc(read_opts);
-
-  EXPECT_EQ(result.tbl->num_columns(), 0);
-  EXPECT_EQ(result.tbl->num_rows(), 0);
-}
-
-TEST_F(OrcMetadataReaderTest, TestBasic)
-{
-  auto const num_rows = 1'200'000;
-
-  auto ints   = random_values<int>(num_rows);
-  auto floats = random_values<float>(num_rows);
-  int32_col int_col(ints.begin(), ints.end());
-  float32_col float_col(floats.begin(), floats.end());
-
-  table_view expected({int_col, float_col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("int_col");
-  expected_metadata.column_metadata[1].set_name("float_col");
-
-  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(std::move(expected_metadata));
-  cudf::io::write_orc(out_opts);
-
-  auto meta = read_orc_metadata(cudf::io::source_info{filepath});
-  EXPECT_EQ(meta.num_rows(), num_rows);
-
-  EXPECT_EQ(meta.schema().root().name(), "");
-  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::orc::STRUCT);
-  ASSERT_EQ(meta.schema().root().num_children(), 2);
-
-  EXPECT_EQ(meta.schema().root().child(0).name(), "int_col");
-  EXPECT_EQ(meta.schema().root().child(1).name(), "float_col");
-}
-
-TEST_F(OrcMetadataReaderTest, TestNested)
-{
-  auto const num_rows       = 1'200'000;
-  auto const lists_per_row  = 4;
-  auto const num_child_rows = num_rows * lists_per_row;
-
-  auto keys = random_values<int>(num_child_rows);
-  auto vals = random_values<float>(num_child_rows);
-  int32_col keys_col(keys.begin(), keys.end());
-  float32_col vals_col(vals.begin(), vals.end());
-  auto s_col = struct_col({keys_col, vals_col}).release();
-
-  std::vector<int> row_offsets(num_rows + 1);
-  for (int idx = 0; idx < num_rows + 1; ++idx) {
-    row_offsets[idx] = idx * lists_per_row;
-  }
-  int32_col offsets(row_offsets.begin(), row_offsets.end());
-
-  auto list_col =
-    cudf::make_lists_column(num_rows, offsets.release(), std::move(s_col), 0, rmm::device_buffer{});
-
-  table_view expected({*list_col, *list_col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("maps");
-  expected_metadata.column_metadata[0].set_list_column_as_map();
-  expected_metadata.column_metadata[1].set_name("lists");
-  expected_metadata.column_metadata[1].child(1).child(0).set_name("int_field");
-  expected_metadata.column_metadata[1].child(1).child(1).set_name("float_field");
-
-  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(std::move(expected_metadata));
-  cudf::io::write_orc(out_opts);
-
-  auto meta = read_orc_metadata(cudf::io::source_info{filepath});
-  EXPECT_EQ(meta.num_rows(), num_rows);
-
-  EXPECT_EQ(meta.schema().root().name(), "");
-  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::orc::STRUCT);
-  ASSERT_EQ(meta.schema().root().num_children(), 2);
-
-  auto const& out_map_col = meta.schema().root().child(0);
-  EXPECT_EQ(out_map_col.name(), "maps");
-  EXPECT_EQ(out_map_col.type_kind(), cudf::io::orc::MAP);
-  ASSERT_EQ(out_map_col.num_children(), 2);
-  EXPECT_EQ(out_map_col.child(0).name(), "");  // keys (no name in ORC)
-  EXPECT_EQ(out_map_col.child(1).name(), "");  // values (no name in ORC)
-
-  auto const& out_list_col = meta.schema().root().child(1);
-  EXPECT_EQ(out_list_col.name(), "lists");
-  EXPECT_EQ(out_list_col.type_kind(), cudf::io::orc::LIST);
-  ASSERT_EQ(out_list_col.num_children(), 1);
-
-  auto const& out_list_struct_col = out_list_col.child(0);
-  EXPECT_EQ(out_list_struct_col.name(), "");  // elements (no name in ORC)
-  EXPECT_EQ(out_list_struct_col.type_kind(), cudf::io::orc::STRUCT);
-  ASSERT_EQ(out_list_struct_col.num_children(), 2);
-
-  auto const& out_int_col = out_list_struct_col.child(0);
-  EXPECT_EQ(out_int_col.name(), "int_field");
-  EXPECT_EQ(out_int_col.type_kind(), cudf::io::orc::INT);
-
-  auto const& out_float_col = out_list_struct_col.child(1);
-  EXPECT_EQ(out_float_col.name(), "float_field");
-  EXPECT_EQ(out_float_col.type_kind(), cudf::io::orc::FLOAT);
-}
-
-TEST_F(OrcReaderTest, ZstdMaxCompressionRate)
-{
-  if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD) or
-      cudf::io::nvcomp::is_compression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) {
-    GTEST_SKIP() << "Newer nvCOMP version is required";
-  }
-
-  // Encodes as 64KB of zeros, which compresses to 18 bytes with ZSTD
-  std::vector<float> const h_data(8 * 1024);
-  float32_col col(h_data.begin(), h_data.end());
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcHugeCompRatio.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .compression(cudf::io::compression_type::ZSTD);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TEST_F(OrcWriterTest, CompStats)
-{
-  auto table = create_random_fixed_table<int>(1, 100000, true);
-
-  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
-
-  std::vector<char> unused_buffer;
-  cudf::io::orc_writer_options opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&unused_buffer}, table->view())
-      .compression_statistics(stats);
-  cudf::io::write_orc(opts);
-
-  EXPECT_NE(stats->num_compressed_bytes(), 0);
-  EXPECT_EQ(stats->num_failed_bytes(), 0);
-  EXPECT_EQ(stats->num_skipped_bytes(), 0);
-  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
-}
-
-TEST_F(OrcChunkedWriterTest, CompStats)
-{
-  auto table = create_random_fixed_table<int>(1, 100000, true);
-
-  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
-
-  std::vector<char> unused_buffer;
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer})
-      .compression_statistics(stats);
-  cudf::io::orc_chunked_writer(opts).write(*table);
-
-  EXPECT_NE(stats->num_compressed_bytes(), 0);
-  EXPECT_EQ(stats->num_failed_bytes(), 0);
-  EXPECT_EQ(stats->num_skipped_bytes(), 0);
-  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
-
-  auto const single_table_comp_stats = *stats;
-  cudf::io::orc_chunked_writer(opts).write(*table);
-
-  EXPECT_EQ(stats->compression_ratio(), single_table_comp_stats.compression_ratio());
-  EXPECT_EQ(stats->num_compressed_bytes(), 2 * single_table_comp_stats.num_compressed_bytes());
-
-  EXPECT_EQ(stats->num_failed_bytes(), 0);
-  EXPECT_EQ(stats->num_skipped_bytes(), 0);
-}
-
-void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats)
-{
-  EXPECT_EQ(stats->num_compressed_bytes(), 0);
-  EXPECT_EQ(stats->num_failed_bytes(), 0);
-  EXPECT_EQ(stats->num_skipped_bytes(), 0);
-  EXPECT_TRUE(std::isnan(stats->compression_ratio()));
-}
-
-TEST_F(OrcWriterTest, CompStatsEmptyTable)
-{
-  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
-
-  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
-
-  std::vector<char> unused_buffer;
-  cudf::io::orc_writer_options opts = cudf::io::orc_writer_options::builder(
-                                        cudf::io::sink_info{&unused_buffer}, table_no_rows->view())
-                                        .compression_statistics(stats);
-  cudf::io::write_orc(opts);
-
-  expect_compression_stats_empty(stats);
-}
-
-TEST_F(OrcChunkedWriterTest, CompStatsEmptyTable)
-{
-  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
-
-  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
-
-  std::vector<char> unused_buffer;
-  cudf::io::chunked_orc_writer_options opts =
-    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer})
-      .compression_statistics(stats);
-  cudf::io::orc_chunked_writer(opts).write(*table_no_rows);
-
-  expect_compression_stats_empty(stats);
-}
-
-TEST_F(OrcWriterTest, EmptyRowGroup)
-{
-  std::vector<int> ints(10000 + 5, -1);
-  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i >= 10000; });
-  int32_col col{ints.begin(), ints.end(), mask};
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcEmptyRowGroup.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(in_opts);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-TEST_F(OrcWriterTest, NoNullsAsNonNullable)
-{
-  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<int32_t> col{{1, 2, 3}, valids};
-  table_view expected({col});
-
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_nullability(false);
-
-  auto filepath = temp_env->get_temp_filepath("NonNullable.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(std::move(expected_metadata));
-  // Writer should be able to write a column without nulls as non-nullable
-  EXPECT_NO_THROW(cudf::io::write_orc(out_opts));
-}
-
-TEST_F(OrcWriterTest, SlicedStringColumn)
-{
-  std::vector<char const*> strings{"a", "bc", "def", "longer", "strings", "at the end"};
-  str_col col(strings.begin(), strings.end());
-  table_view expected({col});
-
-  // Slice the table to include the longer strings
-  auto expected_slice = cudf::slice(expected, {2, 6});
-
-  auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-}
-
-TEST_F(OrcWriterTest, EmptyChildStringColumn)
-{
-  list_col<cudf::string_view> col{{}, {}};
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("OrcEmptyChildStringColumn.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-template <typename T>
-void check_all_null_stats(cudf::io::column_statistics const& stats)
-{
-  EXPECT_EQ(stats.number_of_values, 0);
-  EXPECT_TRUE(stats.has_null);
-
-  auto const ts = std::get<T>(stats.type_specific_stats);
-  EXPECT_FALSE(ts.minimum.has_value());
-  EXPECT_FALSE(ts.maximum.has_value());
-  EXPECT_TRUE(ts.sum.has_value());
-  EXPECT_EQ(*ts.sum, 0);
-}
-
-TEST_F(OrcStatisticsTest, AllNulls)
-{
-  float64_col double_col({0., 0., 0.}, cudf::test::iterators::all_nulls());
-  int32_col int_col({0, 0, 0}, cudf::test::iterators::all_nulls());
-  str_col string_col({"", "", ""}, cudf::test::iterators::all_nulls());
-
-  cudf::table_view expected({int_col, double_col, string_col});
-
-  std::vector<char> out_buffer;
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
-  cudf::io::write_orc(out_opts);
-
-  auto const stats = cudf::io::read_parsed_orc_statistics(
-    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
-
-  check_all_null_stats<cudf::io::integer_statistics>(stats.file_stats[1]);
-  check_all_null_stats<cudf::io::double_statistics>(stats.file_stats[2]);
-  check_all_null_stats<cudf::io::string_statistics>(stats.file_stats[3]);
-}
-
-TEST_F(OrcWriterTest, UnorderedDictionary)
-{
-  std::vector<char const*> strings{
-    "BBBB", "BBBB", "CCCC", "BBBB", "CCCC", "EEEE", "CCCC", "AAAA", "DDDD", "EEEE"};
-  str_col col(strings.begin(), strings.end());
-
-  table_view expected({col});
-
-  std::vector<char> out_buffer_sorted;
-  cudf::io::orc_writer_options out_opts_sorted =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected);
-  cudf::io::write_orc(out_opts_sorted);
-
-  cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder(
-    cudf::io::source_info{out_buffer_sorted.data(), out_buffer_sorted.size()});
-  auto const from_sorted = cudf::io::read_orc(in_opts_sorted).tbl;
-
-  std::vector<char> out_buffer_unsorted;
-  cudf::io::orc_writer_options out_opts_unsorted =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_unsorted}, expected)
-      .enable_dictionary_sort(false);
-  cudf::io::write_orc(out_opts_unsorted);
-
-  cudf::io::orc_reader_options in_opts_unsorted = cudf::io::orc_reader_options::builder(
-    cudf::io::source_info{out_buffer_unsorted.data(), out_buffer_unsorted.size()});
-  auto const from_unsorted = cudf::io::read_orc(in_opts_unsorted).tbl;
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*from_sorted, *from_unsorted);
-}
-
-TEST_F(OrcStatisticsTest, Empty)
-{
-  int32_col col0{};
-  float64_col col1{};
-  str_col col2{};
-  dec64_col col3{};
-  column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> col4;
-  bool_col col5{};
-  table_view expected({col0, col1, col2, col3, col4, col5});
-
-  std::vector<char> out_buffer;
-
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
-  cudf::io::write_orc(out_opts);
-
-  auto const stats = cudf::io::read_parsed_orc_statistics(
-    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
-
-  auto expected_column_names = std::vector<std::string>{""};
-  std::generate_n(
-    std::back_inserter(expected_column_names),
-    expected.num_columns(),
-    [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); });
-  EXPECT_EQ(stats.column_names, expected_column_names);
-
-  EXPECT_EQ(stats.column_names.size(), 7);
-  EXPECT_EQ(stats.stripes_stats.size(), 0);
-
-  auto const& fstats = stats.file_stats;
-  ASSERT_EQ(fstats.size(), 7);
-  auto& s0 = fstats[0];
-  EXPECT_TRUE(s0.number_of_values.has_value());
-  EXPECT_EQ(*s0.number_of_values, 0ul);
-  EXPECT_TRUE(s0.has_null.has_value());
-  EXPECT_FALSE(*s0.has_null);
-
-  auto& s1 = fstats[1];
-  EXPECT_EQ(*s1.number_of_values, 0ul);
-  EXPECT_FALSE(*s1.has_null);
-  auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
-  EXPECT_FALSE(ts1.minimum.has_value());
-  EXPECT_FALSE(ts1.maximum.has_value());
-  EXPECT_TRUE(ts1.sum.has_value());
-  EXPECT_EQ(*ts1.sum, 0);
-
-  auto& s2 = fstats[2];
-  EXPECT_EQ(*s2.number_of_values, 0ul);
-  EXPECT_FALSE(*s2.has_null);
-  auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
-  EXPECT_FALSE(ts2.minimum.has_value());
-  EXPECT_FALSE(ts2.maximum.has_value());
-  EXPECT_TRUE(ts2.sum.has_value());
-  EXPECT_EQ(*ts2.sum, 0);
-
-  auto& s3 = fstats[3];
-  EXPECT_EQ(*s3.number_of_values, 0ul);
-  EXPECT_FALSE(*s3.has_null);
-  auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
-  EXPECT_FALSE(ts3.minimum.has_value());
-  EXPECT_FALSE(ts3.maximum.has_value());
-  EXPECT_TRUE(ts3.sum.has_value());
-  EXPECT_EQ(*ts3.sum, 0);
-
-  auto& s4 = fstats[4];
-  EXPECT_EQ(*s4.number_of_values, 0ul);
-  EXPECT_FALSE(*s4.has_null);
-  auto& ts4 = std::get<cudf::io::decimal_statistics>(s4.type_specific_stats);
-  EXPECT_FALSE(ts4.minimum.has_value());
-  EXPECT_FALSE(ts4.maximum.has_value());
-  EXPECT_TRUE(ts4.sum.has_value());
-  EXPECT_EQ(*ts4.sum, "0");
-
-  auto& s5 = fstats[5];
-  EXPECT_EQ(*s5.number_of_values, 0ul);
-  EXPECT_FALSE(*s5.has_null);
-  auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
-  EXPECT_FALSE(ts5.minimum.has_value());
-  EXPECT_FALSE(ts5.maximum.has_value());
-  EXPECT_FALSE(ts5.minimum_utc.has_value());
-  EXPECT_FALSE(ts5.maximum_utc.has_value());
-  EXPECT_FALSE(ts5.minimum_nanos.has_value());
-  EXPECT_FALSE(ts5.maximum_nanos.has_value());
-
-  auto& s6 = fstats[6];
-  EXPECT_EQ(*s6.number_of_values, 0ul);
-  EXPECT_FALSE(*s6.has_null);
-  auto& ts6 = std::get<cudf::io::bucket_statistics>(s6.type_specific_stats);
-  EXPECT_EQ(ts6.count[0], 0);
-}
-
-TEST_P(OrcCompressionTest, Basic)
-{
-  constexpr auto num_rows     = 12000;
-  auto const compression_type = GetParam();
-
-  // Generate compressible data
-  auto int_sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
-  auto float_sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 32; });
-
-  int32_col int_col(int_sequence, int_sequence + num_rows);
-  float32_col float_col(float_sequence, float_sequence + num_rows);
-
-  table_view expected({int_col, float_col});
-
-  std::vector<char> out_buffer;
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected)
-      .compression(compression_type);
-  cudf::io::write_orc(out_opts);
-
-  cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(
-    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
-  auto result = cudf::io::read_orc(in_opts);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-}
-
-INSTANTIATE_TEST_CASE_P(OrcCompressionTest,
-                        OrcCompressionTest,
-                        ::testing::Values(cudf::io::compression_type::NONE,
-                                          cudf::io::compression_type::SNAPPY,
-                                          cudf::io::compression_type::LZ4,
-                                          cudf::io::compression_type::ZSTD));
-
-TEST_F(OrcWriterTest, BounceBufferBug)
-{
-  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
-
-  constexpr auto num_rows = 150000;
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
-  table_view expected({col});
-
-  auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc");
-  cudf::io::orc_writer_options out_opts =
-    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .compression(cudf::io::compression_type::ZSTD);
-  cudf::io::write_orc(out_opts);
-}
-
-CUDF_TEST_PROGRAM_MAIN()
+                        ::testing::Values(std::make_tuple(800000ul, 1000000)));
+
+// INSTANTIATE_TEST_CASE_P(OrcWriterTest,
+//                         OrcWriterTestStripes,
+//                         ::testing::Values(std::make_tuple(800000ul, 1000000),
+//                                           std::make_tuple(2000000ul, 1000000),
+//                                           std::make_tuple(4000000ul, 1000000),
+//                                           std::make_tuple(8000000ul, 1000000),
+//                                           std::make_tuple(8000000ul, 500000),
+//                                           std::make_tuple(8000000ul, 250000),
+//                                           std::make_tuple(8000000ul, 100000)));

From e7a15210df0e55100d9418778acd46e732a5cd49 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 22 Feb 2024 17:23:57 -0800
Subject: [PATCH 091/321] Implementing decode by chunks

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 62 +++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4bb86091fb0..10cdaf31b48 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -59,6 +59,7 @@ namespace cudf::io::orc::detail {
 namespace {
 
 // TODO: update
+// TODO: compute num stripes from chunks
 /**
  * @brief Decompresses the stripe data, at stream granularity.
  *
@@ -683,10 +684,12 @@ void reader::impl::decompress_and_decode()
 {
   if (_file_itm_data.has_no_data()) { return; }
 
-  //  auto const stripe_chunk =
-  //    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_decode_stripe_chunk++];
-  //  auto const stripe_start = stripe_chunk.start_idx;
-  //  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+  auto const stripe_chunk =
+    _chunk_read_data.decode_stripe_chunks[_chunk_read_data.curr_decode_stripe_chunk++];
+  auto const stripe_start = stripe_chunk.start_idx;
+  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+
+  printf("decoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
 
   auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   auto const rows_to_read      = _file_itm_data.rows_to_read;
@@ -710,6 +713,8 @@ void reader::impl::decompress_and_decode()
   auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
   auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
 
+  null_count_prefix_sums.clear();
+
   // TODO: move this to global step
   lvl_chunks.resize(_selected_columns.num_levels());
   _out_buffers.resize(_selected_columns.num_levels());
@@ -718,7 +723,8 @@ void reader::impl::decompress_and_decode()
   //
   //
   // TODO: move this to reader_impl.cu, decomp and decode step
-  std::size_t num_stripes = selected_stripes.size();
+  //  std::size_t num_stripes = selected_stripes.size();
+  std::size_t num_stripes = stripe_chunk.count;
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
@@ -794,15 +800,16 @@ void reader::impl::decompress_and_decode()
     std::size_t num_rowgroups    = 0;
 
     // TODO: Stripe and stream idx must be by chunk.
-    std::size_t stripe_idx = 0;
+    //    std::size_t stripe_idx = 0;
     std::size_t stream_idx = 0;
 
-    // std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe : selected_stripes) {
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      //    for (auto const& stripe : selected_stripes) {
+      auto const& stripe       = selected_stripes[stripe_idx];
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
-      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx,
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
                                                                       level,
                                                                       stripe_info,
                                                                       stripe_footer,
@@ -830,7 +837,7 @@ void reader::impl::decompress_and_decode()
       }
       // Update chunks to reference streams pointers
       for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-        auto& chunk = chunks[stripe_idx][col_idx];
+        auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
         // start row, number of rows in a each stripe and total number of rows
         // may change in lower levels of nesting
         chunk.start_row       = (level == 0)
@@ -877,7 +884,7 @@ void reader::impl::decompress_and_decode()
       stripe_start_row += num_rows_per_stripe;
       num_rowgroups += stripe_num_rowgroups;
 
-      stripe_idx++;
+      //      stripe_idx++;
     }  // for (stripe : selected_stripes)
 
     if (stripe_data.empty()) { continue; }
@@ -903,17 +910,19 @@ void reader::impl::decompress_and_decode()
     }
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(_file_itm_data.compinfo_map,
-                                                *_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                num_stripes,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
-      stripe_data.clear();
+      auto decomp_data = decompress_stripe_data(
+        _file_itm_data.compinfo_map,
+        *_metadata.per_file_metadata[0].decompressor,
+        stripe_data,
+        host_span<orc_stream_info const>(stream_info.data() + stripe_start, stripe_chunk.count),
+        chunks,
+        row_groups,
+        num_stripes,
+        _metadata.get_row_index_stride(),
+        level == 0,
+        _stream);
+      // TODO: fix this
+      // stripe_data.clear();
       stripe_data.push_back(std::move(decomp_data));
     } else {
       if (row_groups.size().first) {
@@ -1000,12 +1009,17 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   while (_chunk_read_data.more_stripe_to_load()) {
     load_data();
     printf("done load data\n\n");
+
+    while (_chunk_read_data.more_stripe_to_decode()) {
+      decompress_and_decode();
+      _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
+    }
   }
 
   // decompress_and_decode();
   // while (_chunk_read_data.more_stripe_to_decode()) {
-  decompress_and_decode();
-  _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
+  //   decompress_and_decode();
+  //   _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
   // }
 }
 

From fdad84e7b88df6af0ada5472175443612c358078 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 23 Feb 2024 10:48:04 -0800
Subject: [PATCH 092/321] Only to test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp            |  4 +-
 cpp/src/io/orc/reader_impl.cu          | 57 +++++++++++++++++++-------
 cpp/src/io/orc/reader_impl_chunking.cu |  6 ++-
 cpp/tests/io/orc_test.cpp              | 25 ++++++-----
 4 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index d512f4a6cc4..61f4681a3f4 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -1111,7 +1111,7 @@ class chunked_orc_writer_options {
    */
   void set_stripe_size_bytes(size_t size_bytes)
   {
-    CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
+    // CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
     _stripe_size_bytes = size_bytes;
   }
 
@@ -1127,7 +1127,7 @@ class chunked_orc_writer_options {
    */
   void set_stripe_size_rows(size_type size_rows)
   {
-    CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
+    // CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
     _stripe_size_rows = size_rows;
   }
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 10cdaf31b48..ab57dce9680 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -17,6 +17,8 @@
 // #define PRINT_DEBUG
 
 // TODO: remove
+#include <cudf_test/debug_utilities.hpp>
+
 #include <cudf/concatenate.hpp>
 //
 //
@@ -463,6 +465,8 @@ void decode_stream_data(std::size_t num_dicts,
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
+  printf("decode %d stripess \n", (int)num_stripes);
+
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -483,6 +487,7 @@ void decode_stream_data(std::size_t num_dicts,
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
   if (level > 0) {
+    printf("update_null_mask\n");
     // Update nullmasks for children if parent was a struct and had null mask
     update_null_mask(chunks, out_buffers, stream, mr);
   }
@@ -508,13 +513,15 @@ void decode_stream_data(std::size_t num_dicts,
   CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
 
   std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
-    out_buffers[col_idx].null_count() =
-      std::accumulate(stripe_idx_it + 0,
-                      stripe_idx_it + num_stripes,
-                      0,
-                      [&](auto null_count, auto const stripe_idx) {
-                        return null_count + chunks[stripe_idx][col_idx].null_count;
-                      });
+    out_buffers[col_idx].null_count() = std::accumulate(
+      stripe_idx_it + 0,
+      stripe_idx_it + num_stripes,
+      0,
+      [&](auto null_count, auto const stripe_idx) {
+        printf(
+          "null count: %d => %d\n", (int)stripe_idx, (int)chunks[stripe_idx][col_idx].null_count);
+        return null_count + chunks[stripe_idx][col_idx].null_count;
+      });
   });
 }
 
@@ -689,12 +696,18 @@ void reader::impl::decompress_and_decode()
   auto const stripe_start = stripe_chunk.start_idx;
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
-  printf("decoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+  printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
 
-  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
-  auto const rows_to_read      = _file_itm_data.rows_to_read;
+  // auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  // auto const rows_to_read      = _file_itm_data.rows_to_read;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
 
+  auto const rows_to_skip = 0;
+  auto rows_to_read       = 0;
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    rows_to_read += _metadata.per_file_metadata[0].ff.stripes[stripe_idx].numberOfRows;
+  }
+
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {
     auto const has_timestamp_column = std::any_of(
@@ -780,6 +793,8 @@ void reader::impl::decompress_and_decode()
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
+    printf(" use_index: %d\n", (int)use_index);
+
     // Logically view streams as columns
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
 
@@ -805,6 +820,8 @@ void reader::impl::decompress_and_decode()
 
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
       //    for (auto const& stripe : selected_stripes) {
+
+      printf("processing stripe_idx = %d\n", (int)stripe_idx);
       auto const& stripe       = selected_stripes[stripe_idx];
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
@@ -823,14 +840,18 @@ void reader::impl::decompress_and_decode()
                                                                       &chunks);
 
       auto const is_stripe_data_empty = total_data_size == 0;
+      printf("is_stripe_data_empty: %d\n", (int)is_stripe_data_empty);
+
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
       auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
 
       auto const num_rows_per_stripe = stripe_info->numberOfRows;
-      auto const rowgroup_id         = num_rowgroups;
-      auto stripe_num_rowgroups      = 0;
+      printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
+
+      auto const rowgroup_id    = num_rowgroups;
+      auto stripe_num_rowgroups = 0;
       if (use_index) {
         stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
                                _metadata.get_row_index_stride();
@@ -877,7 +898,7 @@ void reader::impl::decompress_and_decode()
         }
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stripe_start].dst_pos;
           }
         }
       }
@@ -968,6 +989,8 @@ void reader::impl::decompress_and_decode()
                        _mr);
 
     if (nested_cols.size()) {
+      printf("have nested col\n");
+
       // Extract information to process nested child columns
       scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
 
@@ -1031,6 +1054,7 @@ table_with_metadata reader::impl::make_output_chunk()
   std::vector<std::unique_ptr<column>> out_columns;
   auto out_metadata = make_output_metadata();
 
+#if 0
   // If no rows or stripes to read, return empty columns
   if (_file_itm_data.has_no_data() || !_chunk_read_data.has_next()) {
     std::transform(_selected_columns.levels[0].begin(),
@@ -1048,6 +1072,7 @@ table_with_metadata reader::impl::make_output_chunk()
                    });
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
+#endif
 
   // TODO: move this into decompress_and_decode
   // Create columns from buffer with respective schema information.
@@ -1059,6 +1084,7 @@ table_with_metadata reader::impl::make_output_chunk()
   for (auto& buffers : _file_itm_data.out_buffers) {
     //
     out_columns.clear();  // TODO: remove
+    out_metadata = make_output_metadata();
 
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
@@ -1077,12 +1103,15 @@ table_with_metadata reader::impl::make_output_chunk()
                        col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
                    });
 
+    printf("output col: \n");
+    cudf::test::print(out_columns.front()->view());
+
     auto tbl = std::make_unique<table>(std::move(out_columns));
     tabs.push_back(std::move(tbl));
     tv.push_back(tabs.back()->view());
 
     //
-    printf(" ----- decode one chunk\n");
+    printf(" ----- decode one chunk, size = %d\n", tv.back().num_rows());
     fflush(stdout);
     //
     //
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 3acd13964e7..aed03245f57 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -87,7 +87,10 @@ std::size_t gather_stream_info_and_column_desc(
   for (auto const& stream : stripefooter->streams) {
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       // Ignore reading this stream from source.
-      cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be ignored.");
+      // cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be
+      // ignored.");
+      printf("Unexpected stream in the input ORC source. The stream will be ignored\n");
+      fflush(stdout);
       src_offset += stream.length;
       continue;
     }
@@ -102,6 +105,7 @@ std::size_t gather_stream_info_and_column_desc(
       auto const schema_type = types[column_id];
       if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
           stream.kind == orc::PRESENT) {
+        printf("present stream\n");
         for (auto const& idx : schema_type.subtypes) {
           auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
           if (child_idx >= 0) {
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 36ef05ecc36..5bddacf635e 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -126,10 +126,10 @@ struct OrcWriterTestStripes
   : public OrcWriterTest,
     public ::testing::WithParamInterface<std::tuple<size_t, cudf::size_type>> {};
 
-TEST_P(OrcWriterTestStripes, StripeSize)
+TEST_F(OrcWriterTestStripes, StripeSize)
 {
-  constexpr auto num_rows            = 1000000;
-  auto const [size_bytes, size_rows] = GetParam();
+  constexpr auto num_rows = 50;
+  // auto const [size_bytes, size_rows] = GetParam();
 
   auto const seq_col = random_values<int>(num_rows);
   auto const validity =
@@ -138,12 +138,15 @@ TEST_P(OrcWriterTestStripes, StripeSize)
 
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col.release());
+
+  printf("input col: \n");
+  cudf::test::print(cols.front()->view());
+
   auto const expected = std::make_unique<table>(std::move(cols));
 
   auto validate = [&](std::vector<char> const& orc_buffer) {
-    auto const expected_stripe_num =
-      std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
-    auto const stats = cudf::io::read_parsed_orc_statistics(
+    auto const expected_stripe_num = 1;
+    auto const stats               = cudf::io::read_parsed_orc_statistics(
       cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
     EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
 
@@ -160,16 +163,16 @@ TEST_P(OrcWriterTestStripes, StripeSize)
     std::vector<char> out_buffer_chunked;
     cudf::io::chunked_orc_writer_options opts =
       cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info(&out_buffer_chunked))
-        .stripe_size_rows(size_rows)
-        .stripe_size_bytes(size_bytes);
+        .stripe_size_rows(1000);
     cudf::io::orc_chunked_writer(opts).write(expected->view());
+
     validate(out_buffer_chunked);
   }
 }
 
-INSTANTIATE_TEST_CASE_P(OrcWriterTest,
-                        OrcWriterTestStripes,
-                        ::testing::Values(std::make_tuple(800000ul, 1000000)));
+// INSTANTIATE_TEST_CASE_P(OrcWriterTest,
+//                         OrcWriterTestStripes,
+//                         ::testing::Values(std::make_tuple(800000ul, 1000000)));
 
 // INSTANTIATE_TEST_CASE_P(OrcWriterTest,
 //                         OrcWriterTestStripes,

From 92844ec76a0ce7122e680629a4f34844b099cc4a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 23 Feb 2024 17:38:08 -0800
Subject: [PATCH 093/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 51 ++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ab57dce9680..b02471c0880 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -77,6 +77,7 @@ namespace {
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
+  chunk const& stripe_chunk,
   stream_id_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
@@ -93,10 +94,26 @@ rmm::device_buffer decompress_stripe_data(
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
 
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
+  // printf("decompress #stripe: %d, ")
 
+  // TODO: use lvl_stripe_stream_chunks
+  std::size_t count{0};
   for (auto const& info : stream_info) {
+    if (info.id.stripe_idx < stripe_chunk.start_idx ||
+        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+      continue;
+    }
+    count++;
+  }
+
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
+
+  for (auto const& info : stream_info) {
+    if (info.id.stripe_idx < stripe_chunk.start_idx ||
+        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+      continue;
+    }
+
 #ifdef PRINT_DEBUG
     printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
            (int)info.id.stripe_idx,
@@ -931,19 +948,18 @@ void reader::impl::decompress_and_decode()
     }
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(
-        _file_itm_data.compinfo_map,
-        *_metadata.per_file_metadata[0].decompressor,
-        stripe_data,
-        host_span<orc_stream_info const>(stream_info.data() + stripe_start, stripe_chunk.count),
-        chunks,
-        row_groups,
-        num_stripes,
-        _metadata.get_row_index_stride(),
-        level == 0,
-        _stream);
-      // TODO: fix this
-      // stripe_data.clear();
+      auto decomp_data = decompress_stripe_data(stripe_chunk,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                num_stripes,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+      stripe_data.clear();
       stripe_data.push_back(std::move(decomp_data));
     } else {
       if (row_groups.size().first) {
@@ -1054,9 +1070,9 @@ table_with_metadata reader::impl::make_output_chunk()
   std::vector<std::unique_ptr<column>> out_columns;
   auto out_metadata = make_output_metadata();
 
-#if 0
   // If no rows or stripes to read, return empty columns
-  if (_file_itm_data.has_no_data() || !_chunk_read_data.has_next()) {
+  if (_file_itm_data.has_no_data() /*|| !_chunk_read_data.has_next()*/) {
+    printf("has no next\n");
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
                    std::back_inserter(out_columns),
@@ -1072,7 +1088,6 @@ table_with_metadata reader::impl::make_output_chunk()
                    });
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
-#endif
 
   // TODO: move this into decompress_and_decode
   // Create columns from buffer with respective schema information.

From 370c00f48451a34926a6a0456e130070797cb8aa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 23 Feb 2024 20:26:31 -0800
Subject: [PATCH 094/321] Fix bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 42 ++++++++++++++++++++++++++++++++---
 cpp/tests/io/orc_test.cpp     | 18 +++++++--------
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index b02471c0880..78f622bf2fe 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -864,6 +864,9 @@ void reader::impl::decompress_and_decode()
 
       auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
 
+      printf("line %d\n", __LINE__);
+      fflush(stdout);
+
       auto const num_rows_per_stripe = stripe_info->numberOfRows;
       printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
 
@@ -873,6 +876,10 @@ void reader::impl::decompress_and_decode()
         stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
                                _metadata.get_row_index_stride();
       }
+
+      printf("line %d\n", __LINE__);
+      fflush(stdout);
+
       // Update chunks to reference streams pointers
       for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
         auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
@@ -919,12 +926,19 @@ void reader::impl::decompress_and_decode()
           }
         }
       }
+
+      printf("line %d\n", __LINE__);
+      fflush(stdout);
+
       stripe_start_row += num_rows_per_stripe;
       num_rowgroups += stripe_num_rowgroups;
 
       //      stripe_idx++;
     }  // for (stripe : selected_stripes)
 
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
+
     if (stripe_data.empty()) { continue; }
 
     // Process dataset chunk pages into output columns
@@ -946,8 +960,14 @@ void reader::impl::decompress_and_decode()
                        return meta;
                      });
     }
+
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
+
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      printf("line %d\n", __LINE__);
+      fflush(stdout);
       auto decomp_data = decompress_stripe_data(stripe_chunk,
                                                 _file_itm_data.compinfo_map,
                                                 *_metadata.per_file_metadata[0].decompressor,
@@ -959,8 +979,12 @@ void reader::impl::decompress_and_decode()
                                                 _metadata.get_row_index_stride(),
                                                 level == 0,
                                                 _stream);
-      stripe_data.clear();
+      // stripe_data.clear();
       stripe_data.push_back(std::move(decomp_data));
+
+      printf("line %d\n", __LINE__);
+      fflush(stdout);
+
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -978,6 +1002,9 @@ void reader::impl::decompress_and_decode()
       }
     }
 
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < num_stripes; ++j) {
@@ -993,6 +1020,9 @@ void reader::impl::decompress_and_decode()
       _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
     }
 
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
+
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
@@ -1004,6 +1034,9 @@ void reader::impl::decompress_and_decode()
                        _stream,
                        _mr);
 
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
+
     if (nested_cols.size()) {
       printf("have nested col\n");
 
@@ -1026,6 +1059,9 @@ void reader::impl::decompress_and_decode()
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
+
+    printf("line %d\n", __LINE__);
+    fflush(stdout);
   }  // end loop level
 }
 
@@ -1118,8 +1154,8 @@ table_with_metadata reader::impl::make_output_chunk()
                        col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
                    });
 
-    printf("output col: \n");
-    cudf::test::print(out_columns.front()->view());
+    // printf("output col: \n");
+    // cudf::test::print(out_columns.front()->view());
 
     auto tbl = std::make_unique<table>(std::move(out_columns));
     tabs.push_back(std::move(tbl));
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 5bddacf635e..2231125f5d8 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -128,7 +128,7 @@ struct OrcWriterTestStripes
 
 TEST_F(OrcWriterTestStripes, StripeSize)
 {
-  constexpr auto num_rows = 50;
+  constexpr auto num_rows = 1000000;
   // auto const [size_bytes, size_rows] = GetParam();
 
   auto const seq_col = random_values<int>(num_rows);
@@ -139,16 +139,16 @@ TEST_F(OrcWriterTestStripes, StripeSize)
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col.release());
 
-  printf("input col: \n");
-  cudf::test::print(cols.front()->view());
+  // printf("input col: \n");
+  // cudf::test::print(cols.front()->view());
 
   auto const expected = std::make_unique<table>(std::move(cols));
 
   auto validate = [&](std::vector<char> const& orc_buffer) {
-    auto const expected_stripe_num = 1;
-    auto const stats               = cudf::io::read_parsed_orc_statistics(
-      cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
-    EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
+    // auto const expected_stripe_num = 6;
+    // auto const stats               = cudf::io::read_parsed_orc_statistics(
+    //   cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
+    // EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
 
     cudf::io::orc_reader_options in_opts =
       cudf::io::orc_reader_options::builder(
@@ -156,14 +156,14 @@ TEST_F(OrcWriterTestStripes, StripeSize)
         .use_index(false);
     auto result = cudf::io::read_orc(in_opts);
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected->view(), result.tbl->view());
   };
 
   {
     std::vector<char> out_buffer_chunked;
     cudf::io::chunked_orc_writer_options opts =
       cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info(&out_buffer_chunked))
-        .stripe_size_rows(1000);
+        .stripe_size_rows(10000);
     cudf::io::orc_chunked_writer(opts).write(expected->view());
 
     validate(out_buffer_chunked);

From 00aa10485420d5ea32db6805b50e54b345d06c82 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 24 Feb 2024 08:10:57 -0800
Subject: [PATCH 095/321] Debugging

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp |    4 +
 cpp/src/io/orc/reader_impl.cu             |   50 +-
 cpp/src/io/orc/reader_impl_chunking.cu    |   11 +-
 cpp/tests/io/orc_test.cpp                 | 2018 ++++++++++++++++++++-
 4 files changed, 2015 insertions(+), 68 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 620294a1e47..18afdddd82a 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -200,6 +200,10 @@ aggregate_orc_metadata::select_stripes(
 
         // TODO: check for overflow here.
         rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        printf(" rows_to_read : %d / %d\n",
+               (int)per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows,
+               (int)rows_to_read);
+        printf(" stripe to read: %d-%d\n", (int)src_file_idx, (int)stripe_idx);
       }
       selected_stripes_mapping.emplace_back(static_cast<int>(src_file_idx),
                                             std::move(stripe_infos));
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 78f622bf2fe..f88b931bd2b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -864,8 +864,8 @@ void reader::impl::decompress_and_decode()
 
       auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
 
-      printf("line %d\n", __LINE__);
-      fflush(stdout);
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
 
       auto const num_rows_per_stripe = stripe_info->numberOfRows;
       printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
@@ -877,8 +877,8 @@ void reader::impl::decompress_and_decode()
                                _metadata.get_row_index_stride();
       }
 
-      printf("line %d\n", __LINE__);
-      fflush(stdout);
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
 
       // Update chunks to reference streams pointers
       for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
@@ -927,8 +927,8 @@ void reader::impl::decompress_and_decode()
         }
       }
 
-      printf("line %d\n", __LINE__);
-      fflush(stdout);
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
 
       stripe_start_row += num_rows_per_stripe;
       num_rowgroups += stripe_num_rowgroups;
@@ -936,8 +936,8 @@ void reader::impl::decompress_and_decode()
       //      stripe_idx++;
     }  // for (stripe : selected_stripes)
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
 
     if (stripe_data.empty()) { continue; }
 
@@ -961,13 +961,13 @@ void reader::impl::decompress_and_decode()
                      });
     }
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
 
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      printf("line %d\n", __LINE__);
-      fflush(stdout);
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
       auto decomp_data = decompress_stripe_data(stripe_chunk,
                                                 _file_itm_data.compinfo_map,
                                                 *_metadata.per_file_metadata[0].decompressor,
@@ -982,8 +982,8 @@ void reader::impl::decompress_and_decode()
       // stripe_data.clear();
       stripe_data.push_back(std::move(decomp_data));
 
-      printf("line %d\n", __LINE__);
-      fflush(stdout);
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
 
     } else {
       if (row_groups.size().first) {
@@ -1002,8 +1002,8 @@ void reader::impl::decompress_and_decode()
       }
     }
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
 
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
@@ -1020,8 +1020,8 @@ void reader::impl::decompress_and_decode()
       _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
     }
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
 
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
@@ -1034,8 +1034,8 @@ void reader::impl::decompress_and_decode()
                        _stream,
                        _mr);
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
 
     if (nested_cols.size()) {
       printf("have nested col\n");
@@ -1060,8 +1060,8 @@ void reader::impl::decompress_and_decode()
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
 
-    printf("line %d\n", __LINE__);
-    fflush(stdout);
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
   }  // end loop level
 }
 
@@ -1083,13 +1083,13 @@ void reader::impl::prepare_data(uint64_t skip_rows,
   // load_data();
   while (_chunk_read_data.more_stripe_to_load()) {
     load_data();
-    printf("done load data\n\n");
 
     while (_chunk_read_data.more_stripe_to_decode()) {
       decompress_and_decode();
       _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
     }
   }
+  printf("done load and decode data\n\n");
 
   // decompress_and_decode();
   // while (_chunk_read_data.more_stripe_to_decode()) {
@@ -1154,8 +1154,8 @@ table_with_metadata reader::impl::make_output_chunk()
                        col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
                    });
 
-    // printf("output col: \n");
-    // cudf::test::print(out_columns.front()->view());
+    printf("output col: \n");
+    cudf::test::print(out_columns.front()->view());
 
     auto tbl = std::make_unique<table>(std::move(out_columns));
     tabs.push_back(std::move(tbl));
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index aed03245f57..6b72ea28a96 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -351,6 +351,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Get the total number of stripes across all input files.
   auto const num_stripes = selected_stripes.size();
 
+  printf("num load stripe: %d\n", (int)num_stripes);
+
   stripe_data_read_chunks.resize(num_stripes);
   lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
 
@@ -460,6 +462,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     //    return;
   }
 
+  printf("total stripe sizes:\n");
+  for (auto& size : total_stripe_sizes) {
+    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+  }
+
   // Compute the prefix sum of stripe data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
@@ -470,7 +477,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
-  printf("total stripe sizes:\n");
+  printf("prefix sum total stripe sizes:\n");
   for (auto& size : total_stripe_sizes) {
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
   }
@@ -521,7 +528,7 @@ void reader::impl::load_data()
   auto const stripe_start = stripe_chunk.start_idx;
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
-  printf("loading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+  printf("\n\nloading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
 
   // Prepare the buffer to read raw data onto.
   // TODO: clear all old buffer.
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 2231125f5d8..bb132e477dd 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -95,41 +95,1282 @@ std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_colum
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
+// Base test fixture for tests
+struct OrcWriterTest : public cudf::test::BaseFixture {};
+
+// Typed test fixture for numeric type tests
+template <typename T>
+struct OrcWriterNumericTypeTest : public OrcWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
+// Typed test fixture for timestamp type tests
+template <typename T>
+struct OrcWriterTimestampTypeTest : public OrcWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
+// Declare typed test cases
+// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5351
+using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
+TYPED_TEST_SUITE(OrcWriterNumericTypeTest, SupportedTypes);
+using SupportedTimestampTypes =
+  cudf::test::RemoveIf<cudf::test::ContainedIn<cudf::test::Types<cudf::timestamp_D>>,
+                       cudf::test::TimestampTypes>;
+TYPED_TEST_SUITE(OrcWriterTimestampTypeTest, SupportedTimestampTypes);
+
+// Base test fixture for chunked writer tests
+struct OrcChunkedWriterTest : public cudf::test::BaseFixture {};
+
+// Typed test fixture for numeric type tests
+template <typename T>
+struct OrcChunkedWriterNumericTypeTest : public OrcChunkedWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
+// Declare typed test cases
+TYPED_TEST_SUITE(OrcChunkedWriterNumericTypeTest, SupportedTypes);
+
+// Test fixture for reader tests
+struct OrcReaderTest : public cudf::test::BaseFixture {};
+
+// Test fixture for statistics tests
+struct OrcStatisticsTest : public cudf::test::BaseFixture {};
+
+// Test fixture for metadata tests
+struct OrcMetadataReaderTest : public cudf::test::BaseFixture {};
+
+struct OrcCompressionTest : public cudf::test::BaseFixture,
+                            public ::testing::WithParamInterface<cudf::io::compression_type> {};
+
 namespace {
 // Generates a vector of uniform random values of type T
 template <typename T>
 inline auto random_values(size_t size)
 {
-  std::vector<T> values(size);
+  std::vector<T> values(size);
+
+  using T1 = T;
+  using uniform_distribution =
+    typename std::conditional_t<std::is_same_v<T1, bool>,
+                                std::bernoulli_distribution,
+                                std::conditional_t<std::is_floating_point_v<T1>,
+                                                   std::uniform_real_distribution<T1>,
+                                                   std::uniform_int_distribution<T1>>>;
+
+  static constexpr auto seed = 0xf00d;
+  static std::mt19937 engine{seed};
+  static uniform_distribution dist{};
+  std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; });
+
+  return values;
+}
+
+struct SkipRowTest {
+  int test_calls{0};
+  SkipRowTest() {}
+
+  std::unique_ptr<table> get_expected_result(std::string const& filepath,
+                                             int skip_rows,
+                                             int file_num_rows,
+                                             int read_num_rows)
+  {
+    auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+    column_wrapper<int32_t, typename decltype(sequence)::value_type> input_col(
+      sequence, sequence + file_num_rows);
+    table_view input_table({input_col});
+
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input_table);
+    cudf::io::write_orc(out_opts);
+
+    auto begin_sequence = sequence, end_sequence = sequence;
+    if (skip_rows < file_num_rows) {
+      begin_sequence += skip_rows;
+      end_sequence += std::min(skip_rows + read_num_rows, file_num_rows);
+    }
+    column_wrapper<int32_t, typename decltype(sequence)::value_type> output_col(begin_sequence,
+                                                                                end_sequence);
+    std::vector<std::unique_ptr<column>> output_cols;
+    output_cols.push_back(output_col.release());
+    return std::make_unique<table>(std::move(output_cols));
+  }
+
+  void test(int skip_rows, int file_num_rows, int read_num_rows)
+  {
+    auto filepath =
+      temp_env->get_temp_filepath("SkipRowTest" + std::to_string(test_calls++) + ".orc");
+    auto expected_result = get_expected_result(filepath, skip_rows, file_num_rows, read_num_rows);
+    cudf::io::orc_reader_options in_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+        .use_index(false)
+        .skip_rows(skip_rows)
+        .num_rows(read_num_rows);
+    auto result = cudf::io::read_orc(in_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_result->view(), result.tbl->view());
+  }
+
+  void test(int skip_rows, int file_num_rows)
+  {
+    auto filepath =
+      temp_env->get_temp_filepath("SkipRowTest" + std::to_string(test_calls++) + ".orc");
+    auto expected_result =
+      get_expected_result(filepath, skip_rows, file_num_rows, file_num_rows - skip_rows);
+    cudf::io::orc_reader_options in_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+        .use_index(false)
+        .skip_rows(skip_rows);
+    auto result = cudf::io::read_orc(in_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_result->view(), result.tbl->view());
+  }
+};
+
+}  // namespace
+
+TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
+{
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
+                                                                         sequence + num_rows);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
+{
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, validity);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
+{
+  auto sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (std::rand() / 10); });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
+                                                                         sequence + num_rows);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .use_index(false)
+      .timestamp_type(this->type());
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
+{
+  auto sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (std::rand() / 10); });
+  auto validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 30) && (i < 60); });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, validity);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .use_index(false)
+      .timestamp_type(this->type());
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
+{
+  constexpr int64_t max = std::numeric_limits<int64_t>::max();
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
+                                                                         sequence + num_rows);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcTimestampOverflow.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .use_index(false)
+      .timestamp_type(this->type());
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, MultiColumn)
+{
+  constexpr auto num_rows = 10;
+
+  auto col0_data = random_values<bool>(num_rows);
+  auto col1_data = random_values<int8_t>(num_rows);
+  auto col2_data = random_values<int16_t>(num_rows);
+  auto col3_data = random_values<int32_t>(num_rows);
+  auto col4_data = random_values<float>(num_rows);
+  auto col5_data = random_values<double>(num_rows);
+  auto col6_vals = random_values<int64_t>(num_rows);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
+  });
+
+  bool_col col0(col0_data.begin(), col0_data.end());
+  int8_col col1(col1_data.begin(), col1_data.end());
+  int16_col col2(col2_data.begin(), col2_data.end());
+  int32_col col3(col3_data.begin(), col3_data.end());
+  float32_col col4(col4_data.begin(), col4_data.end());
+  float64_col col5(col5_data.begin(), col5_data.end());
+  dec128_col col6(col6_data, col6_data + num_rows);
+  dec128_col col7(col7_data, col7_data + num_rows);
+
+  list_col<int64_t> col8{
+    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
+
+  int32_col child_col{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
+  struct_col col9{child_col};
+
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("bools");
+  expected_metadata.column_metadata[1].set_name("int8s");
+  expected_metadata.column_metadata[2].set_name("int16s");
+  expected_metadata.column_metadata[3].set_name("int32s");
+  expected_metadata.column_metadata[4].set_name("floats");
+  expected_metadata.column_metadata[5].set_name("doubles");
+  expected_metadata.column_metadata[6].set_name("decimal_pos_scale");
+  expected_metadata.column_metadata[7].set_name("decimal_neg_scale");
+  expected_metadata.column_metadata[8].set_name("lists");
+  expected_metadata.column_metadata[9].set_name("structs");
+
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcWriterTest, MultiColumnWithNulls)
+{
+  constexpr auto num_rows = 10;
+
+  auto col0_data = random_values<bool>(num_rows);
+  auto col1_data = random_values<int8_t>(num_rows);
+  auto col2_data = random_values<int16_t>(num_rows);
+  auto col3_data = random_values<int32_t>(num_rows);
+  auto col4_data = random_values<float>(num_rows);
+  auto col5_data = random_values<double>(num_rows);
+  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+  });
+  auto col0_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
+  auto col1_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); });
+  auto col3_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
+  auto col4_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 4 && i <= 6); });
+  auto col5_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 8); });
+  auto col6_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); });
+
+  bool_col col0{col0_data.begin(), col0_data.end(), col0_mask};
+  int8_col col1{col1_data.begin(), col1_data.end(), col1_mask};
+  int16_col col2(col2_data.begin(), col2_data.end());
+  int32_col col3{col3_data.begin(), col3_data.end(), col3_mask};
+  float32_col col4{col4_data.begin(), col4_data.end(), col4_mask};
+  float64_col col5{col5_data.begin(), col5_data.end(), col5_mask};
+  dec64_col col6{col6_data, col6_data + num_rows, col6_mask};
+  list_col<int32_t> col7{
+    {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
+    col0_mask};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}};
+  struct_col col8{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("bools");
+  expected_metadata.column_metadata[1].set_name("int8s");
+  expected_metadata.column_metadata[2].set_name("int16s");
+  expected_metadata.column_metadata[3].set_name("int32s");
+  expected_metadata.column_metadata[4].set_name("floats");
+  expected_metadata.column_metadata[5].set_name("doubles");
+  expected_metadata.column_metadata[6].set_name("decimal");
+  expected_metadata.column_metadata[7].set_name("lists");
+  expected_metadata.column_metadata[8].set_name("structs");
+
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcWriterTest, ReadZeroRows)
+{
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+
+  constexpr auto num_rows = 10;
+  column_wrapper<int64_t, typename decltype(sequence)::value_type> col(sequence,
+                                                                       sequence + num_rows);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .use_index(false)
+      .num_rows(0);
+  auto result = cudf::io::read_orc(in_opts);
+
+  EXPECT_EQ(0, result.tbl->num_rows());
+  EXPECT_EQ(1, result.tbl->num_columns());
+}
+
+TEST_F(OrcWriterTest, Strings)
+{
+  std::vector<char const*> strings{
+    "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
+  auto const num_rows = strings.size();
+
+  auto seq_col0 = random_values<int>(num_rows);
+  auto seq_col2 = random_values<float>(num_rows);
+
+  int32_col col0(seq_col0.begin(), seq_col0.end());
+  str_col col1(strings.begin(), strings.end());
+  float32_col col2(seq_col2.begin(), seq_col2.end());
+
+  table_view expected({col0, col1, col2});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+
+  auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcWriterTest, SlicedTable)
+{
+  // This test checks for writing zero copy, offsetted views into existing cudf tables
+
+  std::vector<char const*> strings{
+    "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
+  auto const num_rows = strings.size();
+
+  auto seq_col0  = random_values<int32_t>(num_rows);
+  auto seq_col2  = random_values<float>(num_rows);
+  auto vals_col3 = random_values<int32_t>(num_rows);
+  auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
+  });
+
+  int32_col col0(seq_col0.begin(), seq_col0.end());
+  str_col col1(strings.begin(), strings.end());
+  float32_col col2(seq_col2.begin(), seq_col2.end());
+  float32_col col3(seq_col3, seq_col3 + num_rows);
+
+  list_col<int64_t> col4{
+    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
+
+  int16_col ages_col{{48, 27, 25, 31, 351, 351, 29, 15}, cudf::test::iterators::null_at(5)};
+  struct_col col5{{ages_col}, cudf::test::iterators::null_at(4)};
+
+  table_view expected({col0, col1, col2, col3, col4, col5});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+  expected_metadata.column_metadata[3].set_name("col_decimal");
+  expected_metadata.column_metadata[4].set_name("lists");
+  expected_metadata.column_metadata[5].set_name("structs");
+
+  auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows)});
+
+  auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_slice, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcWriterTest, HostBuffer)
+{
+  constexpr auto num_rows = 100 << 10;
+  auto const seq_col      = random_values<int>(num_rows);
+  int32_col col(seq_col.begin(), seq_col.end());
+
+  table_view expected{{col}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(
+      cudf::io::source_info(out_buffer.data(), out_buffer.size()))
+      .use_index(false);
+  auto const result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcWriterTest, negTimestampsNano)
+{
+  // This is a separate test because ORC format has a bug where writing a timestamp between -1 and 0
+  // seconds from UNIX epoch is read as that timestamp + 1 second. We mimic that behavior and so
+  // this test has to hardcode test values which are < -1 second.
+  // Details: https://github.com/rapidsai/cudf/pull/5529#issuecomment-648768925
+  auto timestamps_ns =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep>{
+      -131968727238000000,
+      -1530705634500000000,
+      -1674638741932929000,
+    };
+  cudf::table_view expected({timestamps_ns});
+
+  auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, Slice)
+{
+  int32_col col{{1, 2, 3, 4, 5}, cudf::test::iterators::null_at(3)};
+  std::vector<cudf::size_type> indices{2, 5};
+  std::vector<cudf::column_view> result = cudf::slice(col, indices);
+  cudf::table_view tbl{result};
+
+  auto filepath = temp_env->get_temp_filepath("Slice.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto read_table = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl);
+}
+
+TEST_F(OrcChunkedWriterTest, SingleTable)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(5, 5, true);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedSingle.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *table1);
+}
+
+TEST_F(OrcChunkedWriterTest, SimpleTable)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(5, 5, true);
+  auto table2 = create_random_fixed_table<int>(5, 5, true);
+
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedSimple.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+}
+
+TEST_F(OrcChunkedWriterTest, LargeTables)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(512, 4096, true);
+  auto table2 = create_random_fixed_table<int>(512, 8192, true);
+
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedLarge.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+}
+
+TEST_F(OrcChunkedWriterTest, ManyTables)
+{
+  srand(31337);
+  std::vector<std::unique_ptr<table>> tables;
+  std::vector<table_view> table_views;
+  constexpr int num_tables = 96;
+  for (int idx = 0; idx < num_tables; idx++) {
+    auto tbl = create_random_fixed_table<int>(16, 64, true);
+    table_views.push_back(*tbl);
+    tables.push_back(std::move(tbl));
+  }
+
+  auto expected = cudf::concatenate(table_views);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
+  std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
+    writer.write(tbl);
+  });
+  writer.close();
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+}
+
+TEST_F(OrcChunkedWriterTest, Metadata)
+{
+  std::vector<char const*> strings{
+    "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"};
+  auto const num_rows = strings.size();
+
+  auto seq_col0 = random_values<int>(num_rows);
+  auto seq_col2 = random_values<float>(num_rows);
+
+  int32_col col0(seq_col0.begin(), seq_col0.end());
+  str_col col1{strings.begin(), strings.end()};
+  float32_col col2(seq_col2.begin(), seq_col2.end());
+
+  table_view expected({col0, col1, col2});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath})
+      .metadata(expected_metadata);
+  cudf::io::orc_chunked_writer(opts).write(expected).write(expected);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcChunkedWriterTest, Strings)
+{
+  bool mask1[] = {true, true, false, true, true, true, true};
+  std::vector<char const*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
+  str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
+  table_view tbl1({strings1});
+
+  bool mask2[] = {false, true, true, true, true, true, true};
+  std::vector<char const*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
+  str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
+  table_view tbl2({strings2});
+
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedStrings.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+}
+
+TEST_F(OrcChunkedWriterTest, MismatchedTypes)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(4, 4, true);
+  auto table2 = create_random_fixed_table<float>(4, 4, true);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
+  writer.write(*table1);
+  EXPECT_THROW(writer.write(*table2), cudf::logic_error);
+}
+
+TEST_F(OrcChunkedWriterTest, ChunkedWritingAfterClosing)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(4, 4, true);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedWritingAfterClosing.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
+  writer.write(*table1);
+  writer.close();
+  EXPECT_THROW(writer.write(*table1), cudf::logic_error);
+}
+
+TEST_F(OrcChunkedWriterTest, MismatchedStructure)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(4, 4, true);
+  auto table2 = create_random_fixed_table<int>(3, 4, true);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
+  writer.write(*table1);
+  EXPECT_THROW(writer.write(*table2), cudf::logic_error);
+}
+
+TEST_F(OrcChunkedWriterTest, ReadStripes)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(1, 5, true);
+  auto table2 = create_random_fixed_table<int>(1, 6, true);
+
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedStripes.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
+
+  printf("tab 1: \n");
+  cudf::test::print(table1->get_column(0).view());
+
+  printf("tab 2: \n");
+  cudf::test::print(table2->get_column(0).view());
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{1, 0, 1}});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+}
+
+TEST_F(OrcChunkedWriterTest, ReadStripesError)
+{
+  srand(31337);
+  auto table1 = create_random_fixed_table<int>(5, 5, true);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedStripesError.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{0, 1}});
+  EXPECT_THROW(cudf::io::read_orc(read_opts), cudf::logic_error);
+  read_opts.set_stripes({{-1}});
+  EXPECT_THROW(cudf::io::read_orc(read_opts), cudf::logic_error);
+}
+
+TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
+{
+  // write out two 31 row tables and make sure they get
+  // read back with all their validity bits in the right place
+
+  using T = TypeParam;
+
+  int num_els = 31;
+
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true};
+
+  T c1a[num_els];
+  std::fill(c1a, c1a + num_els, static_cast<T>(5));
+  T c1b[num_els];
+  std::fill(c1b, c1b + num_els, static_cast<T>(6));
+  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
+  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
+  table_view tbl1({c1a_w, c1b_w});
+
+  T c2a[num_els];
+  std::fill(c2a, c2a + num_els, static_cast<T>(8));
+  T c2b[num_els];
+  std::fill(c2b, c2b + num_els, static_cast<T>(9));
+  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
+  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
+  table_view tbl2({c2a_w, c2b_w});
+
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+}
+
+TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
+{
+  // write out two 33 row tables and make sure they get
+  // read back with all their validity bits in the right place
+
+  using T = TypeParam;
+
+  int num_els = 33;
+
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true};
+
+  T c1a[num_els];
+  std::fill(c1a, c1a + num_els, static_cast<T>(5));
+  T c1b[num_els];
+  std::fill(c1b, c1b + num_els, static_cast<T>(6));
+  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
+  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
+  table_view tbl1({c1a_w, c1b_w});
+
+  T c2a[num_els];
+  std::fill(c2a, c2a + num_els, static_cast<T>(8));
+  T c2b[num_els];
+  std::fill(c2b, c2b + num_els, static_cast<T>(9));
+  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
+  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
+  table_view tbl2({c2a_w, c2b_w});
+
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.orc");
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+}
+
+TEST_F(OrcReaderTest, CombinedSkipRowTest)
+{
+  SkipRowTest skip_row;
+  skip_row.test(50, 75);
+  skip_row.test(2, 100);
+  skip_row.test(2, 100, 50);
+  skip_row.test(2, 100, 98);
+  skip_row.test(2, 100, 99);
+  skip_row.test(2, 100, 100);
+  skip_row.test(2, 100, 110);
+}
+
+TEST_F(OrcStatisticsTest, Basic)
+{
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto ts_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; });
+  auto dec_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1001; });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+
+  std::vector<char const*> strings{
+    "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Wednesday", "Tuesday"};
+  int num_rows = strings.size();
+
+  column_wrapper<int32_t, typename decltype(sequence)::value_type> col1(
+    sequence, sequence + num_rows, validity);
+  column_wrapper<float, typename decltype(sequence)::value_type> col2(
+    sequence, sequence + num_rows, validity);
+  str_col col3{strings.begin(), strings.end()};
+  column_wrapper<cudf::timestamp_ns, typename decltype(sequence)::value_type> col4(
+    ts_sequence, ts_sequence + num_rows, validity);
+  column_wrapper<cudf::timestamp_us, typename decltype(sequence)::value_type> col5(
+    ts_sequence, ts_sequence + num_rows, validity);
+  bool_col col6({true, true, true, true, true, false, false, false, false}, validity);
+
+  cudf::test::fixed_point_column_wrapper<int64_t> col7(
+    dec_sequence, dec_sequence + num_rows, numeric::scale_type{-1});
+
+  table_view expected({col1, col2, col3, col4, col5, col6, col7});
+
+  auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
+
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
+
+  auto expected_column_names = std::vector<std::string>{""};
+  std::generate_n(
+    std::back_inserter(expected_column_names),
+    expected.num_columns(),
+    [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); });
+  EXPECT_EQ(stats.column_names, expected_column_names);
+
+  auto validate_statistics = [&](std::vector<cudf::io::column_statistics> const& stats) {
+    ASSERT_EQ(stats.size(), expected.num_columns() + 1);
+    auto& s0 = stats[0];
+    EXPECT_EQ(*s0.number_of_values, 9ul);
+    EXPECT_TRUE(s0.has_null.has_value());
+    EXPECT_FALSE(*s0.has_null);
+
+    auto& s1 = stats[1];
+    EXPECT_EQ(*s1.number_of_values, 4ul);
+    EXPECT_TRUE(*s1.has_null);
+    auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
+    EXPECT_EQ(*ts1.minimum, 1);
+    EXPECT_EQ(*ts1.maximum, 7);
+    EXPECT_EQ(*ts1.sum, 16);
+
+    auto& s2 = stats[2];
+    EXPECT_EQ(*s2.number_of_values, 4ul);
+    EXPECT_TRUE(*s2.has_null);
+    auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
+    EXPECT_EQ(*ts2.minimum, 1.);
+    EXPECT_EQ(*ts2.maximum, 7.);
+    EXPECT_EQ(*ts2.sum, 16.);
+
+    auto& s3 = stats[3];
+    EXPECT_EQ(*s3.number_of_values, 9ul);
+    EXPECT_FALSE(*s3.has_null);
+    auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
+    EXPECT_EQ(*ts3.minimum, "Friday");
+    EXPECT_EQ(*ts3.maximum, "Wednesday");
+    EXPECT_EQ(*ts3.sum, 58ul);
+
+    auto& s4 = stats[4];
+    EXPECT_EQ(*s4.number_of_values, 4ul);
+    EXPECT_TRUE(*s4.has_null);
+    auto& ts4 = std::get<cudf::io::timestamp_statistics>(s4.type_specific_stats);
+    EXPECT_EQ(*ts4.minimum, -4);
+    EXPECT_EQ(*ts4.maximum, 3);
+    EXPECT_EQ(*ts4.minimum_utc, -4);
+    EXPECT_EQ(*ts4.maximum_utc, 3);
+    EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_EQ(*ts4.maximum_nanos, 6);
+
+    auto& s5 = stats[5];
+    EXPECT_EQ(*s5.number_of_values, 4ul);
+    EXPECT_TRUE(*s5.has_null);
+    auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
+    EXPECT_EQ(*ts5.minimum, -3001);
+    EXPECT_EQ(*ts5.maximum, 3000);
+    EXPECT_EQ(*ts5.minimum_utc, -3001);
+    EXPECT_EQ(*ts5.maximum_utc, 3000);
+    EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+
+    auto& s6 = stats[6];
+    EXPECT_EQ(*s6.number_of_values, 4ul);
+    EXPECT_TRUE(*s6.has_null);
+    auto& ts6 = std::get<cudf::io::bucket_statistics>(s6.type_specific_stats);
+    EXPECT_EQ(ts6.count[0], 2);
+
+    auto& s7 = stats[7];
+    EXPECT_EQ(*s7.number_of_values, 9ul);
+    EXPECT_FALSE(*s7.has_null);
+    auto& ts7 = std::get<cudf::io::decimal_statistics>(s7.type_specific_stats);
+    EXPECT_EQ(*ts7.minimum, "0.0");
+    EXPECT_EQ(*ts7.maximum, "800.8");
+    EXPECT_EQ(*ts7.sum, "3603.6");
+  };
+
+  validate_statistics(stats.file_stats);
+  // There's only one stripe, so column stats are the same as stripe stats
+  validate_statistics(stats.stripes_stats[0]);
+}
+
+TEST_F(OrcWriterTest, SlicedValidMask)
+{
+  std::vector<char const*> strings;
+  // Need more than 32 elements to reproduce the issue
+  for (int i = 0; i < 34; ++i)
+    strings.emplace_back("a long string to make sure overflow affects the output");
+  // An element is null only to enforce the output column to be nullable
+  str_col col{strings.begin(), strings.end(), cudf::test::iterators::null_at(32)};
+
+  // Bug tested here is easiest to reproduce when column_offset % 32 is 31
+  std::vector<cudf::size_type> indices{31, 34};
+  auto sliced_col = cudf::slice(static_cast<cudf::column_view>(col), indices);
+  cudf::table_view tbl{sliced_col};
+
+  cudf::io::table_input_metadata expected_metadata(tbl);
+  expected_metadata.column_metadata[0].set_name("col_string");
+
+  auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcReaderTest, SingleInputs)
+{
+  srand(31533);
+  auto table1 = create_random_fixed_table<int>(5, 5, true);
+
+  auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
+  cudf::io::orc_writer_options write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath1}, table1->view());
+  cudf::io::write_orc(write_opts);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1}});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *table1);
+}
+
+TEST_F(OrcReaderTest, zstdCompressionRegression)
+{
+  if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) {
+    GTEST_SKIP() << "Newer nvCOMP version is required";
+  }
+
+  // Test with zstd compressed orc file with high compression ratio.
+  constexpr uint8_t input_buffer[] = {
+    0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74,
+    0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04,
+    0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb,
+    0xaf, 0x38, 0xc0, 0x07, 0x00, 0x00, 0x40, 0x01, 0xc0, 0x05, 0x00, 0x00, 0x46, 0x4d, 0x45, 0x00,
+    0x00, 0x0a, 0x06, 0x08, 0x01, 0x10, 0x01, 0x18, 0x30, 0x0a, 0x06, 0x08, 0x02, 0x10, 0x01, 0x18,
+    0x06, 0x0a, 0x06, 0x08, 0x03, 0x10, 0x01, 0x18, 0x05, 0x12, 0x02, 0x08, 0x00, 0x12, 0x04, 0x08,
+    0x03, 0x10, 0x02, 0x59, 0x00, 0x00, 0x08, 0x03, 0x10, 0x63, 0x1a, 0x0c, 0x08, 0x03, 0x10, 0x00,
+    0x18, 0x3b, 0x20, 0x25, 0x28, 0xa0, 0x9e, 0x75, 0x22, 0x10, 0x08, 0x0c, 0x12, 0x01, 0x01, 0x1a,
+    0x09, 0x63, 0x64, 0x5f, 0x67, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x22, 0x02, 0x08, 0x07, 0x30, 0xa0,
+    0x9e, 0x75, 0x08, 0x2f, 0x10, 0x05, 0x18, 0x80, 0x80, 0x10, 0x22, 0x02, 0x00, 0x0c, 0x28, 0x00,
+    0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17};
+
+  auto source =
+    cudf::io::source_info(reinterpret_cast<char const*>(input_buffer), sizeof(input_buffer));
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(source).use_index(false);
+
+  cudf::io::table_with_metadata result;
+  CUDF_EXPECT_NO_THROW(result = cudf::io::read_orc(in_opts));
+  EXPECT_EQ(1920800, result.tbl->num_rows());
+}
+
+TEST_F(OrcReaderTest, MultipleInputs)
+{
+  srand(31537);
+  auto table1 = create_random_fixed_table<int>(5, 5, true);
+  auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  using T1 = T;
-  using uniform_distribution =
-    typename std::conditional_t<std::is_same_v<T1, bool>,
-                                std::bernoulli_distribution,
-                                std::conditional_t<std::is_floating_point_v<T1>,
-                                                   std::uniform_real_distribution<T1>,
-                                                   std::uniform_int_distribution<T1>>>;
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
-  static constexpr auto seed = 0xf00d;
-  static std::mt19937 engine{seed};
-  static uniform_distribution dist{};
-  std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; });
+  auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
+  {
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath1}, table1->view());
+    cudf::io::write_orc(out_opts);
+  }
 
-  return values;
+  auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
+  {
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath2}, table2->view());
+    cudf::io::write_orc(out_opts);
+  }
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}});
+  auto result = cudf::io::read_orc(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
 }
-}  // namespace
 
-// Base test fixture for tests
-struct OrcWriterTest : public cudf::test::BaseFixture {};
+struct OrcWriterTestDecimal : public OrcWriterTest,
+                              public ::testing::WithParamInterface<std::tuple<int, int>> {};
+
+TEST_P(OrcWriterTestDecimal, Decimal64)
+{
+  auto const [num_rows, scale] = GetParam();
+
+  // Using int16_t because scale causes values to overflow if they already require 32 bits
+  auto const vals = random_values<int32_t>(num_rows);
+  auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
+  dec64_col col{data, data + num_rows, mask};
+  cudf::table_view tbl({static_cast<cudf::column_view>(col)});
+
+  auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0));
+}
+
+INSTANTIATE_TEST_CASE_P(OrcWriterTest,
+                        OrcWriterTestDecimal,
+                        ::testing::Combine(::testing::Values(1, 10000, 10001, 34567),
+                                           ::testing::Values(-2, 0, 2)));
+
+TEST_F(OrcWriterTest, Decimal32)
+{
+  constexpr auto num_rows = 12000;
+
+  // Using int16_t because scale causes values to overflow if they already require 32 bits
+  auto const vals = random_values<int16_t>(num_rows);
+  auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
+    return numeric::decimal32{vals[i], numeric::scale_type{2}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
+  dec32_col col{data, data + num_rows, mask};
+  cudf::table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0));
+}
+
+TEST_F(OrcStatisticsTest, Overflow)
+{
+  int num_rows       = 10;
+  auto too_large_seq = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i * (std::numeric_limits<int64_t>::max() / 20); });
+  auto too_small_seq = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i * (std::numeric_limits<int64_t>::min() / 20); });
+  auto not_too_large_seq = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i * (std::numeric_limits<int64_t>::max() / 200); });
+  auto not_too_small_seq = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i * (std::numeric_limits<int64_t>::min() / 200); });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+
+  column_wrapper<int64_t, typename decltype(too_large_seq)::value_type> col1(
+    too_large_seq, too_large_seq + num_rows, validity);
+  column_wrapper<int64_t, typename decltype(too_small_seq)::value_type> col2(
+    too_small_seq, too_small_seq + num_rows, validity);
+  column_wrapper<int64_t, typename decltype(not_too_large_seq)::value_type> col3(
+    not_too_large_seq, not_too_large_seq + num_rows, validity);
+  column_wrapper<int64_t, typename decltype(not_too_small_seq)::value_type> col4(
+    not_too_small_seq, not_too_small_seq + num_rows, validity);
+  table_view tbl({col1, col2, col3, col4});
+
+  auto filepath = temp_env->get_temp_filepath("OrcStatsOverflow.orc");
+
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_orc(out_opts);
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
+
+  auto check_sum_exist = [&](int idx, bool expected) {
+    auto const& s  = stats.file_stats[idx];
+    auto const& ts = std::get<cudf::io::integer_statistics>(s.type_specific_stats);
+    EXPECT_EQ(ts.sum.has_value(), expected);
+  };
+  check_sum_exist(1, false);
+  check_sum_exist(2, false);
+  check_sum_exist(3, true);
+  check_sum_exist(4, true);
+}
+
+TEST_F(OrcStatisticsTest, HasNull)
+{
+  // This test can now be implemented with libcudf; keeping the pandas version to keep the test
+  // inputs diversified
+  // Method to create file:
+  // >>> import pandas as pd
+  // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]})
+  // >>> df.to_orc("temp.orc")
+  //
+  // Contents of file:
+  // >>> import pyarrow.orc as po
+  // >>> po.ORCFile('temp.orc').read()
+  // pyarrow.Table
+  // a: int64
+  // b: int64
+  // ----
+  // a: [[1,2,null]]
+  // b: [[3,4,5]]
+  auto nulls_orc = std::array<uint8_t, 308>{
+    0x4F, 0x52, 0x43, 0x1D, 0x00, 0x00, 0x0A, 0x0C, 0x0A, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x04,
+    0x08, 0x03, 0x50, 0x00, 0x2C, 0x00, 0x00, 0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E,
+    0x0E, 0x26, 0x21, 0x36, 0x0E, 0x26, 0x01, 0x16, 0x09, 0xB6, 0x00, 0x46, 0x00, 0x2C, 0x00, 0x00,
+    0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E, 0x0E, 0x66, 0x21, 0x36, 0x0E, 0x36, 0x01,
+    0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x05, 0x00, 0x00, 0xFF, 0xC0,
+    0x07, 0x00, 0x00, 0x46, 0x01, 0x24, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x09, 0x00, 0x00, 0x46, 0x02,
+    0x68, 0xA0, 0x68, 0x00, 0x00, 0xE3, 0x62, 0xE3, 0x60, 0x13, 0x60, 0x90, 0x10, 0xE4, 0x02, 0xD1,
+    0x8C, 0x12, 0x92, 0x60, 0x9A, 0x09, 0x4C, 0x33, 0x00, 0xC5, 0x59, 0xC1, 0x34, 0x23, 0x98, 0x66,
+    0x04, 0xD2, 0x6C, 0x60, 0x3E, 0x13, 0x94, 0xCF, 0x24, 0xC1, 0x2E, 0xC4, 0x02, 0x52, 0x07, 0x24,
+    0x99, 0x60, 0xA4, 0x14, 0x73, 0x68, 0x88, 0x33, 0x00, 0x46, 0x00, 0x00, 0xE3, 0x52, 0xE2, 0x62,
+    0xE1, 0x60, 0x0E, 0x60, 0xE0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91, 0x60,
+    0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0xB4,
+    0x00, 0x00, 0xE3, 0x60, 0x16, 0x98, 0xC6, 0x28, 0xC5, 0xC5, 0xC1, 0x2C, 0xE0, 0x2C, 0x21, 0xA3,
+    0x60, 0xAE, 0xC1, 0xAC, 0x24, 0xC4, 0xC1, 0x23, 0xC4, 0xC4, 0xC8, 0x24, 0xC5, 0x98, 0x28, 0xC5,
+    0x98, 0xA4, 0xC0, 0xA0, 0xC1, 0x60, 0xC0, 0xA0, 0xC4, 0xC1, 0xC1, 0x82, 0xCE, 0x32, 0x60, 0xB6,
+    0x62, 0xE1, 0x60, 0x0E, 0x60, 0xB0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91,
+    0x60, 0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x87,
+    0x09, 0x7E, 0x1E, 0x8C, 0x49, 0xAC, 0x86, 0x7A, 0xE6, 0x7A, 0xA6, 0x00, 0x08, 0x5D, 0x10, 0x01,
+    0x18, 0x80, 0x80, 0x04, 0x22, 0x02, 0x00, 0x0C, 0x28, 0x26, 0x30, 0x06, 0x82, 0xF4, 0x03, 0x03,
+    0x4F, 0x52, 0x43, 0x17,
+  };
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(
+    cudf::io::source_info{reinterpret_cast<char const*>(nulls_orc.data()), nulls_orc.size()});
+
+  EXPECT_EQ(stats.file_stats[1].has_null, true);
+  EXPECT_EQ(stats.file_stats[2].has_null, false);
+
+  EXPECT_EQ(stats.stripes_stats[0][1].has_null, true);
+  EXPECT_EQ(stats.stripes_stats[0][2].has_null, false);
+}
 
 struct OrcWriterTestStripes
   : public OrcWriterTest,
     public ::testing::WithParamInterface<std::tuple<size_t, cudf::size_type>> {};
 
-TEST_F(OrcWriterTestStripes, StripeSize)
+TEST_P(OrcWriterTestStripes, StripeSize)
 {
-  constexpr auto num_rows = 1000000;
-  // auto const [size_bytes, size_rows] = GetParam();
+  constexpr auto num_rows            = 1000000;
+  auto const [size_bytes, size_rows] = GetParam();
 
   auto const seq_col = random_values<int>(num_rows);
   auto const validity =
@@ -138,17 +1379,14 @@ TEST_F(OrcWriterTestStripes, StripeSize)
 
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col.release());
-
-  // printf("input col: \n");
-  // cudf::test::print(cols.front()->view());
-
   auto const expected = std::make_unique<table>(std::move(cols));
 
   auto validate = [&](std::vector<char> const& orc_buffer) {
-    // auto const expected_stripe_num = 6;
-    // auto const stats               = cudf::io::read_parsed_orc_statistics(
-    //   cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
-    // EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
+    auto const expected_stripe_num =
+      std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
+    auto const stats = cudf::io::read_parsed_orc_statistics(
+      cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
+    EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
 
     cudf::io::orc_reader_options in_opts =
       cudf::io::orc_reader_options::builder(
@@ -163,23 +1401,721 @@ TEST_F(OrcWriterTestStripes, StripeSize)
     std::vector<char> out_buffer_chunked;
     cudf::io::chunked_orc_writer_options opts =
       cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info(&out_buffer_chunked))
-        .stripe_size_rows(10000);
+        .stripe_size_rows(size_rows)
+        .stripe_size_bytes(size_bytes);
     cudf::io::orc_chunked_writer(opts).write(expected->view());
-
     validate(out_buffer_chunked);
   }
+  {
+    std::vector<char> out_buffer;
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected->view())
+        .stripe_size_rows(size_rows)
+        .stripe_size_bytes(size_bytes);
+    cudf::io::write_orc(out_opts);
+    validate(out_buffer);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(OrcWriterTest,
+                        OrcWriterTestStripes,
+                        ::testing::Values(std::make_tuple(800000ul, 1000000),
+                                          std::make_tuple(2000000ul, 1000000),
+                                          std::make_tuple(4000000ul, 1000000),
+                                          std::make_tuple(8000000ul, 1000000),
+                                          std::make_tuple(8000000ul, 500000),
+                                          std::make_tuple(8000000ul, 250000),
+                                          std::make_tuple(8000000ul, 100000)));
+
+TEST_F(OrcWriterTest, StripeSizeInvalid)
+{
+  auto const unused_table = std::make_unique<table>();
+  std::vector<char> out_buffer;
+
+  EXPECT_THROW(
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
+      .stripe_size_rows(511),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
+      .stripe_size_bytes(63 << 10),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
+      .row_index_stride(511),
+    cudf::logic_error);
+}
+
+TEST_F(OrcWriterTest, TestMap)
+{
+  auto const num_rows       = 1200000;
+  auto const lists_per_row  = 4;
+  auto const num_child_rows = (num_rows * lists_per_row) / 2;  // half due to validity
+
+  auto keys      = random_values<int>(num_child_rows);
+  auto vals      = random_values<float>(num_child_rows);
+  auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
+  int32_col keys_col(keys.begin(), keys.end());
+  float32_col vals_col{vals.begin(), vals.end(), vals_mask};
+  auto s_col = struct_col({keys_col, vals_col}).release();
+
+  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+
+  std::vector<int> row_offsets(num_rows + 1);
+  int offset = 0;
+  for (int idx = 0; idx < (num_rows) + 1; ++idx) {
+    row_offsets[idx] = offset;
+    if (valids[idx]) { offset += lists_per_row; }
+  }
+  int32_col offsets(row_offsets.begin(), row_offsets.end());
+
+  auto num_list_rows           = static_cast<cudf::column_view>(offsets).size() - 1;
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_list_rows);
+  auto list_col                = cudf::make_lists_column(
+    num_list_rows, offsets.release(), std::move(s_col), null_count, std::move(null_mask));
+
+  table_view expected({*list_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+
+  auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcReaderTest, NestedColumnSelection)
+{
+  auto const num_rows  = 1000;
+  auto child_col1_data = random_values<int32_t>(num_rows);
+  auto child_col2_data = random_values<int64_t>(num_rows);
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
+  int32_col child_col1{child_col1_data.begin(), child_col1_data.end(), validity};
+  int64_col child_col2{child_col2_data.begin(), child_col2_data.end(), validity};
+  struct_col s_col{child_col1, child_col2};
+  table_view expected({s_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("struct_s");
+  expected_metadata.column_metadata[0].child(0).set_name("field_a");
+  expected_metadata.column_metadata[0].child(1).set_name("field_b");
+
+  auto filepath = temp_env->get_temp_filepath("OrcNestedSelection.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .use_index(false)
+      .columns({"struct_s.field_b"});
+  auto result = cudf::io::read_orc(in_opts);
+
+  // Verify that only one child column is included in the output table
+  ASSERT_EQ(1, result.tbl->view().column(0).num_children());
+  // Verify that the first child column is `field_b`
+  int64_col expected_col{child_col2_data.begin(), child_col2_data.end(), validity};
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_col, result.tbl->view().column(0).child(0));
+  ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
+}
+
+TEST_F(OrcReaderTest, DecimalOptions)
+{
+  constexpr auto num_rows = 10;
+  auto col_vals           = random_values<int64_t>(num_rows);
+  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
+
+  dec128_col col{col_data, col_data + num_rows, mask};
+  table_view expected({col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("dec");
+
+  auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options valid_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"});
+  // Should not throw, even with "fake name"
+  EXPECT_NO_THROW(cudf::io::read_orc(valid_opts));
+}
+
+TEST_F(OrcWriterTest, DecimalOptionsNested)
+{
+  auto const num_rows = 100;
+
+  auto dec_vals  = random_values<int32_t>(num_rows);
+  auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
+  });
+  dec64_col dec1_col(dec1_data, dec1_data + num_rows);
+  dec128_col dec2_col(dec2_data, dec2_data + num_rows);
+  auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col};
+
+  auto int_vals = random_values<int32_t>(num_rows);
+  int32_col int_col(int_vals.begin(), int_vals.end());
+  auto map_struct_col = struct_col({child_struct_col, int_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  std::iota(row_offsets.begin(), row_offsets.end(), 0);
+  int32_col offsets(row_offsets.begin(), row_offsets.end());
+
+  auto map_list_col = cudf::make_lists_column(
+    num_rows, offsets.release(), std::move(map_struct_col), 0, rmm::device_buffer{});
+
+  table_view expected({*map_list_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("maps");
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+  expected_metadata.column_metadata[0].child(1).child(0).child(0).set_name("dec64");
+  expected_metadata.column_metadata[0].child(1).child(0).child(1).set_name("dec128");
+
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+      .use_index(false)
+      // One less level of nesting because children of map columns are the child struct's children
+      .decimal128_columns({"maps.0.dec64"});
+  auto result = cudf::io::read_orc(in_opts);
+
+  // Both columns should be read as decimal128
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0).child(0),
+                                      result.tbl->view().column(0).child(1).child(0).child(1));
+}
+
+TEST_F(OrcReaderTest, EmptyColumnsParam)
+{
+  srand(31337);
+  auto const expected = create_random_fixed_table<int>(2, 4, false);
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options args =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, *expected);
+  cudf::io::write_orc(args);
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(
+      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
+      .columns({});
+  auto const result = cudf::io::read_orc(read_opts);
+
+  EXPECT_EQ(result.tbl->num_columns(), 0);
+  EXPECT_EQ(result.tbl->num_rows(), 0);
+}
+
+TEST_F(OrcMetadataReaderTest, TestBasic)
+{
+  auto const num_rows = 1'200'000;
+
+  auto ints   = random_values<int>(num_rows);
+  auto floats = random_values<float>(num_rows);
+  int32_col int_col(ints.begin(), ints.end());
+  float32_col float_col(floats.begin(), floats.end());
+
+  table_view expected({int_col, float_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int_col");
+  expected_metadata.column_metadata[1].set_name("float_col");
+
+  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_orc(out_opts);
+
+  auto meta = read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(meta.num_rows(), num_rows);
+
+  EXPECT_EQ(meta.schema().root().name(), "");
+  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::orc::STRUCT);
+  ASSERT_EQ(meta.schema().root().num_children(), 2);
+
+  EXPECT_EQ(meta.schema().root().child(0).name(), "int_col");
+  EXPECT_EQ(meta.schema().root().child(1).name(), "float_col");
+}
+
+TEST_F(OrcMetadataReaderTest, TestNested)
+{
+  auto const num_rows       = 1'200'000;
+  auto const lists_per_row  = 4;
+  auto const num_child_rows = num_rows * lists_per_row;
+
+  auto keys = random_values<int>(num_child_rows);
+  auto vals = random_values<float>(num_child_rows);
+  int32_col keys_col(keys.begin(), keys.end());
+  float32_col vals_col(vals.begin(), vals.end());
+  auto s_col = struct_col({keys_col, vals_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  for (int idx = 0; idx < num_rows + 1; ++idx) {
+    row_offsets[idx] = idx * lists_per_row;
+  }
+  int32_col offsets(row_offsets.begin(), row_offsets.end());
+
+  auto list_col =
+    cudf::make_lists_column(num_rows, offsets.release(), std::move(s_col), 0, rmm::device_buffer{});
+
+  table_view expected({*list_col, *list_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("maps");
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+  expected_metadata.column_metadata[1].set_name("lists");
+  expected_metadata.column_metadata[1].child(1).child(0).set_name("int_field");
+  expected_metadata.column_metadata[1].child(1).child(1).set_name("float_field");
+
+  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_orc(out_opts);
+
+  auto meta = read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(meta.num_rows(), num_rows);
+
+  EXPECT_EQ(meta.schema().root().name(), "");
+  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::orc::STRUCT);
+  ASSERT_EQ(meta.schema().root().num_children(), 2);
+
+  auto const& out_map_col = meta.schema().root().child(0);
+  EXPECT_EQ(out_map_col.name(), "maps");
+  EXPECT_EQ(out_map_col.type_kind(), cudf::io::orc::MAP);
+  ASSERT_EQ(out_map_col.num_children(), 2);
+  EXPECT_EQ(out_map_col.child(0).name(), "");  // keys (no name in ORC)
+  EXPECT_EQ(out_map_col.child(1).name(), "");  // values (no name in ORC)
+
+  auto const& out_list_col = meta.schema().root().child(1);
+  EXPECT_EQ(out_list_col.name(), "lists");
+  EXPECT_EQ(out_list_col.type_kind(), cudf::io::orc::LIST);
+  ASSERT_EQ(out_list_col.num_children(), 1);
+
+  auto const& out_list_struct_col = out_list_col.child(0);
+  EXPECT_EQ(out_list_struct_col.name(), "");  // elements (no name in ORC)
+  EXPECT_EQ(out_list_struct_col.type_kind(), cudf::io::orc::STRUCT);
+  ASSERT_EQ(out_list_struct_col.num_children(), 2);
+
+  auto const& out_int_col = out_list_struct_col.child(0);
+  EXPECT_EQ(out_int_col.name(), "int_field");
+  EXPECT_EQ(out_int_col.type_kind(), cudf::io::orc::INT);
+
+  auto const& out_float_col = out_list_struct_col.child(1);
+  EXPECT_EQ(out_float_col.name(), "float_field");
+  EXPECT_EQ(out_float_col.type_kind(), cudf::io::orc::FLOAT);
+}
+
+TEST_F(OrcReaderTest, ZstdMaxCompressionRate)
+{
+  if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD) or
+      cudf::io::nvcomp::is_compression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) {
+    GTEST_SKIP() << "Newer nvCOMP version is required";
+  }
+
+  // Encodes as 64KB of zeros, which compresses to 18 bytes with ZSTD
+  std::vector<float> const h_data(8 * 1024);
+  float32_col col(h_data.begin(), h_data.end());
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcHugeCompRatio.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, CompStats)
+{
+  auto table = create_random_fixed_table<int>(1, 100000, true);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::orc_writer_options opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&unused_buffer}, table->view())
+      .compression_statistics(stats);
+  cudf::io::write_orc(opts);
+
+  EXPECT_NE(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
+}
+
+TEST_F(OrcChunkedWriterTest, CompStats)
+{
+  auto table = create_random_fixed_table<int>(1, 100000, true);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer})
+      .compression_statistics(stats);
+  cudf::io::orc_chunked_writer(opts).write(*table);
+
+  EXPECT_NE(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
+
+  auto const single_table_comp_stats = *stats;
+  cudf::io::orc_chunked_writer(opts).write(*table);
+
+  EXPECT_EQ(stats->compression_ratio(), single_table_comp_stats.compression_ratio());
+  EXPECT_EQ(stats->num_compressed_bytes(), 2 * single_table_comp_stats.num_compressed_bytes());
+
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+}
+
+void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats)
+{
+  EXPECT_EQ(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_TRUE(std::isnan(stats->compression_ratio()));
+}
+
+TEST_F(OrcWriterTest, CompStatsEmptyTable)
+{
+  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::orc_writer_options opts = cudf::io::orc_writer_options::builder(
+                                        cudf::io::sink_info{&unused_buffer}, table_no_rows->view())
+                                        .compression_statistics(stats);
+  cudf::io::write_orc(opts);
+
+  expect_compression_stats_empty(stats);
+}
+
+TEST_F(OrcChunkedWriterTest, CompStatsEmptyTable)
+{
+  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer})
+      .compression_statistics(stats);
+  cudf::io::orc_chunked_writer(opts).write(*table_no_rows);
+
+  expect_compression_stats_empty(stats);
+}
+
+TEST_F(OrcWriterTest, EmptyRowGroup)
+{
+  std::vector<int> ints(10000 + 5, -1);
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i >= 10000; });
+  int32_col col{ints.begin(), ints.end(), mask};
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcEmptyRowGroup.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, NoNullsAsNonNullable)
+{
+  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<int32_t> col{{1, 2, 3}, valids};
+  table_view expected({col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+
+  auto filepath = temp_env->get_temp_filepath("NonNullable.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  // Writer should be able to write a column without nulls as non-nullable
+  EXPECT_NO_THROW(cudf::io::write_orc(out_opts));
+}
+
+TEST_F(OrcWriterTest, SlicedStringColumn)
+{
+  std::vector<char const*> strings{"a", "bc", "def", "longer", "strings", "at the end"};
+  str_col col(strings.begin(), strings.end());
+  table_view expected({col});
+
+  // Slice the table to include the longer strings
+  auto expected_slice = cudf::slice(expected, {2, 6});
+
+  auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_slice, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, EmptyChildStringColumn)
+{
+  list_col<cudf::string_view> col{{}, {}};
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcEmptyChildStringColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+}
+
+template <typename T>
+void check_all_null_stats(cudf::io::column_statistics const& stats)
+{
+  EXPECT_EQ(stats.number_of_values, 0);
+  EXPECT_TRUE(stats.has_null);
+
+  auto const ts = std::get<T>(stats.type_specific_stats);
+  EXPECT_FALSE(ts.minimum.has_value());
+  EXPECT_FALSE(ts.maximum.has_value());
+  EXPECT_TRUE(ts.sum.has_value());
+  EXPECT_EQ(*ts.sum, 0);
+}
+
+TEST_F(OrcStatisticsTest, AllNulls)
+{
+  float64_col double_col({0., 0., 0.}, cudf::test::iterators::all_nulls());
+  int32_col int_col({0, 0, 0}, cudf::test::iterators::all_nulls());
+  str_col string_col({"", "", ""}, cudf::test::iterators::all_nulls());
+
+  cudf::table_view expected({int_col, double_col, string_col});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
+  cudf::io::write_orc(out_opts);
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+
+  check_all_null_stats<cudf::io::integer_statistics>(stats.file_stats[1]);
+  check_all_null_stats<cudf::io::double_statistics>(stats.file_stats[2]);
+  check_all_null_stats<cudf::io::string_statistics>(stats.file_stats[3]);
+}
+
+TEST_F(OrcWriterTest, UnorderedDictionary)
+{
+  std::vector<char const*> strings{
+    "BBBB", "BBBB", "CCCC", "BBBB", "CCCC", "EEEE", "CCCC", "AAAA", "DDDD", "EEEE"};
+  str_col col(strings.begin(), strings.end());
+
+  table_view expected({col});
+
+  std::vector<char> out_buffer_sorted;
+  cudf::io::orc_writer_options out_opts_sorted =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected);
+  cudf::io::write_orc(out_opts_sorted);
+
+  cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer_sorted.data(), out_buffer_sorted.size()});
+  auto const from_sorted = cudf::io::read_orc(in_opts_sorted).tbl;
+
+  std::vector<char> out_buffer_unsorted;
+  cudf::io::orc_writer_options out_opts_unsorted =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_unsorted}, expected)
+      .enable_dictionary_sort(false);
+  cudf::io::write_orc(out_opts_unsorted);
+
+  cudf::io::orc_reader_options in_opts_unsorted = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer_unsorted.data(), out_buffer_unsorted.size()});
+  auto const from_unsorted = cudf::io::read_orc(in_opts_unsorted).tbl;
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*from_sorted, *from_unsorted);
+}
+
+TEST_F(OrcStatisticsTest, Empty)
+{
+  int32_col col0{};
+  float64_col col1{};
+  str_col col2{};
+  dec64_col col3{};
+  column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> col4;
+  bool_col col5{};
+  table_view expected({col0, col1, col2, col3, col4, col5});
+
+  std::vector<char> out_buffer;
+
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
+  cudf::io::write_orc(out_opts);
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+
+  auto expected_column_names = std::vector<std::string>{""};
+  std::generate_n(
+    std::back_inserter(expected_column_names),
+    expected.num_columns(),
+    [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); });
+  EXPECT_EQ(stats.column_names, expected_column_names);
+
+  EXPECT_EQ(stats.column_names.size(), 7);
+  EXPECT_EQ(stats.stripes_stats.size(), 0);
+
+  auto const& fstats = stats.file_stats;
+  ASSERT_EQ(fstats.size(), 7);
+  auto& s0 = fstats[0];
+  EXPECT_TRUE(s0.number_of_values.has_value());
+  EXPECT_EQ(*s0.number_of_values, 0ul);
+  EXPECT_TRUE(s0.has_null.has_value());
+  EXPECT_FALSE(*s0.has_null);
+
+  auto& s1 = fstats[1];
+  EXPECT_EQ(*s1.number_of_values, 0ul);
+  EXPECT_FALSE(*s1.has_null);
+  auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
+  EXPECT_FALSE(ts1.minimum.has_value());
+  EXPECT_FALSE(ts1.maximum.has_value());
+  EXPECT_TRUE(ts1.sum.has_value());
+  EXPECT_EQ(*ts1.sum, 0);
+
+  auto& s2 = fstats[2];
+  EXPECT_EQ(*s2.number_of_values, 0ul);
+  EXPECT_FALSE(*s2.has_null);
+  auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
+  EXPECT_FALSE(ts2.minimum.has_value());
+  EXPECT_FALSE(ts2.maximum.has_value());
+  EXPECT_TRUE(ts2.sum.has_value());
+  EXPECT_EQ(*ts2.sum, 0);
+
+  auto& s3 = fstats[3];
+  EXPECT_EQ(*s3.number_of_values, 0ul);
+  EXPECT_FALSE(*s3.has_null);
+  auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
+  EXPECT_FALSE(ts3.minimum.has_value());
+  EXPECT_FALSE(ts3.maximum.has_value());
+  EXPECT_TRUE(ts3.sum.has_value());
+  EXPECT_EQ(*ts3.sum, 0);
+
+  auto& s4 = fstats[4];
+  EXPECT_EQ(*s4.number_of_values, 0ul);
+  EXPECT_FALSE(*s4.has_null);
+  auto& ts4 = std::get<cudf::io::decimal_statistics>(s4.type_specific_stats);
+  EXPECT_FALSE(ts4.minimum.has_value());
+  EXPECT_FALSE(ts4.maximum.has_value());
+  EXPECT_TRUE(ts4.sum.has_value());
+  EXPECT_EQ(*ts4.sum, "0");
+
+  auto& s5 = fstats[5];
+  EXPECT_EQ(*s5.number_of_values, 0ul);
+  EXPECT_FALSE(*s5.has_null);
+  auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
+  EXPECT_FALSE(ts5.minimum.has_value());
+  EXPECT_FALSE(ts5.maximum.has_value());
+  EXPECT_FALSE(ts5.minimum_utc.has_value());
+  EXPECT_FALSE(ts5.maximum_utc.has_value());
+  EXPECT_FALSE(ts5.minimum_nanos.has_value());
+  EXPECT_FALSE(ts5.maximum_nanos.has_value());
+
+  auto& s6 = fstats[6];
+  EXPECT_EQ(*s6.number_of_values, 0ul);
+  EXPECT_FALSE(*s6.has_null);
+  auto& ts6 = std::get<cudf::io::bucket_statistics>(s6.type_specific_stats);
+  EXPECT_EQ(ts6.count[0], 0);
+}
+
+TEST_P(OrcCompressionTest, Basic)
+{
+  constexpr auto num_rows     = 12000;
+  auto const compression_type = GetParam();
+
+  // Generate compressible data
+  auto int_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
+  auto float_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 32; });
+
+  int32_col int_col(int_sequence, int_sequence + num_rows);
+  float32_col float_col(float_sequence, float_sequence + num_rows);
+
+  table_view expected({int_col, float_col});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected)
+      .compression(compression_type);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
 }
 
-// INSTANTIATE_TEST_CASE_P(OrcWriterTest,
-//                         OrcWriterTestStripes,
-//                         ::testing::Values(std::make_tuple(800000ul, 1000000)));
+INSTANTIATE_TEST_CASE_P(OrcCompressionTest,
+                        OrcCompressionTest,
+                        ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::SNAPPY,
+                                          cudf::io::compression_type::LZ4,
+                                          cudf::io::compression_type::ZSTD));
+
+TEST_F(OrcWriterTest, BounceBufferBug)
+{
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
+
+  constexpr auto num_rows = 150000;
+  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
+                                                                      sequence + num_rows);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD);
+  cudf::io::write_orc(out_opts);
+}
 
-// INSTANTIATE_TEST_CASE_P(OrcWriterTest,
-//                         OrcWriterTestStripes,
-//                         ::testing::Values(std::make_tuple(800000ul, 1000000),
-//                                           std::make_tuple(2000000ul, 1000000),
-//                                           std::make_tuple(4000000ul, 1000000),
-//                                           std::make_tuple(8000000ul, 1000000),
-//                                           std::make_tuple(8000000ul, 500000),
-//                                           std::make_tuple(8000000ul, 250000),
-//                                           std::make_tuple(8000000ul, 100000)));
+CUDF_TEST_PROGRAM_MAIN()

From a38b11510c3477a0fc0a891c48ef3b416234a594 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 24 Feb 2024 08:31:59 -0800
Subject: [PATCH 096/321] Fix stripe lookup bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f88b931bd2b..86e863a70e9 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -722,7 +722,9 @@ void reader::impl::decompress_and_decode()
   auto const rows_to_skip = 0;
   auto rows_to_read       = 0;
   for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-    rows_to_read += _metadata.per_file_metadata[0].ff.stripes[stripe_idx].numberOfRows;
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_info = stripe.stripe_info;
+    rows_to_read += stripe_info->numberOfRows;
   }
 
   // Set up table for converting timestamp columns from local to UTC time

From 75cec9b70ebe11cef3395e57fcf4f547c5cd685d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 24 Feb 2024 13:18:03 -0800
Subject: [PATCH 097/321] Fix a bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 16 +++++++++++-----
 cpp/src/io/orc/reader_impl_chunking.cu |  5 +++++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 86e863a70e9..90e2f1b63c2 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -715,17 +715,23 @@ void reader::impl::decompress_and_decode()
 
   printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
 
-  // auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const rows_to_skip = _file_itm_data.rows_to_skip;
   // auto const rows_to_read      = _file_itm_data.rows_to_read;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  auto const rows_to_skip = 0;
-  auto rows_to_read       = 0;
+  // auto const rows_to_skip = 0;
+  auto rows_to_read = 0;
   for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
     auto const& stripe     = selected_stripes[stripe_idx];
     auto const stripe_info = stripe.stripe_info;
     rows_to_read += stripe_info->numberOfRows;
+
+    if (_file_itm_data.rows_to_skip > 0) {
+      CUDF_EXPECTS(_file_itm_data.rows_to_skip < stripe_info->numberOfRows, "TODO");
+    }
   }
+  rows_to_read -= rows_to_skip;
+  _file_itm_data.rows_to_skip = 0;
 
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {
@@ -1156,8 +1162,8 @@ table_with_metadata reader::impl::make_output_chunk()
                        col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
                    });
 
-    printf("output col: \n");
-    cudf::test::print(out_columns.front()->view());
+    // printf("output col: \n");
+    // cudf::test::print(out_columns.front()->view());
 
     auto tbl = std::make_unique<table>(std::move(out_columns));
     tabs.push_back(std::move(tbl));
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 6b72ea28a96..d8e722f75e0 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -332,6 +332,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
   if (_file_itm_data.has_no_data()) { return; }
 
+  printf("input skip rows: %d, num rows: %d\n", (int)skip_rows, (int)num_rows_opt.value_or(-1));
+  printf("actual skip rows: %d, num rows: %d\n",
+         (int)_file_itm_data.rows_to_skip,
+         (int)_file_itm_data.rows_to_read);
+
   //  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   //  auto const rows_to_read      = _file_itm_data.rows_to_read;
   auto const& selected_stripes = _file_itm_data.selected_stripes;

From a7bd47a06b369b1fd16b2f8bceca6782cd16248a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 24 Feb 2024 13:35:00 -0800
Subject: [PATCH 098/321] Fix another bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 90e2f1b63c2..fe653d74aa8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -730,7 +730,7 @@ void reader::impl::decompress_and_decode()
       CUDF_EXPECTS(_file_itm_data.rows_to_skip < stripe_info->numberOfRows, "TODO");
     }
   }
-  rows_to_read -= rows_to_skip;
+  rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
   _file_itm_data.rows_to_skip = 0;
 
   // Set up table for converting timestamp columns from local to UTC time

From db768fbee5b1190281c79b685d5c69ad851a5225 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 24 Feb 2024 19:36:57 -0800
Subject: [PATCH 099/321] Debugging

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp            |  2 +-
 cpp/src/io/orc/reader_impl.cu          | 41 +++++++++---
 cpp/src/io/orc/reader_impl_chunking.cu |  7 ++
 cpp/tests/io/orc_test.cpp              | 91 ++++++++++++++++++--------
 4 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 61f4681a3f4..d24ee6f9225 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -703,7 +703,7 @@ class orc_writer_options {
    */
   void set_stripe_size_rows(size_type size_rows)
   {
-    CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
+    //    CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
     _stripe_size_rows = size_rows;
   }
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index fe653d74aa8..2147bd066a7 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -537,6 +537,8 @@ void decode_stream_data(std::size_t num_dicts,
       [&](auto null_count, auto const stripe_idx) {
         printf(
           "null count: %d => %d\n", (int)stripe_idx, (int)chunks[stripe_idx][col_idx].null_count);
+        printf("num child rows: %d \n", (int)chunks[stripe_idx][col_idx].num_child_rows);
+
         return null_count + chunks[stripe_idx][col_idx].null_count;
       });
   });
@@ -768,6 +770,8 @@ void reader::impl::decompress_and_decode()
   // compared to parent column.
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    printf("processing level = %d\n", (int)level);
+
     auto& columns_level = _selected_columns.levels[level];
 
     // TODO: do it in global step
@@ -893,12 +897,17 @@ void reader::impl::decompress_and_decode()
         auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
         // start row, number of rows in a each stripe and total number of rows
         // may change in lower levels of nesting
-        chunk.start_row       = (level == 0)
-                                  ? stripe_start_row
-                                  : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-        chunk.num_rows        = (level == 0)
-                                  ? stripe_info->numberOfRows
-                                  : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+        chunk.start_row = (level == 0)
+                            ? stripe_start_row
+                            : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+        chunk.num_rows  = (level == 0)
+                            ? stripe_info->numberOfRows
+                            : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+        printf("col idx: %d, start_row: %d, num rows: %d\n",
+               (int)col_idx,
+               (int)chunk.start_row,
+               (int)chunk.num_rows);
+
         chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
         chunk.parent_validity_info =
           (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
@@ -909,6 +918,9 @@ void reader::impl::decompress_and_decode()
         chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
         chunk.type_kind =
           _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+
+        printf("type: %d\n", (int)chunk.type_kind);
+
         // num_child_rows for a struct column will be same, for other nested types it will be
         // calculated.
         chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
@@ -931,6 +943,16 @@ void reader::impl::decompress_and_decode()
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
             chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stripe_start].dst_pos;
+            if (chunk.strm_len[k]) {
+              auto& info = stream_info[chunk.strm_id[k] + stripe_start];
+              printf("stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
+                     (int)info.id.stripe_idx,
+                     (int)info.id.level,
+                     (int)info.id.orc_col_idx,
+                     (int)info.id.kind);
+
+              printf("stream %d: %p\n", (int)k, chunk.streams[k]);
+            }
           }
         }
       }
@@ -1017,6 +1039,7 @@ void reader::impl::decompress_and_decode()
       bool is_nullable = false;
       for (std::size_t j = 0; j < num_stripes; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          printf("   is nullable\n");
           is_nullable = true;
           break;
         }
@@ -1162,8 +1185,10 @@ table_with_metadata reader::impl::make_output_chunk()
                        col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
                    });
 
-    // printf("output col: \n");
-    // cudf::test::print(out_columns.front()->view());
+    printf("output col0: \n");
+    cudf::test::print(out_columns.front()->view());
+    printf("output col1: \n");
+    cudf::test::print(out_columns.back()->view());
 
     auto tbl = std::make_unique<table>(std::move(out_columns));
     tabs.push_back(std::move(tbl));
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index d8e722f75e0..92e4b859388 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -138,6 +138,13 @@ std::size_t gather_stream_info_and_column_desc(
         }
         (*stream_idx)++;
       } else {  // not chunks.has_value()
+        printf("collect stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
+               (int)stripe_index,
+               (int)level,
+               (int)column_id,
+               (int)stream.kind);
+        ;
+
         stream_info.value()->emplace_back(
           stripeinfo->offset + src_offset,
           dst_offset,
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index bb132e477dd..d10e2c54fae 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -150,18 +150,9 @@ inline auto random_values(size_t size)
 {
   std::vector<T> values(size);
 
-  using T1 = T;
-  using uniform_distribution =
-    typename std::conditional_t<std::is_same_v<T1, bool>,
-                                std::bernoulli_distribution,
-                                std::conditional_t<std::is_floating_point_v<T1>,
-                                                   std::uniform_real_distribution<T1>,
-                                                   std::uniform_int_distribution<T1>>>;
-
-  static constexpr auto seed = 0xf00d;
-  static std::mt19937 engine{seed};
-  static uniform_distribution dist{};
-  std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; });
+  for (size_t i = 0; i < size; ++i) {
+    values[i] = i;
+  }
 
   return values;
 }
@@ -1448,49 +1439,97 @@ TEST_F(OrcWriterTest, StripeSizeInvalid)
 
 TEST_F(OrcWriterTest, TestMap)
 {
-  auto const num_rows       = 1200000;
-  auto const lists_per_row  = 4;
+  auto const num_rows       = 15;
+  auto const lists_per_row  = 2;
   auto const num_child_rows = (num_rows * lists_per_row) / 2;  // half due to validity
 
-  auto keys      = random_values<int>(num_child_rows);
-  auto vals      = random_values<float>(num_child_rows);
-  auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
+  auto keys = random_values<int>(num_child_rows);
+  // auto vals = random_values<float>(num_child_rows);
+  // auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3;
+  // });
   int32_col keys_col(keys.begin(), keys.end());
-  float32_col vals_col{vals.begin(), vals.end(), vals_mask};
-  auto s_col = struct_col({keys_col, vals_col}).release();
+  int32_col keys_col2(keys.begin(), keys.end());
+  // float32_col vals_col(vals.begin(), vals.end());
+  auto s_col = struct_col({{keys_col}}).release();
 
-  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  // auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   std::vector<int> row_offsets(num_rows + 1);
   int offset = 0;
   for (int idx = 0; idx < (num_rows) + 1; ++idx) {
     row_offsets[idx] = offset;
-    if (valids[idx]) { offset += lists_per_row; }
+    //    if (valids[idx]) {
+    offset += lists_per_row;
+    //    }
   }
   int32_col offsets(row_offsets.begin(), row_offsets.end());
 
-  auto num_list_rows           = static_cast<cudf::column_view>(offsets).size() - 1;
-  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_list_rows);
-  auto list_col                = cudf::make_lists_column(
-    num_list_rows, offsets.release(), std::move(s_col), null_count, std::move(null_mask));
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+
+#if 0
+  auto num_list_rows = static_cast<cudf::column_view>(offsets).size() - 1;
+  // auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids +
+  // num_list_rows);
+  auto list_col =
+    cudf::make_lists_column(num_list_rows, offsets.release(), std::move(s_col), 0, {});
+
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+  ;
 
   table_view expected({*list_col});
 
+  printf("input:\n");
+  cudf::test::print(*list_col);
+#endif
+  table_view expected({*s_col, keys_col2});
+
+  printf("input0:\n");
+  cudf::test::print(*s_col);
+  printf("input1:\n");
+  cudf::test::print(keys_col2);
+
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+
   cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_list_column_as_map();
 
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+  ;
+
   auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(expected_metadata);
+      .metadata(expected_metadata)
+      .stripe_size_rows(10);
   cudf::io::write_orc(out_opts);
 
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+
   cudf::io::orc_reader_options in_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+
+  printf("output:\n");
+  cudf::test::print(result.tbl->get_column(0));
+  ;
+
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
+
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+
+  printf("line %d\n", __LINE__);
+  fflush(stdout);
 }
 
 TEST_F(OrcReaderTest, NestedColumnSelection)

From f8652d7915186a62629ad6c3129f5c19c38cd2fe Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 25 Feb 2024 13:22:10 -0800
Subject: [PATCH 100/321] All tests pass

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp   |  2 +-
 cpp/src/io/orc/reader_impl.cu | 99 +++++++++++++++++++++++++++++++----
 cpp/tests/io/orc_test.cpp     | 76 +++++----------------------
 3 files changed, 105 insertions(+), 72 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index d24ee6f9225..61f4681a3f4 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -703,7 +703,7 @@ class orc_writer_options {
    */
   void set_stripe_size_rows(size_type size_rows)
   {
-    //    CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
+    CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
     _stripe_size_rows = size_rows;
   }
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 2147bd066a7..d657d95a4ef 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -591,7 +591,8 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
 /**
  * @brief Aggregate child metadata from parent column chunks.
  */
-void aggregate_child_meta(std::size_t level,
+void aggregate_child_meta(std::size_t stripe_start,
+                          std::size_t level,
                           cudf::io::orc::detail::column_hierarchy const& selected_columns,
                           cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
                           cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
@@ -624,15 +625,22 @@ void aggregate_child_meta(std::size_t level,
 
   int index = 0;  // number of child column processed
 
+  printf("\n\n");
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
+    printf("p_col.id: %d\n", (int)p_col.id);
+
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+    printf("   level: %d, parent_col_idx: %d\n", (int)level, (int)parent_col_idx);
+
     auto start_row            = 0;
     auto processed_row_groups = 0;
 
     for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
       // Aggregate num_rows and start_row from processed parent columns per row groups
       if (num_of_rowgroups) {
+        printf("   num_of_rowgroups: %d\n", (int)num_of_rowgroups);
+
         auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
         auto processed_child_rows  = 0;
 
@@ -650,16 +658,24 @@ void aggregate_child_meta(std::size_t level,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      printf("     stripe_id: %d: child_rows: %d\n", (int)stripe_id, (int)child_rows);
+      printf("      p_col.num_children: %d\n", (int)p_col.num_children);
+
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
         // TODO: Check for overflow here.
         num_child_rows[child_col_idx] += child_rows;
-        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
+        num_child_rows_per_stripe[stripe_id + stripe_start][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
-        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+        child_start_row[stripe_id + stripe_start][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+        printf("update child_start_row (%d, %d): %d\n",
+               (int)stripe_id,
+               (int)child_col_idx,
+               (int)start_row);
       }
       start_row += child_rows;
+      printf("        start_row: %d\n", (int)start_row);
     }
 
     // Parent column null mask and null count would be required for child column
@@ -769,6 +785,62 @@ void reader::impl::decompress_and_decode()
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   auto& col_meta = *_col_meta;
+
+  printf("num_child_rows: (size %d)\n", (int)_col_meta->num_child_rows.size());
+  if (_col_meta->num_child_rows.size()) {
+    for (auto x : _col_meta->num_child_rows) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+
+    _col_meta->num_child_rows.clear();
+  }
+
+  printf("parent_column_data null count: (size %d)\n", (int)_col_meta->parent_column_data.size());
+  if (_col_meta->parent_column_data.size()) {
+    for (auto x : _col_meta->parent_column_data) {
+      printf("%d, ", (int)x.null_count);
+    }
+    printf("\n");
+    _col_meta->parent_column_data.clear();
+  }
+
+  printf("parent_column_index: (size %d)\n", (int)_col_meta->parent_column_index.size());
+  if (_col_meta->parent_column_index.size()) {
+    for (auto x : _col_meta->parent_column_index) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+    _col_meta->parent_column_index.clear();
+  }
+
+  printf("child_start_row: (size %d)\n", (int)_col_meta->child_start_row.size());
+  if (_col_meta->child_start_row.size()) {
+    for (auto x : _col_meta->child_start_row) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+    _col_meta->child_start_row.clear();
+  }
+
+  printf("num_child_rows_per_stripe: (size %d)\n",
+         (int)_col_meta->num_child_rows_per_stripe.size());
+  if (_col_meta->num_child_rows_per_stripe.size()) {
+    for (auto x : _col_meta->num_child_rows_per_stripe) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+    _col_meta->num_child_rows_per_stripe.clear();
+  }
+
+  printf("rwgrp_meta: (size %d)\n", (int)_col_meta->rwgrp_meta.size());
+  if (_col_meta->rwgrp_meta.size()) {
+    for (auto x : _col_meta->rwgrp_meta) {
+      printf("(%d | %d), ", (int)x.start_row, (int)x.num_rows);
+    }
+    printf("\n");
+  }
+
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     printf("processing level = %d\n", (int)level);
 
@@ -1046,6 +1118,9 @@ void reader::impl::decompress_and_decode()
       }
       auto is_list_type = (column_types[i].id() == type_id::LIST);
       auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+
+      printf("  create child col, num rows: %d\n", (int)n_rows);
+
       // For list column, offset column will be always size + 1
       if (is_list_type) n_rows++;
       _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
@@ -1075,8 +1150,14 @@ void reader::impl::decompress_and_decode()
       scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
 
       row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(
-        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
+      aggregate_child_meta(stripe_start,
+                           level,
+                           _selected_columns,
+                           chunks,
+                           row_groups,
+                           nested_cols,
+                           _out_buffers[level],
+                           col_meta);
 
       // ORC stores number of elements at each row, so we need to generate offsets from that
       std::vector<list_buffer_data> buff_data;
@@ -1185,10 +1266,10 @@ table_with_metadata reader::impl::make_output_chunk()
                        col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
                    });
 
-    printf("output col0: \n");
-    cudf::test::print(out_columns.front()->view());
-    printf("output col1: \n");
-    cudf::test::print(out_columns.back()->view());
+    // printf("output col0: \n");
+    // cudf::test::print(out_columns.front()->view());
+    // printf("output col1: \n");
+    // cudf::test::print(out_columns.back()->view());
 
     auto tbl = std::make_unique<table>(std::move(out_columns));
     tabs.push_back(std::move(tbl));
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index d10e2c54fae..80dc3ab6fdb 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1439,97 +1439,49 @@ TEST_F(OrcWriterTest, StripeSizeInvalid)
 
 TEST_F(OrcWriterTest, TestMap)
 {
-  auto const num_rows       = 15;
-  auto const lists_per_row  = 2;
+  auto const num_rows       = 1200000;
+  auto const lists_per_row  = 4;
   auto const num_child_rows = (num_rows * lists_per_row) / 2;  // half due to validity
 
-  auto keys = random_values<int>(num_child_rows);
-  // auto vals = random_values<float>(num_child_rows);
-  // auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3;
-  // });
+  auto keys      = random_values<int>(num_child_rows);
+  auto vals      = random_values<float>(num_child_rows);
+  auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   int32_col keys_col(keys.begin(), keys.end());
-  int32_col keys_col2(keys.begin(), keys.end());
-  // float32_col vals_col(vals.begin(), vals.end());
-  auto s_col = struct_col({{keys_col}}).release();
+  float32_col vals_col{vals.begin(), vals.end(), vals_mask};
+  auto s_col = struct_col({keys_col, vals_col}).release();
 
-  // auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   std::vector<int> row_offsets(num_rows + 1);
   int offset = 0;
   for (int idx = 0; idx < (num_rows) + 1; ++idx) {
     row_offsets[idx] = offset;
-    //    if (valids[idx]) {
-    offset += lists_per_row;
-    //    }
+    if (valids[idx]) { offset += lists_per_row; }
   }
   int32_col offsets(row_offsets.begin(), row_offsets.end());
 
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-
-#if 0
-  auto num_list_rows = static_cast<cudf::column_view>(offsets).size() - 1;
-  // auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids +
-  // num_list_rows);
-  auto list_col =
-    cudf::make_lists_column(num_list_rows, offsets.release(), std::move(s_col), 0, {});
-
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-  ;
+  auto num_list_rows           = static_cast<cudf::column_view>(offsets).size() - 1;
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_list_rows);
+  auto list_col                = cudf::make_lists_column(
+    num_list_rows, offsets.release(), std::move(s_col), null_count, std::move(null_mask));
 
   table_view expected({*list_col});
 
-  printf("input:\n");
-  cudf::test::print(*list_col);
-#endif
-  table_view expected({*s_col, keys_col2});
-
-  printf("input0:\n");
-  cudf::test::print(*s_col);
-  printf("input1:\n");
-  cudf::test::print(keys_col2);
-
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-
   cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_list_column_as_map();
 
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-  ;
-
   auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(expected_metadata)
-      .stripe_size_rows(10);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-
   cudf::io::orc_reader_options in_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-
-  printf("output:\n");
-  cudf::test::print(result.tbl->get_column(0));
-  ;
-
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
-
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
-
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-
-  printf("line %d\n", __LINE__);
-  fflush(stdout);
 }
 
 TEST_F(OrcReaderTest, NestedColumnSelection)

From 537ea0cd841b3fbcf2c215d4f37fa9c24168206c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 25 Feb 2024 14:00:46 -0800
Subject: [PATCH 101/321] Reverse tests

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_test.cpp | 90 ++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 80dc3ab6fdb..0b34b39f739 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/random.hpp>
@@ -150,9 +149,18 @@ inline auto random_values(size_t size)
 {
   std::vector<T> values(size);
 
-  for (size_t i = 0; i < size; ++i) {
-    values[i] = i;
-  }
+  using T1 = T;
+  using uniform_distribution =
+    typename std::conditional_t<std::is_same_v<T1, bool>,
+                                std::bernoulli_distribution,
+                                std::conditional_t<std::is_floating_point_v<T1>,
+                                                   std::uniform_real_distribution<T1>,
+                                                   std::uniform_int_distribution<T1>>>;
+
+  static constexpr auto seed = 0xf00d;
+  static std::mt19937 engine{seed};
+  static uniform_distribution dist{};
+  std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; });
 
   return values;
 }
@@ -198,7 +206,7 @@ struct SkipRowTest {
         .skip_rows(skip_rows)
         .num_rows(read_num_rows);
     auto result = cudf::io::read_orc(in_opts);
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_result->view(), result.tbl->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_result->view(), result.tbl->view());
   }
 
   void test(int skip_rows, int file_num_rows)
@@ -212,7 +220,7 @@ struct SkipRowTest {
         .use_index(false)
         .skip_rows(skip_rows);
     auto result = cudf::io::read_orc(in_opts);
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_result->view(), result.tbl->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_result->view(), result.tbl->view());
   }
 };
 
@@ -236,7 +244,7 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
@@ -258,7 +266,7 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
@@ -282,7 +290,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
       .timestamp_type(this->type());
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
@@ -308,7 +316,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
       .timestamp_type(this->type());
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
@@ -332,7 +340,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
       .timestamp_type(this->type());
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, MultiColumn)
@@ -392,7 +400,7 @@ TEST_F(OrcWriterTest, MultiColumn)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -459,7 +467,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -517,7 +525,7 @@ TEST_F(OrcWriterTest, Strings)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -569,7 +577,7 @@ TEST_F(OrcWriterTest, SlicedTable)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_slice, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -596,7 +604,7 @@ TEST_F(OrcWriterTest, HostBuffer)
       .use_index(false);
   auto const result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -626,7 +634,7 @@ TEST_F(OrcWriterTest, negTimestampsNano)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS);
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, Slice)
@@ -662,7 +670,7 @@ TEST_F(OrcChunkedWriterTest, SingleTable)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *table1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
 }
 
 TEST_F(OrcChunkedWriterTest, SimpleTable)
@@ -682,7 +690,7 @@ TEST_F(OrcChunkedWriterTest, SimpleTable)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
 TEST_F(OrcChunkedWriterTest, LargeTables)
@@ -702,7 +710,7 @@ TEST_F(OrcChunkedWriterTest, LargeTables)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
 TEST_F(OrcChunkedWriterTest, ManyTables)
@@ -732,7 +740,7 @@ TEST_F(OrcChunkedWriterTest, ManyTables)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
 TEST_F(OrcChunkedWriterTest, Metadata)
@@ -791,7 +799,7 @@ TEST_F(OrcChunkedWriterTest, Strings)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
 TEST_F(OrcChunkedWriterTest, MismatchedTypes)
@@ -839,8 +847,8 @@ TEST_F(OrcChunkedWriterTest, MismatchedStructure)
 TEST_F(OrcChunkedWriterTest, ReadStripes)
 {
   srand(31337);
-  auto table1 = create_random_fixed_table<int>(1, 5, true);
-  auto table2 = create_random_fixed_table<int>(1, 6, true);
+  auto table1 = create_random_fixed_table<int>(5, 5, true);
+  auto table2 = create_random_fixed_table<int>(5, 5, true);
 
   auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
@@ -849,17 +857,11 @@ TEST_F(OrcChunkedWriterTest, ReadStripes)
     cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
   cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
 
-  printf("tab 1: \n");
-  cudf::test::print(table1->get_column(0).view());
-
-  printf("tab 2: \n");
-  cudf::test::print(table2->get_column(0).view());
-
   cudf::io::orc_reader_options read_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{1, 0, 1}});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
 TEST_F(OrcChunkedWriterTest, ReadStripesError)
@@ -919,7 +921,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
 TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
@@ -962,7 +964,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *expected);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
 TEST_F(OrcReaderTest, CombinedSkipRowTest)
@@ -1121,7 +1123,7 @@ TEST_F(OrcWriterTest, SlicedValidMask)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -1139,7 +1141,7 @@ TEST_F(OrcReaderTest, SingleInputs)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1}});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *table1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
 }
 
 TEST_F(OrcReaderTest, zstdCompressionRegression)
@@ -1198,7 +1200,7 @@ TEST_F(OrcReaderTest, MultipleInputs)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}});
   auto result = cudf::io::read_orc(read_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
 struct OrcWriterTestDecimal : public OrcWriterTest,
@@ -1385,7 +1387,7 @@ TEST_P(OrcWriterTestStripes, StripeSize)
         .use_index(false);
     auto result = cudf::io::read_orc(in_opts);
 
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected->view(), result.tbl->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
   };
 
   {
@@ -1480,7 +1482,7 @@ TEST_F(OrcWriterTest, TestMap)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
@@ -1743,7 +1745,7 @@ TEST_F(OrcReaderTest, ZstdMaxCompressionRate)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, CompStats)
@@ -1844,7 +1846,7 @@ TEST_F(OrcWriterTest, EmptyRowGroup)
   cudf::io::orc_reader_options in_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(in_opts);
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, NoNullsAsNonNullable)
@@ -1882,7 +1884,7 @@ TEST_F(OrcWriterTest, SlicedStringColumn)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_slice, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, EmptyChildStringColumn)
@@ -1899,7 +1901,7 @@ TEST_F(OrcWriterTest, EmptyChildStringColumn)
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 template <typename T>
@@ -1963,7 +1965,7 @@ TEST_F(OrcWriterTest, UnorderedDictionary)
     cudf::io::source_info{out_buffer_unsorted.data(), out_buffer_unsorted.size()});
   auto const from_unsorted = cudf::io::read_orc(in_opts_unsorted).tbl;
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*from_sorted, *from_unsorted);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*from_sorted, *from_unsorted);
 }
 
 TEST_F(OrcStatisticsTest, Empty)
@@ -2083,7 +2085,7 @@ TEST_P(OrcCompressionTest, Basic)
     cudf::io::source_info{out_buffer.data(), out_buffer.size()});
   auto result = cudf::io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 INSTANTIATE_TEST_CASE_P(OrcCompressionTest,

From 24e15523d76ced09709bb1ad0a484e8756d4e390 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 25 Feb 2024 14:06:48 -0800
Subject: [PATCH 102/321] Fix for temp concatenation

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d657d95a4ef..74939b4e628 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1289,7 +1289,26 @@ table_with_metadata reader::impl::make_output_chunk()
   // todo: remove this
   // auto out_table = std::make_unique<table>(std::move(out_columns));
   auto out_table = [&] {
-    if (tv.size() > 1) { return cudf::concatenate(tv); }
+    if (tv.size() > 1) {
+      auto tmp = cudf::concatenate(tv);
+      std::vector<bool> has_mask(tmp->num_columns(), false);
+      std::vector<bool> has_nulls(tmp->num_columns(), false);
+
+      for (int i = 0; i < tmp->num_columns(); ++i) {
+        for (int j = 0; j < (int)tv.size(); ++j) {
+          if (tv[j].column(i).nullable()) { has_mask[i] = true; }
+          if (tv[j].column(i).null_count()) { has_nulls[i] = true; }
+        }
+      }
+      for (int i = 0; i < tmp->num_columns(); ++i) {
+        if (has_mask[i] && !has_nulls[i]) {
+          tmp->get_column(i).set_null_mask(
+            cudf::create_null_mask(tmp->get_column(i).size(), cudf::mask_state::ALL_VALID), 0);
+        }
+      }
+
+      return tmp;
+    }
     return std::move(tabs.front());
   }();
   // auto out_table = std::move(tabs.front());

From df8d9b3a7ccc67907070cea22d31312fee844a95 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 25 Feb 2024 14:32:43 -0800
Subject: [PATCH 103/321] Turn off debug printing

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 37 +++++++++++---------------
 cpp/src/io/orc/reader_impl_chunking.cu | 13 +++++----
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 74939b4e628..304f35bd388 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -530,17 +530,19 @@ void decode_stream_data(std::size_t num_dicts,
   CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
 
   std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
-    out_buffers[col_idx].null_count() = std::accumulate(
-      stripe_idx_it + 0,
-      stripe_idx_it + num_stripes,
-      0,
-      [&](auto null_count, auto const stripe_idx) {
-        printf(
-          "null count: %d => %d\n", (int)stripe_idx, (int)chunks[stripe_idx][col_idx].null_count);
-        printf("num child rows: %d \n", (int)chunks[stripe_idx][col_idx].num_child_rows);
-
-        return null_count + chunks[stripe_idx][col_idx].null_count;
-      });
+    out_buffers[col_idx].null_count() =
+      std::accumulate(stripe_idx_it + 0,
+                      stripe_idx_it + num_stripes,
+                      0,
+                      [&](auto null_count, auto const stripe_idx) {
+                        // printf(
+                        //   "null count: %d => %d\n", (int)stripe_idx,
+                        //   (int)chunks[stripe_idx][col_idx].null_count);
+                        // printf("num child rows: %d \n",
+                        // (int)chunks[stripe_idx][col_idx].num_child_rows);
+
+                        return null_count + chunks[stripe_idx][col_idx].null_count;
+                      });
   });
 }
 
@@ -786,6 +788,7 @@ void reader::impl::decompress_and_decode()
   // compared to parent column.
   auto& col_meta = *_col_meta;
 
+#if 0
   printf("num_child_rows: (size %d)\n", (int)_col_meta->num_child_rows.size());
   if (_col_meta->num_child_rows.size()) {
     for (auto x : _col_meta->num_child_rows) {
@@ -841,6 +844,8 @@ void reader::impl::decompress_and_decode()
     printf("\n");
   }
 
+#endif
+
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     printf("processing level = %d\n", (int)level);
 
@@ -1015,16 +1020,6 @@ void reader::impl::decompress_and_decode()
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
             chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stripe_start].dst_pos;
-            if (chunk.strm_len[k]) {
-              auto& info = stream_info[chunk.strm_id[k] + stripe_start];
-              printf("stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
-                     (int)info.id.stripe_idx,
-                     (int)info.id.level,
-                     (int)info.id.orc_col_idx,
-                     (int)info.id.kind);
-
-              printf("stream %d: %p\n", (int)k, chunk.streams[k]);
-            }
           }
         }
       }
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 92e4b859388..e9b4a337006 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -105,7 +105,7 @@ std::size_t gather_stream_info_and_column_desc(
       auto const schema_type = types[column_id];
       if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
           stream.kind == orc::PRESENT) {
-        printf("present stream\n");
+        // printf("present stream\n");
         for (auto const& idx : schema_type.subtypes) {
           auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
           if (child_idx >= 0) {
@@ -138,12 +138,11 @@ std::size_t gather_stream_info_and_column_desc(
         }
         (*stream_idx)++;
       } else {  // not chunks.has_value()
-        printf("collect stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
-               (int)stripe_index,
-               (int)level,
-               (int)column_id,
-               (int)stream.kind);
-        ;
+        // printf("collect stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
+        //        (int)stripe_index,
+        //        (int)level,
+        //        (int)column_id,
+        //        (int)stream.kind);
 
         stream_info.value()->emplace_back(
           stripeinfo->offset + src_offset,

From 12dff3b790440ace7137ce116e191d39bd72761c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 26 Feb 2024 11:45:43 -0800
Subject: [PATCH 104/321] Some fixes

---
 cpp/src/io/orc/aggregate_orc_metadata.cpp |  8 +++++---
 cpp/src/io/orc/orc.hpp                    |  2 +-
 cpp/src/io/orc/reader_impl.cu             | 13 +++++++++----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 257c356d6b8..1e9cb50d532 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -198,8 +198,9 @@ aggregate_orc_metadata::select_stripes(
                                 nullptr,
                                 static_cast<int>(src_file_idx)});
 
-        // TODO: check for overflow here.
-        rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        // TODO: change return type to int64_t
+        rows_to_read += static_cast<size_type>(
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows);
         printf(" rows_to_read : %d / %d\n",
                (int)per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows,
                (int)rows_to_read);
@@ -220,7 +221,8 @@ aggregate_orc_metadata::select_stripes(
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
-        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        count +=
+          static_cast<int64_t>(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows);
         if (count > rows_to_skip || count == 0) {
           stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
                                   nullptr,
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 9759c3a0bf1..4a35aaf5107 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -85,7 +85,7 @@ struct StripeInformation {
   uint64_t indexLength  = 0;  // the length of the indexes in bytes
   uint64_t dataLength   = 0;  // the length of the data in bytes
   uint64_t footerLength = 0;  // the length of the footer in bytes
-  int64_t numberOfRows  = 0;  // the number of rows in the stripe
+  uint64_t numberOfRows = 0;  // the number of rows in the stripe
 };
 
 struct SchemaType {
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index cbdaaed113f..c3f68de21c7 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -743,10 +743,15 @@ void reader::impl::decompress_and_decode()
   for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
     auto const& stripe     = selected_stripes[stripe_idx];
     auto const stripe_info = stripe.stripe_info;
-    rows_to_read += stripe_info->numberOfRows;
+    // TODO: check overflow
+    // CUDF_EXPECTS(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows <
+    //                static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+    //              "TODO");
+    rows_to_read += static_cast<size_type>(stripe_info->numberOfRows);
 
     if (_file_itm_data.rows_to_skip > 0) {
-      CUDF_EXPECTS(_file_itm_data.rows_to_skip < stripe_info->numberOfRows, "TODO");
+      CUDF_EXPECTS(_file_itm_data.rows_to_skip < static_cast<int64_t>(stripe_info->numberOfRows),
+                   "TODO");
     }
   }
   rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
@@ -955,7 +960,7 @@ void reader::impl::decompress_and_decode()
       // printf("line %d\n", __LINE__);
       // fflush(stdout);
 
-      auto const num_rows_per_stripe = stripe_info->numberOfRows;
+      auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
       printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
 
       auto const rowgroup_id    = num_rowgroups;
@@ -977,7 +982,7 @@ void reader::impl::decompress_and_decode()
                             ? stripe_start_row
                             : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
         chunk.num_rows  = (level == 0)
-                            ? stripe_info->numberOfRows
+                            ? static_cast<int64_t>(stripe_info->numberOfRows)
                             : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
         printf("col idx: %d, start_row: %d, num rows: %d\n",
                (int)col_idx,

From 54018268d732546b87fef950dc9cff561169f05b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 26 Feb 2024 14:19:48 -0800
Subject: [PATCH 105/321] Fix host memory issue

---
 cpp/src/io/orc/reader_impl.cu | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index c3f68de21c7..d8c0018dce8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -667,9 +667,9 @@ void aggregate_child_meta(std::size_t stripe_start,
 
         // TODO: Check for overflow here.
         num_child_rows[child_col_idx] += child_rows;
-        num_child_rows_per_stripe[stripe_id + stripe_start][child_col_idx] = child_rows;
+        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
-        child_start_row[stripe_id + stripe_start][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
         printf("update child_start_row (%d, %d): %d\n",
                (int)stripe_id,
                (int)child_col_idx,
@@ -978,12 +978,15 @@ void reader::impl::decompress_and_decode()
         auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
         // start row, number of rows in a each stripe and total number of rows
         // may change in lower levels of nesting
-        chunk.start_row = (level == 0)
-                            ? stripe_start_row
-                            : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-        chunk.num_rows  = (level == 0)
-                            ? static_cast<int64_t>(stripe_info->numberOfRows)
-                            : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+        chunk.start_row =
+          (level == 0)
+            ? stripe_start_row
+            : col_meta.child_start_row[(stripe_idx - stripe_start) * num_columns + col_idx];
+        chunk.num_rows =
+          (level == 0)
+            ? static_cast<int64_t>(stripe_info->numberOfRows)
+            : col_meta
+                .num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_columns + col_idx];
         printf("col idx: %d, start_row: %d, num rows: %d\n",
                (int)col_idx,
                (int)chunk.start_row,

From e53cf564c544702b4cd577055e4d9eda30aa3fe1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 26 Feb 2024 14:39:39 -0800
Subject: [PATCH 106/321] Some cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d8c0018dce8..4c550820ad8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -629,10 +629,10 @@ void aggregate_child_meta(std::size_t stripe_start,
   printf("\n\n");
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
-    printf("p_col.id: %d\n", (int)p_col.id);
+    // printf("p_col.id: %d\n", (int)p_col.id);
 
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
-    printf("   level: %d, parent_col_idx: %d\n", (int)level, (int)parent_col_idx);
+    // printf("   level: %d, parent_col_idx: %d\n", (int)level, (int)parent_col_idx);
 
     int64_t start_row         = 0;
     auto processed_row_groups = 0;
@@ -640,7 +640,7 @@ void aggregate_child_meta(std::size_t stripe_start,
     for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
       // Aggregate num_rows and start_row from processed parent columns per row groups
       if (num_of_rowgroups) {
-        printf("   num_of_rowgroups: %d\n", (int)num_of_rowgroups);
+        // printf("   num_of_rowgroups: %d\n", (int)num_of_rowgroups);
 
         auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
         auto processed_child_rows  = 0;
@@ -659,8 +659,8 @@ void aggregate_child_meta(std::size_t stripe_start,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      printf("     stripe_id: %d: child_rows: %d\n", (int)stripe_id, (int)child_rows);
-      printf("      p_col.num_children: %d\n", (int)p_col.num_children);
+      // printf("     stripe_id: %d: child_rows: %d\n", (int)stripe_id, (int)child_rows);
+      // printf("      p_col.num_children: %d\n", (int)p_col.num_children);
 
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
@@ -670,13 +670,13 @@ void aggregate_child_meta(std::size_t stripe_start,
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
         child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
-        printf("update child_start_row (%d, %d): %d\n",
-               (int)stripe_id,
-               (int)child_col_idx,
-               (int)start_row);
+        // printf("update child_start_row (%d, %d): %d\n",
+        //        (int)stripe_id,
+        //        (int)child_col_idx,
+        //        (int)start_row);
       }
       start_row += child_rows;
-      printf("        start_row: %d\n", (int)start_row);
+      // printf("        start_row: %d\n", (int)start_row);
     }
 
     // Parent column null mask and null count would be required for child column
@@ -1120,7 +1120,7 @@ void reader::impl::decompress_and_decode()
       auto is_list_type = (column_types[i].id() == type_id::LIST);
       auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
 
-      printf("  create child col, num rows: %d\n", (int)n_rows);
+      // printf("  create child col, num rows: %d\n", (int)n_rows);
 
       // For list column, offset column will be always size + 1
       if (is_list_type) n_rows++;

From f2ec94ccfce30357fb00a6f8746052da4ca682fe Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 27 Feb 2024 11:37:39 -0800
Subject: [PATCH 107/321] Compute table row size

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 279 +++++++++++++++++++++++++
 1 file changed, 279 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2b208738b1e..08e3dbe24f0 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -23,11 +23,18 @@
 #include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
 
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -37,17 +44,153 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
+#include <thrust/reduce.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
 #include <algorithm>
 #include <iterator>
 
+//
+//
+//
+#include <cudf_test/debug_utilities.hpp>
+
+#include <cudf/detail/utilities/linked_column.hpp>
+//
+//
+//
+namespace cudf::experimental {
+
+enum class decompose_lists_column : bool { YES, NO };
+
+auto decompose_structs(table_view table,
+                       decompose_lists_column decompose_lists,
+                       host_span<order const> column_order         = {},
+                       host_span<null_order const> null_precedence = {})
+{
+  auto linked_columns = detail::table_to_linked_columns(table);
+
+  std::vector<column_view> verticalized_columns;
+  std::vector<order> new_column_order;
+  std::vector<null_order> new_null_precedence;
+  std::vector<int> verticalized_col_depths;
+  for (size_t col_idx = 0; col_idx < linked_columns.size(); ++col_idx) {
+    detail::linked_column_view const* col = linked_columns[col_idx].get();
+    if (is_nested(col->type())) {
+      // convert and insert
+      std::vector<std::vector<detail::linked_column_view const*>> flattened;
+      std::function<void(
+        detail::linked_column_view const*, std::vector<detail::linked_column_view const*>*, int)>
+        recursive_child = [&](detail::linked_column_view const* c,
+                              std::vector<detail::linked_column_view const*>* branch,
+                              int depth) {
+          branch->push_back(c);
+          if (decompose_lists == decompose_lists_column::YES && c->type().id() == type_id::LIST) {
+            recursive_child(
+              c->children[lists_column_view::child_column_index].get(), branch, depth + 1);
+          } else if (c->type().id() == type_id::STRUCT) {
+            for (size_t child_idx = 0; child_idx < c->children.size(); ++child_idx) {
+              // When child_idx == 0, we also cut off the current branch if its first child is a
+              // lists column.
+              // In such cases, the last column of the current branch will be `Struct<List,...>` and
+              // it will be modified to empty struct type `Struct<>` later on.
+              if (child_idx > 0 || c->children[0]->type().id() == type_id::LIST) {
+                verticalized_col_depths.push_back(depth + 1);
+                branch = &flattened.emplace_back();
+              }
+              recursive_child(c->children[child_idx].get(), branch, depth + 1);
+            }
+          }
+        };
+      auto& branch = flattened.emplace_back();
+      verticalized_col_depths.push_back(0);
+      recursive_child(col, &branch, 0);
+
+      for (auto const& branch : flattened) {
+        column_view temp_col = *branch.back();
+
+        // Change `Struct<List,...>` into empty struct type `Struct<>`.
+        if (temp_col.type().id() == type_id::STRUCT &&
+            (temp_col.num_children() > 0 && temp_col.child(0).type().id() == type_id::LIST)) {
+          temp_col = column_view(temp_col.type(),
+                                 temp_col.size(),
+                                 temp_col.head(),
+                                 temp_col.null_mask(),
+                                 temp_col.null_count(),
+                                 temp_col.offset(),
+                                 {});
+        }
+
+        for (auto it = branch.crbegin() + 1; it < branch.crend(); ++it) {
+          auto const& prev_col = *(*it);
+          auto children =
+            (prev_col.type().id() == type_id::LIST)
+              ? std::vector<column_view>{*prev_col
+                                            .children[lists_column_view::offsets_column_index],
+                                         temp_col}
+              : std::vector<column_view>{temp_col};
+          temp_col = column_view(prev_col.type(),
+                                 prev_col.size(),
+                                 nullptr,
+                                 prev_col.null_mask(),
+                                 prev_col.null_count(),
+                                 prev_col.offset(),
+                                 std::move(children));
+        }
+        // Traverse upward and include any list columns in the ancestors
+        for (detail::linked_column_view* parent = branch.front()->parent; parent;
+             parent                             = parent->parent) {
+          if (parent->type().id() == type_id::LIST) {
+            // Include this parent
+            temp_col = column_view(
+              parent->type(),
+              parent->size(),
+              nullptr,  // list has no data of its own
+              nullptr,  // If we're going through this then nullmask is already in another branch
+              0,
+              parent->offset(),
+              {*parent->children[lists_column_view::offsets_column_index], temp_col});
+          } else if (parent->type().id() == type_id::STRUCT) {
+            // Replace offset with parent's offset
+            temp_col = column_view(temp_col.type(),
+                                   parent->size(),
+                                   temp_col.head(),
+                                   temp_col.null_mask(),
+                                   temp_col.null_count(),
+                                   parent->offset(),
+                                   {temp_col.child_begin(), temp_col.child_end()});
+          }
+        }
+        verticalized_columns.push_back(temp_col);
+      }
+      if (not column_order.empty()) {
+        new_column_order.insert(new_column_order.end(), flattened.size(), column_order[col_idx]);
+      }
+      if (not null_precedence.empty()) {
+        new_null_precedence.insert(
+          new_null_precedence.end(), flattened.size(), null_precedence[col_idx]);
+      }
+    } else {
+      verticalized_columns.push_back(*col);
+      verticalized_col_depths.push_back(0);
+      if (not column_order.empty()) { new_column_order.push_back(column_order[col_idx]); }
+      if (not null_precedence.empty()) { new_null_precedence.push_back(null_precedence[col_idx]); }
+    }
+  }
+  return std::make_tuple(table_view(verticalized_columns),
+                         std::move(new_column_order),
+                         std::move(new_null_precedence),
+                         std::move(verticalized_col_depths));
+}
+}  // namespace cudf::experimental
+
 namespace cudf::io::orc::detail {
 
 std::size_t gather_stream_info_and_column_desc(
@@ -744,4 +887,140 @@ void reader::impl::load_data()
   _chunk_read_data.curr_decode_stripe_chunk = 0;
 }
 
+namespace {
+
+// Default 10k rows.
+size_type constexpr SEGMENT_SIZE = 10'000;
+
+/**
+ * @brief Functor which computes the total data size for a given type of a cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ *
+ * TODO
+ */
+struct column_segment_size_functor {
+  column_device_view d_col;
+  size_type size;
+
+  __device__ std::size_t num_rows(size_type start_row) const
+  {
+    return cuda::std::min(size, d_col.size() - start_row);
+  }
+
+  __device__ std::size_t validity_size(size_type start_row) const
+  {
+    return d_col.nullable()
+             ? cudf::util::div_rounding_up_safe(num_rows(start_row), std::size_t{32}) * 4ul
+             : 0ul;
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(!cudf::is_rep_layout_compatible<T>() && !cudf::is_nested<T>() &&
+                           !std::is_same_v<T, string_view>)>
+  __device__ std::size_t operator()(size_type) const
+  {
+    CUDF_UNREACHABLE("Attempted to find size of unsupported types.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
+  __device__ std::size_t operator()(size_type start_row) const
+  {
+    auto constexpr element_size = sizeof(device_storage_type_t<T>);
+    return element_size * num_rows(start_row) + validity_size(start_row);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
+  __device__ std::size_t operator()(size_type start_row) const
+  {
+    auto const offsets      = d_col.child(strings_column_view::offsets_column_index);
+    auto const offsetalator = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    auto const char_begin   = offsetalator[start_row];
+    auto const char_end     = offsetalator[start_row + num_rows(start_row)];
+    auto const chars_size   = char_end - char_begin;
+
+    // NOTE: Adding the + 1 offset, similar to the case of lists column.
+    auto const offset_size =
+      offsets.type().id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t);
+    return offset_size * (num_rows(start_row) + 1) + validity_size(start_row) + chars_size;
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
+  __device__ std::size_t operator()(size_type start_row) const
+  {
+    auto constexpr element_size = sizeof(device_storage_type_t<T>);
+
+    auto col             = d_col;
+    auto col_size        = element_size + validity_size(start_row);
+    auto child_start_row = start_row;
+    auto child_size      = size;
+
+    while (col.type().id() == type_id::STRUCT || col.type().id() == type_id::LIST) {
+      if (col.type().id() == type_id::STRUCT) {
+        // Empty struct.
+        if (col.num_child_columns() == 0) { return col_size; }
+        col = col.child(0);
+      } else {
+        auto const offsets = col.child(lists_column_view::offsets_column_index);
+        col                = col.child(lists_column_view::child_column_index);
+
+        auto const child_end_row = offsets.element<size_type>(start_row + num_rows(start_row));
+        child_start_row          = offsets.element<size_type>(start_row);
+        child_size               = child_end_row - child_start_row;
+
+        // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra
+        // offset for the entire column so we will get a small over-estimate of the real size.
+        auto constexpr offset_size = sizeof(size_type);
+        col_size += offset_size * (num_rows(start_row) + 1);
+      }
+    }
+
+    return col_size + type_dispatcher<cudf::experimental::dispatch_void_if_nested>(
+                        col.type(), column_segment_size_functor{col, child_size}, child_start_row);
+  }
+};
+
+struct table_segment_size_functor {
+  table_device_view d_table;
+  size_type size;
+
+  __device__ std::size_t operator()(size_type start_row) const
+  {
+    auto const col_size = [=](column_device_view col) {
+      return cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size}, start_row);
+    };
+
+    return thrust::transform_reduce(
+      thrust::seq, d_table.begin(), d_table.end(), col_size, 0ul, thrust::plus<>{});
+  }
+};
+
+}  // namespace
+
+void test(table_view const& input, rmm::cuda_stream_view stream)
+{
+  auto verticalized_t = std::get<0>(
+    cudf::experimental::decompose_structs(input, cudf::experimental::decompose_lists_column::YES));
+  auto d_t = table_device_view::create(verticalized_t, stream);
+
+  auto const num_segments = input.num_rows() / SEGMENT_SIZE;
+  auto output             = make_fixed_width_column(
+    data_type{type_id::UINT64}, num_segments, mask_state::UNALLOCATED, stream);
+
+  auto s = thrust::transform(
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(num_segments),
+    output->mutable_view().begin<size_t>(),
+    cuda::proclaim_return_type<std::size_t>(
+      [SEGMENT_SIZE = SEGMENT_SIZE, d_table = *d_t] __device__(auto const segment_idx) {
+        auto const start_row = segment_idx * SEGMENT_SIZE;
+        return table_segment_size_functor{d_table, SEGMENT_SIZE}(start_row);
+      }));
+
+  printf("segment size: \n");
+  cudf::test::print(output->view());
+  fflush(stdout);
+}
+
 }  // namespace cudf::io::orc::detail

From fd325b6224a858117d7ca4e1fe9697cfe05d56e3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 27 Feb 2024 15:30:48 -0800
Subject: [PATCH 108/321] Compute column row size

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 28 +++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 08e3dbe24f0..6b8733827c1 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -892,6 +892,8 @@ namespace {
 // Default 10k rows.
 size_type constexpr SEGMENT_SIZE = 10'000;
 
+// size_type constexpr SEGMENT_SIZE = 1;
+
 /**
  * @brief Functor which computes the total data size for a given type of a cudf column.
  *
@@ -943,20 +945,22 @@ struct column_segment_size_functor {
     // NOTE: Adding the + 1 offset, similar to the case of lists column.
     auto const offset_size =
       offsets.type().id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t);
+    // printf("   offset sizes: %d, char size: %d\n", (int)offset_size, (int)chars_size);
+
     return offset_size * (num_rows(start_row) + 1) + validity_size(start_row) + chars_size;
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
   __device__ std::size_t operator()(size_type start_row) const
   {
-    auto constexpr element_size = sizeof(device_storage_type_t<T>);
-
     auto col             = d_col;
-    auto col_size        = element_size + validity_size(start_row);
+    auto col_size        = std::size_t{0};
     auto child_start_row = start_row;
     auto child_size      = size;
 
     while (col.type().id() == type_id::STRUCT || col.type().id() == type_id::LIST) {
+      col_size += validity_size(start_row);
+
       if (col.type().id() == type_id::STRUCT) {
         // Empty struct.
         if (col.num_child_columns() == 0) { return col_size; }
@@ -987,10 +991,19 @@ struct table_segment_size_functor {
 
   __device__ std::size_t operator()(size_type start_row) const
   {
+    // printf("line %d, start row %d\n", __LINE__, start_row);
+
     auto const col_size = [=](column_device_view col) {
       return cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size}, start_row);
     };
 
+    // for (auto col : d_table) {
+    //   auto t = cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size},
+    //   start_row); printf("start: %d, col size: %d\n", start_row, (int)t);
+    // }
+
+    // printf("line %d\n", __LINE__);
+
     return thrust::transform_reduce(
       thrust::seq, d_table.begin(), d_table.end(), col_size, 0ul, thrust::plus<>{});
   }
@@ -1002,13 +1015,18 @@ void test(table_view const& input, rmm::cuda_stream_view stream)
 {
   auto verticalized_t = std::get<0>(
     cudf::experimental::decompose_structs(input, cudf::experimental::decompose_lists_column::YES));
+
   auto d_t = table_device_view::create(verticalized_t, stream);
 
-  auto const num_segments = input.num_rows() / SEGMENT_SIZE;
-  auto output             = make_fixed_width_column(
+  auto const num_segments = std::max(input.num_rows() / SEGMENT_SIZE, 1);
+  printf("num rows: %d, num seeg: %d\n", input.num_rows(), num_segments);
+  fflush(stdout);
+
+  auto output = make_fixed_width_column(
     data_type{type_id::UINT64}, num_segments, mask_state::UNALLOCATED, stream);
 
   auto s = thrust::transform(
+    rmm::exec_policy(stream),
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(num_segments),
     output->mutable_view().begin<size_t>(),

From 416d810a357da3ed8da0259f84eab2903ebd6a4f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 27 Feb 2024 15:30:55 -0800
Subject: [PATCH 109/321] Test column size

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_test.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 24e2e2cfea0..d4e497d7ecd 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -39,6 +39,10 @@
 
 #include <type_traits>
 
+namespace cudf::io::orc::detail {
+void test(table_view const& input, rmm::cuda_stream_view stream);
+}
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
@@ -774,6 +778,8 @@ TEST_F(OrcChunkedWriterTest, Metadata)
   auto result = cudf::io::read_orc(read_opts);
 
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+
+  cudf::io::orc::detail::test(expected, cudf::get_default_stream());
 }
 
 TEST_F(OrcChunkedWriterTest, Strings)
@@ -1388,6 +1394,8 @@ TEST_P(OrcWriterTestStripes, StripeSize)
     auto result = cudf::io::read_orc(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+
+    cudf::io::orc::detail::test(expected->view(), cudf::get_default_stream());
   };
 
   {
@@ -1484,6 +1492,8 @@ TEST_F(OrcWriterTest, TestMap)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+
+  cudf::io::orc::detail::test(cudf::table_view{{*list_col}}, cudf::get_default_stream());
 }
 
 TEST_F(OrcReaderTest, NestedColumnSelection)

From b745787f841c208555467303667ee15c48e1248a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 11:43:08 -0800
Subject: [PATCH 110/321] Test column sizes using `segmented_bit_count`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/detail/transform.hpp  | 12 +++-
 cpp/src/io/orc/reader_impl_chunking.cu | 55 +++++++++++++++--
 cpp/src/transform/row_bit_count.cu     | 82 ++++++++++++++++++--------
 3 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 215ad50aed6..0ce7037b9e8 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,5 +100,15 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
+/**
+ * @copydoc cudf::segmented_bit_count
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> segmented_bit_count(table_view const& t,
+                                            size_type segment_length,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 6b8733827c1..5fdf834f6bc 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -25,8 +25,10 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/timezone.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -930,6 +932,13 @@ struct column_segment_size_functor {
   __device__ std::size_t operator()(size_type start_row) const
   {
     auto constexpr element_size = sizeof(device_storage_type_t<T>);
+
+    if (start_row == 0) {
+      printf("     col size: %d (valid: %d)\n",
+             (int)(element_size * num_rows(start_row) + validity_size(start_row)),
+             (int)validity_size(start_row));
+    }
+
     return element_size * num_rows(start_row) + validity_size(start_row);
   }
 
@@ -961,10 +970,14 @@ struct column_segment_size_functor {
     while (col.type().id() == type_id::STRUCT || col.type().id() == type_id::LIST) {
       col_size += validity_size(start_row);
 
+      if (start_row == 0) { printf("     add valid size: %d\n", (int)validity_size(start_row)); }
+
       if (col.type().id() == type_id::STRUCT) {
         // Empty struct.
         if (col.num_child_columns() == 0) { return col_size; }
         col = col.child(0);
+
+        if (start_row == 0) { printf("   struct, move down\n"); }
       } else {
         auto const offsets = col.child(lists_column_view::offsets_column_index);
         col                = col.child(lists_column_view::child_column_index);
@@ -977,6 +990,10 @@ struct column_segment_size_functor {
         // offset for the entire column so we will get a small over-estimate of the real size.
         auto constexpr offset_size = sizeof(size_type);
         col_size += offset_size * (num_rows(start_row) + 1);
+
+        if (start_row == 0) {
+          printf("     list, add offst size: %d\n", (int)(offset_size * (num_rows(start_row) + 1)));
+        }
       }
     }
 
@@ -994,16 +1011,18 @@ struct table_segment_size_functor {
     // printf("line %d, start row %d\n", __LINE__, start_row);
 
     auto const col_size = [=](column_device_view col) {
+      if (start_row == 0) { printf("compute new col %d\n", __LINE__); }
       return cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size}, start_row);
     };
 
-    // for (auto col : d_table) {
-    //   auto t = cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size},
-    //   start_row); printf("start: %d, col size: %d\n", start_row, (int)t);
+    // if (start_row == 0) {
+    //   for (auto col : d_table) {
+    //     auto t =
+    //       cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size}, start_row);
+    //     printf("start: %d, col size: %d\n", start_row, (int)t);
+    //   }
     // }
 
-    // printf("line %d\n", __LINE__);
-
     return thrust::transform_reduce(
       thrust::seq, d_table.begin(), d_table.end(), col_size, 0ul, thrust::plus<>{});
   }
@@ -1016,6 +1035,20 @@ void test(table_view const& input, rmm::cuda_stream_view stream)
   auto verticalized_t = std::get<0>(
     cudf::experimental::decompose_structs(input, cudf::experimental::decompose_lists_column::YES));
 
+  auto sliced_in = std::move(cudf::slice(input, {0, 5})[0]);
+  for (auto col : sliced_in) {
+    printf("=====sliced in col: \n");
+    cudf::test::print(col);
+  }
+  fflush(stdout);
+
+  auto sliced_in_v = std::move(cudf::slice(verticalized_t, {0, 5})[0]);
+  for (auto col : sliced_in_v) {
+    printf("=====sliced_in_v: \n");
+    cudf::test::print(col);
+  }
+  fflush(stdout);
+
   auto d_t = table_device_view::create(verticalized_t, stream);
 
   auto const num_segments = std::max(input.num_rows() / SEGMENT_SIZE, 1);
@@ -1039,6 +1072,18 @@ void test(table_view const& input, rmm::cuda_stream_view stream)
   printf("segment size: \n");
   cudf::test::print(output->view());
   fflush(stdout);
+
+  auto out = cudf::detail::segmented_bit_count(
+    input, SEGMENT_SIZE, stream, rmm::mr::get_current_device_resource());
+  thrust::transform(rmm::exec_policy(stream),
+                    out->view().begin<int>(),
+                    out->view().end<int>(),
+                    out->mutable_view().begin<int>(),
+                    cuda::proclaim_return_type<int>([] __device__(auto const x) { return x / 8; }));
+
+  printf("segment size again: \n");
+  cudf::test::print(out->view());
+  fflush(stdout);
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index eda8ec7a463..8c0a805b00f 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -31,8 +32,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/fill.h>
+#include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/optional.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
@@ -398,26 +401,29 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
  * @param cols An span of column_device_views representing a column hierarchy
  * @param info An span of column_info structs corresponding the elements in `cols`
  * @param output Output span of size (# rows) where per-row bit sizes are stored
+ * @param segment_length The number of rows in each segment for which the total size is computed
  * @param max_branch_depth Maximum depth of the span stack needed per-thread
  */
 CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
                                    device_span<column_info const> info,
                                    device_span<size_type> output,
+                                   size_type segment_length,
                                    size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-  auto const num_rows = output.size();
-  if (tid >= num_rows) { return; }
+  auto const num_segments = static_cast<size_type>(output.size());
+  if (tid >= num_segments) { return; }
 
   // my_branch_stack points to the last span prior to branching. a branch occurs only
   // when we are inside of a list contained within a struct column.
   row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth);
   size_type branch_depth{0};
 
-  // current row span - always starts at 1 row.
-  row_span cur_span{tid, tid + 1};
+  // current row span - always starts at spanning over `segment_length` rows.
+  auto const num_rows = cols[0].size();
+  row_span cur_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
 
   // output size
   size_type& size = output[tid];
@@ -444,7 +450,8 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
     if (info[idx].depth == 0) {
       branch_depth      = 0;
       last_branch_depth = 0;
-      cur_span          = row_span{tid, tid + 1};
+      cur_span =
+        row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
     }
 
     // add the contributing size of this row
@@ -465,14 +472,13 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
 
 }  // anonymous namespace
 
-/**
- * @copydoc cudf::detail::row_bit_count
- *
- */
-std::unique_ptr<column> row_bit_count(table_view const& t,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> segmented_bit_count(table_view const& t,
+                                            size_type segment_length,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(segment_length >= 1, "Invalid segment length.", std::invalid_argument);
+
   // no rows
   if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
 
@@ -484,17 +490,31 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   CUDF_EXPECTS(info.size() == cols.size(), "Size/info mismatch");
 
   // create output buffer and view
-  auto output = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, t.num_rows(), mask_state::UNALLOCATED, stream, mr);
+  auto const num_segments = cudf::util::div_rounding_up_safe(t.num_rows(), segment_length);
+  auto output             = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, num_segments, mask_state::UNALLOCATED, stream, mr);
   mutable_column_view mcv = output->mutable_view();
 
   // simple case.  if we have no complex types (lists, strings, etc), the per-row size is already
   // trivially computed
   if (h_info.complex_type_count <= 0) {
-    thrust::fill(rmm::exec_policy(stream),
-                 mcv.begin<size_type>(),
-                 mcv.end<size_type>(),
-                 h_info.simple_per_row_size);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(num_segments),
+      mcv.begin<size_type>(),
+      cuda::proclaim_return_type<size_type>(
+        [segment_length,
+         num_segments,
+         num_rows     = t.num_rows(),
+         per_row_size = h_info.simple_per_row_size] __device__(size_type const segment_idx) {
+          // Since the number of rows may not divisible by segment_length,
+          // the last segment may be shorter than the others.
+          auto const current_length = segment_idx + 1 < num_segments
+                                        ? segment_length
+                                        : num_rows - segment_length * segment_idx;
+          return per_row_size * current_length;
+        }));
     return output;
   }
 
@@ -523,22 +543,34 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   // should we be aborting if we reach some extremely small block size, or just if we hit 0?
   CUDF_EXPECTS(block_size > 0, "Encountered a column hierarchy too complex for row_bit_count");
 
-  cudf::detail::grid_1d grid{t.num_rows(), block_size, 1};
+  cudf::detail::grid_1d grid{num_segments, block_size, 1};
   compute_row_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
     {std::get<1>(d_cols), cols.size()},
     {d_info.data(), info.size()},
-    {mcv.data<size_type>(), static_cast<std::size_t>(t.num_rows())},
+    {mcv.data<size_type>(), static_cast<std::size_t>(mcv.size())},
+    segment_length,
     h_info.max_branch_depth);
 
   return output;
 }
 
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  return segmented_bit_count(t, 1, stream, mr);
+}
+
 }  // namespace detail
 
-/**
- * @copydoc cudf::row_bit_count
- *
- */
+std::unique_ptr<column> segmented_bit_count(table_view const& t,
+                                            size_type segment_length,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::segmented_bit_count(t, segment_length, cudf::get_default_stream(), mr);
+}
+
 std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();

From d0ed05a62a1951fdabf748cabeb9732fce516ebc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 14:57:17 -0800
Subject: [PATCH 111/321] Compute table sizes using `segmented_bit_count`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           |  83 ++++++++-
 cpp/src/io/orc/reader_impl.hpp          |  12 +-
 cpp/src/io/orc/reader_impl_chunking.cu  | 221 +-----------------------
 cpp/src/io/orc/reader_impl_chunking.hpp |  26 +++
 cpp/tests/io/orc_test.cpp               |  10 --
 5 files changed, 111 insertions(+), 241 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4c550820ad8..05f881fab71 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -23,7 +23,6 @@
 //
 //
 //
-
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/reader_impl.hpp"
@@ -33,6 +32,7 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/timezone.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -45,6 +45,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -720,6 +721,46 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
   }
 }
 
+/**
+ * @brief TODO
+ * @param input
+ * @param size_limit
+ * @param stream
+ * @return
+ */
+std::vector<chunk> find_table_splits(table_view const& input,
+                                     std::size_t size_limit,
+                                     rmm::cuda_stream_view stream)
+{
+  // Default 10k rows.
+  size_type constexpr SEGMENT_SIZE = 10'000;
+  auto const d_segmented_sizes     = cudf::detail::segmented_bit_count(
+    input, SEGMENT_SIZE, stream, rmm::mr::get_current_device_resource());
+  auto const d_size_begin = d_segmented_sizes->view().begin<size_type>();
+
+  auto segmented_sizes =
+    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
+
+  // TODO: exec_policy_nosync
+  thrust::transform(rmm::exec_policy(stream),
+                    d_size_begin,
+                    d_size_begin + d_segmented_sizes->size(),
+                    segmented_sizes.d_begin(),
+                    [SEGMENT_SIZE] __device__(auto const size) {
+                      return cumulative_size{SEGMENT_SIZE, static_cast<std::size_t>(size)};
+                    });
+  // TODO: exec_policy_nosync
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         segmented_sizes.d_begin(),
+                         segmented_sizes.d_end(),
+                         segmented_sizes.d_begin(),
+                         cumulative_size_sum{});
+  segmented_sizes.device_to_host_sync(stream);
+
+  // Since the segment sizes are in bits, we need to multiply CHAR_BIT with the output limit.
+  return find_splits(segmented_sizes, input.num_rows(), size_limit * CHAR_BIT);
+}
+
 }  // namespace
 
 // TODO: this should be called per chunk of stripes.
@@ -1176,6 +1217,34 @@ void reader::impl::decompress_and_decode()
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
   }  // end loop level
+
+  std::vector<std::unique_ptr<column>> out_columns;
+  _out_metadata = get_meta_with_user_data();
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      _out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
+      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
+  _decoded_table = std::make_unique<table>(std::move(out_columns));
+
+  // DEBUG only
+  _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
+
+  _chunk_read_data.output_table_chunks =
+    find_table_splits(_decoded_table->view(), _chunk_read_data.output_size_limit, _stream);
+  _chunk_read_data.curr_output_table_chunk = 0;
+
+  auto& splits = _chunk_read_data.output_table_chunks;
+  printf("------------\nSplits (/total num rows = %d): \n", (int)_decoded_table->num_rows());
+  for (size_t idx = 0; idx < splits.size(); idx++) {
+    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+  }
+  fflush(stdout);
 }
 
 void reader::impl::prepare_data(int64_t skip_rows,
@@ -1217,7 +1286,7 @@ table_with_metadata reader::impl::make_output_chunk()
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
   std::vector<std::unique_ptr<column>> out_columns;
-  auto out_metadata = make_output_metadata();
+  auto out_metadata = get_meta_with_user_data();
 
   // If no rows or stripes to read, return empty columns
   if (_file_itm_data.has_no_data() /*|| !_chunk_read_data.has_next()*/) {
@@ -1248,7 +1317,7 @@ table_with_metadata reader::impl::make_output_chunk()
   for (auto& buffers : _file_itm_data.out_buffers) {
     //
     out_columns.clear();  // TODO: remove
-    out_metadata = make_output_metadata();
+    out_metadata = get_meta_with_user_data();
 
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
@@ -1334,9 +1403,9 @@ table_with_metadata reader::impl::make_output_chunk()
   return {std::move(out_table), std::move(out_metadata)};
 }
 
-table_metadata reader::impl::make_output_metadata()
+table_metadata reader::impl::get_meta_with_user_data()
 {
-  if (_out_metadata) { return table_metadata{*_out_metadata}; }
+  if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; }
 
   // Copy user data to the output metadata.
   table_metadata out_metadata;
@@ -1357,8 +1426,8 @@ table_metadata reader::impl::make_output_metadata()
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Save the output table metadata into `_out_metadata` for reuse next time.
-  _out_metadata = std::make_unique<table_metadata>(out_metadata);
+  // Save the output table metadata into `_meta_with_user_data` for reuse next time.
+  _meta_with_user_data = std::make_unique<table_metadata>(out_metadata);
 
   return out_metadata;
 }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b95e9e244a0..b94f639c05d 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -141,7 +141,7 @@ class reader::impl {
   void decompress_and_decode();
 
   /**
-   * @brief Create the output table from the internal buffers and return it along with metadata.
+   * @brief Create the output table from the intermediate table and return it along with metadata.
    *
    * This function is called internally and expects all preprocessing steps have already been done.
    *
@@ -150,11 +150,11 @@ class reader::impl {
   table_with_metadata make_output_chunk();
 
   /**
-   * @brief Create the output table metadata from file metadata.
+   * @brief Create the output table metadata storing user data in source metadata.
    *
-   * @return Columns' metadata to output with the table read from file
+   * @return Columns' user data to output with the table read from file
    */
-  table_metadata make_output_metadata();
+  table_metadata get_meta_with_user_data();
 
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
@@ -174,8 +174,10 @@ class reader::impl {
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
   file_intermediate_data _file_itm_data;
   chunk_read_data _chunk_read_data;
-  std::unique_ptr<table_metadata> _out_metadata;
+  std::unique_ptr<table_metadata> _meta_with_user_data;
+  table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
+  std::unique_ptr<cudf::table> _decoded_table;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5fdf834f6bc..af959b78af8 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -28,13 +28,11 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/timezone.hpp>
-#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/bit.hpp>
@@ -304,26 +302,6 @@ std::size_t gather_stream_info_and_column_desc(
   return dst_offset;
 }
 
-namespace {
-
-/**
- * @brief Struct to accummulate sizes of chunks of some data such as stripe or rows.
- */
-struct cumulative_size {
-  int64_t count{0};
-  std::size_t size_bytes{0};
-};
-
-/**
- * @brief Functor to sum up cumulative sizes.
- */
-struct cumulative_size_sum {
-  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
-  {
-    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
-  }
-};
-
 #if 1
 /**
  * @brief Find the splits of the input data such that each split has cumulative size less than a
@@ -388,6 +366,8 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 }
 #endif
 
+namespace {
+
 #ifdef PRINT_DEBUG
 /**
  * @brief Verify the splits, checking if they are correct.
@@ -889,201 +869,4 @@ void reader::impl::load_data()
   _chunk_read_data.curr_decode_stripe_chunk = 0;
 }
 
-namespace {
-
-// Default 10k rows.
-size_type constexpr SEGMENT_SIZE = 10'000;
-
-// size_type constexpr SEGMENT_SIZE = 1;
-
-/**
- * @brief Functor which computes the total data size for a given type of a cudf column.
- *
- * In the case of strings, the return size does not include the chars themselves. That
- * information is tracked separately (see PageInfo::str_bytes).
- *
- * TODO
- */
-struct column_segment_size_functor {
-  column_device_view d_col;
-  size_type size;
-
-  __device__ std::size_t num_rows(size_type start_row) const
-  {
-    return cuda::std::min(size, d_col.size() - start_row);
-  }
-
-  __device__ std::size_t validity_size(size_type start_row) const
-  {
-    return d_col.nullable()
-             ? cudf::util::div_rounding_up_safe(num_rows(start_row), std::size_t{32}) * 4ul
-             : 0ul;
-  }
-
-  template <typename T,
-            CUDF_ENABLE_IF(!cudf::is_rep_layout_compatible<T>() && !cudf::is_nested<T>() &&
-                           !std::is_same_v<T, string_view>)>
-  __device__ std::size_t operator()(size_type) const
-  {
-    CUDF_UNREACHABLE("Attempted to find size of unsupported types.");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
-  __device__ std::size_t operator()(size_type start_row) const
-  {
-    auto constexpr element_size = sizeof(device_storage_type_t<T>);
-
-    if (start_row == 0) {
-      printf("     col size: %d (valid: %d)\n",
-             (int)(element_size * num_rows(start_row) + validity_size(start_row)),
-             (int)validity_size(start_row));
-    }
-
-    return element_size * num_rows(start_row) + validity_size(start_row);
-  }
-
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  __device__ std::size_t operator()(size_type start_row) const
-  {
-    auto const offsets      = d_col.child(strings_column_view::offsets_column_index);
-    auto const offsetalator = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
-    auto const char_begin   = offsetalator[start_row];
-    auto const char_end     = offsetalator[start_row + num_rows(start_row)];
-    auto const chars_size   = char_end - char_begin;
-
-    // NOTE: Adding the + 1 offset, similar to the case of lists column.
-    auto const offset_size =
-      offsets.type().id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t);
-    // printf("   offset sizes: %d, char size: %d\n", (int)offset_size, (int)chars_size);
-
-    return offset_size * (num_rows(start_row) + 1) + validity_size(start_row) + chars_size;
-  }
-
-  template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
-  __device__ std::size_t operator()(size_type start_row) const
-  {
-    auto col             = d_col;
-    auto col_size        = std::size_t{0};
-    auto child_start_row = start_row;
-    auto child_size      = size;
-
-    while (col.type().id() == type_id::STRUCT || col.type().id() == type_id::LIST) {
-      col_size += validity_size(start_row);
-
-      if (start_row == 0) { printf("     add valid size: %d\n", (int)validity_size(start_row)); }
-
-      if (col.type().id() == type_id::STRUCT) {
-        // Empty struct.
-        if (col.num_child_columns() == 0) { return col_size; }
-        col = col.child(0);
-
-        if (start_row == 0) { printf("   struct, move down\n"); }
-      } else {
-        auto const offsets = col.child(lists_column_view::offsets_column_index);
-        col                = col.child(lists_column_view::child_column_index);
-
-        auto const child_end_row = offsets.element<size_type>(start_row + num_rows(start_row));
-        child_start_row          = offsets.element<size_type>(start_row);
-        child_size               = child_end_row - child_start_row;
-
-        // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra
-        // offset for the entire column so we will get a small over-estimate of the real size.
-        auto constexpr offset_size = sizeof(size_type);
-        col_size += offset_size * (num_rows(start_row) + 1);
-
-        if (start_row == 0) {
-          printf("     list, add offst size: %d\n", (int)(offset_size * (num_rows(start_row) + 1)));
-        }
-      }
-    }
-
-    return col_size + type_dispatcher<cudf::experimental::dispatch_void_if_nested>(
-                        col.type(), column_segment_size_functor{col, child_size}, child_start_row);
-  }
-};
-
-struct table_segment_size_functor {
-  table_device_view d_table;
-  size_type size;
-
-  __device__ std::size_t operator()(size_type start_row) const
-  {
-    // printf("line %d, start row %d\n", __LINE__, start_row);
-
-    auto const col_size = [=](column_device_view col) {
-      if (start_row == 0) { printf("compute new col %d\n", __LINE__); }
-      return cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size}, start_row);
-    };
-
-    // if (start_row == 0) {
-    //   for (auto col : d_table) {
-    //     auto t =
-    //       cudf::type_dispatcher(col.type(), column_segment_size_functor{col, size}, start_row);
-    //     printf("start: %d, col size: %d\n", start_row, (int)t);
-    //   }
-    // }
-
-    return thrust::transform_reduce(
-      thrust::seq, d_table.begin(), d_table.end(), col_size, 0ul, thrust::plus<>{});
-  }
-};
-
-}  // namespace
-
-void test(table_view const& input, rmm::cuda_stream_view stream)
-{
-  auto verticalized_t = std::get<0>(
-    cudf::experimental::decompose_structs(input, cudf::experimental::decompose_lists_column::YES));
-
-  auto sliced_in = std::move(cudf::slice(input, {0, 5})[0]);
-  for (auto col : sliced_in) {
-    printf("=====sliced in col: \n");
-    cudf::test::print(col);
-  }
-  fflush(stdout);
-
-  auto sliced_in_v = std::move(cudf::slice(verticalized_t, {0, 5})[0]);
-  for (auto col : sliced_in_v) {
-    printf("=====sliced_in_v: \n");
-    cudf::test::print(col);
-  }
-  fflush(stdout);
-
-  auto d_t = table_device_view::create(verticalized_t, stream);
-
-  auto const num_segments = std::max(input.num_rows() / SEGMENT_SIZE, 1);
-  printf("num rows: %d, num seeg: %d\n", input.num_rows(), num_segments);
-  fflush(stdout);
-
-  auto output = make_fixed_width_column(
-    data_type{type_id::UINT64}, num_segments, mask_state::UNALLOCATED, stream);
-
-  auto s = thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(num_segments),
-    output->mutable_view().begin<size_t>(),
-    cuda::proclaim_return_type<std::size_t>(
-      [SEGMENT_SIZE = SEGMENT_SIZE, d_table = *d_t] __device__(auto const segment_idx) {
-        auto const start_row = segment_idx * SEGMENT_SIZE;
-        return table_segment_size_functor{d_table, SEGMENT_SIZE}(start_row);
-      }));
-
-  printf("segment size: \n");
-  cudf::test::print(output->view());
-  fflush(stdout);
-
-  auto out = cudf::detail::segmented_bit_count(
-    input, SEGMENT_SIZE, stream, rmm::mr::get_current_device_resource());
-  thrust::transform(rmm::exec_policy(stream),
-                    out->view().begin<int>(),
-                    out->view().end<int>(),
-                    out->mutable_view().begin<int>(),
-                    cuda::proclaim_return_type<int>([] __device__(auto const x) { return x / 8; }));
-
-  printf("segment size again: \n");
-  cudf::test::print(out->view());
-  fflush(stdout);
-}
-
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 1542182ed7f..ba1de2e7525 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -240,6 +240,32 @@ struct chunk_read_data {
   }
 };
 
+/**
+ * @brief Struct to accumulate sizes of chunks of some data such as stripe or rows.
+ */
+struct cumulative_size {
+  int64_t count{0};
+  std::size_t size_bytes{0};
+};
+
+/**
+ * @brief Functor to sum up cumulative sizes.
+ */
+struct cumulative_size_sum {
+  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
+  {
+    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
+  }
+};
+
+/**
+ * @brief Find the splits of the input data such that each split has cumulative size less than a
+ * given `size_limit`.
+ */
+std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
+                               int64_t total_count,
+                               size_t size_limit);
+
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column
  * data, but not both.
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index d4e497d7ecd..24e2e2cfea0 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -39,10 +39,6 @@
 
 #include <type_traits>
 
-namespace cudf::io::orc::detail {
-void test(table_view const& input, rmm::cuda_stream_view stream);
-}
-
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
@@ -778,8 +774,6 @@ TEST_F(OrcChunkedWriterTest, Metadata)
   auto result = cudf::io::read_orc(read_opts);
 
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-
-  cudf::io::orc::detail::test(expected, cudf::get_default_stream());
 }
 
 TEST_F(OrcChunkedWriterTest, Strings)
@@ -1394,8 +1388,6 @@ TEST_P(OrcWriterTestStripes, StripeSize)
     auto result = cudf::io::read_orc(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-
-    cudf::io::orc::detail::test(expected->view(), cudf::get_default_stream());
   };
 
   {
@@ -1492,8 +1484,6 @@ TEST_F(OrcWriterTest, TestMap)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
-
-  cudf::io::orc::detail::test(cudf::table_view{{*list_col}}, cudf::get_default_stream());
 }
 
 TEST_F(OrcReaderTest, NestedColumnSelection)

From ae06017094b36b8a4d6bc23da34ea830c3970190 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 16:05:52 -0800
Subject: [PATCH 112/321] Temporary store multiple decoded tables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           | 49 ++++---------------------
 cpp/src/io/orc/reader_impl_chunking.hpp |  2 +-
 2 files changed, 9 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 05f881fab71..ae46b3c6d48 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -820,6 +820,7 @@ void reader::impl::decompress_and_decode()
 
   // TODO: move this to global step
   lvl_chunks.resize(_selected_columns.num_levels());
+  _out_buffers.clear();
   _out_buffers.resize(_selected_columns.num_levels());
 
   //
@@ -1268,7 +1269,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
     while (_chunk_read_data.more_stripe_to_decode()) {
       decompress_and_decode();
-      _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
+      _file_itm_data.out_tables.push_back(std::move(_decoded_table));
     }
   }
   printf("done load and decode data\n\n");
@@ -1285,12 +1286,11 @@ table_with_metadata reader::impl::make_output_chunk()
   // There is no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
-  std::vector<std::unique_ptr<column>> out_columns;
-  auto out_metadata = get_meta_with_user_data();
-
   // If no rows or stripes to read, return empty columns
   if (_file_itm_data.has_no_data() /*|| !_chunk_read_data.has_next()*/) {
     printf("has no next\n");
+    std::vector<std::unique_ptr<column>> out_columns;
+    auto out_metadata = get_meta_with_user_data();
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
                    std::back_inserter(out_columns),
@@ -1307,43 +1307,10 @@ table_with_metadata reader::impl::make_output_chunk()
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
-  // TODO: move this into decompress_and_decode
-  // Create columns from buffer with respective schema information.
-
-  // TODO: remove
-  std::vector<std::unique_ptr<table>> tabs;
   std::vector<cudf::table_view> tv;
 
-  for (auto& buffers : _file_itm_data.out_buffers) {
-    //
-    out_columns.clear();  // TODO: remove
-    out_metadata = get_meta_with_user_data();
-
-    std::transform(_selected_columns.levels[0].begin(),
-                   _selected_columns.levels[0].end(),
-                   std::back_inserter(out_columns),
-                   [&](auto const& orc_col_meta) {
-                     out_metadata.schema_info.emplace_back("");
-                     auto col_buffer = assemble_buffer(orc_col_meta.id,
-                                                       0,
-                                                       *_col_meta,
-                                                       _metadata,
-                                                       _selected_columns,
-                                                       buffers, /*_out_buffers*/
-                                                       _stream,
-                                                       _mr);
-                     return make_column(
-                       col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
-                   });
-
-    // printf("output col0: \n");
-    // cudf::test::print(out_columns.front()->view());
-    // printf("output col1: \n");
-    // cudf::test::print(out_columns.back()->view());
-
-    auto tbl = std::make_unique<table>(std::move(out_columns));
-    tabs.push_back(std::move(tbl));
-    tv.push_back(tabs.back()->view());
+  for (auto& table : _file_itm_data.out_tables) {
+    tv.push_back(table->view());
 
     //
     printf(" ----- decode one chunk, size = %d\n", tv.back().num_rows());
@@ -1379,7 +1346,7 @@ table_with_metadata reader::impl::make_output_chunk()
 
       return tmp;
     }
-    return std::move(tabs.front());
+    return std::move(_file_itm_data.out_tables.front());
   }();
   // auto out_table = std::move(tabs.front());
 
@@ -1400,7 +1367,7 @@ table_with_metadata reader::impl::make_output_chunk()
   }();
 
 #endif
-  return {std::move(out_table), std::move(out_metadata)};
+  return {std::move(out_table), _out_metadata};
 }
 
 table_metadata reader::impl::get_meta_with_user_data()
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index ba1de2e7525..d0996bcdde3 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -123,7 +123,7 @@ struct range {
  */
 struct file_intermediate_data {
   // TODO: remove
-  std::vector<std::vector<std::vector<cudf::io::detail::column_buffer>>> out_buffers;
+  std::vector<std::unique_ptr<cudf::table>> out_tables;
 
   int64_t rows_to_skip;
   size_type rows_to_read;

From 6cccca3f7e9a876876e53c211688e8741552cdcd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 19:25:47 -0800
Subject: [PATCH 113/321] Add test file

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/CMakeLists.txt                 |    2 +-
 cpp/tests/io/orc_chunked_reader_test.cpp | 1013 ++++++++++++++++++++++
 2 files changed, 1014 insertions(+), 1 deletion(-)
 create mode 100644 cpp/tests/io/orc_chunked_reader_test.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3e377b07eee..1bf11603bc0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -286,7 +286,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cpp io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )
diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
new file mode 100644
index 00000000000..eecadcc1e05
--- /dev/null
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <fstream>
+#include <type_traits>
+
+namespace {
+// Global environment for temporary files
+auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using strings_col      = cudf::test::strings_column_wrapper;
+using structs_col      = cudf::test::structs_column_wrapper;
+using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
+
+auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
+                std::string const& filename,
+                bool nullable,
+                std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
+                cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
+{
+  // Just shift nulls of the next column by one position to avoid having all nulls in the same
+  // table rows.
+  if (nullable) {
+    // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
+    auto const valid_iter = cudf::detail::make_counting_transform_iterator(
+      0, [](cudf::size_type i) { return i % 4 != 3; });
+
+    cudf::size_type offset{0};
+    for (auto& col : input_columns) {
+      auto const [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset);
+      col = cudf::structs::detail::superimpose_nulls(
+        static_cast<cudf::bitmask_type const*>(null_mask.data()),
+        null_count,
+        std::move(col),
+        cudf::get_default_stream(),
+        rmm::mr::get_current_device_resource());
+    }
+  }
+
+  auto input_table = std::make_unique<cudf::table>(std::move(input_columns));
+  auto filepath =
+    temp_env->get_temp_filepath(nullable ? filename + "_nullable.orc" : filename + ".orc");
+
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *input_table)
+      .stripe_size_bytes(stripe_size_bytes)
+      .stripe_size_rows(stripe_size_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  return std::pair{std::move(input_table), std::move(filepath)};
+}
+
+auto chunked_read(std::string const& filepath, std::size_t output_limit)
+{
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_orc_reader(output_limit, read_opts);
+
+  auto num_chunks = 0;
+  auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+    }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  return std::pair(cudf::concatenate(out_tviews), num_chunks);
+}
+
+}  // namespace
+
+struct OrcChunkedReaderTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty", false);
+  auto const [result, num_chunks] = chunked_read(filepath, 1'000);
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(result->num_rows(), 0);
+  EXPECT_EQ(result->num_columns(), 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+}
+
+#if 0
+TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
+{
+  auto constexpr num_rows = 40'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    return write_file(input_columns, "chunked_read_simple", nullable);
+  };
+
+  {
+    auto const [expected, filepath] = generate_input(false);
+    auto const [result, num_chunks] = chunked_read(filepath, 240'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto const [expected, filepath] = generate_input(true);
+    auto const [result, num_chunks] = chunked_read(filepath, 240'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
+{
+  // Tests some specific boundary conditions in the split calculations.
+
+  auto constexpr num_rows = 40'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    return write_file(input_columns, "chunked_read_simple_boundary", false /*nullable*/);
+  }();
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than one page of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 79'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size one page of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 80'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size one page of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 81'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than two pages of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 159'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two pages of data minus one byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 159'999);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two pages of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 160'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size two pages of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 161'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
+{
+  auto constexpr num_rows = 60'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+
+    // ints                                            Page    total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each                    = A0      80000         80000
+    // 20000 rows of 4 bytes each                    = A1      80000         160000
+    // 20000 rows of 4 bytes each                    = A2      80000         240000
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+
+    // strings                                         Page    total bytes   cumulative bytes
+    // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
+    // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
+    // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
+    auto const strings  = std::vector<std::string>{"a", "bbbb", "cccccccccccccccc"};
+    auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+      if (i < 20000) { return strings[0]; }
+      if (i < 40000) { return strings[1]; }
+      return strings[2];
+    });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 + B0 :  180004
+    // A1 + B1 :  420008
+    // A2 + B2 :  900012
+    //                                    skip_rows / num_rows
+    // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
+    // byte_limit==1000000 should give 1 chunks: {0, 60000},
+    return write_file(input_columns,
+                      "chunked_read_with_strings",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 500'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 500'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStringPrecise)
+{
+  auto constexpr num_rows = 60'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+
+    // strings                                                 Page    total bytes   cumulative
+    // 20000 rows alternating 1-4 chars each (50000 + 80004)   A0      130004        130004
+    // 20000 rows alternating 1-4 chars each (50000 + 80004)   A1      130004        260008
+    // ...
+    auto const strings = std::vector<std::string>{"a", "bbbb"};
+    auto const str_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return strings[i % 2]; });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 :  130004
+    // A1 :  260008
+    // A2 :  390012
+    return write_file(input_columns,
+                      "chunked_read_with_strings_precise",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null] = generate_input(false);
+
+  // a chunk limit of 1 byte less than 2 pages should force it to produce 3 chunks:
+  // each 1 page in size
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 260'007);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  // a chunk limit of exactly equal to 2 pages should force it to produce 2 chunks
+  // pages 0-1 and page 2
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 260'008);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_rows);
+      auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_rows};
+
+      return structs_col{{child1, child2, child3}}.release();
+    }());
+
+    return write_file(input_columns,
+                      "chunked_read_with_structs",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 500'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 500'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 20001 offsets :   80004  bytes
+    // 30000 ints    :   120000 bytes
+    // total         :   200004 bytes
+    auto const template_lists = int32s_lists_col{
+      int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
+
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns,
+                      "chunked_read_with_lists_no_null",
+                      false /*nullable*/,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  }();
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size slightly less than 1 page (forcing it to be at least 1 page per read)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 200'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size exactly 1 page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 200'004);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages. 3 chunks (2 pages + 2 pages + 1 page)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 400'008);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages minus one byte: each chunk will be just one page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 400'007);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 625 validity words :   2500 bytes   (a null every 4 rows: null at indices [3, 7, 11, ...])
+    // 20001 offsets      :   80004  bytes
+    // 15000 ints         :   60000 bytes
+    // total              :   142504 bytes
+    auto const template_lists =
+      int32s_lists_col{// these will all be null
+                       int32s_lists_col{},
+                       int32s_lists_col{0},
+                       int32s_lists_col{1, 2},
+                       int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */};
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns,
+                      "chunked_read_with_lists_nulls",
+                      true /*nullable*/,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  }();
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size slightly less than 1 page (forcing it to be at least 1 page per read)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 142'500);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size exactly 1 page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 142'504);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages. 3 chunks (2 pages + 2 pages + 1 page)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 285'008);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages minus 1 byte: each chunk will be just one page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 285'007);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      std::vector<std::unique_ptr<cudf::column>> child_columns;
+      child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+      child_columns.emplace_back(
+        int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release());
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+        return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i);
+      });
+      child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+      auto const template_lists = int32s_lists_col{
+        int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}};
+      auto const gather_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+      auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+      child_columns.emplace_back(
+        std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+      return structs_col(std::move(child_columns)).release();
+    }());
+
+    return write_file(input_columns,
+                      "chunked_read_with_structs_of_lists",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  // for these tests, different columns get written to different numbers of pages so it's a
+  // little tricky to describe the expected results by page counts. To get an idea of how
+  // these values are chosen, see the debug output from the call to print_cumulative_row_info() in
+  // reader_impl_preprocess.cu -> find_splits()
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+    auto offsets = std::vector<cudf::size_type>{};
+    offsets.reserve(num_rows * 2);
+    cudf::size_type num_structs = 0;
+    for (int i = 0; i < num_rows; ++i) {
+      offsets.push_back(num_structs);
+      auto const new_list_size = i % 4;
+      num_structs += new_list_size;
+    }
+    offsets.push_back(num_structs);
+
+    auto const make_structs_col = [=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_structs);
+      auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+      return structs_col{{child1, child2, child3}}.release();
+    };
+
+    input_columns.emplace_back(
+      cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                              int32s_col(offsets.begin(), offsets.end()).release(),
+                              make_structs_col(),
+                              0,
+                              rmm::device_buffer{}));
+
+    return write_file(input_columns,
+                      "chunked_read_with_lists_of_structs",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // for these tests, different columns get written to different numbers of pages so it's a
+  // little tricky to describe the expected results by page counts. To get an idea of how
+  // these values are chosen, see the debug output from the call to print_cumulative_row_info() in
+  // reader_impl_preprocess.cu -> find_splits()
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto const validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; });
+  cudf::test::fixed_width_column_wrapper<int32_t> col{sequence, sequence + num_rows, validity};
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.push_back(col.release());
+  auto const expected = std::make_unique<cudf::table>(std::move(cols));
+
+  auto const filepath        = temp_env->get_temp_filepath("chunked_reader_null_count.parquet");
+  auto const page_limit_rows = num_rows / 5;
+  auto const write_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .max_page_size_rows(page_limit_rows)  // 20k rows per page
+      .build();
+  cudf::io::write_parquet(write_opts);
+
+  auto const byte_limit = page_limit_rows * sizeof(int);
+  auto const read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
+
+  do {
+    // Every fourth row is null
+    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), page_limit_rows / 4);
+  } while (reader.has_next());
+}
+
+TEST_F(OrcChunkedReaderTest, InputLimitSimple)
+{
+  auto const filepath = temp_env->get_temp_filepath("input_limit_10_rowgroups.parquet");
+
+  // This results in 10 grow groups, at 4001150 bytes per row group
+  constexpr int num_rows = 25'000'000;
+  auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](int i) { return i; });
+  cudf::test::fixed_width_column_wrapper<int> expected(value_iter, value_iter + num_rows);
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath},
+                                              cudf::table_view{{expected}})
+      // note: it is unnecessary to force compression to NONE here because the size we are using in
+      // the row group is the uncompressed data size. But forcing the dictionary policy to
+      // dictionary_policy::NEVER is necessary to prevent changes in the
+      // decompressed-but-not-yet-decoded data.
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+
+  cudf::io::write_parquet(opts);
+
+  {
+    // no chunking
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 1);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 4000000);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 4100000);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 12 chunks of 200k rows each, plus 1 final chunk of 100k rows.
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 8002301);
+    EXPECT_EQ(num_chunks, 13);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 1 big chunk
+    auto const [result, num_chunks] = chunked_read(filepath, 0, size_t{1} * 1024 * 1024 * 1024);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+}
+#endif

From 2488cb2479b043f9500f46de607b409f93fa33da Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 19:28:09 -0800
Subject: [PATCH 114/321] Add comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index af959b78af8..8db36998311 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -831,6 +831,7 @@ void reader::impl::load_data()
   // DEBUG only
   _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
 
+  // TODO: only decode stripes enough for output.
   _chunk_read_data.decode_stripe_chunks =
     find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {

From e3db4dcea9093151c9865512f3d090ff038d3df9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 20:38:32 -0800
Subject: [PATCH 115/321] Add `output_row_granularity` parameter

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp      |  9 +++++++
 cpp/include/cudf/io/orc.hpp             | 14 ++++++++++
 cpp/src/io/functions.cpp                | 24 +++++++++++++++++
 cpp/src/io/orc/reader.cu                | 19 ++++++++++++++
 cpp/src/io/orc/reader_impl.cu           | 35 +++++++++++++++++++------
 cpp/src/io/orc/reader_impl.hpp          | 10 +++++++
 cpp/src/io/orc/reader_impl_chunking.hpp | 10 +++++--
 7 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index ff748e63506..ac024caf1f3 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -102,6 +102,7 @@ class chunked_reader : private reader {
    * whole file and return a table containing all rows.
    *
    * TODO: data read limit
+   * TODO: granularity
    *
    * @param output_size_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
@@ -119,6 +120,14 @@ class chunked_reader : private reader {
                           rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource* mr);
 
+  explicit chunked_reader(std::size_t output_size_limit,
+                          std::size_t data_read_limit,
+                          size_type output_row_granularity,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr);
+
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header.
    *
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 9af86cee6d7..cfab642f25d 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -423,6 +423,12 @@ class chunked_orc_reader {
    */
   chunked_orc_reader() = default;
 
+  // TODO
+  chunked_orc_reader(std::size_t output_size_limit,
+                     orc_reader_options const& options,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Constructor for chunked reader.
    *
@@ -446,6 +452,14 @@ class chunked_orc_reader {
                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  // TODO
+  chunked_orc_reader(std::size_t output_size_limit,
+                     std::size_t data_read_limit,
+                     size_type output_row_granularity,
+                     orc_reader_options const& options,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Destructor, destroying the internal reader instance.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 2f3f57cc2d1..04799fabeef 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -451,13 +451,37 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
 /**
  * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader
  */
+chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+  : chunked_orc_reader(output_size_limit, 0UL, options, stream, mr)
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
+                                       std::size_t data_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
+                                                         data_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        std::size_t data_read_limit,
+                                       size_type output_row_granularity,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
   : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
                                                          data_read_limit,
+                                                         output_row_granularity,
                                                          make_datasources(options.get_source()),
                                                          options,
                                                          stream,
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 855a96c9ae3..4d285e6788d 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -42,11 +42,30 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
+  : reader()  // TODO
 {
   _impl = std::make_unique<impl>(
     output_size_limit, data_read_limit, std::move(sources), options, stream, mr);
 }
 
+chunked_reader::chunked_reader(std::size_t output_size_limit,
+                               std::size_t data_read_limit,
+                               size_type output_row_granularity,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+  : reader()  // TODO
+{
+  _impl = std::make_unique<impl>(output_size_limit,
+                                 data_read_limit,
+                                 output_row_granularity,
+                                 std::move(sources),
+                                 options,
+                                 stream,
+                                 mr);
+}
+
 chunked_reader::~chunked_reader() = default;
 
 bool chunked_reader::has_next() const { return _impl->has_next(); }
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ae46b3c6d48..75e743df9a4 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -729,13 +729,13 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
  * @return
  */
 std::vector<chunk> find_table_splits(table_view const& input,
+                                     size_type segment_length,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
 {
   // Default 10k rows.
-  size_type constexpr SEGMENT_SIZE = 10'000;
-  auto const d_segmented_sizes     = cudf::detail::segmented_bit_count(
-    input, SEGMENT_SIZE, stream, rmm::mr::get_current_device_resource());
+  auto const d_segmented_sizes = cudf::detail::segmented_bit_count(
+    input, segment_length, stream, rmm::mr::get_current_device_resource());
   auto const d_size_begin = d_segmented_sizes->view().begin<size_type>();
 
   auto segmented_sizes =
@@ -746,8 +746,8 @@ std::vector<chunk> find_table_splits(table_view const& input,
                     d_size_begin,
                     d_size_begin + d_segmented_sizes->size(),
                     segmented_sizes.d_begin(),
-                    [SEGMENT_SIZE] __device__(auto const size) {
-                      return cumulative_size{SEGMENT_SIZE, static_cast<std::size_t>(size)};
+                    [segment_length] __device__(auto const size) {
+                      return cumulative_size{segment_length, static_cast<std::size_t>(size)};
                     });
   // TODO: exec_policy_nosync
   thrust::inclusive_scan(rmm::exec_policy(stream),
@@ -1236,8 +1236,10 @@ void reader::impl::decompress_and_decode()
   // DEBUG only
   _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
 
-  _chunk_read_data.output_table_chunks =
-    find_table_splits(_decoded_table->view(), _chunk_read_data.output_size_limit, _stream);
+  _chunk_read_data.output_table_chunks     = find_table_splits(_decoded_table->view(),
+                                                           _chunk_read_data.output_row_granularity,
+                                                           _chunk_read_data.output_size_limit,
+                                                           _stream);
   _chunk_read_data.curr_output_table_chunk = 0;
 
   auto& splits = _chunk_read_data.output_table_chunks;
@@ -1413,6 +1415,23 @@ reader::impl::impl(std::size_t output_size_limit,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
+  : reader::impl::impl(output_size_limit,
+                       data_read_limit,
+                       DEFAULT_OUTPUT_ROW_GRANULARITY,
+                       std::move(sources),
+                       options,
+                       stream,
+                       mr)
+{
+}
+
+reader::impl::impl(std::size_t output_size_limit,
+                   std::size_t data_read_limit,
+                   size_type output_row_granularity,
+                   std::vector<std::unique_ptr<datasource>>&& sources,
+                   orc_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
     _config{options.get_timestamp_type(),
@@ -1423,7 +1442,7 @@ reader::impl::impl(std::size_t output_size_limit,
     _sources(std::move(sources)),
     _metadata{_sources, stream},
     _selected_columns{_metadata.select_columns(options.get_columns())},
-    _chunk_read_data{output_size_limit, data_read_limit}
+    _chunk_read_data{output_size_limit, data_read_limit, output_row_granularity}
 {
 }
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b94f639c05d..e6764d6d688 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -63,6 +63,14 @@ class reader::impl {
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
+  explicit impl(std::size_t output_size_limit,
+                std::size_t data_read_limit,
+                size_type output_row_granularity,
+                std::vector<std::unique_ptr<datasource>>&& sources,
+                orc_reader_options const& options,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
+
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
@@ -178,6 +186,8 @@ class reader::impl {
   table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
   std::unique_ptr<cudf::table> _decoded_table;
+
+  static constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index d0996bcdde3..47b6ae7a02e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -201,14 +201,20 @@ struct file_intermediate_data {
  * @brief Struct to store all data necessary for chunked reading.
  */
 struct chunk_read_data {
-  explicit chunk_read_data(std::size_t output_size_limit_ = 0, std::size_t data_read_limit_ = 0)
-    : output_size_limit{output_size_limit_}, data_read_limit(data_read_limit_)
+  explicit chunk_read_data(std::size_t output_size_limit_,
+                           std::size_t data_read_limit_,
+                           size_type output_row_granularity_)
+    : output_size_limit{output_size_limit_},
+      data_read_limit{data_read_limit_},
+      output_row_granularity{output_row_granularity_}
   {
   }
 
+  // TODO: const for 3 below?
   std::size_t output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
   std::size_t data_read_limit;    // approximate maximum size (in bytes) used for store
                                   // intermediate data, or 0 for no limit
+  size_type output_row_granularity;  // TODO
 
   // Chunks of stripes that can be load into memory such that their data size is within a size
   // limit.

From 94d66ad8129d1ab14e75a44d8696668ab9f3443c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 28 Feb 2024 22:04:19 -0800
Subject: [PATCH 116/321] Fix segment length

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu            |  4 ++++
 cpp/tests/io/orc_chunked_reader_test.cpp | 17 +++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 75e743df9a4..19d433c04f6 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -733,6 +733,10 @@ std::vector<chunk> find_table_splits(table_view const& input,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
 {
+  // If segment_length is zero: we don't have any limit on granularity.
+  // As such, set segment length to the number of rows.
+  if (segment_length == 0) { segment_length = input.num_rows(); }
+
   // Default 10k rows.
   auto const d_segmented_sizes = cudf::detail::segmented_bit_count(
     input, segment_length, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index eecadcc1e05..e6a7d3fcb36 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -38,10 +38,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <fstream>
 #include <type_traits>
 
@@ -96,11 +96,15 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
   return std::pair{std::move(input_table), std::move(filepath)};
 }
 
-auto chunked_read(std::string const& filepath, std::size_t output_limit)
+auto chunked_read(std::string const& filepath,
+                  std::size_t output_limit,
+                  std::size_t input_limit                = 0,
+                  cudf::size_type output_row_granularity = 0)
 {
   auto const read_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
-  auto reader = cudf::io::chunked_orc_reader(output_limit, read_opts);
+  auto reader =
+    cudf::io::chunked_orc_reader(output_limit, input_limit, output_row_granularity, read_opts);
 
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
@@ -142,7 +146,6 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
 }
 
-#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
 {
   auto constexpr num_rows = 40'000;
@@ -169,9 +172,11 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
-}
 
+  exit(0);
+}
 
+#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
 {
   // Tests some specific boundary conditions in the split calculations.

From e270aa38c1b7e1602e5dad6272bde75901d00219 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 13:13:32 -0800
Subject: [PATCH 117/321] Use chunking for chunked reader

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           | 89 ++++++++-----------------
 cpp/src/io/orc/reader_impl.hpp          |  1 -
 cpp/src/io/orc/reader_impl_chunking.cu  | 22 +++---
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 -
 4 files changed, 40 insertions(+), 75 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 19d433c04f6..4e0935d908d 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1235,19 +1235,23 @@ void reader::impl::decompress_and_decode()
         orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
       return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
     });
-  _decoded_table = std::make_unique<table>(std::move(out_columns));
+  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
 
   // DEBUG only
-  _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
+  // _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
 
-  _chunk_read_data.output_table_chunks     = find_table_splits(_decoded_table->view(),
-                                                           _chunk_read_data.output_row_granularity,
-                                                           _chunk_read_data.output_size_limit,
-                                                           _stream);
   _chunk_read_data.curr_output_table_chunk = 0;
+  _chunk_read_data.output_table_chunks =
+    _chunk_read_data.output_size_limit == 0
+      ? std::vector<chunk>{chunk{0, _chunk_read_data.decoded_table->num_rows()}}
+      : find_table_splits(_chunk_read_data.decoded_table->view(),
+                          _chunk_read_data.output_row_granularity,
+                          _chunk_read_data.output_size_limit,
+                          _stream);
 
   auto& splits = _chunk_read_data.output_table_chunks;
-  printf("------------\nSplits (/total num rows = %d): \n", (int)_decoded_table->num_rows());
+  printf("------------\nSplits decoded table (/total num rows = %d): \n",
+         (int)_chunk_read_data.decoded_table->num_rows());
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
   }
@@ -1268,16 +1272,18 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   global_preprocess(skip_rows, num_rows_opt, stripes);
 
-  // TODO: only load data if there is no loaded stripe ready to decode.
-  // load_data();
-  while (_chunk_read_data.more_stripe_to_load()) {
-    load_data();
+  if (!_chunk_read_data.more_table_chunk_to_output()) {
+    if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
+      printf("load more data\n\n");
+      load_data();
+    }
 
-    while (_chunk_read_data.more_stripe_to_decode()) {
+    if (_chunk_read_data.more_stripe_to_decode()) {
+      printf("decode more data\n\n");
       decompress_and_decode();
-      _file_itm_data.out_tables.push_back(std::move(_decoded_table));
     }
   }
+
   printf("done load and decode data\n\n");
 
   // decompress_and_decode();
@@ -1293,7 +1299,7 @@ table_with_metadata reader::impl::make_output_chunk()
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
   // If no rows or stripes to read, return empty columns
-  if (_file_itm_data.has_no_data() /*|| !_chunk_read_data.has_next()*/) {
+  if (_file_itm_data.has_no_data() || !_chunk_read_data.more_table_chunk_to_output()) {
     printf("has no next\n");
     std::vector<std::unique_ptr<column>> out_columns;
     auto out_metadata = get_meta_with_user_data();
@@ -1313,50 +1319,7 @@ table_with_metadata reader::impl::make_output_chunk()
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
-  std::vector<cudf::table_view> tv;
-
-  for (auto& table : _file_itm_data.out_tables) {
-    tv.push_back(table->view());
-
-    //
-    printf(" ----- decode one chunk, size = %d\n", tv.back().num_rows());
-    fflush(stdout);
-    //
-    //
-    //
-    //
-  }
-  printf(" ----- decode total %d chunks\n", (int)tv.size());
-  fflush(stdout);
-
-  // todo: remove this
-  // auto out_table = std::make_unique<table>(std::move(out_columns));
-  auto out_table = [&] {
-    if (tv.size() > 1) {
-      auto tmp = cudf::concatenate(tv);
-      std::vector<bool> has_mask(tmp->num_columns(), false);
-      std::vector<bool> has_nulls(tmp->num_columns(), false);
-
-      for (int i = 0; i < tmp->num_columns(); ++i) {
-        for (int j = 0; j < (int)tv.size(); ++j) {
-          if (tv[j].column(i).nullable()) { has_mask[i] = true; }
-          if (tv[j].column(i).null_count()) { has_nulls[i] = true; }
-        }
-      }
-      for (int i = 0; i < tmp->num_columns(); ++i) {
-        if (has_mask[i] && !has_nulls[i]) {
-          tmp->get_column(i).set_null_mask(
-            cudf::create_null_mask(tmp->get_column(i).size(), cudf::mask_state::ALL_VALID), 0);
-        }
-      }
-
-      return tmp;
-    }
-    return std::move(_file_itm_data.out_tables.front());
-  }();
-  // auto out_table = std::move(tabs.front());
-
-#if 0
+#if 1
   auto out_table = [&] {
     if (_chunk_read_data.output_table_chunks.size() == 1) {
       return std::move(_chunk_read_data.decoded_table);
@@ -1365,11 +1328,11 @@ table_with_metadata reader::impl::make_output_chunk()
     auto const out_chunk =
       _chunk_read_data.output_table_chunks[_chunk_read_data.curr_output_table_chunk++];
     auto const out_tview =
-      cudf::slice(_chunk_read_data.decoded_table->view(),
-                  {static_cast<size_type>(out_chunk.start_idx),
-                   static_cast<size_type>(out_chunk.start_idx + out_chunk.count)},
-                  _stream)[0];
-    return std::make_unique<table>(out_tview);
+      cudf::detail::slice(_chunk_read_data.decoded_table->view(),
+                          {static_cast<size_type>(out_chunk.start_idx),
+                           static_cast<size_type>(out_chunk.start_idx + out_chunk.count)},
+                          _stream)[0];
+    return std::make_unique<table>(out_tview, _stream, _mr);
   }();
 
 #endif
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index e6764d6d688..9ca003672a4 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -185,7 +185,6 @@ class reader::impl {
   std::unique_ptr<table_metadata> _meta_with_user_data;
   table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
-  std::unique_ptr<cudf::table> _decoded_table;
 
   static constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 8db36998311..2fe8f6753f1 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -311,7 +311,10 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
                                int64_t total_count,
                                size_t size_limit)
 {
-  // if (size_limit == 0) { return {chunk{0, total_count}}; }
+  // if (size_limit == 0) {
+  //   printf("0 limit: output chunk = 0, %d\n", (int)total_count);
+  //   return {chunk{0, total_count}};
+  // }
   CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
 
   std::vector<chunk> splits;
@@ -592,10 +595,13 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       chunk{last_read_size, static_cast<int64_t>(read_info.size() - last_read_size)};
   }
 
+  _chunk_read_data.curr_load_stripe_chunk = 0;
+
   // Load all chunks if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
+    printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
-    //    return;
+    return;
   }
 
   printf("total stripe sizes:\n");
@@ -620,7 +626,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   // DEBUG only
   // TODO: use 0.3 constant
-  _chunk_read_data.data_read_limit = total_stripe_sizes.back().size_bytes / 3;
+  // _chunk_read_data.data_read_limit = total_stripe_sizes.back().size_bytes / 3;
 
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
@@ -811,11 +817,14 @@ void reader::impl::load_data()
 
   }  // end loop level
 
+  // Decoding is reset to start from the first chunk in `decode_stripe_chunks`.
+  _chunk_read_data.curr_decode_stripe_chunk = 0;
+
   // Decode all chunks if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
     // TODO: DEBUG only
-    //    return;
+    return;
   }
 
   // Compute the prefix sum of stripe data sizes.
@@ -829,7 +838,7 @@ void reader::impl::load_data()
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
   // DEBUG only
-  _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
+  // _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
 
   // TODO: only decode stripes enough for output.
   _chunk_read_data.decode_stripe_chunks =
@@ -865,9 +874,6 @@ void reader::impl::load_data()
 
   // lvl_stripe_data.clear();
   // _file_itm_data.compinfo_ready = true;
-
-  // Decoding is reset to start from the first chunk in `decode_stripe_chunks`.
-  _chunk_read_data.curr_decode_stripe_chunk = 0;
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 47b6ae7a02e..cc37ac585a3 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -122,9 +122,6 @@ struct range {
  * @brief Struct to store file-level data that remains constant for all chunks being output.
  */
 struct file_intermediate_data {
-  // TODO: remove
-  std::vector<std::unique_ptr<cudf::table>> out_tables;
-
   int64_t rows_to_skip;
   size_type rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;

From b307b802d0535c84f061afa66629eba67bb21f0a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 13:27:28 -0800
Subject: [PATCH 118/321] Fix bug in chunking

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4e0935d908d..d7b7bc47e13 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1322,6 +1322,7 @@ table_with_metadata reader::impl::make_output_chunk()
 #if 1
   auto out_table = [&] {
     if (_chunk_read_data.output_table_chunks.size() == 1) {
+      _chunk_read_data.curr_output_table_chunk++;
       return std::move(_chunk_read_data.decoded_table);
     }
 

From fcdc9c1d89e1cacb67a165f611b072a8ecb9b599 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 13:30:48 -0800
Subject: [PATCH 119/321] Add debug info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d7b7bc47e13..68b85941709 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -733,6 +733,8 @@ std::vector<chunk> find_table_splits(table_view const& input,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
 {
+  printf("find table split, seg length = %d, limit = %d \n", segment_length, (int)size_limit);
+
   // If segment_length is zero: we don't have any limit on granularity.
   // As such, set segment length to the number of rows.
   if (segment_length == 0) { segment_length = input.num_rows(); }
@@ -753,6 +755,14 @@ std::vector<chunk> find_table_splits(table_view const& input,
                     [segment_length] __device__(auto const size) {
                       return cumulative_size{segment_length, static_cast<std::size_t>(size)};
                     });
+
+  // TODO: remove:
+  segmented_sizes.device_to_host_sync(stream);
+  printf("total row sizes by segment = %d:\n", (int)segment_length);
+  for (auto& size : segmented_sizes) {
+    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+  }
+
   // TODO: exec_policy_nosync
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          segmented_sizes.d_begin(),
@@ -1412,6 +1422,13 @@ reader::impl::impl(std::size_t output_size_limit,
     _selected_columns{_metadata.select_columns(options.get_columns())},
     _chunk_read_data{output_size_limit, data_read_limit, output_row_granularity}
 {
+  printf("construct reader , limit = %d, %d, gradunarity %d \n",
+
+         (int)output_size_limit,
+         (int)data_read_limit,
+         (int)output_row_granularity
+
+  );
 }
 
 table_with_metadata reader::impl::read(int64_t skip_rows,

From 915a3fcd55548d9c09c67024d6bbe79423f9840f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 13:37:21 -0800
Subject: [PATCH 120/321] Fix a bug in setting row granularity

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 68b85941709..1ca175b56b5 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1420,7 +1420,10 @@ reader::impl::impl(std::size_t output_size_limit,
     _sources(std::move(sources)),
     _metadata{_sources, stream},
     _selected_columns{_metadata.select_columns(options.get_columns())},
-    _chunk_read_data{output_size_limit, data_read_limit, output_row_granularity}
+    _chunk_read_data{
+      output_size_limit,
+      data_read_limit,
+      output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY}
 {
   printf("construct reader , limit = %d, %d, gradunarity %d \n",
 

From de4a365274634fba9dd278b0695e0969e7d637ce Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 13:41:42 -0800
Subject: [PATCH 121/321] Fix test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu            | 2 +-
 cpp/tests/io/orc_chunked_reader_test.cpp | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1ca175b56b5..b88dc361dc3 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -760,7 +760,7 @@ std::vector<chunk> find_table_splits(table_view const& input,
   segmented_sizes.device_to_host_sync(stream);
   printf("total row sizes by segment = %d:\n", (int)segment_length);
   for (auto& size : segmented_sizes) {
-    printf("size: %ld, %zu\n", size.count, size.size_bytes);
+    printf("size: %ld, %zu\n", size.count, size.size_bytes / CHAR_BIT);
   }
 
   // TODO: exec_policy_nosync
diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index e6a7d3fcb36..05fe45c631d 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -161,19 +161,17 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
 
   {
     auto const [expected, filepath] = generate_input(false);
-    auto const [result, num_chunks] = chunked_read(filepath, 240'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   {
     auto const [expected, filepath] = generate_input(true);
-    auto const [result, num_chunks] = chunked_read(filepath, 240'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
-
-  exit(0);
 }
 
 #if 0

From 119002ed82d2856110246625c4c7396373bb90c4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 13:48:11 -0800
Subject: [PATCH 122/321] Improve tests

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu            |  4 ++++
 cpp/tests/io/orc_chunked_reader_test.cpp | 26 +++++++++++++++++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index b88dc361dc3..0b8a7a61226 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1444,12 +1444,16 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
 
 bool reader::impl::has_next()
 {
+  printf("==================query has next \n");
   prepare_data();
+
+  printf("has next: %d\n", (int)_chunk_read_data.has_next());
   return _chunk_read_data.has_next();
 }
 
 table_with_metadata reader::impl::read_chunk()
 {
+  printf("==================call read chunk\n");
   prepare_data();
   return make_output_chunk();
 }
diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 05fe45c631d..40b0313ac14 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -150,31 +150,46 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
 {
   auto constexpr num_rows = 40'000;
 
-  auto const generate_input = [num_rows](bool nullable) {
+  auto const generate_input = [num_rows](bool nullable, std::size_t stripe_rows) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
     auto const value_iter = thrust::make_counting_iterator(0);
     input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
     input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
 
-    return write_file(input_columns, "chunked_read_simple", nullable);
+    return write_file(input_columns,
+                      "chunked_read_simple",
+                      nullable,
+                      cudf::io::default_stripe_size_bytes,
+                      stripe_rows);
   };
 
   {
-    auto const [expected, filepath] = generate_input(false);
+    auto const [expected, filepath] = generate_input(false, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows);
     auto const [result, num_chunks] = chunked_read(filepath, 245'000);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   {
-    auto const [expected, filepath] = generate_input(true);
+    auto const [expected, filepath] = generate_input(true, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows);
     auto const [result, num_chunks] = chunked_read(filepath, 245'000);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 }
 
-#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
 {
   // Tests some specific boundary conditions in the split calculations.
@@ -259,6 +274,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
   }
 }
 
+#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
 {
   auto constexpr num_rows = 60'000;

From 818cfb7337ea5dc146107b604522e4a5a31c8990 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 14:12:58 -0800
Subject: [PATCH 123/321] Implement adaptive size limit for decoding

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2fe8f6753f1..859671c184c 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -628,6 +628,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // TODO: use 0.3 constant
   // _chunk_read_data.data_read_limit = total_stripe_sizes.back().size_bytes / 3;
 
+  // TODO: handle case for extremely large files.
+
   _chunk_read_data.load_stripe_chunks =
     find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
 
@@ -820,8 +822,8 @@ void reader::impl::load_data()
   // Decoding is reset to start from the first chunk in `decode_stripe_chunks`.
   _chunk_read_data.curr_decode_stripe_chunk = 0;
 
-  // Decode all chunks if there is no read limit.
-  if (_chunk_read_data.data_read_limit == 0) {
+  // Decode all chunks if there is no read and no output limit.
+  if (_chunk_read_data.data_read_limit == 0 && _chunk_read_data.output_size_limit == 0) {
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
     // TODO: DEBUG only
     return;
@@ -840,9 +842,19 @@ void reader::impl::load_data()
   // DEBUG only
   // _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
 
-  // TODO: only decode stripes enough for output.
+  // TODO: Check and turn this 1.0.
+  // If there is no read limit, we still do not decode all stripes.
+  // Typically, the limit below will result in a very large number of stripes
+  // since their data is compressed to be much smaller than the actual data.
+  // However, it is still better than decoding all stripes, which may be a huge number.
+  auto const decode_size_limit = _chunk_read_data.data_read_limit > 0
+                                   ? _chunk_read_data.data_read_limit
+                                   : _chunk_read_data.output_size_limit;
+
+  printf("decode size limit: %d\n", (int)decode_size_limit);
+
   _chunk_read_data.decode_stripe_chunks =
-    find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
+    find_splits(stripe_decomp_sizes, stripe_chunk.count, decode_size_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
     chunk.start_idx += stripe_chunk.start_idx;
   }

From bce6e8db3b02ac3d22107a7d4e174d359707f3fa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 16:27:49 -0800
Subject: [PATCH 124/321] Update `row_bit_count.cu`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/detail/transform.hpp | 10 ++---
 cpp/include/cudf/transform.hpp        | 25 ++++++++++-
 cpp/src/io/orc/reader_impl.cu         |  2 +-
 cpp/src/transform/row_bit_count.cu    | 62 ++++++++++++++-------------
 4 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 0ce7037b9e8..965fea84860 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -101,14 +101,14 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::mr::device_memory_resource* mr);
 
 /**
- * @copydoc cudf::segmented_bit_count
+ * @copydoc cudf::segmented_row_bit_count
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> segmented_bit_count(table_view const& t,
-                                            size_type segment_length,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 412fe17ef26..49ec3d7c0d5 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -224,5 +224,28 @@ std::unique_ptr<column> row_bit_count(
   table_view const& t,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
+ * each segment of rows.
+ *
+ * This is similar to counting bit size per row for the input table in `cudf::row_bit_count`,
+ * except that row sizes are accumulated by segments.
+ *
+ * Currently, only fixed-length segments are supported. In case the input table has number of rows
+ * not divisible by `segment_length`, its last segment is considered as shorter than the others.
+ *
+ * @throw std::invalid_argument if the input `segment_length` is non-positive or larger than the
+ * number of rows in the input table.
+ *
+ * @param t The table view to perform the computation on
+ * @param segment_length The number of rows in each segment for which the total size is computed
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return A 32-bit integer column containing the bit counts for each segment of rows
+ */
+std::unique_ptr<column> segmented_row_bit_count(
+  table_view const& t,
+  size_type segment_length,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 0b8a7a61226..974043b78db 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -740,7 +740,7 @@ std::vector<chunk> find_table_splits(table_view const& input,
   if (segment_length == 0) { segment_length = input.num_rows(); }
 
   // Default 10k rows.
-  auto const d_segmented_sizes = cudf::detail::segmented_bit_count(
+  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
     input, segment_length, stream, rmm::mr::get_current_device_resource());
   auto const d_size_begin = d_segmented_sizes->view().begin<size_type>();
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 8c0a805b00f..10260df8fb1 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -35,7 +35,7 @@
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/optional.h>
-#include <thrust/transform.h>
+#include <thrust/tabulate.h>
 
 namespace cudf {
 namespace detail {
@@ -404,11 +404,11 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
  * @param segment_length The number of rows in each segment for which the total size is computed
  * @param max_branch_depth Maximum depth of the span stack needed per-thread
  */
-CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
-                                   device_span<column_info const> info,
-                                   device_span<size_type> output,
-                                   size_type segment_length,
-                                   size_type max_branch_depth)
+CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> cols,
+                                       device_span<column_info const> info,
+                                       device_span<size_type> output,
+                                       size_type segment_length,
+                                       size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -422,8 +422,11 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
   size_type branch_depth{0};
 
   // current row span - always starts at spanning over `segment_length` rows.
-  auto const num_rows = cols[0].size();
-  row_span cur_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
+  auto const num_rows             = cols[0].size();
+  auto const get_default_row_span = [=] {
+    return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
+  };
+  auto cur_span = get_default_row_span();
 
   // output size
   size_type& size = output[tid];
@@ -450,8 +453,7 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
     if (info[idx].depth == 0) {
       branch_depth      = 0;
       last_branch_depth = 0;
-      cur_span =
-        row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
+      cur_span          = get_default_row_span();
     }
 
     // add the contributing size of this row
@@ -472,16 +474,18 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
 
 }  // anonymous namespace
 
-std::unique_ptr<column> segmented_bit_count(table_view const& t,
-                                            size_type segment_length,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(segment_length >= 1, "Invalid segment length.", std::invalid_argument);
-
-  // no rows
+  // If there is no rows, segment_length will not be checked.
   if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
 
+  CUDF_EXPECTS(segment_length >= 1 && segment_length <= t.num_rows(),
+               "Invalid segment length.",
+               std::invalid_argument);
+
   // flatten the hierarchy and determine some information about it.
   std::vector<cudf::column_view> cols;
   std::vector<column_info> info;
@@ -498,11 +502,10 @@ std::unique_ptr<column> segmented_bit_count(table_view const& t,
   // simple case.  if we have no complex types (lists, strings, etc), the per-row size is already
   // trivially computed
   if (h_info.complex_type_count <= 0) {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(num_segments),
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
       mcv.begin<size_type>(),
+      mcv.end<size_type>(),
       cuda::proclaim_return_type<size_type>(
         [segment_length,
          num_segments,
@@ -510,9 +513,8 @@ std::unique_ptr<column> segmented_bit_count(table_view const& t,
          per_row_size = h_info.simple_per_row_size] __device__(size_type const segment_idx) {
           // Since the number of rows may not divisible by segment_length,
           // the last segment may be shorter than the others.
-          auto const current_length = segment_idx + 1 < num_segments
-                                        ? segment_length
-                                        : num_rows - segment_length * segment_idx;
+          auto const current_length =
+            cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
           return per_row_size * current_length;
         }));
     return output;
@@ -544,7 +546,7 @@ std::unique_ptr<column> segmented_bit_count(table_view const& t,
   CUDF_EXPECTS(block_size > 0, "Encountered a column hierarchy too complex for row_bit_count");
 
   cudf::detail::grid_1d grid{num_segments, block_size, 1};
-  compute_row_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
+  compute_segment_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
     {std::get<1>(d_cols), cols.size()},
     {d_info.data(), info.size()},
     {mcv.data<size_type>(), static_cast<std::size_t>(mcv.size())},
@@ -558,17 +560,17 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  return segmented_bit_count(t, 1, stream, mr);
+  return segmented_row_bit_count(t, 1, stream, mr);
 }
 
 }  // namespace detail
 
-std::unique_ptr<column> segmented_bit_count(table_view const& t,
-                                            size_type segment_length,
-                                            rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_bit_count(t, segment_length, cudf::get_default_stream(), mr);
+  return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)

From f6fc6f06067c38972aeb4c9f716d8593ef1fb387 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 16:31:51 -0800
Subject: [PATCH 125/321] Fix caller to `segmented_row_bit_count`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 974043b78db..ca7e1605bfc 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -739,6 +739,10 @@ std::vector<chunk> find_table_splits(table_view const& input,
   // As such, set segment length to the number of rows.
   if (segment_length == 0) { segment_length = input.num_rows(); }
 
+  // If we have small number of rows, need to adjust segment_length before calling to
+  // `segmented_row_bit_count`.
+  segment_length = std::min(segment_length, input.num_rows());
+
   // Default 10k rows.
   auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
     input, segment_length, stream, rmm::mr::get_current_device_resource());

From bd198dc71a4a4c444215f96bea4a060b6d43c0c4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 17:01:29 -0800
Subject: [PATCH 126/321] Remove adaptive size for decoding

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu   | 24 +++++++++---------------
 cpp/tests/io/orc_chunked_reader_test.cpp |  7 ++++---
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 859671c184c..19aa6eac48a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -763,7 +763,7 @@ void reader::impl::load_data()
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.id.stripe_idx,
                (int)info.id.level,
-               (int)info.id.orc_cold_idx,
+               (int)info.id.orc_col_idx,
                (int)info.id.kind,
                info.dst_pos,
                info.length);
@@ -823,9 +823,14 @@ void reader::impl::load_data()
   _chunk_read_data.curr_decode_stripe_chunk = 0;
 
   // Decode all chunks if there is no read and no output limit.
-  if (_chunk_read_data.data_read_limit == 0 && _chunk_read_data.output_size_limit == 0) {
+  // In theory, we should just decode enough stripes for output one table chunk.
+  // However, we do not know the output size of each stripe after decompressing and decoding,
+  // thus we have to process all loaded chunks.
+  // That is because the estimated `max_uncompressed_size` of stream data from
+  // `ParseCompressedStripeData` is just the approximate of the maximum possible size, not the
+  // actual size, which can be much smaller in practice.
+  if (_chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
-    // TODO: DEBUG only
     return;
   }
 
@@ -842,19 +847,8 @@ void reader::impl::load_data()
   // DEBUG only
   // _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
 
-  // TODO: Check and turn this 1.0.
-  // If there is no read limit, we still do not decode all stripes.
-  // Typically, the limit below will result in a very large number of stripes
-  // since their data is compressed to be much smaller than the actual data.
-  // However, it is still better than decoding all stripes, which may be a huge number.
-  auto const decode_size_limit = _chunk_read_data.data_read_limit > 0
-                                   ? _chunk_read_data.data_read_limit
-                                   : _chunk_read_data.output_size_limit;
-
-  printf("decode size limit: %d\n", (int)decode_size_limit);
-
   _chunk_read_data.decode_stripe_chunks =
-    find_splits(stripe_decomp_sizes, stripe_chunk.count, decode_size_limit);
+    find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
     chunk.start_idx += stripe_chunk.start_idx;
   }
diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 40b0313ac14..12ffaa30a8f 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -62,13 +62,10 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
                 std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
                 cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
 {
-  // Just shift nulls of the next column by one position to avoid having all nulls in the same
-  // table rows.
   if (nullable) {
     // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
     auto const valid_iter = cudf::detail::make_counting_transform_iterator(
       0, [](cudf::size_type i) { return i % 4 != 3; });
-
     cudf::size_type offset{0};
     for (auto& col : input_columns) {
       auto const [null_mask, null_count] =
@@ -79,6 +76,10 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
         std::move(col),
         cudf::get_default_stream(),
         rmm::mr::get_current_device_resource());
+
+      // Shift nulls of the next column by one position, to avoid having all nulls
+      // in the same table rows.
+      ++offset;
     }
   }
 

From d23591d2f63ea7504d53e0dd8f5975fe4d70cbd6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 17:10:03 -0800
Subject: [PATCH 127/321] Update test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 12ffaa30a8f..6068540438c 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -100,7 +100,7 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
 auto chunked_read(std::string const& filepath,
                   std::size_t output_limit,
                   std::size_t input_limit                = 0,
-                  cudf::size_type output_row_granularity = 0)
+                  cudf::size_type output_row_granularity = 10'000)
 {
   auto const read_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
@@ -214,7 +214,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
   // Test with a very small limit: 1 byte
   {
     auto const [result, num_chunks] = chunked_read(filepath, 1);
-    EXPECT_EQ(num_chunks, 2);
+    // Number of chunks is 4 because of using default output_row_granularity=10k.
+    EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 

From a581e96eb5dfb92d0934a3cd1fdfc4d593462ada Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 20:39:55 -0800
Subject: [PATCH 128/321] Fix segment size processing

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ca7e1605bfc..a7c9d1ab635 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -746,19 +746,26 @@ std::vector<chunk> find_table_splits(table_view const& input,
   // Default 10k rows.
   auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
     input, segment_length, stream, rmm::mr::get_current_device_resource());
-  auto const d_size_begin = d_segmented_sizes->view().begin<size_type>();
 
   auto segmented_sizes =
     cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
 
   // TODO: exec_policy_nosync
-  thrust::transform(rmm::exec_policy(stream),
-                    d_size_begin,
-                    d_size_begin + d_segmented_sizes->size(),
-                    segmented_sizes.d_begin(),
-                    [segment_length] __device__(auto const size) {
-                      return cumulative_size{segment_length, static_cast<std::size_t>(size)};
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(d_segmented_sizes->size()),
+    segmented_sizes.d_begin(),
+    [segment_length,
+     num_rows = input.num_rows(),
+     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
+      // Since the number of rows may not divisible by segment_length,
+      // the last segment may be shorter than the others.
+      auto const current_length =
+        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const size = d_sizes[segment_idx];
+      return cumulative_size{current_length, static_cast<std::size_t>(size)};
+    });
 
   // TODO: remove:
   segmented_sizes.device_to_host_sync(stream);

From 6072ffa0cc0353f14748e477aabdfd7d37034f90 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 20:40:01 -0800
Subject: [PATCH 129/321] Add more test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 78 +++++++++++++++++-------
 1 file changed, 55 insertions(+), 23 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 6068540438c..237ffb43b8b 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -97,6 +97,9 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
   return std::pair{std::move(input_table), std::move(filepath)};
 }
 
+// NOTE: By default, output_row_granularity=10'000 rows.
+// This means if the input file has more than 10k rows then the output chunk will never
+// have less than 10k rows.
 auto chunked_read(std::string const& filepath,
                   std::size_t output_limit,
                   std::size_t input_limit                = 0,
@@ -204,73 +207,102 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
     return write_file(input_columns, "chunked_read_simple_boundary", false /*nullable*/);
   }();
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    auto const [result, num_chunks] = chunked_read(filepath, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1);
-    // Number of chunks is 4 because of using default output_row_granularity=10k.
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL);
+    // Number of chunks is 4 because of using default `output_row_granularity = 10k`.
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
+  // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 0UL, 1'000);
+    EXPECT_EQ(num_chunks, 40);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 0UL, 30'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
   // Test with a very large limit
   {
     auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
+  // Test with a limit slightly less than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 39'000UL);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
 
-  // Test with a limit slightly less than one page of data
+  // Test with a limit exactly the size one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 79'000);
-    EXPECT_EQ(num_chunks, 2);
+    auto const [result, num_chunks] = chunked_read(filepath, 40'000UL);
+    EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a limit exactly the size one page of data
+  // Test with a limit slightly more than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 80'000);
-    EXPECT_EQ(num_chunks, 2);
+    auto const [result, num_chunks] = chunked_read(filepath, 41'000UL);
+    EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a limit slightly more the size one page of data
+  // Test with a limit slightly less than two granularity segments of data
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 81'000);
-    EXPECT_EQ(num_chunks, 2);
+    auto const [result, num_chunks] = chunked_read(filepath, 79'000UL);
+    EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a limit slightly less than two pages of data
+  // Test with a limit exactly the size of two granularity segments of data minus 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 159'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 79'999UL);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 80'000UL);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a limit exactly the size of two pages of data minus one byte
+  // Test with a limit slightly more the size two granularity segments of data.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 159'999);
+    auto const [result, num_chunks] = chunked_read(filepath, 81'000);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a limit exactly the size of two pages of data
+  // Test with a limit exactly the size of the input minus 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 160'000);
-    EXPECT_EQ(num_chunks, 1);
+    auto const [result, num_chunks] = chunked_read(filepath, 159'999UL);
+    EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a limit slightly more the size two pages of data
+  // Test with a limit exactly the size of the input.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 161'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 160'000UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }

From afb4ffaef7fc2a384c8479291aa9601018b3261c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 22:00:19 -0800
Subject: [PATCH 130/321] Add test with strings

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 61 ++++++++++++++----------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 237ffb43b8b..b4549e1433b 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -132,6 +132,13 @@ auto chunked_read(std::string const& filepath,
   return std::pair(cudf::concatenate(out_tviews), num_chunks);
 }
 
+auto chunked_read(std::string const& filepath,
+                  std::size_t output_limit,
+                  cudf::size_type output_row_granularity)
+{
+  return chunked_read(filepath, output_limit, 0UL, output_row_granularity);
+}
+
 }  // namespace
 
 struct OrcChunkedReaderTest : public cudf::test::BaseFixture {};
@@ -224,14 +231,14 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
 
   // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 0UL, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 1'000);
     EXPECT_EQ(num_chunks, 40);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 0UL, 30'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 30'000);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -308,22 +315,22 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
   }
 }
 
-#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
 {
-  auto constexpr num_rows = 60'000;
+  auto constexpr num_rows               = 60'000;
+  auto constexpr output_row_granularity = 20'000;
 
   auto const generate_input = [num_rows](bool nullable) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
     auto const value_iter = thrust::make_counting_iterator(0);
 
-    // ints                                            Page    total bytes   cumulative bytes
-    // 20000 rows of 4 bytes each                    = A0      80000         80000
-    // 20000 rows of 4 bytes each                    = A1      80000         160000
-    // 20000 rows of 4 bytes each                    = A2      80000         240000
+    // ints                               Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each               = A0           80000         80000
+    // 20000 rows of 4 bytes each               = A1           80000         160000
+    // 20000 rows of 4 bytes each               = A2           80000         240000
     input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
 
-    // strings                                         Page    total bytes   cumulative bytes
+    // strings                            Granularity Segment  total bytes   cumulative bytes
     // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
     // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
     // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
@@ -342,42 +349,38 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
     //                                    skip_rows / num_rows
     // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
     // byte_limit==1000000 should give 1 chunks: {0, 60000},
-    return write_file(input_columns,
-                      "chunked_read_with_strings",
-                      nullable,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
+    return write_file(input_columns, "chunked_read_with_strings", nullable);
   };
 
   auto const [expected_no_null, filepath_no_null]       = generate_input(false);
   auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, 1UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very large limit
+  // Test with a very large limit.
   {
     auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
     EXPECT_EQ(num_chunks, 1);
@@ -392,28 +395,34 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
   // Other tests:
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 500'000);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, 500'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 500'000);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, 500'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
+
+#if 0
+#endif
 }
 
+#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStringPrecise)
 {
   auto constexpr num_rows = 60'000;

From 4b1665e16f1931deaa5e56b6e7dc4787b8a0f066 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 29 Feb 2024 22:51:46 -0800
Subject: [PATCH 131/321] Add more tests

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 341 +++++++----------------
 1 file changed, 108 insertions(+), 233 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index b4549e1433b..49571c23f0b 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -417,62 +417,12 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
-
-#if 0
-#endif
-}
-
-#if 0
-TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStringPrecise)
-{
-  auto constexpr num_rows = 60'000;
-
-  auto const generate_input = [num_rows](bool nullable) {
-    std::vector<std::unique_ptr<cudf::column>> input_columns;
-
-    // strings                                                 Page    total bytes   cumulative
-    // 20000 rows alternating 1-4 chars each (50000 + 80004)   A0      130004        130004
-    // 20000 rows alternating 1-4 chars each (50000 + 80004)   A1      130004        260008
-    // ...
-    auto const strings = std::vector<std::string>{"a", "bbbb"};
-    auto const str_iter =
-      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return strings[i % 2]; });
-    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
-
-    // Cumulative sizes:
-    // A0 :  130004
-    // A1 :  260008
-    // A2 :  390012
-    return write_file(input_columns,
-                      "chunked_read_with_strings_precise",
-                      nullable,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
-  };
-
-  auto const [expected_no_null, filepath_no_null] = generate_input(false);
-
-  // a chunk limit of 1 byte less than 2 pages should force it to produce 3 chunks:
-  // each 1 page in size
-  {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 260'007);
-    EXPECT_EQ(num_chunks, 3);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
-  }
-
-  // a chunk limit of exactly equal to 2 pages should force it to produce 2 chunks
-  // pages 0-1 and page 2
-  {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 260'008);
-    EXPECT_EQ(num_chunks, 2);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
-  }
 }
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
 {
-  auto constexpr num_rows = 100'000;
+  auto constexpr num_rows               = 100'000;
+  auto constexpr output_row_granularity = 20'000;
 
   auto const generate_input = [num_rows](bool nullable) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
@@ -489,49 +439,47 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
       return structs_col{{child1, child2, child3}}.release();
     }());
 
-    return write_file(input_columns,
-                      "chunked_read_with_structs",
-                      nullable,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
+    return write_file(input_columns, "chunked_read_with_structs", nullable);
   };
 
   auto const [expected_no_null, filepath_no_null]       = generate_input(false);
   auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, 1UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very large limit
+  // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, 2L << 40, output_row_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, 2L << 40, output_row_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -539,12 +487,14 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
   // Other tests:
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 500'000);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, 500'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 500'000);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, 500'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -552,15 +502,19 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
 {
-  auto constexpr num_rows = 100'000;
+  auto constexpr num_rows               = 100'000;
+  auto constexpr output_row_granularity = 20'000;
 
   auto const [expected, filepath] = [num_rows]() {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
-    // 20000 rows in 1 page consist of:
+    // 20000 rows in 1 segment consist of:
     //
     // 20001 offsets :   80004  bytes
     // 30000 ints    :   120000 bytes
     // total         :   200004 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 200000,
+    // thus we consider as having only 200000 bytes in total.
     auto const template_lists = int32s_lists_col{
       int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
 
@@ -570,59 +524,54 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
     input_columns.emplace_back(
       std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
 
-    return write_file(input_columns,
-                      "chunked_read_with_lists_no_null",
-                      false /*nullable*/,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
+    return write_file(input_columns, "chunked_read_with_lists_no_null", false /*nullable*/);
   }();
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    auto const [result, num_chunks] = chunked_read(filepath, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a very large limit
+  // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size slightly less than 1 page (forcing it to be at least 1 page per read)
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 200'000);
+    auto const [result, num_chunks] = chunked_read(filepath, 199'999UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size exactly 1 page
+  // Chunk size exactly 1 row segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 200'004);
+    auto const [result, num_chunks] = chunked_read(filepath, 200'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size 2 pages. 3 chunks (2 pages + 2 pages + 1 page)
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 400'008);
+    auto const [result, num_chunks] = chunked_read(filepath, 400'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size 2 pages minus one byte: each chunk will be just one page
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 400'007);
+    auto const [result, num_chunks] = chunked_read(filepath, 399'999UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -630,7 +579,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
 {
-  auto constexpr num_rows = 100'000;
+  auto constexpr num_rows               = 100'000;
+  auto constexpr output_row_granularity = 20'000;
 
   auto const [expected, filepath] = [num_rows]() {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
@@ -640,6 +590,9 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
     // 20001 offsets      :   80004  bytes
     // 15000 ints         :   60000 bytes
     // total              :   142504 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 142500,
+    // thus we consider as having only 142500 bytes in total.
     auto const template_lists =
       int32s_lists_col{// these will all be null
                        int32s_lists_col{},
@@ -652,59 +605,54 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
     input_columns.emplace_back(
       std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
 
-    return write_file(input_columns,
-                      "chunked_read_with_lists_nulls",
-                      true /*nullable*/,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
+    return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/);
   }();
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    auto const [result, num_chunks] = chunked_read(filepath, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    auto const [result, num_chunks] = chunked_read(filepath, 1UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // Test with a very large limit
+  // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40, output_row_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size slightly less than 1 page (forcing it to be at least 1 page per read)
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 142'500);
+    auto const [result, num_chunks] = chunked_read(filepath, 142'499UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size exactly 1 page
+  // Chunk size exactly 1 row segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 142'504);
+    auto const [result, num_chunks] = chunked_read(filepath, 142'500UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size 2 pages. 3 chunks (2 pages + 2 pages + 1 page)
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 285'008);
+    auto const [result, num_chunks] = chunked_read(filepath, 285'000UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
-  // chunk size 2 pages minus 1 byte: each chunk will be just one page
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 285'007);
+    auto const [result, num_chunks] = chunked_read(filepath, 284'999UL, output_row_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -714,6 +662,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
 {
   auto constexpr num_rows = 100'000;
 
+  // Size of each segment (10k row by default) is from 537k to 560k bytes (no nulls)
+  // and from 456k to 473k (with nulls).
   auto const generate_input = [num_rows](bool nullable) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
     auto const int_iter = thrust::make_counting_iterator(0);
@@ -740,42 +690,37 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
       return structs_col(std::move(child_columns)).release();
     }());
 
-    return write_file(input_columns,
-                      "chunked_read_with_structs_of_lists",
-                      nullable,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
+    return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable);
   };
 
   auto const [expected_no_null, filepath_no_null]       = generate_input(false);
   auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL);
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
-    EXPECT_EQ(num_chunks, 5);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1UL);
+    EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very large limit
+  // Test with a very large limit.
   {
     auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
     EXPECT_EQ(num_chunks, 1);
@@ -789,55 +734,50 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
 
   // Other tests:
 
-  // for these tests, different columns get written to different numbers of pages so it's a
-  // little tricky to describe the expected results by page counts. To get an idea of how
-  // these values are chosen, see the debug output from the call to print_cumulative_row_info() in
-  // reader_impl_preprocess.cu -> find_splits()
-
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
-    EXPECT_EQ(num_chunks, 7);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000UL);
+    EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000);
-    EXPECT_EQ(num_chunks, 4);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000UL);
+    EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000UL);
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000UL);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000UL);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000);
-    EXPECT_EQ(num_chunks, 5);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000UL);
+    EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000UL);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -847,6 +787,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
 {
   auto constexpr num_rows = 100'000;
 
+  // Size of each segment (10k row by default) is from 450k to 530k bytes (no nulls)
+  // and from 330k to 380k (with nulls).
   auto const generate_input = [num_rows](bool nullable) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
     auto const int_iter = thrust::make_counting_iterator(0);
@@ -880,42 +822,37 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
                               0,
                               rmm::device_buffer{}));
 
-    return write_file(input_columns,
-                      "chunked_read_with_lists_of_structs",
-                      nullable,
-                      512 * 1024,  // 512KB per page
-                      20000        // 20k rows per page
-    );
+    return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable);
   };
 
   auto const [expected_no_null, filepath_no_null]       = generate_input(false);
   auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
 
-  // Test with zero limit: everything will be read in one chunk
+  // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very small limit: 1 byte
+  // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL);
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
-    EXPECT_EQ(num_chunks, 5);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1UL);
+    EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // Test with a very large limit
+  // Test with a very large limit.
   {
     auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
     EXPECT_EQ(num_chunks, 1);
@@ -927,59 +864,58 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
-  // for these tests, different columns get written to different numbers of pages so it's a
-  // little tricky to describe the expected results by page counts. To get an idea of how
-  // these values are chosen, see the debug output from the call to print_cumulative_row_info() in
-  // reader_impl_preprocess.cu -> find_splits()
+  // Other tests.
+
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000UL);
     EXPECT_EQ(num_chunks, 7);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000UL);
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000);
-    EXPECT_EQ(num_chunks, 4);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000UL);
+    EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000);
-    EXPECT_EQ(num_chunks, 2);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000UL);
+    EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000UL);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000);
-    EXPECT_EQ(num_chunks, 4);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000UL);
+    EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000);
-    EXPECT_EQ(num_chunks, 3);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000UL);
+    EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000UL);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 }
 
+#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
 {
   auto constexpr num_rows = 100'000;
@@ -1011,65 +947,4 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
-TEST_F(OrcChunkedReaderTest, InputLimitSimple)
-{
-  auto const filepath = temp_env->get_temp_filepath("input_limit_10_rowgroups.parquet");
-
-  // This results in 10 grow groups, at 4001150 bytes per row group
-  constexpr int num_rows = 25'000'000;
-  auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](int i) { return i; });
-  cudf::test::fixed_width_column_wrapper<int> expected(value_iter, value_iter + num_rows);
-  cudf::io::parquet_writer_options opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath},
-                                              cudf::table_view{{expected}})
-      // note: it is unnecessary to force compression to NONE here because the size we are using in
-      // the row group is the uncompressed data size. But forcing the dictionary policy to
-      // dictionary_policy::NEVER is necessary to prevent changes in the
-      // decompressed-but-not-yet-decoded data.
-      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
-
-  cudf::io::write_parquet(opts);
-
-  {
-    // no chunking
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 0);
-    EXPECT_EQ(num_chunks, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 1);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 4000000);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 4100000);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 12 chunks of 200k rows each, plus 1 final chunk of 100k rows.
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 8002301);
-    EXPECT_EQ(num_chunks, 13);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 1 big chunk
-    auto const [result, num_chunks] = chunked_read(filepath, 0, size_t{1} * 1024 * 1024 * 1024);
-    EXPECT_EQ(num_chunks, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-}
 #endif

From e08984f6dd11b01441480203a6358484a5251c00 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 10:56:33 -0800
Subject: [PATCH 132/321] Add more test

---
 cpp/tests/io/orc_chunked_reader_test.cpp | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 49571c23f0b..a3884745da4 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -915,7 +915,6 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
   }
 }
 
-#if 0
 TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
 {
   auto constexpr num_rows = 100'000;
@@ -928,23 +927,22 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
   cols.push_back(col.release());
   auto const expected = std::make_unique<cudf::table>(std::move(cols));
 
-  auto const filepath        = temp_env->get_temp_filepath("chunked_reader_null_count.parquet");
-  auto const page_limit_rows = num_rows / 5;
+  auto const filepath          = temp_env->get_temp_filepath("chunked_reader_null_count.orc");
+  auto const stripe_limit_rows = num_rows / 5;
   auto const write_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
-      .max_page_size_rows(page_limit_rows)  // 20k rows per page
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .stripe_size_rows(stripe_limit_rows)
       .build();
-  cudf::io::write_parquet(write_opts);
+  cudf::io::write_orc(write_opts);
 
-  auto const byte_limit = page_limit_rows * sizeof(int);
+  auto const byte_limit = stripe_limit_rows * sizeof(int);
   auto const read_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
-  auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader =
+    cudf::io::chunked_orc_reader(byte_limit, 0UL /*read_limit*/, stripe_limit_rows, read_opts);
 
   do {
-    // Every fourth row is null
-    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), page_limit_rows / 4);
+    // Every fourth row is null.
+    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL);
   } while (reader.has_next());
 }
-
-#endif

From d555b5466ac525a9ebad3e488593465e0a7af67f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 13:03:26 -0800
Subject: [PATCH 133/321] Implement test limit function

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 56 ++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index a3884745da4..3f52668aee2 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -946,3 +946,59 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
     EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL);
   } while (reader.has_next());
 }
+
+namespace {
+
+constexpr size_t input_limit_expected_file_count = 3;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
+{
+  return {base_filename + "_a.orc", base_filename + "_b.orc", base_filename + "_c.orc"};
+}
+
+void input_limit_test_write_one(std::string const& filepath,
+                                cudf::table_view const& input,
+                                cudf::io::compression_type compression)
+{
+  auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+                          .compression(compression)
+                          .build();
+  cudf::io::write_orc(out_opts);
+}
+
+void input_limit_test_write(std::vector<std::string> const& test_filenames,
+                            cudf::table_view const& input)
+{
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  // No compression
+  input_limit_test_write_one(test_filenames[0], input, cudf::io::compression_type::NONE);
+
+  // Compression with a codec that uses a lot of scratch space at decode time (2.5x the total
+  // decompressed buffer size).
+  input_limit_test_write_one(test_filenames[1], input, cudf::io::compression_type::ZSTD);
+
+  // Compression with a codec that uses no scratch space at decode time.
+  input_limit_test_write_one(test_filenames[2], input, cudf::io::compression_type::SNAPPY);
+}
+
+void input_limit_test_read(std::vector<std::string> const& test_filenames,
+                           cudf::table_view const& input,
+                           size_t output_limit,
+                           size_t input_limit,
+                           int const* expected_chunk_counts)
+{
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  for (size_t idx = 0; idx < test_filenames.size(); idx++) {
+    auto const result = chunked_read(test_filenames[idx], output_limit, input_limit);
+    EXPECT_EQ(expected_chunk_counts[idx], result.second)
+      << "Unexpected number of chunks produced in chunk read.";
+    // TODO: equal
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, input);
+  }
+}
+
+}  // namespace

From cfb8345d70a475708555b5a3c1808d81f93c2ba3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 14:30:01 -0800
Subject: [PATCH 134/321] Implement `load_limit_ratio`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 24 +++++++++++++-----------
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 ++-
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 19aa6eac48a..5aa499fd9e9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -487,6 +487,7 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   // Logically view streams as columns
   _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
 
+  // TODO: handle large number of stripes.
   // Get the total number of stripes across all input files.
   auto const num_stripes = selected_stripes.size();
 
@@ -624,14 +625,13 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
   }
 
-  // DEBUG only
-  // TODO: use 0.3 constant
-  // _chunk_read_data.data_read_limit = total_stripe_sizes.back().size_bytes / 3;
-
   // TODO: handle case for extremely large files.
-
-  _chunk_read_data.load_stripe_chunks =
-    find_splits(total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
+  auto const load_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
+                                              chunk_read_data::load_limit_ratio);
+    return tmp > 0UL ? tmp : 1UL;
+  }();
+  _chunk_read_data.load_stripe_chunks = find_splits(total_stripe_sizes, num_stripes, load_limit);
 
 #ifndef PRINT_DEBUG
   auto& splits = _chunk_read_data.load_stripe_chunks;
@@ -844,11 +844,13 @@ void reader::impl::load_data()
 
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
-  // DEBUG only
-  // _chunk_read_data.data_read_limit = stripe_decomp_sizes.back().size_bytes / 3;
-
+  auto const decode_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
+                                              (1.0 - chunk_read_data::load_limit_ratio));
+    return tmp > 0UL ? tmp : 1UL;
+  }();
   _chunk_read_data.decode_stripe_chunks =
-    find_splits(stripe_decomp_sizes, stripe_chunk.count, _chunk_read_data.data_read_limit);
+    find_splits(stripe_decomp_sizes, stripe_chunk.count, decode_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
     chunk.start_idx += stripe_chunk.start_idx;
   }
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index cc37ac585a3..a721226b78b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -211,7 +211,8 @@ struct chunk_read_data {
   std::size_t output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
   std::size_t data_read_limit;    // approximate maximum size (in bytes) used for store
                                   // intermediate data, or 0 for no limit
-  size_type output_row_granularity;  // TODO
+  size_type output_row_granularity;               // TODO
+  static double constexpr load_limit_ratio{0.3};  // TODO
 
   // Chunks of stripes that can be load into memory such that their data size is within a size
   // limit.

From e0721246c987efef211839c9c82924ab825acfee Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 14:50:17 -0800
Subject: [PATCH 135/321] Add new test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 60 +++++++++++++++++++-----
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 3f52668aee2..3174496f313 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -949,7 +949,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
 
 namespace {
 
-constexpr size_t input_limit_expected_file_count = 3;
+std::size_t constexpr input_limit_expected_file_count = 3;
 
 std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
 {
@@ -962,43 +962,77 @@ void input_limit_test_write_one(std::string const& filepath,
 {
   auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
                           .compression(compression)
+                          .stripe_size_rows(10'000)  // intentionally write small stripes
                           .build();
   cudf::io::write_orc(out_opts);
 }
 
-void input_limit_test_write(std::vector<std::string> const& test_filenames,
+void input_limit_test_write(std::vector<std::string> const& test_files,
                             cudf::table_view const& input)
 {
-  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
                "Unexpected count of test filenames.");
 
   // No compression
-  input_limit_test_write_one(test_filenames[0], input, cudf::io::compression_type::NONE);
+  input_limit_test_write_one(test_files[0], input, cudf::io::compression_type::NONE);
 
   // Compression with a codec that uses a lot of scratch space at decode time (2.5x the total
   // decompressed buffer size).
-  input_limit_test_write_one(test_filenames[1], input, cudf::io::compression_type::ZSTD);
+  input_limit_test_write_one(test_files[1], input, cudf::io::compression_type::ZSTD);
 
   // Compression with a codec that uses no scratch space at decode time.
-  input_limit_test_write_one(test_filenames[2], input, cudf::io::compression_type::SNAPPY);
+  input_limit_test_write_one(test_files[2], input, cudf::io::compression_type::SNAPPY);
 }
 
-void input_limit_test_read(std::vector<std::string> const& test_filenames,
+void input_limit_test_read(int test_location,
+                           std::vector<std::string> const& test_files,
                            cudf::table_view const& input,
                            size_t output_limit,
                            size_t input_limit,
                            int const* expected_chunk_counts)
 {
-  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
                "Unexpected count of test filenames.");
 
-  for (size_t idx = 0; idx < test_filenames.size(); idx++) {
-    auto const result = chunked_read(test_filenames[idx], output_limit, input_limit);
-    EXPECT_EQ(expected_chunk_counts[idx], result.second)
-      << "Unexpected number of chunks produced in chunk read.";
+  for (size_t idx = 0; idx < test_files.size(); idx++) {
+    SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
+                 ", file idx: " + std::to_string(idx));
+    auto const [result, num_chunks] = chunked_read(test_files[idx], output_limit, input_limit);
+    EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
     // TODO: equal
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, input);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
   }
 }
 
 }  // namespace
+
+struct OrcChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
+{
+  auto constexpr num_rows = 1'000'000;
+  auto const iter1        = thrust::make_constant_iterator(15);
+  auto const col1         = cudf::test::fixed_width_column_wrapper<double>(iter1, iter1 + num_rows);
+  auto const input        = cudf::table_view{{col1}};
+
+  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  input_limit_test_write(test_files, input);
+
+  // Some small limit.
+  {
+    int constexpr expected[] = {100, 100, 100};
+    input_limit_test_read(__LINE__, test_files, input, 0UL, 1UL, expected);
+  }
+
+  if (0) {
+    int constexpr expected[] = {15, 20, 9};
+    input_limit_test_read(__LINE__, test_files, input, 0UL, 2 * 1024 * 1024UL, expected);
+  }
+
+  // Limit of 1 byte.
+  if (0) {
+    int constexpr expected[] = {1, 50, 50};
+    input_limit_test_read(__LINE__, test_files, input, 0UL, 1UL, expected);
+  }
+}

From 37aaeebfcc8078ca2229a6c88593a24e846200c7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 15:54:49 -0800
Subject: [PATCH 136/321] Add strong type for limits

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 226 +++++++++++++----------
 1 file changed, 125 insertions(+), 101 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 3174496f313..8471d2e362f 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -46,6 +46,10 @@
 #include <type_traits>
 
 namespace {
+enum class output_limit : std::size_t {};
+enum class input_limit : std::size_t {};
+enum class output_row_granularity : cudf::size_type {};
+
 // Global environment for temporary files
 auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
   ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
@@ -101,14 +105,16 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
 // This means if the input file has more than 10k rows then the output chunk will never
 // have less than 10k rows.
 auto chunked_read(std::string const& filepath,
-                  std::size_t output_limit,
-                  std::size_t input_limit                = 0,
-                  cudf::size_type output_row_granularity = 10'000)
+                  output_limit output_limit_bytes,
+                  input_limit input_limit_bytes             = input_limit{0},
+                  output_row_granularity output_granularity = output_row_granularity{10'000})
 {
   auto const read_opts =
     cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
-  auto reader =
-    cudf::io::chunked_orc_reader(output_limit, input_limit, output_row_granularity, read_opts);
+  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                             static_cast<std::size_t>(input_limit_bytes),
+                                             static_cast<cudf::size_type>(output_granularity),
+                                             read_opts);
 
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
@@ -133,10 +139,10 @@ auto chunked_read(std::string const& filepath,
 }
 
 auto chunked_read(std::string const& filepath,
-                  std::size_t output_limit,
-                  cudf::size_type output_row_granularity)
+                  output_limit output_limit_bytes,
+                  output_row_granularity output_granularity)
 {
-  return chunked_read(filepath, output_limit, 0UL, output_row_granularity);
+  return chunked_read(filepath, output_limit_bytes, input_limit{0UL}, output_granularity);
 }
 
 }  // namespace
@@ -150,7 +156,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
   input_columns.emplace_back(int64s_col{}.release());
 
   auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty", false);
-  auto const [result, num_chunks] = chunked_read(filepath, 1'000);
+  auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000});
   EXPECT_EQ(num_chunks, 1);
   EXPECT_EQ(result->num_rows(), 0);
   EXPECT_EQ(result->num_columns(), 2);
@@ -176,26 +182,26 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
 
   {
     auto const [expected, filepath] = generate_input(false, 1'000);
-    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
   {
     auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows);
-    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   {
     auto const [expected, filepath] = generate_input(true, 1'000);
-    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
   {
     auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows);
-    auto const [result, num_chunks] = chunked_read(filepath, 245'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -216,14 +222,14 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL});
     // Number of chunks is 4 because of using default `output_row_granularity = 10k`.
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
@@ -231,27 +237,29 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
 
   // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 1'000);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{1'000});
     EXPECT_EQ(num_chunks, 40);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL, 30'000);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{30'000});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
   // Test with a very large limit
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
   // Test with a limit slightly less than one granularity segment of data
   // (output_row_granularity = 10k rows = 40'000 bytes).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 39'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{39'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -259,7 +267,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
   // Test with a limit exactly the size one granularity segment of data
   // (output_row_granularity = 10k rows = 40'000 bytes).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 40'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{40'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -267,49 +275,49 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
   // Test with a limit slightly more than one granularity segment of data
   // (output_row_granularity = 10k rows = 40'000 bytes).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 41'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{41'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a limit slightly less than two granularity segments of data
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 79'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a limit exactly the size of two granularity segments of data minus 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 79'999UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'999UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a limit exactly the size of two granularity segments of data.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 80'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{80'000UL});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a limit slightly more the size two granularity segments of data.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 81'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{81'000});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a limit exactly the size of the input minus 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 159'999UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{159'999UL});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a limit exactly the size of the input.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 160'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{160'000UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -317,8 +325,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
 {
-  auto constexpr num_rows               = 60'000;
-  auto constexpr output_row_granularity = 20'000;
+  auto constexpr num_rows           = 60'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
 
   auto const generate_input = [num_rows](bool nullable) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
@@ -357,37 +365,38 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_with_nulls, 1UL, output_row_granularity);
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -396,24 +405,24 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
 
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_no_null, 500'000UL, output_row_granularity);
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_with_nulls, 500'000UL, output_row_granularity);
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -421,8 +430,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
 {
-  auto constexpr num_rows               = 100'000;
-  auto constexpr output_row_granularity = 20'000;
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
 
   auto const generate_input = [num_rows](bool nullable) {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
@@ -447,25 +456,26 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_with_nulls, 1UL, output_row_granularity);
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -473,13 +483,13 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
   // Test with a very large limit.
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_no_null, 2L << 40, output_row_granularity);
+      chunked_read(filepath_no_null, output_limit{2L << 40}, output_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_with_nulls, 2L << 40, output_row_granularity);
+      chunked_read(filepath_with_nulls, output_limit{2L << 40}, output_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -488,13 +498,13 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
 
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_no_null, 500'000UL, output_row_granularity);
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
     auto const [result, num_chunks] =
-      chunked_read(filepath_with_nulls, 500'000UL, output_row_granularity);
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -502,8 +512,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
 {
-  auto constexpr num_rows               = 100'000;
-  auto constexpr output_row_granularity = 20'000;
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
 
   auto const [expected, filepath] = [num_rows]() {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
@@ -529,49 +539,54 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL, output_row_granularity);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40UL}, output_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 199'999UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{199'999UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size exactly 1 row segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 200'000UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{200'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size == size of 2 segments. Totally have 3 chunks.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 400'000UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{400'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 399'999UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{399'999UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -579,8 +594,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
 
 TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
 {
-  auto constexpr num_rows               = 100'000;
-  auto constexpr output_row_granularity = 20'000;
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
 
   auto const [expected, filepath] = [num_rows]() {
     std::vector<std::unique_ptr<cudf::column>> input_columns;
@@ -610,49 +625,54 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 1UL, output_row_granularity);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40}, output_granularity);
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 142'499UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'499UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size exactly 1 row segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 142'500UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'500UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size == size of 2 segments. Totally have 3 chunks.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 285'000UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{285'000UL}, output_granularity);
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 
   // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
   {
-    auto const [result, num_chunks] = chunked_read(filepath, 284'999UL, output_row_granularity);
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{284'999UL}, output_granularity);
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
@@ -698,36 +718,36 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -735,49 +755,49 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
   // Other tests:
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -830,36 +850,36 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
 
   // Test with zero limit: everything will be read in one chunk.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very small limit: 1 byte.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
     EXPECT_EQ(num_chunks, 10);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   // Test with a very large limit.
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -867,49 +887,49 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
   // Other tests.
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
     EXPECT_EQ(num_chunks, 7);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
     EXPECT_EQ(num_chunks, 4);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
     EXPECT_EQ(num_chunks, 5);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
     EXPECT_EQ(num_chunks, 3);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
     EXPECT_EQ(num_chunks, 2);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
 
   {
-    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000UL);
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
     EXPECT_EQ(num_chunks, 1);
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
   }
@@ -987,8 +1007,8 @@ void input_limit_test_write(std::vector<std::string> const& test_files,
 void input_limit_test_read(int test_location,
                            std::vector<std::string> const& test_files,
                            cudf::table_view const& input,
-                           size_t output_limit,
-                           size_t input_limit,
+                           output_limit output_limit_bytes,
+                           input_limit input_limit_bytes,
                            int const* expected_chunk_counts)
 {
   CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
@@ -997,7 +1017,8 @@ void input_limit_test_read(int test_location,
   for (size_t idx = 0; idx < test_files.size(); idx++) {
     SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
                  ", file idx: " + std::to_string(idx));
-    auto const [result, num_chunks] = chunked_read(test_files[idx], output_limit, input_limit);
+    auto const [result, num_chunks] =
+      chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
     EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
     // TODO: equal
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
@@ -1022,17 +1043,20 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
   // Some small limit.
   {
     int constexpr expected[] = {100, 100, 100};
-    input_limit_test_read(__LINE__, test_files, input, 0UL, 1UL, expected);
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
 
   if (0) {
     int constexpr expected[] = {15, 20, 9};
-    input_limit_test_read(__LINE__, test_files, input, 0UL, 2 * 1024 * 1024UL, expected);
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
 
   // Limit of 1 byte.
   if (0) {
     int constexpr expected[] = {1, 50, 50};
-    input_limit_test_read(__LINE__, test_files, input, 0UL, 1UL, expected);
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
 }

From 4531ab3c0aeda58338d1d982661c08c6dce62f49 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 15:57:46 -0800
Subject: [PATCH 137/321] Fix test check

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 8471d2e362f..2405c0eea90 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -130,6 +130,10 @@ auto chunked_read(std::string const& filepath,
     out_tables.emplace_back(std::move(chunk.tbl));
   } while (reader.has_next());
 
+  if (num_chunks > 1) {
+    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  }
+
   auto out_tviews = std::vector<cudf::table_view>{};
   for (auto const& tbl : out_tables) {
     out_tviews.emplace_back(tbl->view());

From 9a80faf4142c7c4c89254f873d0d4bd6ebda4653 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 16:31:30 -0800
Subject: [PATCH 138/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 124 -------------------------
 1 file changed, 124 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5aa499fd9e9..9b8dfce2f67 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -66,130 +66,6 @@
 //
 //
 //
-namespace cudf::experimental {
-
-enum class decompose_lists_column : bool { YES, NO };
-
-auto decompose_structs(table_view table,
-                       decompose_lists_column decompose_lists,
-                       host_span<order const> column_order         = {},
-                       host_span<null_order const> null_precedence = {})
-{
-  auto linked_columns = detail::table_to_linked_columns(table);
-
-  std::vector<column_view> verticalized_columns;
-  std::vector<order> new_column_order;
-  std::vector<null_order> new_null_precedence;
-  std::vector<int> verticalized_col_depths;
-  for (size_t col_idx = 0; col_idx < linked_columns.size(); ++col_idx) {
-    detail::linked_column_view const* col = linked_columns[col_idx].get();
-    if (is_nested(col->type())) {
-      // convert and insert
-      std::vector<std::vector<detail::linked_column_view const*>> flattened;
-      std::function<void(
-        detail::linked_column_view const*, std::vector<detail::linked_column_view const*>*, int)>
-        recursive_child = [&](detail::linked_column_view const* c,
-                              std::vector<detail::linked_column_view const*>* branch,
-                              int depth) {
-          branch->push_back(c);
-          if (decompose_lists == decompose_lists_column::YES && c->type().id() == type_id::LIST) {
-            recursive_child(
-              c->children[lists_column_view::child_column_index].get(), branch, depth + 1);
-          } else if (c->type().id() == type_id::STRUCT) {
-            for (size_t child_idx = 0; child_idx < c->children.size(); ++child_idx) {
-              // When child_idx == 0, we also cut off the current branch if its first child is a
-              // lists column.
-              // In such cases, the last column of the current branch will be `Struct<List,...>` and
-              // it will be modified to empty struct type `Struct<>` later on.
-              if (child_idx > 0 || c->children[0]->type().id() == type_id::LIST) {
-                verticalized_col_depths.push_back(depth + 1);
-                branch = &flattened.emplace_back();
-              }
-              recursive_child(c->children[child_idx].get(), branch, depth + 1);
-            }
-          }
-        };
-      auto& branch = flattened.emplace_back();
-      verticalized_col_depths.push_back(0);
-      recursive_child(col, &branch, 0);
-
-      for (auto const& branch : flattened) {
-        column_view temp_col = *branch.back();
-
-        // Change `Struct<List,...>` into empty struct type `Struct<>`.
-        if (temp_col.type().id() == type_id::STRUCT &&
-            (temp_col.num_children() > 0 && temp_col.child(0).type().id() == type_id::LIST)) {
-          temp_col = column_view(temp_col.type(),
-                                 temp_col.size(),
-                                 temp_col.head(),
-                                 temp_col.null_mask(),
-                                 temp_col.null_count(),
-                                 temp_col.offset(),
-                                 {});
-        }
-
-        for (auto it = branch.crbegin() + 1; it < branch.crend(); ++it) {
-          auto const& prev_col = *(*it);
-          auto children =
-            (prev_col.type().id() == type_id::LIST)
-              ? std::vector<column_view>{*prev_col
-                                            .children[lists_column_view::offsets_column_index],
-                                         temp_col}
-              : std::vector<column_view>{temp_col};
-          temp_col = column_view(prev_col.type(),
-                                 prev_col.size(),
-                                 nullptr,
-                                 prev_col.null_mask(),
-                                 prev_col.null_count(),
-                                 prev_col.offset(),
-                                 std::move(children));
-        }
-        // Traverse upward and include any list columns in the ancestors
-        for (detail::linked_column_view* parent = branch.front()->parent; parent;
-             parent                             = parent->parent) {
-          if (parent->type().id() == type_id::LIST) {
-            // Include this parent
-            temp_col = column_view(
-              parent->type(),
-              parent->size(),
-              nullptr,  // list has no data of its own
-              nullptr,  // If we're going through this then nullmask is already in another branch
-              0,
-              parent->offset(),
-              {*parent->children[lists_column_view::offsets_column_index], temp_col});
-          } else if (parent->type().id() == type_id::STRUCT) {
-            // Replace offset with parent's offset
-            temp_col = column_view(temp_col.type(),
-                                   parent->size(),
-                                   temp_col.head(),
-                                   temp_col.null_mask(),
-                                   temp_col.null_count(),
-                                   parent->offset(),
-                                   {temp_col.child_begin(), temp_col.child_end()});
-          }
-        }
-        verticalized_columns.push_back(temp_col);
-      }
-      if (not column_order.empty()) {
-        new_column_order.insert(new_column_order.end(), flattened.size(), column_order[col_idx]);
-      }
-      if (not null_precedence.empty()) {
-        new_null_precedence.insert(
-          new_null_precedence.end(), flattened.size(), null_precedence[col_idx]);
-      }
-    } else {
-      verticalized_columns.push_back(*col);
-      verticalized_col_depths.push_back(0);
-      if (not column_order.empty()) { new_column_order.push_back(column_order[col_idx]); }
-      if (not null_precedence.empty()) { new_null_precedence.push_back(null_precedence[col_idx]); }
-    }
-  }
-  return std::make_tuple(table_view(verticalized_columns),
-                         std::move(new_column_order),
-                         std::move(new_null_precedence),
-                         std::move(verticalized_col_depths));
-}
-}  // namespace cudf::experimental
 
 namespace cudf::io::orc::detail {
 

From 6279ad65e681d0a51258a6ee12823254d210f4c6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 21:48:34 -0800
Subject: [PATCH 139/321] Fix bug in stream data access

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           | 44 +++++++++++++++++++++++--
 cpp/src/io/orc/reader_impl_chunking.cu  | 21 ++++++++----
 cpp/src/io/orc/reader_impl_chunking.hpp |  4 +++
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index a7c9d1ab635..dc2f9fbdebe 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -512,6 +512,19 @@ void decode_stream_data(std::size_t num_dicts,
   auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
   // Update the null map for child columns
+
+  // printf(
+  //   "num col: %d, num stripe: %d, skip row: %d, row_groups size: %d, row index stride: %d, "
+  //   "level: "
+  //   "%d\n",
+  //   (int)num_columns,
+  //   (int)num_stripes,
+  //   (int)skip_rows,
+  //   (int)row_groups.size().first,
+  //   (int)row_index_stride,
+  //   (int)level
+  // );
+
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
@@ -917,9 +930,14 @@ void reader::impl::decompress_and_decode()
 
 #endif
 
+  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
+
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     printf("processing level = %d\n", (int)level);
 
+    auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
+    auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
+
     auto& columns_level = _selected_columns.levels[level];
 
     // TODO: do it in global step
@@ -1003,6 +1021,10 @@ void reader::impl::decompress_and_decode()
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
+      // printf("stripeinfo->indexLength: %d, data: %d\n",
+      //        (int)stripe_info->indexLength,
+      //        (int)stripe_info->dataLength);
+
       auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
                                                                       level,
                                                                       stripe_info,
@@ -1088,12 +1110,20 @@ void reader::impl::decompress_and_decode()
                                 ? sizeof(size_type)
                                 : cudf::size_of(column_types[col_idx]);
         chunk.num_rowgroups = stripe_num_rowgroups;
+        // printf("stripe_num_rowgroups: %d\n", (int)stripe_num_rowgroups);
+
         if (chunk.type_kind == orc::TIMESTAMP) {
           chunk.timestamp_type_id = _config.timestamp_type.id();
         }
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stripe_start].dst_pos;
+            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stream_begin].dst_pos;
+            // printf("chunk.streams[%d] of chunk.strm_id[%d], stripe %d | %d, collect from %d\n",
+            //        (int)k,
+            //        (int)chunk.strm_id[k],
+            //        (int)stripe_idx,
+            //        (int)stripe_start,
+            //        (int)(chunk.strm_id[k] + stream_begin));
           }
         }
       }
@@ -1137,8 +1167,9 @@ void reader::impl::decompress_and_decode()
 
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // printf("decompress----------------------\n");
       // printf("line %d\n", __LINE__);
-      // fflush(stdout);
+      fflush(stdout);
       auto decomp_data = decompress_stripe_data(stripe_chunk,
                                                 _file_itm_data.compinfo_map,
                                                 *_metadata.per_file_metadata[0].decompressor,
@@ -1157,7 +1188,11 @@ void reader::impl::decompress_and_decode()
       // fflush(stdout);
 
     } else {
+      // printf("no decompression----------------------\n");
+
       if (row_groups.size().first) {
+        // printf("line %d\n", __LINE__);
+        // fflush(stdout);
         chunks.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
@@ -1187,7 +1222,7 @@ void reader::impl::decompress_and_decode()
       auto is_list_type = (column_types[i].id() == type_id::LIST);
       auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
 
-      // printf("  create child col, num rows: %d\n", (int)n_rows);
+      // printf("  create col, num rows: %d\n", (int)n_rows);
 
       // For list column, offset column will be always size + 1
       if (is_list_type) n_rows++;
@@ -1258,6 +1293,9 @@ void reader::impl::decompress_and_decode()
     });
   _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
 
+  // printf("col: \n");
+  // cudf::test::print(_chunk_read_data.decoded_table->get_column(0).view());
+
   // DEBUG only
   // _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9b8dfce2f67..e176e32f561 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -142,7 +142,14 @@ std::size_t gather_stream_info_and_column_desc(
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk                = (*chunks.value())[stripe_index][col];
+            auto& chunk = (*chunks.value())[stripe_index][col];
+            // printf("use stream id: %d, stripe: %d, level: %d, col idx: %d, kind: %d\n",
+            //        (int)(*stream_idx),
+            //        (int)stripe_index,
+            //        (int)level,
+            //        (int)column_id,
+            //        (int)stream.kind);
+
             chunk.strm_id[index_type]  = *stream_idx;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
@@ -155,6 +162,7 @@ std::size_t gather_stream_info_and_column_desc(
             }
           }
         }
+
         (*stream_idx)++;
       } else {  // not chunks.has_value()
         // printf("collect stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
@@ -302,6 +310,8 @@ void verify_splits(host_span<chunk const> splits,
 }
 #endif
 
+}  // namespace
+
 /**
  * @brief Find range of the data span by a given chunk of chunks.
  *
@@ -327,8 +337,6 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
   return {begin, end};
 }
 
-}  // namespace
-
 void reader::impl::global_preprocess(uint64_t skip_rows,
                                      std::optional<size_type> const& num_rows_opt,
                                      std::vector<std::vector<size_type>> const& stripes)
@@ -556,6 +564,7 @@ void reader::impl::load_data()
     auto& stripe_data  = lvl_stripe_data[level];
     auto& stripe_sizes = lvl_stripe_sizes[level];
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      // TODO: only do this if it was not allocated before.
       stripe_data[stripe_idx] = rmm::device_buffer(
         cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
     }
@@ -666,7 +675,7 @@ void reader::impl::load_data()
           stream_compinfo->max_uncompressed_size;
 #ifdef PRINT_DEBUG
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-               (int)stream_id.id.stripe_idx,
+               (int)stream_id.stripe_idx,
                (int)stream_id.level,
                (int)stream_id.orc_col_idx,
                (int)stream_id.kind,
@@ -681,8 +690,8 @@ void reader::impl::load_data()
       stream_compinfo_map.clear();
 
     } else {
-      // printf("no compression \n");
-      // fflush(stdout);
+      printf("no compression \n");
+      fflush(stdout);
 
       // Set decompression size equal to the input size.
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index a721226b78b..18fcbf25bdb 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -270,6 +270,10 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
                                int64_t total_count,
                                size_t size_limit);
 
+// TODO
+std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
+                                      chunk const& selected_chunks);
+
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column
  * data, but not both.

From 3a89549ee161d1057e288d6bee366437d37e1774 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Mar 2024 21:56:59 -0800
Subject: [PATCH 140/321] Add temp docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp | 28 ++++++++++++++++++++++++
 cpp/include/cudf/io/orc.hpp        | 35 ++++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index ac024caf1f3..c6176021a79 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -120,6 +120,34 @@ class chunked_reader : private reader {
                           rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource* mr);
 
+  /**
+   * @brief Constructor from size limits and an array of data sources with reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `output_size_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
+   * whole file and return a table containing all rows.
+   *
+   * TODO: data read limit
+   * TODO: granularity
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param output_row_granularity  TODO
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
                           size_type output_row_granularity,
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index cfab642f25d..19252e77b91 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -423,7 +423,21 @@ class chunked_orc_reader {
    */
   chunked_orc_reader() = default;
 
-  // TODO
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `orc_reader_option` parameter as in
+   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
+   * output table for each reading.
+   *
+   * TODO: data read limit
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param options The options used to read Parquet file
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
   chunked_orc_reader(std::size_t output_size_limit,
                      orc_reader_options const& options,
                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
@@ -452,7 +466,24 @@ class chunked_orc_reader {
                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-  // TODO
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `orc_reader_option` parameter as in
+   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
+   * output table for each reading.
+   *
+   * TODO: data read limit
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
+   *        of input, or `0` if there is no limit
+   * @param output_row_granularity  TODO
+   * @param options The options used to read Parquet file
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
   chunked_orc_reader(std::size_t output_size_limit,
                      std::size_t data_read_limit,
                      size_type output_row_granularity,

From d1cc44c8aa00319d7f8d728a1825fb658de6a1d2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 08:24:57 -0800
Subject: [PATCH 141/321] Add new tests

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/CMakeLists.txt                 |  2 +-
 cpp/tests/io/orc_chunked_reader_test.cpp | 52 ++++++++++++++++++------
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c8f490df02b..511705855f5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -287,7 +287,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_chunked_reader_test.cpp io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cpp
   GPUS 1
   PERCENT 30
 )
diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cpp
index 2405c0eea90..9298b95616e 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cpp
@@ -997,14 +997,14 @@ void input_limit_test_write(std::vector<std::string> const& test_files,
   CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
                "Unexpected count of test filenames.");
 
-  // No compression
+  // ZSTD yields a very small decompression size, can be much smaller than SNAPPY.
+  // However, ORC reader typically over-estimates the decompression size of data
+  // compressed by ZSTD to be very large, can be much larger than that of SNAPPY.
+  // That is because ZSTD may use a lot of scratch space at decode time
+  // (2.5x the total decompressed buffer size).
+  // As such, we may see smaller output chunks for the input data compressed by ZSTD.
   input_limit_test_write_one(test_files[0], input, cudf::io::compression_type::NONE);
-
-  // Compression with a codec that uses a lot of scratch space at decode time (2.5x the total
-  // decompressed buffer size).
   input_limit_test_write_one(test_files[1], input, cudf::io::compression_type::ZSTD);
-
-  // Compression with a codec that uses no scratch space at decode time.
   input_limit_test_write_one(test_files[2], input, cudf::io::compression_type::SNAPPY);
 }
 
@@ -1018,7 +1018,7 @@ void input_limit_test_read(int test_location,
   CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
                "Unexpected count of test filenames.");
 
-  for (size_t idx = 0; idx < test_files.size(); idx++) {
+  for (size_t idx = 0; idx < test_files.size(); ++idx) {
     SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
                  ", file idx: " + std::to_string(idx));
     auto const [result, num_chunks] =
@@ -1044,23 +1044,51 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
   input_limit_test_write(test_files, input);
 
-  // Some small limit.
   {
     int constexpr expected[] = {100, 100, 100};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
 
-  if (0) {
+  {
     int constexpr expected[] = {15, 20, 9};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
+}
 
-  // Limit of 1 byte.
-  if (0) {
-    int constexpr expected[] = {1, 50, 50};
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
+{
+  auto constexpr num_rows = 1'000'000;
+
+  auto const iter1 = thrust::make_counting_iterator<int>(0);
+  auto const col1  = cudf::test::fixed_width_column_wrapper<int>(iter1, iter1 + num_rows);
+
+  auto const iter2 = thrust::make_counting_iterator<double>(0);
+  auto const col2  = cudf::test::fixed_width_column_wrapper<double>(iter2, iter2 + num_rows);
+
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto const col3     = strings_col(str_iter, str_iter + num_rows);
+
+  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1, col2, col3}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {100, 100, 100};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
+
+  {
+    int constexpr expected[] = {15, 100, 21};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
 }

From ac97dc2ef17c9fb4a912549f3384123b058c486f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 12:54:49 -0800
Subject: [PATCH 142/321] Add test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 ...er_test.cpp => orc_chunked_reader_test.cu} | 70 ++++++++++++++++++-
 1 file changed, 68 insertions(+), 2 deletions(-)
 rename cpp/tests/io/{orc_chunked_reader_test.cpp => orc_chunked_reader_test.cu} (95%)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cpp b/cpp/tests/io/orc_chunked_reader_test.cu
similarity index 95%
rename from cpp/tests/io/orc_chunked_reader_test.cpp
rename to cpp/tests/io/orc_chunked_reader_test.cu
index 9298b95616e..d5790b27327 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cpp
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -1038,10 +1039,10 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
   auto constexpr num_rows = 1'000'000;
   auto const iter1        = thrust::make_constant_iterator(15);
   auto const col1         = cudf::test::fixed_width_column_wrapper<double>(iter1, iter1 + num_rows);
-  auto const input        = cudf::table_view{{col1}};
 
   auto const filename   = std::string{"single_col_fixed_width"};
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1}};
   input_limit_test_write(test_files, input);
 
   {
@@ -1075,7 +1076,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
   });
   auto const col3     = strings_col(str_iter, str_iter + num_rows);
 
-  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const filename   = std::string{"mixed_columns"};
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
   auto const input      = cudf::table_view{{col1, col2, col3}};
   input_limit_test_write(test_files, input);
@@ -1092,3 +1093,68 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
 }
+
+namespace {
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i) const { return i * group_size; }
+};
+
+template <typename T>
+struct value_gen {
+  __device__ T operator()(int i) const { return i % 1024; }
+};
+
+#if 0
+struct char_values {
+  __device__ int8_t operator()(int i) const
+  {
+    int const index = (i / 2) % 3;
+    // Generate repeating 3-runs of 2 values each: "aabbccaabbcc...".
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
+  }
+};
+#endif
+
+}  // namespace
+
+TEST_F(OrcChunkedReaderInputLimitTest, ListType)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  auto const filename   = std::string{"list_type"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {5000, 5000, 5000};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+}

From 9b2bbaa3cf37859f5f425302db6c88ec1ed6a94f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 12:57:07 -0800
Subject: [PATCH 143/321] Allow to control number of rows per stripe

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index d5790b27327..ba1f5c891bb 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -983,17 +983,20 @@ std::vector<std::string> input_limit_get_test_names(std::string const& base_file
 
 void input_limit_test_write_one(std::string const& filepath,
                                 cudf::table_view const& input,
+                                cudf::size_type stripe_size_rows,
                                 cudf::io::compression_type compression)
 {
   auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
                           .compression(compression)
-                          .stripe_size_rows(10'000)  // intentionally write small stripes
+                          .stripe_size_rows(stripe_size_rows)
                           .build();
   cudf::io::write_orc(out_opts);
 }
 
-void input_limit_test_write(std::vector<std::string> const& test_files,
-                            cudf::table_view const& input)
+void input_limit_test_write(
+  std::vector<std::string> const& test_files,
+  cudf::table_view const& input,
+  cudf::size_type stripe_size_rows = 10'000 /*write small stripes by default*/)
 {
   CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
                "Unexpected count of test filenames.");
@@ -1004,9 +1007,12 @@ void input_limit_test_write(std::vector<std::string> const& test_files,
   // That is because ZSTD may use a lot of scratch space at decode time
   // (2.5x the total decompressed buffer size).
   // As such, we may see smaller output chunks for the input data compressed by ZSTD.
-  input_limit_test_write_one(test_files[0], input, cudf::io::compression_type::NONE);
-  input_limit_test_write_one(test_files[1], input, cudf::io::compression_type::ZSTD);
-  input_limit_test_write_one(test_files[2], input, cudf::io::compression_type::SNAPPY);
+  input_limit_test_write_one(
+    test_files[0], input, stripe_size_rows, cudf::io::compression_type::NONE);
+  input_limit_test_write_one(
+    test_files[1], input, stripe_size_rows, cudf::io::compression_type::ZSTD);
+  input_limit_test_write_one(
+    test_files[2], input, stripe_size_rows, cudf::io::compression_type::SNAPPY);
 }
 
 void input_limit_test_read(int test_location,

From a959db2f49bed04f8da28c814e189c98e81dd6fa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 13:03:45 -0800
Subject: [PATCH 144/321] Write a bit larger stripes to test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 32 +++++++++++++++++++------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index ba1f5c891bb..21210426b24 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -996,7 +996,7 @@ void input_limit_test_write_one(std::string const& filepath,
 void input_limit_test_write(
   std::vector<std::string> const& test_files,
   cudf::table_view const& input,
-  cudf::size_type stripe_size_rows = 10'000 /*write small stripes by default*/)
+  cudf::size_type stripe_size_rows = 20'000 /*write relatively small stripes by default*/)
 {
   CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
                "Unexpected count of test filenames.");
@@ -1052,13 +1052,13 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
   input_limit_test_write(test_files, input);
 
   {
-    int constexpr expected[] = {100, 100, 100};
+    int constexpr expected[] = {50, 50, 50};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
 
   {
-    int constexpr expected[] = {15, 20, 9};
+    int constexpr expected[] = {17, 10, 9};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
@@ -1088,13 +1088,13 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
   input_limit_test_write(test_files, input);
 
   {
-    int constexpr expected[] = {100, 100, 100};
+    int constexpr expected[] = {50, 50, 50};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
 
   {
-    int constexpr expected[] = {15, 100, 21};
+    int constexpr expected[] = {17, 50, 14};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
@@ -1156,11 +1156,29 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
   auto const filename   = std::string{"list_type"};
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
   auto const input      = cudf::table_view{{*lists_col}};
-  input_limit_test_write(test_files, input);
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
-    int constexpr expected[] = {5000, 5000, 5000};
+    // Although we set `stripe_size_rows` to be very large, the writer only write
+    // 250k rows per stripe. Thus, we have 200 stripes in total.
+    int constexpr expected[] = {200, 200, 200};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
   }
+
+  {
+    int constexpr expected[] = {2, 34, 2};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {8, 34, 8};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{5 * 1024 * 1024UL},
+                          expected);
+  }
 }

From 81b78ea5459a30e4ef7bf0b8e5255a7cb52e0229 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 13:45:38 -0800
Subject: [PATCH 145/321] Add the final test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/CMakeLists.txt                |  4 +-
 cpp/tests/io/orc_chunked_reader_test.cu | 95 ++++++++++++++++++++++---
 2 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 511705855f5..058b8555378 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -287,7 +287,9 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_chunked_reader_test.cpp
+  ORC_TEST
+  io/orc_chunked_reader_test.cu
+  io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 21210426b24..2c7e43060f6 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1112,7 +1112,6 @@ struct value_gen {
   __device__ T operator()(int i) const { return i % 1024; }
 };
 
-#if 0
 struct char_values {
   __device__ int8_t operator()(int i) const
   {
@@ -1121,7 +1120,6 @@ struct char_values {
     return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
   }
 };
-#endif
 
 }  // namespace
 
@@ -1156,15 +1154,10 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
   auto const filename   = std::string{"list_type"};
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
   auto const input      = cudf::table_view{{*lists_col}};
-  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
-  {
-    // Although we set `stripe_size_rows` to be very large, the writer only write
-    // 250k rows per stripe. Thus, we have 200 stripes in total.
-    int constexpr expected[] = {200, 200, 200};
-    input_limit_test_read(
-      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
-  }
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows per stripe. Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
     int constexpr expected[] = {2, 34, 2};
@@ -1182,3 +1175,85 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
                           expected);
   }
 }
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+  int constexpr str_size  = 3;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  // list<int>
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  // strings
+  int constexpr num_chars = num_rows * str_size;
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    str_offset_col->mutable_view().begin<int>(),
+                    offset_gen{str_size});
+  rmm::device_buffer str_chars(num_chars, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_chars,
+                    static_cast<int8_t*>(str_chars.data()),
+                    char_values{});
+  auto const str_col =
+    cudf::make_strings_column(num_rows, std::move(str_offset_col), std::move(str_chars), 0, {});
+
+  // doubles
+  auto const double_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows,
+                    double_col->mutable_view().begin<double>(),
+                    value_gen<double>{});
+
+  auto const filename   = std::string{"mixed_cols_having_list"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows per stripe. Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {11, 7, 5};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {21, 13, 14};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{128 * 1024 * 1024UL},
+                          expected);
+  }
+}

From 5537033597aa0c0835b6da3475def7ff9726a464 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 14:12:56 -0800
Subject: [PATCH 146/321] Change debug info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index e176e32f561..867ef4b508f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -490,8 +490,11 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   }
 
   printf("total stripe sizes:\n");
+  int count{0};
   for (auto& size : total_stripe_sizes) {
+    ++count;
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
+    if (count > 5) break;
   }
 
   // Compute the prefix sum of stripe data sizes.
@@ -504,9 +507,12 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
+  count = 0;
   printf("prefix sum total stripe sizes:\n");
   for (auto& size : total_stripe_sizes) {
+    ++count;
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
+    if (count > 5) break;
   }
 
   // TODO: handle case for extremely large files.
@@ -740,8 +746,10 @@ void reader::impl::load_data()
     chunk.start_idx += stripe_chunk.start_idx;
   }
 
+  int count{0};
   for (auto& size : stripe_decomp_sizes) {
     printf("decomp size: %ld, %zu\n", size.count, size.size_bytes);
+    if (count++ > 5) break;
   }
 
 #ifndef PRINT_DEBUG

From 41b9f52c5457b9a95e28b5b4b305ecd112f652a9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 20:10:16 -0800
Subject: [PATCH 147/321] Implement peak memory usage

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu  |  8 +++++++-
 cpp/src/io/orc/reader_impl.hpp | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index dc2f9fbdebe..9fef8782dee 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1396,6 +1396,11 @@ table_with_metadata reader::impl::make_output_chunk()
   }();
 
 #endif
+
+  auto peak_mem = mem_stats_logger.peak_memory_usage();
+  std::cout << "peak_memory_usage: " << peak_mem << "(" << (peak_mem * 1.0) / (1024.0 * 1024.0)
+            << " MB)" << std::endl;
+
   return {std::move(out_table), _out_metadata};
 }
 
@@ -1472,7 +1477,8 @@ reader::impl::impl(std::size_t output_size_limit,
     _chunk_read_data{
       output_size_limit,
       data_read_limit,
-      output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY}
+      output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY},
+    mem_stats_logger(mr)
 {
   printf("construct reader , limit = %d, %d, gradunarity %d \n",
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 9ca003672a4..3f93541aed5 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -24,6 +24,7 @@
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>  // TODO: remove
 
 #include <io/utilities/column_buffer.hpp>
 
@@ -33,6 +34,26 @@
 
 namespace cudf::io::orc::detail {
 
+class memory_stats_logger {
+ public:
+  explicit memory_stats_logger(rmm::mr::device_memory_resource* mr)
+    : existing_mr(mr), statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+  {
+    rmm::mr::set_current_device_resource(&statistics_mr);
+  }
+
+  ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
+
+  [[nodiscard]] size_t peak_memory_usage() const noexcept
+  {
+    return statistics_mr.get_bytes_counter().peak;
+  }
+
+ private:
+  rmm::mr::device_memory_resource* existing_mr;
+  rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource> statistics_mr;
+};
+
 struct reader_column_meta;
 
 /**
@@ -187,6 +208,8 @@ class reader::impl {
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 
   static constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
+
+  memory_stats_logger mem_stats_logger;
 };
 
 }  // namespace cudf::io::orc::detail

From 65976990554f9121fb48bda29a3edc0a8dfa0841 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 20:36:54 -0800
Subject: [PATCH 148/321] Optimize memory usage

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 35 +++++++++++++++-----------
 cpp/src/io/orc/reader_impl_chunking.cu | 15 ++++++-----
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 9fef8782dee..f7dbcc46282 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -78,6 +78,7 @@ namespace {
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
+  chunk const& load_stripe_chunk,
   chunk const& stripe_chunk,
   stream_id_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
@@ -127,7 +128,9 @@ rmm::device_buffer decompress_stripe_data(
 #endif
 
     compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
+      static_cast<uint8_t const*>(
+        stripe_data[info.id.stripe_idx - load_stripe_chunk.start_idx].data()) +
+        info.dst_pos,
       info.length));
 
     //    printf("line %d\n", __LINE__);
@@ -1044,7 +1047,7 @@ void reader::impl::decompress_and_decode()
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx].data());
+      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx - stripe_start].data());
 
       // printf("line %d\n", __LINE__);
       // fflush(stdout);
@@ -1169,18 +1172,22 @@ void reader::impl::decompress_and_decode()
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       // printf("decompress----------------------\n");
       // printf("line %d\n", __LINE__);
-      fflush(stdout);
-      auto decomp_data = decompress_stripe_data(stripe_chunk,
-                                                _file_itm_data.compinfo_map,
-                                                *_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                num_stripes,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
+      // fflush(stdout);
+      CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_chunk > 0, "ERRRRR");
+
+      auto decomp_data = decompress_stripe_data(
+        _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1],
+        stripe_chunk,
+        _file_itm_data.compinfo_map,
+        *_metadata.per_file_metadata[0].decompressor,
+        stripe_data,
+        stream_info,
+        chunks,
+        row_groups,
+        num_stripes,
+        _metadata.get_row_index_stride(),
+        level == 0,
+        _stream);
       // stripe_data.clear();
       stripe_data.push_back(std::move(decomp_data));
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 867ef4b508f..41ffe3288c0 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -401,8 +401,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
       col_meta.orc_col_map[level][col.id] = col_id++;
     }
 
-    auto& stripe_data = lvl_stripe_data[level];
-    stripe_data.resize(num_stripes);
+    // auto& stripe_data = lvl_stripe_data[level];
+    // stripe_data.resize(num_stripes);
 
     auto& stream_info      = _file_itm_data.lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
@@ -567,11 +567,13 @@ void reader::impl::load_data()
   // Prepare the buffer to read raw data onto.
   // TODO: clear all old buffer.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& stripe_data  = lvl_stripe_data[level];
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(stripe_chunk.count);
+
     auto& stripe_sizes = lvl_stripe_sizes[level];
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
       // TODO: only do this if it was not allocated before.
-      stripe_data[stripe_idx] = rmm::device_buffer(
+      stripe_data[stripe_idx - stripe_start] = rmm::device_buffer(
         cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
     }
   }
@@ -585,7 +587,7 @@ void reader::impl::load_data()
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read  = read_info[read_idx];
     auto& stripe_data = lvl_stripe_data[read.level];
-    auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx].data());
+    auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx - stripe_start].data());
 
     if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
           read.length)) {
@@ -646,7 +648,8 @@ void reader::impl::load_data()
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx].data()) + info.dst_pos,
+          static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx - stripe_start].data()) +
+            info.dst_pos,
           info.length));
         stream_compinfo_map[stream_id_info{
           info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind}] = &compinfo.back();

From 277758e073d1e7fde9e5527ba843099bf4d09d7c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 2 Mar 2024 20:58:03 -0800
Subject: [PATCH 149/321] Add debug info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 2c7e43060f6..47bdb2427e8 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1028,6 +1028,8 @@ void input_limit_test_read(int test_location,
   for (size_t idx = 0; idx < test_files.size(); ++idx) {
     SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
                  ", file idx: " + std::to_string(idx));
+    // TODO: remove
+    printf("file_idx %d\n", (int)idx);
     auto const [result, num_chunks] =
       chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
     EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
@@ -1256,4 +1258,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
                           input_limit{128 * 1024 * 1024UL},
                           expected);
   }
+
+  // TODO: remove
+  {
+    int constexpr expected[] = {1, 1, 1};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
+  }
 }

From 83ba727ac49521e28e856761a75e6fee10893b56 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 07:31:39 -0800
Subject: [PATCH 150/321] Fix a bug in memory write, and add debug info for
 memory usage

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 135 ++++++++++++++++++++++++-
 cpp/src/io/orc/reader_impl.hpp         |   2 +-
 cpp/src/io/orc/reader_impl_chunking.cu |   4 +
 3 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f7dbcc46282..e1086c2df30 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -814,6 +814,9 @@ void reader::impl::decompress_and_decode()
   auto const stripe_start = stripe_chunk.start_idx;
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
+  auto const load_stripe_start =
+    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1].start_idx;
+
   printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
 
   auto const rows_to_skip = _file_itm_data.rows_to_skip;
@@ -938,6 +941,13 @@ void reader::impl::decompress_and_decode()
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     printf("processing level = %d\n", (int)level);
 
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
     auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
 
@@ -979,6 +989,13 @@ void reader::impl::decompress_and_decode()
     chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     const bool use_index =
       _config.use_index &&
       // Do stripes have row group index
@@ -1047,7 +1064,9 @@ void reader::impl::decompress_and_decode()
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx - stripe_start].data());
+      // TODO: Wrong?
+      // stripe load_stripe_start?
+      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
 
       // printf("line %d\n", __LINE__);
       // fflush(stdout);
@@ -1175,6 +1194,13 @@ void reader::impl::decompress_and_decode()
       // fflush(stdout);
       CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_chunk > 0, "ERRRRR");
 
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+
       auto decomp_data = decompress_stripe_data(
         _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1],
         stripe_chunk,
@@ -1189,7 +1215,18 @@ void reader::impl::decompress_and_decode()
         level == 0,
         _stream);
       // stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
+      // stripe_data.push_back(std::move(decomp_data));
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (int64_t i = 1; i < stripe_chunk.count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
 
       // printf("line %d\n", __LINE__);
       // fflush(stdout);
@@ -1217,6 +1254,13 @@ void reader::impl::decompress_and_decode()
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
 
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < num_stripes; ++j) {
@@ -1231,14 +1275,35 @@ void reader::impl::decompress_and_decode()
 
       // printf("  create col, num rows: %d\n", (int)n_rows);
 
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+
       // For list column, offset column will be always size + 1
       if (is_list_type) n_rows++;
       _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
     }
 
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
 
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
@@ -1250,6 +1315,13 @@ void reader::impl::decompress_and_decode()
                        _stream,
                        _mr);
 
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
 
@@ -1286,6 +1358,13 @@ void reader::impl::decompress_and_decode()
     // fflush(stdout);
   }  // end loop level
 
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
   std::vector<std::unique_ptr<column>> out_columns;
   _out_metadata = get_meta_with_user_data();
   std::transform(
@@ -1300,6 +1379,13 @@ void reader::impl::decompress_and_decode()
     });
   _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
 
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
   // printf("col: \n");
   // cudf::test::print(_chunk_read_data.decoded_table->get_column(0).view());
 
@@ -1322,6 +1408,13 @@ void reader::impl::decompress_and_decode()
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
   }
   fflush(stdout);
+
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "decomp and decode, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
 }
 
 void reader::impl::prepare_data(int64_t skip_rows,
@@ -1361,6 +1454,13 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
 table_with_metadata reader::impl::make_output_chunk()
 {
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "start to make out, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
   // There is no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
@@ -1392,6 +1492,13 @@ table_with_metadata reader::impl::make_output_chunk()
       return std::move(_chunk_read_data.decoded_table);
     }
 
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << "prepare to make out, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     auto const out_chunk =
       _chunk_read_data.output_table_chunks[_chunk_read_data.curr_output_table_chunk++];
     auto const out_tview =
@@ -1399,14 +1506,24 @@ table_with_metadata reader::impl::make_output_chunk()
                           {static_cast<size_type>(out_chunk.start_idx),
                            static_cast<size_type>(out_chunk.start_idx + out_chunk.count)},
                           _stream)[0];
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << "done make out, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     return std::make_unique<table>(out_tview, _stream, _mr);
   }();
 
 #endif
 
-  auto peak_mem = mem_stats_logger.peak_memory_usage();
-  std::cout << "peak_memory_usage: " << peak_mem << "(" << (peak_mem * 1.0) / (1024.0 * 1024.0)
-            << " MB)" << std::endl;
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "done, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
 
   return {std::move(out_table), _out_metadata};
 }
@@ -1517,6 +1634,14 @@ table_with_metadata reader::impl::read_chunk()
 {
   printf("==================call read chunk\n");
   prepare_data();
+
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "done prepare data, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
   return make_output_chunk();
 }
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 3f93541aed5..d605975e1ad 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -46,7 +46,7 @@ class memory_stats_logger {
 
   [[nodiscard]] size_t peak_memory_usage() const noexcept
   {
-    return statistics_mr.get_bytes_counter().peak;
+    return statistics_mr.get_bytes_counter().value;
   }
 
  private:
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 41ffe3288c0..944f23e7764 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -778,6 +778,10 @@ void reader::impl::load_data()
 
   // lvl_stripe_data.clear();
   // _file_itm_data.compinfo_ready = true;
+
+  auto peak_mem = mem_stats_logger.peak_memory_usage();
+  std::cout << "load, peak_memory_usage: " << peak_mem << "("
+            << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
 }
 
 }  // namespace cudf::io::orc::detail

From 5dcd61242b8773d218116379a94209f73d64f4f8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 09:24:56 -0800
Subject: [PATCH 151/321] Debugging memory leak

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index e1086c2df30..7412f9d2251 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1216,6 +1216,8 @@ void reader::impl::decompress_and_decode()
         _stream);
       // stripe_data.clear();
       // stripe_data.push_back(std::move(decomp_data));
+
+      // TODO: only reset each one if the new size/type are different.
       stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
       for (int64_t i = 1; i < stripe_chunk.count; ++i) {
         stripe_data[i + stripe_start - load_stripe_start] = {};
@@ -1261,6 +1263,17 @@ void reader::impl::decompress_and_decode()
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
 
+    // TODO: do not clear but reset each one.
+    // and only reset if the new size/type are different.
+    _out_buffers[level].clear();
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < num_stripes; ++j) {
@@ -1289,7 +1302,8 @@ void reader::impl::decompress_and_decode()
       {
         _stream.synchronize();
         auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+        std::cout << __LINE__ << ", buffer size: " << n_rows
+                  << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                   << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
       }
     }
@@ -1379,6 +1393,13 @@ void reader::impl::decompress_and_decode()
     });
   _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
 
+  // TODO: do not clear but reset each one.
+  // and only reset if the new size/type are different.
+  // This clear is just to check if there is memory leak.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    _out_buffers[level].clear();
+  }
+
   {
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
@@ -1489,6 +1510,7 @@ table_with_metadata reader::impl::make_output_chunk()
   auto out_table = [&] {
     if (_chunk_read_data.output_table_chunks.size() == 1) {
       _chunk_read_data.curr_output_table_chunk++;
+      printf("one chunk, no more table---------------------------------\n");
       return std::move(_chunk_read_data.decoded_table);
     }
 
@@ -1633,6 +1655,13 @@ bool reader::impl::has_next()
 table_with_metadata reader::impl::read_chunk()
 {
   printf("==================call read chunk\n");
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "\n\n\nstart read chunk, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
   prepare_data();
 
   {

From 04acd0f381ca170a9ec267777c9705bd3a155d08 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 10:18:24 -0800
Subject: [PATCH 152/321] Fix memory leak

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 7412f9d2251..5f6d02f43f1 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1398,6 +1398,16 @@ void reader::impl::decompress_and_decode()
   // This clear is just to check if there is memory leak.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     _out_buffers[level].clear();
+
+    auto& stripe_data = lvl_stripe_data[level];
+
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (int64_t i = 0; i < stripe_chunk.count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
   }
 
   {
@@ -1658,10 +1668,28 @@ table_with_metadata reader::impl::read_chunk()
   {
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "\n\n\nstart read chunk, peak_memory_usage: " << peak_mem << "("
+    std::cout << "\n\n\n------------start read chunk, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
 
+  {
+    static int count{0};
+    ++count;
+
+#if 0
+    if (count == 3) {
+      _file_itm_data.lvl_stripe_data.clear();
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << "clear all, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+      exit(0);
+    }
+#endif
+  }
+
   prepare_data();
 
   {

From 97f80c823eef8b79671523331ce77387ae4ba99b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 19:33:36 -0800
Subject: [PATCH 153/321] Change comments

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 47bdb2427e8..c0cdeb1b107 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1158,7 +1158,8 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
   auto const input      = cudf::table_view{{*lists_col}};
 
   // Although we set `stripe_size_rows` to be very large, the writer only write
-  // 250k rows per stripe. Thus, we have 200 stripes in total.
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
   input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
@@ -1240,7 +1241,8 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
 
   // Although we set `stripe_size_rows` to be very large, the writer only write
-  // 250k rows per stripe. Thus, we have 200 stripes in total.
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
   input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {

From e425e416b51826eb4d53b7c8139099b7e5c64690 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 19:33:42 -0800
Subject: [PATCH 154/321] Change memory stats

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index d605975e1ad..3f93541aed5 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -46,7 +46,7 @@ class memory_stats_logger {
 
   [[nodiscard]] size_t peak_memory_usage() const noexcept
   {
-    return statistics_mr.get_bytes_counter().value;
+    return statistics_mr.get_bytes_counter().peak;
   }
 
  private:

From 8d7730926a547b842b5ce9df6b830d0a1d8caae2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 19:57:35 -0800
Subject: [PATCH 155/321] Change read limit ratio

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp |  2 +-
 cpp/tests/io/orc_chunked_reader_test.cu | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 18fcbf25bdb..0769f46f1d1 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -212,7 +212,7 @@ struct chunk_read_data {
   std::size_t data_read_limit;    // approximate maximum size (in bytes) used for store
                                   // intermediate data, or 0 for no limit
   size_type output_row_granularity;               // TODO
-  static double constexpr load_limit_ratio{0.3};  // TODO
+  static double constexpr load_limit_ratio{0.4};  // TODO
 
   // Chunks of stripes that can be load into memory such that their data size is within a size
   // limit.
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index c0cdeb1b107..7520a54e174 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1060,7 +1060,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
   }
 
   {
-    int constexpr expected[] = {17, 10, 9};
+    int constexpr expected[] = {10, 13, 10};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
@@ -1096,7 +1096,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
   }
 
   {
-    int constexpr expected[] = {17, 50, 14};
+    int constexpr expected[] = {10, 50, 15};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
@@ -1163,13 +1163,13 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
   input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
-    int constexpr expected[] = {2, 34, 2};
+    int constexpr expected[] = {2, 40, 3};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
   }
 
   {
-    int constexpr expected[] = {8, 34, 8};
+    int constexpr expected[] = {8, 40, 9};
     input_limit_test_read(__LINE__,
                           test_files,
                           input,
@@ -1246,13 +1246,13 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
-    int constexpr expected[] = {11, 7, 5};
+    int constexpr expected[] = {8, 8, 6};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
   }
 
   {
-    int constexpr expected[] = {21, 13, 14};
+    int constexpr expected[] = {16, 15, 17};
     input_limit_test_read(__LINE__,
                           test_files,
                           input,

From c4f98ee5c94c7fe82c1ef1979ff3cb32272c5307 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 20:52:37 -0800
Subject: [PATCH 156/321] Test read with very large file

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 77 +++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 7520a54e174..b58ee3692d9 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -33,6 +33,7 @@
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -1268,3 +1269,79 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
       __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
   }
 }
+
+TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
+{
+  using cudf::test::iterators::no_nulls;
+
+  int64_t constexpr num_rows    = 500'000'000l;
+  int constexpr rows_per_stripe = 1'000'000;
+  int constexpr num_reps        = 5l;
+  int64_t constexpr total_rows  = num_rows * num_reps;
+  static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
+
+  auto const it = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; });
+  auto const col =
+    cudf::test::fixed_width_column_wrapper<int8_t, typename decltype(it)::value_type>(
+      it, it + num_rows);
+  auto const chunk_table = cudf::table_view{{col}};
+
+  std::vector<char> data_buffer;
+  {
+    auto const write_opts =
+      cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&data_buffer})
+        .stripe_size_rows(rows_per_stripe)
+        .build();
+
+    auto writer = cudf::io::orc_chunked_writer(write_opts);
+    for (int i = 0; i < num_reps; ++i) {
+      writer.write(chunk_table);
+    }
+  }
+
+  // Test reading the metadata
+  auto const metadata =
+    cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
+  EXPECT_EQ(metadata.num_rows(), total_rows);
+  EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
+
+  printf("start test chunk\n");
+  fflush(stdout);
+
+  int constexpr num_rows_to_read = 5'000'000;
+  const auto num_rows_to_skip    = metadata.num_rows() - num_rows_to_read;
+
+  // Check validity of the last 5 million rows.
+  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const skipped_col =
+    cudf::test::fixed_width_column_wrapper<int8_t, typename decltype(it)::value_type>(
+      it + sequence_start, it + sequence_start + num_rows_to_read, no_nulls());
+  auto const expected = cudf::table_view{{skipped_col}};
+
+  auto const read_opts = cudf::io::orc_reader_options::builder(
+                           cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                           .use_index(false)
+                           .skip_rows(num_rows_to_skip)
+                           .build();
+  auto reader = cudf::io::chunked_orc_reader(
+    500'000UL /*output limit*/,
+    1'000'000UL /*input limit*/,
+    500'000 /*output granularity, or minimum number of rows for the output chunk*/,
+    read_opts);
+
+  auto num_chunks  = 0;
+  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto tviews      = std::vector<cudf::table_view>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    ++num_chunks;
+    tviews.emplace_back(chunk.tbl->view());
+    read_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  auto const read_result = cudf::concatenate(tviews);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+
+  printf("num chunk: %d\n", num_chunks);
+}

From ae665a0ef81cca5423e1ddc543acbb8f41346390 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 3 Mar 2024 21:48:25 -0800
Subject: [PATCH 157/321] Support `skip_rows` and `num_rows`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           | 11 ++++++++---
 cpp/src/io/orc/reader_impl.hpp          | 13 +++++++++----
 cpp/tests/io/orc_chunked_reader_test.cu |  8 +++++++-
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 5f6d02f43f1..a4b46bb49f4 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1460,6 +1460,8 @@ void reader::impl::prepare_data(int64_t skip_rows,
   // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return; }
 
+  std::cout << "call global, skip = " << skip_rows << std::endl;
+
   global_preprocess(skip_rows, num_rows_opt, stripes);
 
   if (!_chunk_read_data.more_table_chunk_to_output()) {
@@ -1625,7 +1627,10 @@ reader::impl::impl(std::size_t output_size_limit,
     _config{options.get_timestamp_type(),
             options.is_enabled_use_index(),
             options.is_enabled_use_np_dtypes(),
-            options.get_decimal128_columns()},
+            options.get_decimal128_columns(),
+            options.get_skip_rows(),
+            options.get_num_rows(),
+            options.get_stripes()},
     _col_meta{std::make_unique<reader_column_meta>()},
     _sources(std::move(sources)),
     _metadata{_sources, stream},
@@ -1656,7 +1661,7 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
 bool reader::impl::has_next()
 {
   printf("==================query has next \n");
-  prepare_data();
+  prepare_data(_config.skip_rows, _config.num_read_rows, _config.selected_stripes);
 
   printf("has next: %d\n", (int)_chunk_read_data.has_next());
   return _chunk_read_data.has_next();
@@ -1690,7 +1695,7 @@ table_with_metadata reader::impl::read_chunk()
 #endif
   }
 
-  prepare_data();
+  prepare_data(_config.skip_rows, _config.num_read_rows, _config.selected_stripes);
 
   {
     _stream.synchronize();
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 3f93541aed5..4a32394c91f 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -190,10 +190,15 @@ class reader::impl {
 
   // Reader configs
   struct {
-    data_type timestamp_type;  // Override output timestamp resolution
-    bool use_index;            // Enable or disable attempt to use row index for parsing
-    bool use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-    std::vector<std::string> decimal128_columns;  // Control decimals conversion
+    data_type timestamp_type;  // override output timestamp resolution
+    bool use_index;            // enable or disable attempt to use row index for parsing
+    bool use_np_dtypes;        // enable or disable the conversion to numpy-compatible dtypes
+    std::vector<std::string> decimal128_columns;  // control decimals conversion
+
+    // User specified reading rows/stripes selection.
+    uint64_t const skip_rows;
+    std::optional<size_type> num_read_rows;
+    std::vector<std::vector<size_type>> const selected_stripes;
   } const _config;
 
   // Intermediate data for internal processing.
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index b58ee3692d9..4a67cda8757 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1270,11 +1270,15 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   }
 }
 
+#define LOCAL_TEST
+
+// This test is extremely heavy, thus it should be disabled by default.
+#ifdef LOCAL_TEST
 TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 {
   using cudf::test::iterators::no_nulls;
 
-  int64_t constexpr num_rows    = 500'000'000l;
+  int64_t constexpr num_rows    = 1'000'000'000l;
   int constexpr rows_per_stripe = 1'000'000;
   int constexpr num_reps        = 5l;
   int64_t constexpr total_rows  = num_rows * num_reps;
@@ -1310,6 +1314,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 
   int constexpr num_rows_to_read = 5'000'000;
   const auto num_rows_to_skip    = metadata.num_rows() - num_rows_to_read;
+  // - 123456 /*just shift the read data region back by a random offset*/;
 
   // Check validity of the last 5 million rows.
   const auto sequence_start = num_rows_to_skip % num_rows;
@@ -1345,3 +1350,4 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 
   printf("num chunk: %d\n", num_chunks);
 }
+#endif

From 883ccc04afe9bf79285261e23e2525d35131efec Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 4 Mar 2024 09:43:58 -0800
Subject: [PATCH 158/321] Fix test with very large file

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 27 +++++++++----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 4a67cda8757..326eaac73b9 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1277,17 +1277,16 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
 TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 {
   using cudf::test::iterators::no_nulls;
+  using int64s_col = cudf::test::fixed_width_column_wrapper<int64_t>;
 
-  int64_t constexpr num_rows    = 1'000'000'000l;
+  int64_t constexpr num_rows    = 500'000'000l;
   int constexpr rows_per_stripe = 1'000'000;
-  int constexpr num_reps        = 5l;
+  int constexpr num_reps        = 10l;
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; });
-  auto const col =
-    cudf::test::fixed_width_column_wrapper<int8_t, typename decltype(it)::value_type>(
-      it, it + num_rows);
+  auto const it          = thrust::make_counting_iterator(int64_t{0});
+  auto const col         = int64s_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;
@@ -1309,19 +1308,14 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   EXPECT_EQ(metadata.num_rows(), total_rows);
   EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
 
-  printf("start test chunk\n");
-  fflush(stdout);
-
   int constexpr num_rows_to_read = 5'000'000;
   const auto num_rows_to_skip    = metadata.num_rows() - num_rows_to_read;
   // - 123456 /*just shift the read data region back by a random offset*/;
 
   // Check validity of the last 5 million rows.
   const auto sequence_start = num_rows_to_skip % num_rows;
-  auto const skipped_col =
-    cudf::test::fixed_width_column_wrapper<int8_t, typename decltype(it)::value_type>(
-      it + sequence_start, it + sequence_start + num_rows_to_read, no_nulls());
-  auto const expected = cudf::table_view{{skipped_col}};
+  auto const skipped_col = int64s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+  auto const expected    = cudf::table_view{{skipped_col}};
 
   auto const read_opts = cudf::io::orc_reader_options::builder(
                            cudf::io::source_info{data_buffer.data(), data_buffer.size()})
@@ -1329,8 +1323,8 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                            .skip_rows(num_rows_to_skip)
                            .build();
   auto reader = cudf::io::chunked_orc_reader(
-    500'000UL /*output limit*/,
-    1'000'000UL /*input limit*/,
+    600'000UL * sizeof(int64_t) /*output limit, equal to 600k int64_t rows */,
+    8'000'000UL /*input limit, around size of 1 stripe's decoded data */,
     500'000 /*output granularity, or minimum number of rows for the output chunk*/,
     read_opts);
 
@@ -1346,8 +1340,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   } while (reader.has_next());
 
   auto const read_result = cudf::concatenate(tviews);
+  EXPECT_EQ(num_chunks, 10);
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
-
-  printf("num chunk: %d\n", num_chunks);
 }
 #endif

From 625d0f4826e333e32d9fb0a07801f028fb5cf556 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 4 Mar 2024 10:28:23 -0800
Subject: [PATCH 159/321] Some refactors

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 93 +++++++++++++++++++++----
 1 file changed, 78 insertions(+), 15 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 326eaac73b9..13f0311ab1c 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -58,13 +58,14 @@ auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
 
 using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
 using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using doubles_col      = cudf::test::fixed_width_column_wrapper<double>;
 using strings_col      = cudf::test::strings_column_wrapper;
 using structs_col      = cudf::test::structs_column_wrapper;
 using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
 
 auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
                 std::string const& filename,
-                bool nullable,
+                bool nullable                    = false,
                 std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
                 cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
 {
@@ -161,7 +162,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
   input_columns.emplace_back(int32s_col{}.release());
   input_columns.emplace_back(int64s_col{}.release());
 
-  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty", false);
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty");
   auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000});
   EXPECT_EQ(num_chunks, 1);
   EXPECT_EQ(result->num_rows(), 0);
@@ -223,7 +224,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
     std::vector<std::unique_ptr<cudf::column>> input_columns;
     auto const value_iter = thrust::make_counting_iterator(0);
     input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
-    return write_file(input_columns, "chunked_read_simple_boundary", false /*nullable*/);
+    return write_file(input_columns, "chunked_read_simple_boundary");
   }();
 
   // Test with zero limit: everything will be read in one chunk.
@@ -540,7 +541,7 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
     input_columns.emplace_back(
       std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
 
-    return write_file(input_columns, "chunked_read_with_lists_no_null", false /*nullable*/);
+    return write_file(input_columns, "chunked_read_with_lists_no_null");
   }();
 
   // Test with zero limit: everything will be read in one chunk.
@@ -948,9 +949,8 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
   auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
   auto const validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; });
-  cudf::test::fixed_width_column_wrapper<int32_t> col{sequence, sequence + num_rows, validity};
   std::vector<std::unique_ptr<cudf::column>> cols;
-  cols.push_back(col.release());
+  cols.push_back(int32s_col{sequence, sequence + num_rows, validity}.release());
   auto const expected = std::make_unique<cudf::table>(std::move(cols));
 
   auto const filepath          = temp_env->get_temp_filepath("chunked_reader_null_count.orc");
@@ -1047,7 +1047,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
 {
   auto constexpr num_rows = 1'000'000;
   auto const iter1        = thrust::make_constant_iterator(15);
-  auto const col1         = cudf::test::fixed_width_column_wrapper<double>(iter1, iter1 + num_rows);
+  auto const col1         = doubles_col(iter1, iter1 + num_rows);
 
   auto const filename   = std::string{"single_col_fixed_width"};
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
@@ -1072,10 +1072,10 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
   auto constexpr num_rows = 1'000'000;
 
   auto const iter1 = thrust::make_counting_iterator<int>(0);
-  auto const col1  = cudf::test::fixed_width_column_wrapper<int>(iter1, iter1 + num_rows);
+  auto const col1  = int32s_col(iter1, iter1 + num_rows);
 
   auto const iter2 = thrust::make_counting_iterator<double>(0);
-  auto const col2  = cudf::test::fixed_width_column_wrapper<double>(iter2, iter2 + num_rows);
+  auto const col2  = doubles_col(iter2, iter2 + num_rows);
 
   auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
   auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
@@ -1270,15 +1270,77 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   }
 }
 
+TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
+{
+  int64_t constexpr num_rows    = 100'000'000l;
+  int constexpr rows_per_stripe = 100'000;
+
+  auto const it    = thrust::make_counting_iterator(0);
+  auto const col   = int32s_col(it, it + num_rows);
+  auto const input = cudf::table_view{{col}};
+
+  auto const filepath = temp_env->get_temp_filepath("chunk_read_with_row_selection.orc");
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+      .stripe_size_rows(rows_per_stripe)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  // Verify metadata.
+  auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(metadata.num_rows(), num_rows);
+  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe);
+
+  int constexpr random_val = 123456;
+
+  // Read some random number or rows that is not stripe size.
+  int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val;
+
+  // Just shift the read data region back by a random offset.
+  const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val;
+
+  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+  auto const expected    = cudf::table_view{{skipped_col}};
+
+  auto const read_opts = cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+                           .use_index(false)
+                           .skip_rows(num_rows_to_skip)
+                           .num_rows(num_rows_to_read)
+                           .build();
+
+  auto reader = cudf::io::chunked_orc_reader(
+    60'000UL * sizeof(int) /*output limit, equal to 60k rows, less than rows in 1 stripe*/,
+    rows_per_stripe * sizeof(int) /*input limit, around size of 1 stripe's decoded data*/,
+    50'000 /*output granularity, or minimum number of rows for the output chunk*/,
+    read_opts);
+
+  auto num_chunks  = 0;
+  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto tviews      = std::vector<cudf::table_view>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // Each output chunk should have either exactly 50k rows, or num_rows_to_read % 50k.
+    EXPECT_TRUE(chunk.tbl->num_rows() == 50000 ||
+                chunk.tbl->num_rows() == num_rows_to_read % 50000);
+
+    tviews.emplace_back(chunk.tbl->view());
+    read_tables.emplace_back(std::move(chunk.tbl));
+    ++num_chunks;
+  } while (reader.has_next());
+
+  auto const read_result = cudf::concatenate(tviews);
+  EXPECT_EQ(num_chunks, 13);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+}
+
 #define LOCAL_TEST
 
 // This test is extremely heavy, thus it should be disabled by default.
 #ifdef LOCAL_TEST
 TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 {
-  using cudf::test::iterators::no_nulls;
-  using int64s_col = cudf::test::fixed_width_column_wrapper<int64_t>;
-
   int64_t constexpr num_rows    = 500'000'000l;
   int constexpr rows_per_stripe = 1'000'000;
   int constexpr num_reps        = 10l;
@@ -1302,15 +1364,15 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     }
   }
 
-  // Test reading the metadata
+  // Verify metadata.
   auto const metadata =
     cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
   EXPECT_EQ(metadata.num_rows(), total_rows);
   EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
 
   int constexpr num_rows_to_read = 5'000'000;
-  const auto num_rows_to_skip    = metadata.num_rows() - num_rows_to_read;
-  // - 123456 /*just shift the read data region back by a random offset*/;
+  const auto num_rows_to_skip    = metadata.num_rows() - num_rows_to_read -
+                                123456 /*just shift the read data region back by a random offset*/;
 
   // Check validity of the last 5 million rows.
   const auto sequence_start = num_rows_to_skip % num_rows;
@@ -1321,6 +1383,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                            cudf::io::source_info{data_buffer.data(), data_buffer.size()})
                            .use_index(false)
                            .skip_rows(num_rows_to_skip)
+                           .num_rows(num_rows_to_read)
                            .build();
   auto reader = cudf::io::chunked_orc_reader(
     600'000UL * sizeof(int64_t) /*output limit, equal to 600k int64_t rows */,

From 974bb7faf25c33b21d8409158e5f80a07023560e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 4 Mar 2024 11:06:52 -0800
Subject: [PATCH 160/321] Update debug info

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index a4b46bb49f4..1705369e7dd 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1552,11 +1552,18 @@ table_with_metadata reader::impl::make_output_chunk()
 
 #endif
 
-  {
+  if (!_chunk_read_data.has_next()) {
+    static int count{0};
+    count++;
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "done, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    std::cout << "complete, " << count << ", peak_memory_usage: " << peak_mem
+              << " , MB = " << (peak_mem * 1.0) / (1024.0 * 1024.0) << std::endl;
+  } else {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "done, partial, peak_memory_usage: " << peak_mem
+              << " , MB = " << (peak_mem * 1.0) / (1024.0 * 1024.0) << std::endl;
   }
 
   return {std::move(out_table), _out_metadata};

From bdb586ead08f044119b93d55e654e00b7f31b575 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 4 Mar 2024 11:06:59 -0800
Subject: [PATCH 161/321] Add a temporary test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 384 ++++++++++++++++++++++--
 1 file changed, 359 insertions(+), 25 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 13f0311ab1c..3a2ace205b4 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1182,7 +1182,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
 
 TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
 {
-  int constexpr num_rows  = 50'000'000;
+  int constexpr num_rows  = 1'000'000;
   int constexpr list_size = 4;
   int constexpr str_size  = 3;
 
@@ -1241,33 +1241,367 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
   auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
 
-  // Although we set `stripe_size_rows` to be very large, the writer only write
-  // 250k rows (top level) per stripe due to having nested type.
-  // Thus, we have 200 stripes in total.
-  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+  for (int iters = 1; iters <= 100; ++iters) {
+    {
+      auto const write_opts =
+        cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{test_files[0]})
+          .stripe_size_rows(cudf::io::default_stripe_size_rows)
+          .build();
+
+      auto writer = cudf::io::orc_chunked_writer(write_opts);
+      for (int i = 0; i < iters; ++i) {
+        writer.write(input);
+      }
+    }
 
-  {
-    int constexpr expected[] = {8, 8, 6};
-    input_limit_test_read(
-      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
-  }
+    // Although we set `stripe_size_rows` to be very large, the writer only write
+    // 250k rows (top level) per stripe due to having nested type.
+    // Thus, we have 200 stripes in total.
+    // input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
-  {
-    int constexpr expected[] = {16, 15, 17};
-    input_limit_test_read(__LINE__,
-                          test_files,
-                          input,
-                          output_limit{128 * 1024 * 1024UL},
-                          input_limit{128 * 1024 * 1024UL},
-                          expected);
-  }
+    if (0) {
+      int constexpr expected[] = {8, 8, 6};
+      auto const [result, num_chunks] =
+        chunked_read(test_files[0], output_limit{0UL}, input_limit{128 * 1024 * 1024UL});
+      EXPECT_EQ(expected[0], num_chunks);
+      printf("num_chunks: %d\n", (int)num_chunks);
 
-  // TODO: remove
-  {
-    int constexpr expected[] = {1, 1, 1};
-    input_limit_test_read(
-      __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
-  }
+      // TODO: equal
+      // CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+
+      // input_limit_test_read(
+      //   __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL},
+      //   expected);
+    }
+
+    // clang-format off
+  /*
+complete, 1, peak_memory_usage: 24870400 , MB = 23.7183
+complete, 2, peak_memory_usage: 49739984 , MB = 47.4357
+complete, 3, peak_memory_usage: 74609600 , MB = 71.1533
+complete, 4, peak_memory_usage: 99479184 , MB = 94.8707
+complete, 5, peak_memory_usage: 124348528 , MB = 118.588
+complete, 6, peak_memory_usage: 149218128 , MB = 142.305
+complete, 7, peak_memory_usage: 174087728 , MB = 166.023
+complete, 8, peak_memory_usage: 198957312 , MB = 189.74
+complete, 9, peak_memory_usage: 223826672 , MB = 213.458
+complete, 10, peak_memory_usage: 248696256 , MB = 237.175
+complete, 11, peak_memory_usage: 224912432 , MB = 214.493
+complete, 12, peak_memory_usage: 225455472 , MB = 215.011
+complete, 13, peak_memory_usage: 225998192 , MB = 215.529
+complete, 14, peak_memory_usage: 226541072 , MB = 216.046
+complete, 15, peak_memory_usage: 227084080 , MB = 216.564
+complete, 16, peak_memory_usage: 227626832 , MB = 217.082
+complete, 17, peak_memory_usage: 228169712 , MB = 217.6
+complete, 18, peak_memory_usage: 228712592 , MB = 218.117
+complete, 19, peak_memory_usage: 248696256 , MB = 237.175
+complete, 20, peak_memory_usage: 229798352 , MB = 219.153
+complete, 21, peak_memory_usage: 230341600 , MB = 219.671
+complete, 22, peak_memory_usage: 230884096 , MB = 220.188
+complete, 23, peak_memory_usage: 231427152 , MB = 220.706
+complete, 24, peak_memory_usage: 231970080 , MB = 221.224
+complete, 25, peak_memory_usage: 232513136 , MB = 221.742
+complete, 26, peak_memory_usage: 233056016 , MB = 222.26
+complete, 27, peak_memory_usage: 233598624 , MB = 222.777
+complete, 28, peak_memory_usage: 248696256 , MB = 237.175
+complete, 29, peak_memory_usage: 234684480 , MB = 223.813
+complete, 30, peak_memory_usage: 235227984 , MB = 224.331
+complete, 31, peak_memory_usage: 235770208 , MB = 224.848
+complete, 32, peak_memory_usage: 236313040 , MB = 225.366
+complete, 33, peak_memory_usage: 236855888 , MB = 225.883
+complete, 34, peak_memory_usage: 237399504 , MB = 226.402
+complete, 35, peak_memory_usage: 237941776 , MB = 226.919
+complete, 36, peak_memory_usage: 238485504 , MB = 227.438
+complete, 37, peak_memory_usage: 248696256 , MB = 237.175
+complete, 38, peak_memory_usage: 239570400 , MB = 228.472
+complete, 39, peak_memory_usage: 240113728 , MB = 228.99
+complete, 40, peak_memory_usage: 240656512 , MB = 229.508
+complete, 41, peak_memory_usage: 241198848 , MB = 230.025
+complete, 42, peak_memory_usage: 241742608 , MB = 230.544
+complete, 43, peak_memory_usage: 242285536 , MB = 231.061
+complete, 44, peak_memory_usage: 242828368 , MB = 231.579
+complete, 45, peak_memory_usage: 243371008 , MB = 232.097
+complete, 46, peak_memory_usage: 248696256 , MB = 237.175
+complete, 47, peak_memory_usage: 244456448 , MB = 233.132
+complete, 48, peak_memory_usage: 245000016 , MB = 233.65
+complete, 49, peak_memory_usage: 245542256 , MB = 234.167
+complete, 50, peak_memory_usage: 246085472 , MB = 234.685
+complete, 51, peak_memory_usage: 246628768 , MB = 235.204
+complete, 52, peak_memory_usage: 247171088 , MB = 235.721
+complete, 53, peak_memory_usage: 247714240 , MB = 236.239
+complete, 54, peak_memory_usage: 248257248 , MB = 236.757
+complete, 55, peak_memory_usage: 248799808 , MB = 237.274
+complete, 56, peak_memory_usage: 249342880 , MB = 237.792
+complete, 57, peak_memory_usage: 249885808 , MB = 238.31
+complete, 58, peak_memory_usage: 250428960 , MB = 238.828
+complete, 59, peak_memory_usage: 250971984 , MB = 239.346
+complete, 60, peak_memory_usage: 251514080 , MB = 239.863
+complete, 61, peak_memory_usage: 252057616 , MB = 240.381
+complete, 62, peak_memory_usage: 252599968 , MB = 240.898
+complete, 63, peak_memory_usage: 253142992 , MB = 241.416
+complete, 64, peak_memory_usage: 253686064 , MB = 241.934
+complete, 65, peak_memory_usage: 254227872 , MB = 242.451
+complete, 66, peak_memory_usage: 254771152 , MB = 242.969
+complete, 67, peak_memory_usage: 255313872 , MB = 243.486
+complete, 68, peak_memory_usage: 255856912 , MB = 244.004
+complete, 69, peak_memory_usage: 256400048 , MB = 244.522
+complete, 70, peak_memory_usage: 256943040 , MB = 245.04
+complete, 71, peak_memory_usage: 257485520 , MB = 245.557
+complete, 72, peak_memory_usage: 258029520 , MB = 246.076
+complete, 73, peak_memory_usage: 258572064 , MB = 246.594
+complete, 74, peak_memory_usage: 259115328 , MB = 247.112
+complete, 75, peak_memory_usage: 259657776 , MB = 247.629
+complete, 76, peak_memory_usage: 260200864 , MB = 248.147
+complete, 77, peak_memory_usage: 260742832 , MB = 248.664
+complete, 78, peak_memory_usage: 261286496 , MB = 249.182
+complete, 79, peak_memory_usage: 261828432 , MB = 249.699
+complete, 80, peak_memory_usage: 262371920 , MB = 250.217
+complete, 81, peak_memory_usage: 262914432 , MB = 250.735
+complete, 82, peak_memory_usage: 263458960 , MB = 251.254
+complete, 83, peak_memory_usage: 264000816 , MB = 251.771
+complete, 84, peak_memory_usage: 264543056 , MB = 252.288
+complete, 85, peak_memory_usage: 265085984 , MB = 252.806
+complete, 86, peak_memory_usage: 265630256 , MB = 253.325
+complete, 87, peak_memory_usage: 266171696 , MB = 253.841
+complete, 88, peak_memory_usage: 266714432 , MB = 254.359
+complete, 89, peak_memory_usage: 267257392 , MB = 254.877
+complete, 90, peak_memory_usage: 267800176 , MB = 255.394
+complete, 91, peak_memory_usage: 268343536 , MB = 255.912
+complete, 92, peak_memory_usage: 268886256 , MB = 256.43
+complete, 93, peak_memory_usage: 269429968 , MB = 256.948
+complete, 94, peak_memory_usage: 269971904 , MB = 257.465
+complete, 95, peak_memory_usage: 270516528 , MB = 257.985
+complete, 96, peak_memory_usage: 271058992 , MB = 258.502
+complete, 97, peak_memory_usage: 271601616 , MB = 259.019
+complete, 98, peak_memory_usage: 272145536 , MB = 259.538
+complete, 99, peak_memory_usage: 272686496 , MB = 260.054
+complete, 100, peak_memory_usage: 273230448 , MB = 260.573
+
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 1
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 2
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 3
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 4
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 5
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 6
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 7
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 8
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 9
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 10
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+num_chunks: 11
+
+   */
+    // clang-format on
+
+    printf("\n\n\n\n read full\n");
+    fflush(stdout);
+    // TODO: remove
+    {
+      int constexpr expected[] = {1, 1, 1};
+      // input_limit_test_read(
+      //   __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
+      auto const [result, num_chunks] =
+        chunked_read(test_files[0], output_limit{0UL}, input_limit{0UL});
+      EXPECT_EQ(expected[0], num_chunks);
+      printf("num_chunks: %d\n", (int)num_chunks);
+      // TODO: equal
+      // CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+    }
+
+    // clang-format off
+/*
+complete, 1, peak_memory_usage: 24870400 , MB = 23.7183
+complete, 2, peak_memory_usage: 49739984 , MB = 47.4357
+complete, 3, peak_memory_usage: 74609600 , MB = 71.1533
+complete, 4, peak_memory_usage: 99479184 , MB = 94.8707
+complete, 5, peak_memory_usage: 124348528 , MB = 118.588
+complete, 6, peak_memory_usage: 149218128 , MB = 142.305
+complete, 7, peak_memory_usage: 174087728 , MB = 166.023
+complete, 8, peak_memory_usage: 198957312 , MB = 189.74
+complete, 9, peak_memory_usage: 223826672 , MB = 213.458
+complete, 10, peak_memory_usage: 248696256 , MB = 237.175
+complete, 11, peak_memory_usage: 273565872 , MB = 260.893
+complete, 12, peak_memory_usage: 298435456 , MB = 284.61
+complete, 13, peak_memory_usage: 323304800 , MB = 308.327
+complete, 14, peak_memory_usage: 348174400 , MB = 332.045
+complete, 15, peak_memory_usage: 373044000 , MB = 355.762
+complete, 16, peak_memory_usage: 397913584 , MB = 379.48
+complete, 17, peak_memory_usage: 422782944 , MB = 403.197
+complete, 18, peak_memory_usage: 447652528 , MB = 426.915
+complete, 19, peak_memory_usage: 472522144 , MB = 450.632
+complete, 20, peak_memory_usage: 497391728 , MB = 474.35
+complete, 21, peak_memory_usage: 522261072 , MB = 498.067
+complete, 22, peak_memory_usage: 547130672 , MB = 521.784
+complete, 23, peak_memory_usage: 572000272 , MB = 545.502
+complete, 24, peak_memory_usage: 596869856 , MB = 569.219
+complete, 25, peak_memory_usage: 621739216 , MB = 592.937
+complete, 26, peak_memory_usage: 646608800 , MB = 616.654
+complete, 27, peak_memory_usage: 671478416 , MB = 640.372
+complete, 28, peak_memory_usage: 696348000 , MB = 664.089
+complete, 29, peak_memory_usage: 721217344 , MB = 687.806
+complete, 30, peak_memory_usage: 746086944 , MB = 711.524
+complete, 31, peak_memory_usage: 770956544 , MB = 735.241
+complete, 32, peak_memory_usage: 795826128 , MB = 758.959
+complete, 33, peak_memory_usage: 820695488 , MB = 782.676
+complete, 34, peak_memory_usage: 845565072 , MB = 806.394
+complete, 35, peak_memory_usage: 870434688 , MB = 830.111
+complete, 36, peak_memory_usage: 895304272 , MB = 853.829
+complete, 37, peak_memory_usage: 920173616 , MB = 877.546
+complete, 38, peak_memory_usage: 945043216 , MB = 901.263
+complete, 39, peak_memory_usage: 969912816 , MB = 924.981
+complete, 40, peak_memory_usage: 994782400 , MB = 948.698
+complete, 41, peak_memory_usage: 1019651760 , MB = 972.416
+complete, 42, peak_memory_usage: 1044521344 , MB = 996.133
+complete, 43, peak_memory_usage: 1069390960 , MB = 1019.85
+complete, 44, peak_memory_usage: 1094260544 , MB = 1043.57
+complete, 45, peak_memory_usage: 1119129888 , MB = 1067.29
+complete, 46, peak_memory_usage: 1143999488 , MB = 1091
+complete, 47, peak_memory_usage: 1168869088 , MB = 1114.72
+complete, 48, peak_memory_usage: 1193738672 , MB = 1138.44
+complete, 49, peak_memory_usage: 1218608032 , MB = 1162.16
+complete, 50, peak_memory_usage: 1243477616 , MB = 1185.87
+complete, 51, peak_memory_usage: 1268347232 , MB = 1209.59
+complete, 52, peak_memory_usage: 1293216816 , MB = 1233.31
+complete, 53, peak_memory_usage: 1318086160 , MB = 1257.02
+complete, 54, peak_memory_usage: 1342955760 , MB = 1280.74
+complete, 55, peak_memory_usage: 1367825360 , MB = 1304.46
+complete, 56, peak_memory_usage: 1392694944 , MB = 1328.18
+complete, 57, peak_memory_usage: 1417564560 , MB = 1351.89
+complete, 58, peak_memory_usage: 1442433888 , MB = 1375.61
+complete, 59, peak_memory_usage: 1467303504 , MB = 1399.33
+complete, 60, peak_memory_usage: 1492173088 , MB = 1423.05
+complete, 61, peak_memory_usage: 1517042688 , MB = 1446.76
+complete, 62, peak_memory_usage: 1541912032 , MB = 1470.48
+complete, 63, peak_memory_usage: 1566781632 , MB = 1494.2
+complete, 64, peak_memory_usage: 1591651216 , MB = 1517.92
+complete, 65, peak_memory_usage: 1616520832 , MB = 1541.63
+complete, 66, peak_memory_usage: 1641390160 , MB = 1565.35
+complete, 67, peak_memory_usage: 1666259776 , MB = 1589.07
+complete, 68, peak_memory_usage: 1691129360 , MB = 1612.79
+complete, 69, peak_memory_usage: 1715998960 , MB = 1636.5
+complete, 70, peak_memory_usage: 1740868304 , MB = 1660.22
+complete, 71, peak_memory_usage: 1765737904 , MB = 1683.94
+complete, 72, peak_memory_usage: 1790607488 , MB = 1707.66
+complete, 73, peak_memory_usage: 1815477104 , MB = 1731.37
+complete, 74, peak_memory_usage: 1840346432 , MB = 1755.09
+complete, 75, peak_memory_usage: 1865216048 , MB = 1778.81
+complete, 76, peak_memory_usage: 1890085632 , MB = 1802.53
+complete, 77, peak_memory_usage: 1914955232 , MB = 1826.24
+complete, 78, peak_memory_usage: 1939824576 , MB = 1849.96
+complete, 79, peak_memory_usage: 1964694176 , MB = 1873.68
+complete, 80, peak_memory_usage: 1989563760 , MB = 1897.4
+complete, 81, peak_memory_usage: 2014433376 , MB = 1921.11
+complete, 82, peak_memory_usage: 2039302704 , MB = 1944.83
+complete, 83, peak_memory_usage: 2064172320 , MB = 1968.55
+complete, 84, peak_memory_usage: 2089041904 , MB = 1992.27
+complete, 85, peak_memory_usage: 2113911504 , MB = 2015.98
+complete, 86, peak_memory_usage: 2138780848 , MB = 2039.7
+complete, 87, peak_memory_usage: 2163650448 , MB = 2063.42
+complete, 88, peak_memory_usage: 2188520032 , MB = 2087.14
+complete, 89, peak_memory_usage: 2213389648 , MB = 2110.85
+complete, 90, peak_memory_usage: 2238258976 , MB = 2134.57
+complete, 91, peak_memory_usage: 2263128592 , MB = 2158.29
+complete, 92, peak_memory_usage: 2287998176 , MB = 2182.01
+complete, 93, peak_memory_usage: 2312867776 , MB = 2205.72
+complete, 94, peak_memory_usage: 2337737120 , MB = 2229.44
+complete, 95, peak_memory_usage: 2362606720 , MB = 2253.16
+complete, 96, peak_memory_usage: 2387476304 , MB = 2276.87
+complete, 97, peak_memory_usage: 2412345920 , MB = 2300.59
+complete, 98, peak_memory_usage: 2437215248 , MB = 2324.31
+complete, 99, peak_memory_usage: 2462084864 , MB = 2348.03
+complete, 100, peak_memory_usage: 2486954448 , MB = 2371.74
+*/
+    // clang-format on
+
+  }  // end iters
 }
 
 TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)

From 1bee174cd9f297c30f2034b4b246588a5a0514ce Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 4 Mar 2024 11:07:05 -0800
Subject: [PATCH 162/321] Revert "Add a temporary test"

This reverts commit bdb586ead08f044119b93d55e654e00b7f31b575.
---
 cpp/tests/io/orc_chunked_reader_test.cu | 384 ++----------------------
 1 file changed, 25 insertions(+), 359 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 3a2ace205b4..13f0311ab1c 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1182,7 +1182,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
 
 TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
 {
-  int constexpr num_rows  = 1'000'000;
+  int constexpr num_rows  = 50'000'000;
   int constexpr list_size = 4;
   int constexpr str_size  = 3;
 
@@ -1241,367 +1241,33 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
   auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
 
-  for (int iters = 1; iters <= 100; ++iters) {
-    {
-      auto const write_opts =
-        cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{test_files[0]})
-          .stripe_size_rows(cudf::io::default_stripe_size_rows)
-          .build();
-
-      auto writer = cudf::io::orc_chunked_writer(write_opts);
-      for (int i = 0; i < iters; ++i) {
-        writer.write(input);
-      }
-    }
-
-    // Although we set `stripe_size_rows` to be very large, the writer only write
-    // 250k rows (top level) per stripe due to having nested type.
-    // Thus, we have 200 stripes in total.
-    // input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
-
-    if (0) {
-      int constexpr expected[] = {8, 8, 6};
-      auto const [result, num_chunks] =
-        chunked_read(test_files[0], output_limit{0UL}, input_limit{128 * 1024 * 1024UL});
-      EXPECT_EQ(expected[0], num_chunks);
-      printf("num_chunks: %d\n", (int)num_chunks);
-
-      // TODO: equal
-      // CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
-      // input_limit_test_read(
-      //   __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL},
-      //   expected);
-    }
+  {
+    int constexpr expected[] = {8, 8, 6};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
+  }
 
-    // clang-format off
-  /*
-complete, 1, peak_memory_usage: 24870400 , MB = 23.7183
-complete, 2, peak_memory_usage: 49739984 , MB = 47.4357
-complete, 3, peak_memory_usage: 74609600 , MB = 71.1533
-complete, 4, peak_memory_usage: 99479184 , MB = 94.8707
-complete, 5, peak_memory_usage: 124348528 , MB = 118.588
-complete, 6, peak_memory_usage: 149218128 , MB = 142.305
-complete, 7, peak_memory_usage: 174087728 , MB = 166.023
-complete, 8, peak_memory_usage: 198957312 , MB = 189.74
-complete, 9, peak_memory_usage: 223826672 , MB = 213.458
-complete, 10, peak_memory_usage: 248696256 , MB = 237.175
-complete, 11, peak_memory_usage: 224912432 , MB = 214.493
-complete, 12, peak_memory_usage: 225455472 , MB = 215.011
-complete, 13, peak_memory_usage: 225998192 , MB = 215.529
-complete, 14, peak_memory_usage: 226541072 , MB = 216.046
-complete, 15, peak_memory_usage: 227084080 , MB = 216.564
-complete, 16, peak_memory_usage: 227626832 , MB = 217.082
-complete, 17, peak_memory_usage: 228169712 , MB = 217.6
-complete, 18, peak_memory_usage: 228712592 , MB = 218.117
-complete, 19, peak_memory_usage: 248696256 , MB = 237.175
-complete, 20, peak_memory_usage: 229798352 , MB = 219.153
-complete, 21, peak_memory_usage: 230341600 , MB = 219.671
-complete, 22, peak_memory_usage: 230884096 , MB = 220.188
-complete, 23, peak_memory_usage: 231427152 , MB = 220.706
-complete, 24, peak_memory_usage: 231970080 , MB = 221.224
-complete, 25, peak_memory_usage: 232513136 , MB = 221.742
-complete, 26, peak_memory_usage: 233056016 , MB = 222.26
-complete, 27, peak_memory_usage: 233598624 , MB = 222.777
-complete, 28, peak_memory_usage: 248696256 , MB = 237.175
-complete, 29, peak_memory_usage: 234684480 , MB = 223.813
-complete, 30, peak_memory_usage: 235227984 , MB = 224.331
-complete, 31, peak_memory_usage: 235770208 , MB = 224.848
-complete, 32, peak_memory_usage: 236313040 , MB = 225.366
-complete, 33, peak_memory_usage: 236855888 , MB = 225.883
-complete, 34, peak_memory_usage: 237399504 , MB = 226.402
-complete, 35, peak_memory_usage: 237941776 , MB = 226.919
-complete, 36, peak_memory_usage: 238485504 , MB = 227.438
-complete, 37, peak_memory_usage: 248696256 , MB = 237.175
-complete, 38, peak_memory_usage: 239570400 , MB = 228.472
-complete, 39, peak_memory_usage: 240113728 , MB = 228.99
-complete, 40, peak_memory_usage: 240656512 , MB = 229.508
-complete, 41, peak_memory_usage: 241198848 , MB = 230.025
-complete, 42, peak_memory_usage: 241742608 , MB = 230.544
-complete, 43, peak_memory_usage: 242285536 , MB = 231.061
-complete, 44, peak_memory_usage: 242828368 , MB = 231.579
-complete, 45, peak_memory_usage: 243371008 , MB = 232.097
-complete, 46, peak_memory_usage: 248696256 , MB = 237.175
-complete, 47, peak_memory_usage: 244456448 , MB = 233.132
-complete, 48, peak_memory_usage: 245000016 , MB = 233.65
-complete, 49, peak_memory_usage: 245542256 , MB = 234.167
-complete, 50, peak_memory_usage: 246085472 , MB = 234.685
-complete, 51, peak_memory_usage: 246628768 , MB = 235.204
-complete, 52, peak_memory_usage: 247171088 , MB = 235.721
-complete, 53, peak_memory_usage: 247714240 , MB = 236.239
-complete, 54, peak_memory_usage: 248257248 , MB = 236.757
-complete, 55, peak_memory_usage: 248799808 , MB = 237.274
-complete, 56, peak_memory_usage: 249342880 , MB = 237.792
-complete, 57, peak_memory_usage: 249885808 , MB = 238.31
-complete, 58, peak_memory_usage: 250428960 , MB = 238.828
-complete, 59, peak_memory_usage: 250971984 , MB = 239.346
-complete, 60, peak_memory_usage: 251514080 , MB = 239.863
-complete, 61, peak_memory_usage: 252057616 , MB = 240.381
-complete, 62, peak_memory_usage: 252599968 , MB = 240.898
-complete, 63, peak_memory_usage: 253142992 , MB = 241.416
-complete, 64, peak_memory_usage: 253686064 , MB = 241.934
-complete, 65, peak_memory_usage: 254227872 , MB = 242.451
-complete, 66, peak_memory_usage: 254771152 , MB = 242.969
-complete, 67, peak_memory_usage: 255313872 , MB = 243.486
-complete, 68, peak_memory_usage: 255856912 , MB = 244.004
-complete, 69, peak_memory_usage: 256400048 , MB = 244.522
-complete, 70, peak_memory_usage: 256943040 , MB = 245.04
-complete, 71, peak_memory_usage: 257485520 , MB = 245.557
-complete, 72, peak_memory_usage: 258029520 , MB = 246.076
-complete, 73, peak_memory_usage: 258572064 , MB = 246.594
-complete, 74, peak_memory_usage: 259115328 , MB = 247.112
-complete, 75, peak_memory_usage: 259657776 , MB = 247.629
-complete, 76, peak_memory_usage: 260200864 , MB = 248.147
-complete, 77, peak_memory_usage: 260742832 , MB = 248.664
-complete, 78, peak_memory_usage: 261286496 , MB = 249.182
-complete, 79, peak_memory_usage: 261828432 , MB = 249.699
-complete, 80, peak_memory_usage: 262371920 , MB = 250.217
-complete, 81, peak_memory_usage: 262914432 , MB = 250.735
-complete, 82, peak_memory_usage: 263458960 , MB = 251.254
-complete, 83, peak_memory_usage: 264000816 , MB = 251.771
-complete, 84, peak_memory_usage: 264543056 , MB = 252.288
-complete, 85, peak_memory_usage: 265085984 , MB = 252.806
-complete, 86, peak_memory_usage: 265630256 , MB = 253.325
-complete, 87, peak_memory_usage: 266171696 , MB = 253.841
-complete, 88, peak_memory_usage: 266714432 , MB = 254.359
-complete, 89, peak_memory_usage: 267257392 , MB = 254.877
-complete, 90, peak_memory_usage: 267800176 , MB = 255.394
-complete, 91, peak_memory_usage: 268343536 , MB = 255.912
-complete, 92, peak_memory_usage: 268886256 , MB = 256.43
-complete, 93, peak_memory_usage: 269429968 , MB = 256.948
-complete, 94, peak_memory_usage: 269971904 , MB = 257.465
-complete, 95, peak_memory_usage: 270516528 , MB = 257.985
-complete, 96, peak_memory_usage: 271058992 , MB = 258.502
-complete, 97, peak_memory_usage: 271601616 , MB = 259.019
-complete, 98, peak_memory_usage: 272145536 , MB = 259.538
-complete, 99, peak_memory_usage: 272686496 , MB = 260.054
-complete, 100, peak_memory_usage: 273230448 , MB = 260.573
-
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 1
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 2
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 3
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 4
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 5
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 6
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 7
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 8
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 9
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 10
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-num_chunks: 11
-
-   */
-    // clang-format on
-
-    printf("\n\n\n\n read full\n");
-    fflush(stdout);
-    // TODO: remove
-    {
-      int constexpr expected[] = {1, 1, 1};
-      // input_limit_test_read(
-      //   __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
-      auto const [result, num_chunks] =
-        chunked_read(test_files[0], output_limit{0UL}, input_limit{0UL});
-      EXPECT_EQ(expected[0], num_chunks);
-      printf("num_chunks: %d\n", (int)num_chunks);
-      // TODO: equal
-      // CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
-    }
+  {
+    int constexpr expected[] = {16, 15, 17};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{128 * 1024 * 1024UL},
+                          expected);
+  }
 
-    // clang-format off
-/*
-complete, 1, peak_memory_usage: 24870400 , MB = 23.7183
-complete, 2, peak_memory_usage: 49739984 , MB = 47.4357
-complete, 3, peak_memory_usage: 74609600 , MB = 71.1533
-complete, 4, peak_memory_usage: 99479184 , MB = 94.8707
-complete, 5, peak_memory_usage: 124348528 , MB = 118.588
-complete, 6, peak_memory_usage: 149218128 , MB = 142.305
-complete, 7, peak_memory_usage: 174087728 , MB = 166.023
-complete, 8, peak_memory_usage: 198957312 , MB = 189.74
-complete, 9, peak_memory_usage: 223826672 , MB = 213.458
-complete, 10, peak_memory_usage: 248696256 , MB = 237.175
-complete, 11, peak_memory_usage: 273565872 , MB = 260.893
-complete, 12, peak_memory_usage: 298435456 , MB = 284.61
-complete, 13, peak_memory_usage: 323304800 , MB = 308.327
-complete, 14, peak_memory_usage: 348174400 , MB = 332.045
-complete, 15, peak_memory_usage: 373044000 , MB = 355.762
-complete, 16, peak_memory_usage: 397913584 , MB = 379.48
-complete, 17, peak_memory_usage: 422782944 , MB = 403.197
-complete, 18, peak_memory_usage: 447652528 , MB = 426.915
-complete, 19, peak_memory_usage: 472522144 , MB = 450.632
-complete, 20, peak_memory_usage: 497391728 , MB = 474.35
-complete, 21, peak_memory_usage: 522261072 , MB = 498.067
-complete, 22, peak_memory_usage: 547130672 , MB = 521.784
-complete, 23, peak_memory_usage: 572000272 , MB = 545.502
-complete, 24, peak_memory_usage: 596869856 , MB = 569.219
-complete, 25, peak_memory_usage: 621739216 , MB = 592.937
-complete, 26, peak_memory_usage: 646608800 , MB = 616.654
-complete, 27, peak_memory_usage: 671478416 , MB = 640.372
-complete, 28, peak_memory_usage: 696348000 , MB = 664.089
-complete, 29, peak_memory_usage: 721217344 , MB = 687.806
-complete, 30, peak_memory_usage: 746086944 , MB = 711.524
-complete, 31, peak_memory_usage: 770956544 , MB = 735.241
-complete, 32, peak_memory_usage: 795826128 , MB = 758.959
-complete, 33, peak_memory_usage: 820695488 , MB = 782.676
-complete, 34, peak_memory_usage: 845565072 , MB = 806.394
-complete, 35, peak_memory_usage: 870434688 , MB = 830.111
-complete, 36, peak_memory_usage: 895304272 , MB = 853.829
-complete, 37, peak_memory_usage: 920173616 , MB = 877.546
-complete, 38, peak_memory_usage: 945043216 , MB = 901.263
-complete, 39, peak_memory_usage: 969912816 , MB = 924.981
-complete, 40, peak_memory_usage: 994782400 , MB = 948.698
-complete, 41, peak_memory_usage: 1019651760 , MB = 972.416
-complete, 42, peak_memory_usage: 1044521344 , MB = 996.133
-complete, 43, peak_memory_usage: 1069390960 , MB = 1019.85
-complete, 44, peak_memory_usage: 1094260544 , MB = 1043.57
-complete, 45, peak_memory_usage: 1119129888 , MB = 1067.29
-complete, 46, peak_memory_usage: 1143999488 , MB = 1091
-complete, 47, peak_memory_usage: 1168869088 , MB = 1114.72
-complete, 48, peak_memory_usage: 1193738672 , MB = 1138.44
-complete, 49, peak_memory_usage: 1218608032 , MB = 1162.16
-complete, 50, peak_memory_usage: 1243477616 , MB = 1185.87
-complete, 51, peak_memory_usage: 1268347232 , MB = 1209.59
-complete, 52, peak_memory_usage: 1293216816 , MB = 1233.31
-complete, 53, peak_memory_usage: 1318086160 , MB = 1257.02
-complete, 54, peak_memory_usage: 1342955760 , MB = 1280.74
-complete, 55, peak_memory_usage: 1367825360 , MB = 1304.46
-complete, 56, peak_memory_usage: 1392694944 , MB = 1328.18
-complete, 57, peak_memory_usage: 1417564560 , MB = 1351.89
-complete, 58, peak_memory_usage: 1442433888 , MB = 1375.61
-complete, 59, peak_memory_usage: 1467303504 , MB = 1399.33
-complete, 60, peak_memory_usage: 1492173088 , MB = 1423.05
-complete, 61, peak_memory_usage: 1517042688 , MB = 1446.76
-complete, 62, peak_memory_usage: 1541912032 , MB = 1470.48
-complete, 63, peak_memory_usage: 1566781632 , MB = 1494.2
-complete, 64, peak_memory_usage: 1591651216 , MB = 1517.92
-complete, 65, peak_memory_usage: 1616520832 , MB = 1541.63
-complete, 66, peak_memory_usage: 1641390160 , MB = 1565.35
-complete, 67, peak_memory_usage: 1666259776 , MB = 1589.07
-complete, 68, peak_memory_usage: 1691129360 , MB = 1612.79
-complete, 69, peak_memory_usage: 1715998960 , MB = 1636.5
-complete, 70, peak_memory_usage: 1740868304 , MB = 1660.22
-complete, 71, peak_memory_usage: 1765737904 , MB = 1683.94
-complete, 72, peak_memory_usage: 1790607488 , MB = 1707.66
-complete, 73, peak_memory_usage: 1815477104 , MB = 1731.37
-complete, 74, peak_memory_usage: 1840346432 , MB = 1755.09
-complete, 75, peak_memory_usage: 1865216048 , MB = 1778.81
-complete, 76, peak_memory_usage: 1890085632 , MB = 1802.53
-complete, 77, peak_memory_usage: 1914955232 , MB = 1826.24
-complete, 78, peak_memory_usage: 1939824576 , MB = 1849.96
-complete, 79, peak_memory_usage: 1964694176 , MB = 1873.68
-complete, 80, peak_memory_usage: 1989563760 , MB = 1897.4
-complete, 81, peak_memory_usage: 2014433376 , MB = 1921.11
-complete, 82, peak_memory_usage: 2039302704 , MB = 1944.83
-complete, 83, peak_memory_usage: 2064172320 , MB = 1968.55
-complete, 84, peak_memory_usage: 2089041904 , MB = 1992.27
-complete, 85, peak_memory_usage: 2113911504 , MB = 2015.98
-complete, 86, peak_memory_usage: 2138780848 , MB = 2039.7
-complete, 87, peak_memory_usage: 2163650448 , MB = 2063.42
-complete, 88, peak_memory_usage: 2188520032 , MB = 2087.14
-complete, 89, peak_memory_usage: 2213389648 , MB = 2110.85
-complete, 90, peak_memory_usage: 2238258976 , MB = 2134.57
-complete, 91, peak_memory_usage: 2263128592 , MB = 2158.29
-complete, 92, peak_memory_usage: 2287998176 , MB = 2182.01
-complete, 93, peak_memory_usage: 2312867776 , MB = 2205.72
-complete, 94, peak_memory_usage: 2337737120 , MB = 2229.44
-complete, 95, peak_memory_usage: 2362606720 , MB = 2253.16
-complete, 96, peak_memory_usage: 2387476304 , MB = 2276.87
-complete, 97, peak_memory_usage: 2412345920 , MB = 2300.59
-complete, 98, peak_memory_usage: 2437215248 , MB = 2324.31
-complete, 99, peak_memory_usage: 2462084864 , MB = 2348.03
-complete, 100, peak_memory_usage: 2486954448 , MB = 2371.74
-*/
-    // clang-format on
-
-  }  // end iters
+  // TODO: remove
+  {
+    int constexpr expected[] = {1, 1, 1};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
+  }
 }
 
 TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)

From 17096d33c4375594f1d3ad0f04ae570601cf56bc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 4 Mar 2024 13:02:30 -0800
Subject: [PATCH 163/321] Fix format

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 058b8555378..b294efbbad6 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -287,9 +287,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST
-  io/orc_chunked_reader_test.cu
-  io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )

From 18a4e9ff27d412b0b9a3b7f5c3753bb146897e88 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 5 Mar 2024 08:07:53 -0800
Subject: [PATCH 164/321] Temporarily fix use-after-free bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 50 +++++++++++++------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 13f0311ab1c..d1fa977d80f 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -112,34 +112,38 @@ auto chunked_read(std::string const& filepath,
                   input_limit input_limit_bytes             = input_limit{0},
                   output_row_granularity output_granularity = output_row_granularity{10'000})
 {
-  auto const read_opts =
-    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
-  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
-                                             static_cast<std::size_t>(input_limit_bytes),
-                                             static_cast<cudf::size_type>(output_granularity),
-                                             read_opts);
-
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto out_tviews = std::vector<cudf::table_view>{};
 
-  do {
-    auto chunk = reader.read_chunk();
-    // If the input file is empty, the first call to `read_chunk` will return an empty table.
-    // Thus, we only check for non-empty output table from the second call.
-    if (num_chunks > 0) {
-      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  // TODO: remove this scope, when we get rid of mem stat in the reader.
+  // This is to avoid use-after-free of memory resource created by the mem stat object.
+  {
+    auto const read_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+    auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                               static_cast<std::size_t>(input_limit_bytes),
+                                               static_cast<cudf::size_type>(output_granularity),
+                                               read_opts);
+
+    do {
+      auto chunk = reader.read_chunk();
+      // If the input file is empty, the first call to `read_chunk` will return an empty table.
+      // Thus, we only check for non-empty output table from the second call.
+      if (num_chunks > 0) {
+        CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+      }
+      ++num_chunks;
+      out_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
+
+    if (num_chunks > 1) {
+      CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
     }
-    ++num_chunks;
-    out_tables.emplace_back(std::move(chunk.tbl));
-  } while (reader.has_next());
-
-  if (num_chunks > 1) {
-    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
-  }
 
-  auto out_tviews = std::vector<cudf::table_view>{};
-  for (auto const& tbl : out_tables) {
-    out_tviews.emplace_back(tbl->view());
+    for (auto const& tbl : out_tables) {
+      out_tviews.emplace_back(tbl->view());
+    }
   }
 
   return std::pair(cudf::concatenate(out_tviews), num_chunks);

From 969781368e6c4ede43f32bf1edaadd4dab2d8014 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 5 Mar 2024 08:12:24 -0800
Subject: [PATCH 165/321] Revert "Temporarily fix use-after-free bug"

This reverts commit 18a4e9ff27d412b0b9a3b7f5c3753bb146897e88.
---
 cpp/tests/io/orc_chunked_reader_test.cu | 50 ++++++++++++-------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index d1fa977d80f..13f0311ab1c 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -112,38 +112,34 @@ auto chunked_read(std::string const& filepath,
                   input_limit input_limit_bytes             = input_limit{0},
                   output_row_granularity output_granularity = output_row_granularity{10'000})
 {
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                             static_cast<std::size_t>(input_limit_bytes),
+                                             static_cast<cudf::size_type>(output_granularity),
+                                             read_opts);
+
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
-  auto out_tviews = std::vector<cudf::table_view>{};
 
-  // TODO: remove this scope, when we get rid of mem stat in the reader.
-  // This is to avoid use-after-free of memory resource created by the mem stat object.
-  {
-    auto const read_opts =
-      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
-    auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
-                                               static_cast<std::size_t>(input_limit_bytes),
-                                               static_cast<cudf::size_type>(output_granularity),
-                                               read_opts);
-
-    do {
-      auto chunk = reader.read_chunk();
-      // If the input file is empty, the first call to `read_chunk` will return an empty table.
-      // Thus, we only check for non-empty output table from the second call.
-      if (num_chunks > 0) {
-        CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
-      }
-      ++num_chunks;
-      out_tables.emplace_back(std::move(chunk.tbl));
-    } while (reader.has_next());
-
-    if (num_chunks > 1) {
-      CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
     }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
 
-    for (auto const& tbl : out_tables) {
-      out_tviews.emplace_back(tbl->view());
-    }
+  if (num_chunks > 1) {
+    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  }
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
   }
 
   return std::pair(cudf::concatenate(out_tviews), num_chunks);

From 001693577dc8205bd51c2ecc71223be0e0593fee Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 5 Mar 2024 08:12:45 -0800
Subject: [PATCH 166/321] This is indeed the fix for use-after-free bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 13f0311ab1c..2857b82d415 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -122,6 +122,10 @@ auto chunked_read(std::string const& filepath,
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
 
+  // TODO: remove this scope, when we get rid of mem stat in the reader.
+  // This is to avoid use-after-free of memory resource created by the mem stat object.
+  auto mr = rmm::mr::get_current_device_resource();
+
   do {
     auto chunk = reader.read_chunk();
     // If the input file is empty, the first call to `read_chunk` will return an empty table.
@@ -142,7 +146,10 @@ auto chunked_read(std::string const& filepath,
     out_tviews.emplace_back(tbl->view());
   }
 
-  return std::pair(cudf::concatenate(out_tviews), num_chunks);
+  // return std::pair(cudf::concatenate(out_tviews), num_chunks);
+
+  // TODO: remove this
+  return std::pair(cudf::concatenate(out_tviews, cudf::get_default_stream(), mr), num_chunks);
 }
 
 auto chunked_read(std::string const& filepath,

From 759246d81c15106bbd67f149a4e309bdd94a4196 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 5 Mar 2024 13:21:15 -0800
Subject: [PATCH 167/321] Final workaround for use-after-free bug

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu  |  4 ++--
 cpp/src/io/orc/reader_impl.hpp | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1705369e7dd..5f2a140ed4b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1631,6 +1631,7 @@ reader::impl::impl(std::size_t output_size_limit,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
+    mem_stats_logger(mr),
     _config{options.get_timestamp_type(),
             options.is_enabled_use_index(),
             options.is_enabled_use_np_dtypes(),
@@ -1645,8 +1646,7 @@ reader::impl::impl(std::size_t output_size_limit,
     _chunk_read_data{
       output_size_limit,
       data_read_limit,
-      output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY},
-    mem_stats_logger(mr)
+      output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY}
 {
   printf("construct reader , limit = %d, %d, gradunarity %d \n",
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4a32394c91f..de1d0ed68f5 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -36,22 +36,29 @@ namespace cudf::io::orc::detail {
 
 class memory_stats_logger {
  public:
-  explicit memory_stats_logger(rmm::mr::device_memory_resource* mr)
-    : existing_mr(mr), statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+  explicit memory_stats_logger(rmm::mr::device_memory_resource* mr) : existing_mr(mr)
   {
-    rmm::mr::set_current_device_resource(&statistics_mr);
+    printf("exist mr: %p\n", mr);
+
+    statistics_mr =
+      std::make_unique<rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>(
+        existing_mr);
+
+    rmm::mr::set_current_device_resource(statistics_mr.get());
   }
 
   ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
 
   [[nodiscard]] size_t peak_memory_usage() const noexcept
   {
-    return statistics_mr.get_bytes_counter().peak;
+    return statistics_mr->get_bytes_counter().peak;
   }
 
  private:
   rmm::mr::device_memory_resource* existing_mr;
-  rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource> statistics_mr;
+  static inline std::unique_ptr<
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>
+    statistics_mr;
 };
 
 struct reader_column_meta;
@@ -188,6 +195,8 @@ class reader::impl {
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
 
+  memory_stats_logger mem_stats_logger;
+
   // Reader configs
   struct {
     data_type timestamp_type;  // override output timestamp resolution
@@ -213,8 +222,6 @@ class reader::impl {
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 
   static constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
-
-  memory_stats_logger mem_stats_logger;
 };
 
 }  // namespace cudf::io::orc::detail

From d5912b905fa556db05f7a63d59b328fe937a9dbc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 5 Mar 2024 16:30:38 -0800
Subject: [PATCH 168/321] Split file

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/CMakeLists.txt                   |    1 +
 cpp/src/io/orc/reader_impl.cu        | 1389 ------------------------
 cpp/src/io/orc/reader_impl_decode.cu | 1451 ++++++++++++++++++++++++++
 3 files changed, 1452 insertions(+), 1389 deletions(-)
 create mode 100644 cpp/src/io/orc/reader_impl_decode.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eb5360509d7..fc9854ebf7c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -389,6 +389,7 @@ add_library(
   src/io/orc/reader.cu
   src/io/orc/reader_impl.cu
   src/io/orc/reader_impl_chunking.cu
+  src/io/orc/reader_impl_decode.cu
   src/io/orc/reader_impl_helpers.cpp
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 5f2a140ed4b..d4ddbea347c 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -59,1395 +59,6 @@
 
 namespace cudf::io::orc::detail {
 
-namespace {
-
-// TODO: update
-// TODO: compute num stripes from chunks
-/**
- * @brief Decompresses the stripe data, at stream granularity.
- *
- * @param decompressor Block decompressor
- * @param stripe_data List of source stripe column data
- * @param stream_info List of stream to column mappings
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
- * @param row_index_stride Distance between each row index
- * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
- */
-rmm::device_buffer decompress_stripe_data(
-  chunk const& load_stripe_chunk,
-  chunk const& stripe_chunk,
-  stream_id_map<stripe_level_comp_info> const& compinfo_map,
-  OrcDecompressor const& decompressor,
-  host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info const> stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_type num_stripes,
-  size_type row_index_stride,
-  bool use_base_stride,
-  rmm::cuda_stream_view stream)
-{
-  // Count the exact number of compressed blocks
-  std::size_t num_compressed_blocks   = 0;
-  std::size_t num_uncompressed_blocks = 0;
-  std::size_t total_decomp_size       = 0;
-
-  // printf("decompress #stripe: %d, ")
-
-  // TODO: use lvl_stripe_stream_chunks
-  std::size_t count{0};
-  for (auto const& info : stream_info) {
-    if (info.id.stripe_idx < stripe_chunk.start_idx ||
-        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
-      continue;
-    }
-    count++;
-  }
-
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
-
-  for (auto const& info : stream_info) {
-    if (info.id.stripe_idx < stripe_chunk.start_idx ||
-        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
-      continue;
-    }
-
-#ifdef PRINT_DEBUG
-    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-           (int)info.id.stripe_idx,
-           (int)info.id.level,
-           (int)info.id.orc_cold_idx,
-           (int)info.id.kind,
-           info.dst_pos,
-           info.length);
-    fflush(stdout);
-#endif
-
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(
-        stripe_data[info.id.stripe_idx - load_stripe_chunk.start_idx].data()) +
-        info.dst_pos,
-      info.length));
-
-    //    printf("line %d\n", __LINE__);
-    //    fflush(stdout);
-    auto const& cached_comp_info = compinfo_map.at(
-      stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind});
-    //    printf("line %d\n", __LINE__);
-    //    fflush(stdout);
-    // auto const& cached_comp_info =
-    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx,
-    //   info.id.kind}];
-    auto& stream_comp_info                   = compinfo.back();
-    stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
-    stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
-    stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
-
-    num_compressed_blocks += cached_comp_info.num_compressed_blocks;
-    num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
-    total_decomp_size += cached_comp_info.total_decomp_size;
-  }
-
-  CUDF_EXPECTS(
-    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
-    "Inconsistent info on compression blocks");
-
-#ifdef XXX
-  std::size_t old_num_compressed_blocks   = num_compressed_blocks;
-  std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
-  std::size_t old_total_decomp_size       = total_decomp_size;
-
-  num_compressed_blocks   = 0;
-  num_uncompressed_blocks = 0;
-  total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
-
-    auto const& info = stream_info[i];
-    printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-           (int)info.id.stripe_idx,
-           (int)info.id.level,
-           (int)info.id.orc_cold_idx,
-           (int)info.id.kind,
-           (size_t)compinfo[i].num_compressed_blocks,
-           (size_t)compinfo[i].num_uncompressed_blocks,
-           compinfo[i].max_uncompressed_size);
-    fflush(stdout);
-  }
-
-  if (old_num_compressed_blocks != num_compressed_blocks ||
-      old_num_uncompressed_blocks != num_uncompressed_blocks ||
-      old_total_decomp_size != total_decomp_size) {
-    printf("invalid: %d - %d, %d - %d, %d - %d\n",
-           (int)old_num_compressed_blocks,
-           (int)num_compressed_blocks,
-           (int)old_num_uncompressed_blocks,
-           (int)num_uncompressed_blocks,
-           (int)old_total_decomp_size,
-           (int)total_decomp_size
-
-    );
-  }
-#endif
-
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
-  rmm::device_buffer decomp_data(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-  if (decomp_data.is_empty()) { return decomp_data; }
-
-  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<device_span<uint8_t>> inflate_out(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
-               inflate_res.begin(),
-               inflate_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  // Parse again to populate the decompression input/output buffers
-  std::size_t decomp_offset      = 0;
-  uint32_t max_uncomp_block_size = 0;
-  uint32_t start_pos             = 0;
-  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
-    compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
-    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
-    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
-    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
-    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
-
-    //    stream_info[i].dst_pos = decomp_offset;
-    decomp_offset += compinfo[i].max_uncompressed_size;
-    start_pos += compinfo[i].num_compressed_blocks;
-    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
-    max_uncomp_block_size =
-      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
-  }
-  compinfo.host_to_device_async(stream);
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-
-  // Value for checking whether we decompress successfully.
-  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
-  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device_async(stream);
-
-  // Dispatch batches of blocks to decompress
-  if (num_compressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
-                                                            num_compressed_blocks};
-    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
-    switch (decompressor.compression()) {
-      case compression_type::ZLIB:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
-          gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::SNAPPY:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::ZSTD:
-        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
-            reason) {
-          CUDF_FAIL("Decompression error: " + reason.value());
-        }
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   inflate_in_view,
-                                   inflate_out_view,
-                                   inflate_res,
-                                   max_uncomp_block_size,
-                                   total_decomp_size,
-                                   stream);
-        break;
-      case compression_type::LZ4:
-        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::LZ4);
-            reason) {
-          CUDF_FAIL("Decompression error: " + reason.value());
-        }
-        nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
-                                   inflate_in_view,
-                                   inflate_out_view,
-                                   inflate_res,
-                                   max_uncomp_block_size,
-                                   total_decomp_size,
-                                   stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-
-    // TODO: proclam return type
-
-    // Check if any block has been failed to decompress.
-    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(std::size_t{0}),
-      thrust::make_counting_iterator(inflate_res.size()),
-      [results           = inflate_res.begin(),
-       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
-        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
-      });
-  }
-
-  if (num_uncompressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
-                                                         num_uncompressed_blocks};
-    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
-                                                    num_uncompressed_blocks};
-    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
-  }
-
-  // Copy without stream sync, thus need to wait for stream sync below to access.
-  any_block_failure.device_to_host_async(stream);
-
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
-  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
-
-  // We can check on host after stream synchronize
-  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
-
-  auto const num_columns = static_cast<size_type>(chunks.size().second);
-
-  // Update the stream information with the updated uncompressed info
-  // TBD: We could update the value from the information we already
-  // have in stream_info[], but using the gpu results also updates
-  // max_uncompressed_size to the actual uncompressed size, or zero if
-  // decompression failed.
-  for (size_type i = 0; i < num_stripes; ++i) {
-    for (size_type j = 0; j < num_columns; ++j) {
-      auto& chunk = chunks[i][j];
-      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
-        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
-          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
-          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
-        }
-      }
-    }
-  }
-
-  if (row_groups.size().first) {
-    chunks.host_to_device_async(stream);
-    row_groups.host_to_device_async(stream);
-    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                            compinfo.device_ptr(),
-                            chunks.base_device_ptr(),
-                            num_columns,
-                            num_stripes,
-                            row_index_stride,
-                            use_base_stride,
-                            stream);
-  }
-
-  return decomp_data;
-}
-
-/**
- * @brief Updates null mask of columns whose parent is a struct column.
- *
- * If struct column has null element, that row would be skipped while writing child column in ORC,
- * so we need to insert the missing null elements in child column. There is another behavior from
- * pyspark, where if the child column doesn't have any null elements, it will not have present
- * stream, so in that case parent null mask need to be copied to child column.
- *
- * @param chunks Vector of list of column chunk descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
- */
-void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                      host_span<column_buffer> out_buffers,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  bool is_mask_updated   = false;
-
-  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
-      if (not is_mask_updated) {
-        chunks.device_to_host_sync(stream);
-        is_mask_updated = true;
-      }
-
-      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
-      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
-      auto child_mask_len =
-        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
-      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
-
-      if (child_valid_map_base != nullptr) {
-        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
-        // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator(0),
-                        thrust::make_counting_iterator(0) + parent_mask_len,
-                        dst_idx.begin(),
-                        [parent_valid_map_base] __device__(auto idx) {
-                          return bit_is_set(parent_valid_map_base, idx);
-                        });
-
-        auto merged_null_mask = cudf::detail::create_null_mask(
-          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
-        uint32_t* dst_idx_ptr = dst_idx.data();
-        // Copy child valid bits from child column to valid indexes, this will merge both child
-        // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0),
-                         thrust::make_counting_iterator(0) + dst_idx.size(),
-                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
-                           if (bit_is_set(child_valid_map_base, idx)) {
-                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
-                           };
-                         });
-
-        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
-
-      } else {
-        // Since child column doesn't have a mask, copy parent null mask
-        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx].set_null_mask(
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
-      }
-    }
-  }
-
-  if (is_mask_updated) {
-    // Update chunks with pointers to column data which might have been changed.
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto& chunk          = chunks[stripe_idx][col_idx];
-        chunk.valid_map_base = out_buffers[col_idx].null_mask();
-      }
-    }
-    chunks.host_to_device_sync(stream);
-  }
-}
-
-/**
- * @brief Converts the stripe column data and outputs to columns.
- *
- * @param num_dicts Number of dictionary entries required
- * @param skip_rows Number of rows to offset from start
- * @param row_index_stride Distance between each row index
- * @param level Current nesting level being processed
- * @param tz_table Local time to UTC conversion table
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
- */
-void decode_stream_data(std::size_t num_dicts,
-                        int64_t skip_rows,
-                        size_type row_index_stride,
-                        std::size_t level,
-                        table_view const& tz_table,
-                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
-                        std::vector<column_buffer>& out_buffers,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  printf("decode %d stripess \n", (int)num_stripes);
-
-  thrust::counting_iterator<int> col_idx_it(0);
-  thrust::counting_iterator<int> stripe_idx_it(0);
-
-  // Update chunks with pointers to column data
-  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
-    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
-      auto& chunk            = chunks[stripe_idx][col_idx];
-      chunk.column_data_base = out_buffers[col_idx].data();
-      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
-    });
-  });
-
-  // Allocate global dictionary for deserializing
-  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
-
-  chunks.host_to_device_sync(stream);
-  gpu::DecodeNullsAndStringDictionaries(
-    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
-
-  if (level > 0) {
-    printf("update_null_mask\n");
-    // Update nullmasks for children if parent was a struct and had null mask
-    update_null_mask(chunks, out_buffers, stream, mr);
-  }
-
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
-  rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
-
-  // printf(
-  //   "num col: %d, num stripe: %d, skip row: %d, row_groups size: %d, row index stride: %d, "
-  //   "level: "
-  //   "%d\n",
-  //   (int)num_columns,
-  //   (int)num_stripes,
-  //   (int)skip_rows,
-  //   (int)row_groups.size().first,
-  //   (int)row_index_stride,
-  //   (int)level
-  // );
-
-  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
-                           global_dict.data(),
-                           row_groups,
-                           num_columns,
-                           num_stripes,
-                           skip_rows,
-                           *tz_table_dptr,
-                           row_groups.size().first,
-                           row_index_stride,
-                           level,
-                           error_count.data(),
-                           stream);
-  chunks.device_to_host_async(stream);
-  // `value` synchronizes
-  auto const num_errors = error_count.value(stream);
-  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
-
-  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
-    out_buffers[col_idx].null_count() =
-      std::accumulate(stripe_idx_it + 0,
-                      stripe_idx_it + num_stripes,
-                      0,
-                      [&](auto null_count, auto const stripe_idx) {
-                        // printf(
-                        //   "null count: %d => %d\n", (int)stripe_idx,
-                        //   (int)chunks[stripe_idx][col_idx].null_count);
-                        // printf("num child rows: %d \n",
-                        // (int)chunks[stripe_idx][col_idx].num_child_rows);
-
-                        return null_count + chunks[stripe_idx][col_idx].null_count;
-                      });
-  });
-}
-
-/**
- * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
- * layer.
- */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
-                      rmm::cuda_stream_view stream)
-{
-  auto const num_stripes = chunks.size().first;
-  if (num_stripes == 0) return;
-
-  auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
-  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
-    // Null counts sums are only needed for children of struct columns
-    if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
-    }
-  }
-  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
-
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
-  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
-  stream.synchronize();
-}
-
-// TODO: this is called for each chunk of stripes.
-/**
- * @brief Aggregate child metadata from parent column chunks.
- */
-void aggregate_child_meta(std::size_t stripe_start,
-                          std::size_t level,
-                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
-                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
-                          host_span<orc_column_meta const> nested_cols,
-                          host_span<column_buffer> out_buffers,
-                          reader_column_meta& col_meta)
-{
-  auto const num_of_stripes         = chunks.size().first;
-  auto const num_of_rowgroups       = row_groups.size().first;
-  auto const num_child_cols         = selected_columns.levels[level + 1].size();
-  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
-  auto& num_child_rows              = col_meta.num_child_rows;
-  auto& parent_column_data          = col_meta.parent_column_data;
-
-  // Reset the meta to store child column details.
-  num_child_rows.resize(selected_columns.levels[level + 1].size());
-  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
-  parent_column_data.resize(number_of_child_chunks);
-  col_meta.parent_column_index.resize(number_of_child_chunks);
-  col_meta.child_start_row.resize(number_of_child_chunks);
-  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
-  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
-
-  auto child_start_row = cudf::detail::host_2dspan<int64_t>(
-    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
-  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<int64_t>(
-    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
-  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
-
-  int index = 0;  // number of child column processed
-
-  printf("\n\n");
-  // For each parent column, update its child column meta for each stripe.
-  std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
-    // printf("p_col.id: %d\n", (int)p_col.id);
-
-    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
-    // printf("   level: %d, parent_col_idx: %d\n", (int)level, (int)parent_col_idx);
-
-    int64_t start_row         = 0;
-    auto processed_row_groups = 0;
-
-    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
-      // Aggregate num_rows and start_row from processed parent columns per row groups
-      if (num_of_rowgroups) {
-        // printf("   num_of_rowgroups: %d\n", (int)num_of_rowgroups);
-
-        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
-        auto processed_child_rows  = 0;
-
-        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
-             rowgroup_id++, processed_row_groups++) {
-          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (size_type id = 0; id < p_col.num_children; id++) {
-            auto const child_col_idx                                  = index + id;
-            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
-            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
-          }
-          processed_child_rows += child_rows;
-        }
-      }
-
-      // Aggregate start row, number of rows per chunk and total number of rows in a column
-      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      // printf("     stripe_id: %d: child_rows: %d\n", (int)stripe_id, (int)child_rows);
-      // printf("      p_col.num_children: %d\n", (int)p_col.num_children);
-
-      for (size_type id = 0; id < p_col.num_children; id++) {
-        auto const child_col_idx = index + id;
-
-        // TODO: Check for overflow here.
-        num_child_rows[child_col_idx] += child_rows;
-        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
-        // start row could be different for each column when there is nesting at each stripe level
-        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
-        // printf("update child_start_row (%d, %d): %d\n",
-        //        (int)stripe_id,
-        //        (int)child_col_idx,
-        //        (int)start_row);
-      }
-      start_row += child_rows;
-      // printf("        start_row: %d\n", (int)start_row);
-    }
-
-    // Parent column null mask and null count would be required for child column
-    // to adjust its nullmask.
-    auto type              = out_buffers[parent_col_idx].type.id();
-    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
-    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
-    auto num_rows          = out_buffers[parent_col_idx].size;
-
-    for (size_type id = 0; id < p_col.num_children; id++) {
-      auto const child_col_idx                    = index + id;
-      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
-      if (type == type_id::STRUCT) {
-        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
-        // Number of rows in child will remain same as parent in case of struct column
-        num_child_rows[child_col_idx] = num_rows;
-      } else {
-        parent_column_data[child_col_idx] = {nullptr, 0};
-      }
-    }
-    index += p_col.num_children;
-  });
-}
-
-/**
- * @brief struct to store buffer data and size of list buffer
- */
-struct list_buffer_data {
-  size_type* data;
-  size_type size;
-};
-
-// Generates offsets for list buffer from number of elements in a row.
-void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
-{
-  for (auto& list_data : buff_data) {
-    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
-                           list_data.data,
-                           list_data.data + list_data.size,
-                           list_data.data);
-  }
-}
-
-/**
- * @brief TODO
- * @param input
- * @param size_limit
- * @param stream
- * @return
- */
-std::vector<chunk> find_table_splits(table_view const& input,
-                                     size_type segment_length,
-                                     std::size_t size_limit,
-                                     rmm::cuda_stream_view stream)
-{
-  printf("find table split, seg length = %d, limit = %d \n", segment_length, (int)size_limit);
-
-  // If segment_length is zero: we don't have any limit on granularity.
-  // As such, set segment length to the number of rows.
-  if (segment_length == 0) { segment_length = input.num_rows(); }
-
-  // If we have small number of rows, need to adjust segment_length before calling to
-  // `segmented_row_bit_count`.
-  segment_length = std::min(segment_length, input.num_rows());
-
-  // Default 10k rows.
-  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
-    input, segment_length, stream, rmm::mr::get_current_device_resource());
-
-  auto segmented_sizes =
-    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
-
-  // TODO: exec_policy_nosync
-  thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(d_segmented_sizes->size()),
-    segmented_sizes.d_begin(),
-    [segment_length,
-     num_rows = input.num_rows(),
-     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
-      // Since the number of rows may not divisible by segment_length,
-      // the last segment may be shorter than the others.
-      auto const current_length =
-        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
-      auto const size = d_sizes[segment_idx];
-      return cumulative_size{current_length, static_cast<std::size_t>(size)};
-    });
-
-  // TODO: remove:
-  segmented_sizes.device_to_host_sync(stream);
-  printf("total row sizes by segment = %d:\n", (int)segment_length);
-  for (auto& size : segmented_sizes) {
-    printf("size: %ld, %zu\n", size.count, size.size_bytes / CHAR_BIT);
-  }
-
-  // TODO: exec_policy_nosync
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         segmented_sizes.d_begin(),
-                         segmented_sizes.d_end(),
-                         segmented_sizes.d_begin(),
-                         cumulative_size_sum{});
-  segmented_sizes.device_to_host_sync(stream);
-
-  // Since the segment sizes are in bits, we need to multiply CHAR_BIT with the output limit.
-  return find_splits(segmented_sizes, input.num_rows(), size_limit * CHAR_BIT);
-}
-
-}  // namespace
-
-// TODO: this should be called per chunk of stripes.
-void reader::impl::decompress_and_decode()
-{
-  if (_file_itm_data.has_no_data()) { return; }
-
-  auto const stripe_chunk =
-    _chunk_read_data.decode_stripe_chunks[_chunk_read_data.curr_decode_stripe_chunk++];
-  auto const stripe_start = stripe_chunk.start_idx;
-  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
-
-  auto const load_stripe_start =
-    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1].start_idx;
-
-  printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
-
-  auto const rows_to_skip = _file_itm_data.rows_to_skip;
-  // auto const rows_to_read      = _file_itm_data.rows_to_read;
-  auto const& selected_stripes = _file_itm_data.selected_stripes;
-
-  // auto const rows_to_skip = 0;
-  auto rows_to_read = 0;
-  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-    auto const& stripe     = selected_stripes[stripe_idx];
-    auto const stripe_info = stripe.stripe_info;
-    // TODO: check overflow
-    // CUDF_EXPECTS(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows <
-    //                static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
-    //              "TODO");
-    rows_to_read += static_cast<size_type>(stripe_info->numberOfRows);
-
-    if (_file_itm_data.rows_to_skip > 0) {
-      CUDF_EXPECTS(_file_itm_data.rows_to_skip < static_cast<int64_t>(stripe_info->numberOfRows),
-                   "TODO");
-    }
-  }
-  rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
-  _file_itm_data.rows_to_skip = 0;
-
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
-
-    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
-                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
-                                : std::make_unique<cudf::table>();
-  }();
-
-  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
-  auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
-
-  null_count_prefix_sums.clear();
-
-  // TODO: move this to global step
-  lvl_chunks.resize(_selected_columns.num_levels());
-  _out_buffers.clear();
-  _out_buffers.resize(_selected_columns.num_levels());
-
-  //
-  //
-  //
-  // TODO: move this to reader_impl.cu, decomp and decode step
-  //  std::size_t num_stripes = selected_stripes.size();
-  std::size_t num_stripes = stripe_chunk.count;
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
-  auto& col_meta = *_col_meta;
-
-#if 0
-  printf("num_child_rows: (size %d)\n", (int)_col_meta->num_child_rows.size());
-  if (_col_meta->num_child_rows.size()) {
-    for (auto x : _col_meta->num_child_rows) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-
-    _col_meta->num_child_rows.clear();
-  }
-
-  printf("parent_column_data null count: (size %d)\n", (int)_col_meta->parent_column_data.size());
-  if (_col_meta->parent_column_data.size()) {
-    for (auto x : _col_meta->parent_column_data) {
-      printf("%d, ", (int)x.null_count);
-    }
-    printf("\n");
-    _col_meta->parent_column_data.clear();
-  }
-
-  printf("parent_column_index: (size %d)\n", (int)_col_meta->parent_column_index.size());
-  if (_col_meta->parent_column_index.size()) {
-    for (auto x : _col_meta->parent_column_index) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-    _col_meta->parent_column_index.clear();
-  }
-
-  printf("child_start_row: (size %d)\n", (int)_col_meta->child_start_row.size());
-  if (_col_meta->child_start_row.size()) {
-    for (auto x : _col_meta->child_start_row) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-    _col_meta->child_start_row.clear();
-  }
-
-  printf("num_child_rows_per_stripe: (size %d)\n",
-         (int)_col_meta->num_child_rows_per_stripe.size());
-  if (_col_meta->num_child_rows_per_stripe.size()) {
-    for (auto x : _col_meta->num_child_rows_per_stripe) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-    _col_meta->num_child_rows_per_stripe.clear();
-  }
-
-  printf("rwgrp_meta: (size %d)\n", (int)_col_meta->rwgrp_meta.size());
-  if (_col_meta->rwgrp_meta.size()) {
-    for (auto x : _col_meta->rwgrp_meta) {
-      printf("(%d | %d), ", (int)x.start_row, (int)x.num_rows);
-    }
-    printf("\n");
-  }
-
-#endif
-
-  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
-
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    printf("processing level = %d\n", (int)level);
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-
-    auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
-    auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
-
-    auto& columns_level = _selected_columns.levels[level];
-
-    // TODO: do it in global step
-    // Association between each ORC column and its cudf::column
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type =
-        to_cudf_type(_metadata.get_col_type(col.id).kind,
-                     _config.use_np_dtypes,
-                     _config.timestamp_type.id(),
-                     to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
-
-      // Map each ORC column to its column
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
-
-    auto const num_columns = columns_level.size();
-    auto& chunks           = lvl_chunks[level];
-    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-
-    const bool use_index =
-      _config.use_index &&
-      // Do stripes have row group index
-      _metadata.is_row_grp_idx_present() &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (rows_to_skip == 0);
-
-    printf(" use_index: %d\n", (int)use_index);
-
-    // Logically view streams as columns
-    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    int64_t stripe_start_row = 0;
-    int64_t num_dict_entries = 0;
-    int64_t num_rowgroups    = 0;
-
-    // TODO: Stripe and stream idx must be by chunk.
-    //    std::size_t stripe_idx = 0;
-    std::size_t stream_idx = 0;
-
-    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-      //    for (auto const& stripe : selected_stripes) {
-
-      printf("processing stripe_idx = %d\n", (int)stripe_idx);
-      auto const& stripe       = selected_stripes[stripe_idx];
-      auto const stripe_info   = stripe.stripe_info;
-      auto const stripe_footer = stripe.stripe_footer;
-
-      // printf("stripeinfo->indexLength: %d, data: %d\n",
-      //        (int)stripe_info->indexLength,
-      //        (int)stripe_info->dataLength);
-
-      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
-                                                                      level,
-                                                                      stripe_info,
-                                                                      stripe_footer,
-                                                                      col_meta.orc_col_map[level],
-                                                                      _metadata.get_types(),
-                                                                      use_index,
-                                                                      level == 0,
-                                                                      &num_dict_entries,
-                                                                      &stream_idx,
-                                                                      std::nullopt,  // stream_info
-                                                                      &chunks);
-
-      auto const is_stripe_data_empty = total_data_size == 0;
-      printf("is_stripe_data_empty: %d\n", (int)is_stripe_data_empty);
-
-      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                   "Invalid index rowgroup stream data");
-
-      // TODO: Wrong?
-      // stripe load_stripe_start?
-      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
-
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
-      auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
-      printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
-
-      auto const rowgroup_id    = num_rowgroups;
-      auto stripe_num_rowgroups = 0;
-      if (use_index) {
-        stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                               _metadata.get_row_index_stride();
-      }
-
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
-      // Update chunks to reference streams pointers
-      for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-        auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
-        // start row, number of rows in a each stripe and total number of rows
-        // may change in lower levels of nesting
-        chunk.start_row =
-          (level == 0)
-            ? stripe_start_row
-            : col_meta.child_start_row[(stripe_idx - stripe_start) * num_columns + col_idx];
-        chunk.num_rows =
-          (level == 0)
-            ? static_cast<int64_t>(stripe_info->numberOfRows)
-            : col_meta
-                .num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_columns + col_idx];
-        printf("col idx: %d, start_row: %d, num rows: %d\n",
-               (int)col_idx,
-               (int)chunk.start_row,
-               (int)chunk.num_rows);
-
-        chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-        chunk.parent_validity_info =
-          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-        chunk.parent_null_count_prefix_sums =
-          (level == 0)
-            ? nullptr
-            : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-        chunk.type_kind =
-          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
-
-        printf("type: %d\n", (int)chunk.type_kind);
-
-        // num_child_rows for a struct column will be same, for other nested types it will be
-        // calculated.
-        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-        chunk.dtype_id       = column_types[col_idx].id();
-        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
-                                .ff.types[columns_level[col_idx].id]
-                                .scale.value_or(0);
-
-        chunk.rowgroup_id   = rowgroup_id;
-        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                ? sizeof(string_index_pair)
-                              : ((column_types[col_idx].id() == type_id::LIST) or
-                             (column_types[col_idx].id() == type_id::STRUCT))
-                                ? sizeof(size_type)
-                                : cudf::size_of(column_types[col_idx]);
-        chunk.num_rowgroups = stripe_num_rowgroups;
-        // printf("stripe_num_rowgroups: %d\n", (int)stripe_num_rowgroups);
-
-        if (chunk.type_kind == orc::TIMESTAMP) {
-          chunk.timestamp_type_id = _config.timestamp_type.id();
-        }
-        if (not is_stripe_data_empty) {
-          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stream_begin].dst_pos;
-            // printf("chunk.streams[%d] of chunk.strm_id[%d], stripe %d | %d, collect from %d\n",
-            //        (int)k,
-            //        (int)chunk.strm_id[k],
-            //        (int)stripe_idx,
-            //        (int)stripe_start,
-            //        (int)(chunk.strm_id[k] + stream_begin));
-          }
-        }
-      }
-
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
-      stripe_start_row += num_rows_per_stripe;
-      num_rowgroups += stripe_num_rowgroups;
-
-      //      stripe_idx++;
-    }  // for (stripe : selected_stripes)
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
-    if (stripe_data.empty()) { continue; }
-
-    // Process dataset chunk pages into output columns
-    auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
-    if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
-      auto& rw_grp_meta = col_meta.rwgrp_meta;
-
-      // Update start row and num rows per row group
-      std::transform(rw_grp_meta.begin(),
-                     rw_grp_meta.end(),
-                     row_groups_span.begin(),
-                     rw_grp_meta.begin(),
-                     [&](auto meta, auto& row_grp) {
-                       row_grp.num_rows  = meta.num_rows;
-                       row_grp.start_row = meta.start_row;
-                       return meta;
-                     });
-    }
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      // printf("decompress----------------------\n");
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-      CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_chunk > 0, "ERRRRR");
-
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-
-      auto decomp_data = decompress_stripe_data(
-        _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1],
-        stripe_chunk,
-        _file_itm_data.compinfo_map,
-        *_metadata.per_file_metadata[0].decompressor,
-        stripe_data,
-        stream_info,
-        chunks,
-        row_groups,
-        num_stripes,
-        _metadata.get_row_index_stride(),
-        level == 0,
-        _stream);
-      // stripe_data.clear();
-      // stripe_data.push_back(std::move(decomp_data));
-
-      // TODO: only reset each one if the new size/type are different.
-      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
-      for (int64_t i = 1; i < stripe_chunk.count; ++i) {
-        stripe_data[i + stripe_start - load_stripe_start] = {};
-      }
-
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
-    } else {
-      // printf("no decompression----------------------\n");
-
-      if (row_groups.size().first) {
-        // printf("line %d\n", __LINE__);
-        // fflush(stdout);
-        chunks.host_to_device_async(_stream);
-        row_groups.host_to_device_async(_stream);
-        row_groups.host_to_device_async(_stream);
-        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                nullptr,
-                                chunks.base_device_ptr(),
-                                num_columns,
-                                num_stripes,
-                                _metadata.get_row_index_stride(),
-                                level == 0,
-                                _stream);
-      }
-    }
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-
-    // TODO: do not clear but reset each one.
-    // and only reset if the new size/type are different.
-    _out_buffers[level].clear();
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-
-    for (std::size_t i = 0; i < column_types.size(); ++i) {
-      bool is_nullable = false;
-      for (std::size_t j = 0; j < num_stripes; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-          printf("   is nullable\n");
-          is_nullable = true;
-          break;
-        }
-      }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-
-      // printf("  create col, num rows: %d\n", (int)n_rows);
-
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
-
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", buffer size: " << n_rows
-                  << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-    }
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-
-    decode_stream_data(num_dict_entries,
-                       rows_to_skip,
-                       _metadata.get_row_index_stride(),
-                       level,
-                       tz_table->view(),
-                       chunks,
-                       row_groups,
-                       _out_buffers[level],
-                       _stream,
-                       _mr);
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
-    if (nested_cols.size()) {
-      printf("have nested col\n");
-
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
-
-      row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(stripe_start,
-                           level,
-                           _selected_columns,
-                           chunks,
-                           row_groups,
-                           nested_cols,
-                           _out_buffers[level],
-                           col_meta);
-
-      // ORC stores number of elements at each row, so we need to generate offsets from that
-      std::vector<list_buffer_data> buff_data;
-      std::for_each(
-        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
-          if (out_buffer.type.id() == type_id::LIST) {
-            auto data = static_cast<size_type*>(out_buffer.data());
-            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
-          }
-        });
-
-      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
-    }
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-  }  // end loop level
-
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-
-  std::vector<std::unique_ptr<column>> out_columns;
-  _out_metadata = get_meta_with_user_data();
-  std::transform(
-    _selected_columns.levels[0].begin(),
-    _selected_columns.levels[0].end(),
-    std::back_inserter(out_columns),
-    [&](auto const& orc_col_meta) {
-      _out_metadata.schema_info.emplace_back("");
-      auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
-      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
-    });
-  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
-
-  // TODO: do not clear but reset each one.
-  // and only reset if the new size/type are different.
-  // This clear is just to check if there is memory leak.
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    _out_buffers[level].clear();
-
-    auto& stripe_data = lvl_stripe_data[level];
-
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      stripe_data[stripe_start - load_stripe_start] = {};
-    } else {
-      for (int64_t i = 0; i < stripe_chunk.count; ++i) {
-        stripe_data[i + stripe_start - load_stripe_start] = {};
-      }
-    }
-  }
-
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-
-  // printf("col: \n");
-  // cudf::test::print(_chunk_read_data.decoded_table->get_column(0).view());
-
-  // DEBUG only
-  // _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
-
-  _chunk_read_data.curr_output_table_chunk = 0;
-  _chunk_read_data.output_table_chunks =
-    _chunk_read_data.output_size_limit == 0
-      ? std::vector<chunk>{chunk{0, _chunk_read_data.decoded_table->num_rows()}}
-      : find_table_splits(_chunk_read_data.decoded_table->view(),
-                          _chunk_read_data.output_row_granularity,
-                          _chunk_read_data.output_size_limit,
-                          _stream);
-
-  auto& splits = _chunk_read_data.output_table_chunks;
-  printf("------------\nSplits decoded table (/total num rows = %d): \n",
-         (int)_chunk_read_data.decoded_table->num_rows());
-  for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
-  }
-  fflush(stdout);
-
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "decomp and decode, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-}
-
 void reader::impl::prepare_data(int64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
                                 std::vector<std::vector<size_type>> const& stripes)
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
new file mode 100644
index 00000000000..4971f65debb
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -0,0 +1,1451 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// #define PRINT_DEBUG
+
+// TODO: remove
+#include <cudf_test/debug_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+//
+//
+//
+#include "io/comp/gpuinflate.hpp"
+#include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
+#include "io/utilities/config_utils.hpp"
+
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/transform.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include <algorithm>
+#include <iterator>
+
+namespace cudf::io::orc::detail {
+
+namespace {
+
+// TODO: update
+// TODO: compute num stripes from chunks
+/**
+ * @brief Decompresses the stripe data, at stream granularity.
+ *
+ * @param decompressor Block decompressor
+ * @param stripe_data List of source stripe column data
+ * @param stream_info List of stream to column mappings
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param num_stripes Number of stripes making up column chunks
+ * @param row_index_stride Distance between each row index
+ * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Device buffer to decompressed page data
+ */
+rmm::device_buffer decompress_stripe_data(
+  chunk const& load_stripe_chunk,
+  chunk const& stripe_chunk,
+  stream_id_map<stripe_level_comp_info> const& compinfo_map,
+  OrcDecompressor const& decompressor,
+  host_span<rmm::device_buffer const> stripe_data,
+  host_span<orc_stream_info const> stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+  size_type num_stripes,
+  size_type row_index_stride,
+  bool use_base_stride,
+  rmm::cuda_stream_view stream)
+{
+  // Count the exact number of compressed blocks
+  std::size_t num_compressed_blocks   = 0;
+  std::size_t num_uncompressed_blocks = 0;
+  std::size_t total_decomp_size       = 0;
+
+  // printf("decompress #stripe: %d, ")
+
+  // TODO: use lvl_stripe_stream_chunks
+  std::size_t count{0};
+  for (auto const& info : stream_info) {
+    if (info.id.stripe_idx < stripe_chunk.start_idx ||
+        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+      continue;
+    }
+    count++;
+  }
+
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
+
+  for (auto const& info : stream_info) {
+    if (info.id.stripe_idx < stripe_chunk.start_idx ||
+        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+      continue;
+    }
+
+#ifdef PRINT_DEBUG
+    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+           (int)info.id.stripe_idx,
+           (int)info.id.level,
+           (int)info.id.orc_cold_idx,
+           (int)info.id.kind,
+           info.dst_pos,
+           info.length);
+    fflush(stdout);
+#endif
+
+    compinfo.push_back(gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(
+        stripe_data[info.id.stripe_idx - load_stripe_chunk.start_idx].data()) +
+        info.dst_pos,
+      info.length));
+
+    //    printf("line %d\n", __LINE__);
+    //    fflush(stdout);
+    auto const& cached_comp_info = compinfo_map.at(
+      stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind});
+    //    printf("line %d\n", __LINE__);
+    //    fflush(stdout);
+    // auto const& cached_comp_info =
+    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx,
+    //   info.id.kind}];
+    auto& stream_comp_info                   = compinfo.back();
+    stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+    stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+    stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+    num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+    num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+    total_decomp_size += cached_comp_info.total_decomp_size;
+  }
+
+  CUDF_EXPECTS(
+    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+    "Inconsistent info on compression blocks");
+
+#ifdef XXX
+  std::size_t old_num_compressed_blocks   = num_compressed_blocks;
+  std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
+  std::size_t old_total_decomp_size       = total_decomp_size;
+
+  num_compressed_blocks   = 0;
+  num_uncompressed_blocks = 0;
+  total_decomp_size       = 0;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    num_compressed_blocks += compinfo[i].num_compressed_blocks;
+    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+    auto const& info = stream_info[i];
+    printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
+           (int)info.id.stripe_idx,
+           (int)info.id.level,
+           (int)info.id.orc_cold_idx,
+           (int)info.id.kind,
+           (size_t)compinfo[i].num_compressed_blocks,
+           (size_t)compinfo[i].num_uncompressed_blocks,
+           compinfo[i].max_uncompressed_size);
+    fflush(stdout);
+  }
+
+  if (old_num_compressed_blocks != num_compressed_blocks ||
+      old_num_uncompressed_blocks != num_uncompressed_blocks ||
+      old_total_decomp_size != total_decomp_size) {
+    printf("invalid: %d - %d, %d - %d, %d - %d\n",
+           (int)old_num_compressed_blocks,
+           (int)num_compressed_blocks,
+           (int)old_num_uncompressed_blocks,
+           (int)num_uncompressed_blocks,
+           (int)old_total_decomp_size,
+           (int)total_decomp_size
+
+    );
+  }
+#endif
+
+  // Buffer needs to be padded.
+  // Required by `gpuDecodeOrcColumnData`.
+  rmm::device_buffer decomp_data(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+  if (decomp_data.is_empty()) { return decomp_data; }
+
+  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> inflate_out(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               inflate_res.begin(),
+               inflate_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  // Parse again to populate the decompression input/output buffers
+  std::size_t decomp_offset      = 0;
+  uint32_t max_uncomp_block_size = 0;
+  uint32_t start_pos             = 0;
+  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
+    compinfo[i].uncompressed_data = dst_base + decomp_offset;
+    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
+    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
+    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
+
+    //    stream_info[i].dst_pos = decomp_offset;
+    decomp_offset += compinfo[i].max_uncompressed_size;
+    start_pos += compinfo[i].num_compressed_blocks;
+    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
+    max_uncomp_block_size =
+      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
+  }
+  compinfo.host_to_device_async(stream);
+  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                 compinfo.size(),
+                                 decompressor.GetBlockSize(),
+                                 decompressor.GetLog2MaxCompressionRatio(),
+                                 stream);
+
+  // Value for checking whether we decompress successfully.
+  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
+  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
+  any_block_failure[0] = false;
+  any_block_failure.host_to_device_async(stream);
+
+  // Dispatch batches of blocks to decompress
+  if (num_compressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
+                                                            num_compressed_blocks};
+    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
+    switch (decompressor.compression()) {
+      case compression_type::ZLIB:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::SNAPPY:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::ZSTD:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   inflate_in_view,
+                                   inflate_out_view,
+                                   inflate_res,
+                                   max_uncomp_block_size,
+                                   total_decomp_size,
+                                   stream);
+        break;
+      case compression_type::LZ4:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::LZ4);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
+        nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
+                                   inflate_in_view,
+                                   inflate_out_view,
+                                   inflate_res,
+                                   max_uncomp_block_size,
+                                   total_decomp_size,
+                                   stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+
+    // TODO: proclam return type
+
+    // Check if any block has been failed to decompress.
+    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(std::size_t{0}),
+      thrust::make_counting_iterator(inflate_res.size()),
+      [results           = inflate_res.begin(),
+       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
+        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
+      });
+  }
+
+  if (num_uncompressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
+                                                         num_uncompressed_blocks};
+    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
+                                                    num_uncompressed_blocks};
+    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
+  }
+
+  // Copy without stream sync, thus need to wait for stream sync below to access.
+  any_block_failure.device_to_host_async(stream);
+
+  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
+
+  // We can check on host after stream synchronize
+  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
+
+  auto const num_columns = static_cast<size_type>(chunks.size().second);
+
+  // Update the stream information with the updated uncompressed info
+  // TBD: We could update the value from the information we already
+  // have in stream_info[], but using the gpu results also updates
+  // max_uncompressed_size to the actual uncompressed size, or zero if
+  // decompression failed.
+  for (size_type i = 0; i < num_stripes; ++i) {
+    for (size_type j = 0; j < num_columns; ++j) {
+      auto& chunk = chunks[i][j];
+      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
+        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
+          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
+          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
+        }
+      }
+    }
+  }
+
+  if (row_groups.size().first) {
+    chunks.host_to_device_async(stream);
+    row_groups.host_to_device_async(stream);
+    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                            compinfo.device_ptr(),
+                            chunks.base_device_ptr(),
+                            num_columns,
+                            num_stripes,
+                            row_index_stride,
+                            use_base_stride,
+                            stream);
+  }
+
+  return decomp_data;
+}
+
+/**
+ * @brief Updates null mask of columns whose parent is a struct column.
+ *
+ * If struct column has null element, that row would be skipped while writing child column in ORC,
+ * so we need to insert the missing null elements in child column. There is another behavior from
+ * pyspark, where if the child column doesn't have any null elements, it will not have present
+ * stream, so in that case parent null mask need to be copied to child column.
+ *
+ * @param chunks Vector of list of column chunk descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                      host_span<column_buffer> out_buffers,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  bool is_mask_updated   = false;
+
+  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
+      if (not is_mask_updated) {
+        chunks.device_to_host_sync(stream);
+        is_mask_updated = true;
+      }
+
+      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
+      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
+      auto child_mask_len =
+        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
+      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
+
+      if (child_valid_map_base != nullptr) {
+        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
+        // Copy indexes at which the parent has valid value.
+        thrust::copy_if(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator(0),
+                        thrust::make_counting_iterator(0) + parent_mask_len,
+                        dst_idx.begin(),
+                        [parent_valid_map_base] __device__(auto idx) {
+                          return bit_is_set(parent_valid_map_base, idx);
+                        });
+
+        auto merged_null_mask = cudf::detail::create_null_mask(
+          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
+        uint32_t* dst_idx_ptr = dst_idx.data();
+        // Copy child valid bits from child column to valid indexes, this will merge both child
+        // and parent null masks
+        thrust::for_each(rmm::exec_policy(stream),
+                         thrust::make_counting_iterator(0),
+                         thrust::make_counting_iterator(0) + dst_idx.size(),
+                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
+                           if (bit_is_set(child_valid_map_base, idx)) {
+                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
+                           };
+                         });
+
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
+
+      } else {
+        // Since child column doesn't have a mask, copy parent null mask
+        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
+      }
+    }
+  }
+
+  if (is_mask_updated) {
+    // Update chunks with pointers to column data which might have been changed.
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+        auto& chunk          = chunks[stripe_idx][col_idx];
+        chunk.valid_map_base = out_buffers[col_idx].null_mask();
+      }
+    }
+    chunks.host_to_device_sync(stream);
+  }
+}
+
+/**
+ * @brief Converts the stripe column data and outputs to columns.
+ *
+ * @param num_dicts Number of dictionary entries required
+ * @param skip_rows Number of rows to offset from start
+ * @param row_index_stride Distance between each row index
+ * @param level Current nesting level being processed
+ * @param tz_table Local time to UTC conversion table
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void decode_stream_data(std::size_t num_dicts,
+                        int64_t skip_rows,
+                        size_type row_index_stride,
+                        std::size_t level,
+                        table_view const& tz_table,
+                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        std::vector<column_buffer>& out_buffers,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  printf("decode %d stripess \n", (int)num_stripes);
+
+  thrust::counting_iterator<int> col_idx_it(0);
+  thrust::counting_iterator<int> stripe_idx_it(0);
+
+  // Update chunks with pointers to column data
+  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
+    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
+      auto& chunk            = chunks[stripe_idx][col_idx];
+      chunk.column_data_base = out_buffers[col_idx].data();
+      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
+    });
+  });
+
+  // Allocate global dictionary for deserializing
+  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
+
+  chunks.host_to_device_sync(stream);
+  gpu::DecodeNullsAndStringDictionaries(
+    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
+
+  if (level > 0) {
+    printf("update_null_mask\n");
+    // Update nullmasks for children if parent was a struct and had null mask
+    update_null_mask(chunks, out_buffers, stream, mr);
+  }
+
+  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
+  rmm::device_scalar<size_type> error_count(0, stream);
+  // Update the null map for child columns
+
+  // printf(
+  //   "num col: %d, num stripe: %d, skip row: %d, row_groups size: %d, row index stride: %d, "
+  //   "level: "
+  //   "%d\n",
+  //   (int)num_columns,
+  //   (int)num_stripes,
+  //   (int)skip_rows,
+  //   (int)row_groups.size().first,
+  //   (int)row_index_stride,
+  //   (int)level
+  // );
+
+  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
+                           global_dict.data(),
+                           row_groups,
+                           num_columns,
+                           num_stripes,
+                           skip_rows,
+                           *tz_table_dptr,
+                           row_groups.size().first,
+                           row_index_stride,
+                           level,
+                           error_count.data(),
+                           stream);
+  chunks.device_to_host_async(stream);
+  // `value` synchronizes
+  auto const num_errors = error_count.value(stream);
+  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
+
+  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
+    out_buffers[col_idx].null_count() =
+      std::accumulate(stripe_idx_it + 0,
+                      stripe_idx_it + num_stripes,
+                      0,
+                      [&](auto null_count, auto const stripe_idx) {
+                        // printf(
+                        //   "null count: %d => %d\n", (int)stripe_idx,
+                        //   (int)chunks[stripe_idx][col_idx].null_count);
+                        // printf("num child rows: %d \n",
+                        // (int)chunks[stripe_idx][col_idx].num_child_rows);
+
+                        return null_count + chunks[stripe_idx][col_idx].null_count;
+                      });
+  });
+}
+
+/**
+ * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
+ * layer.
+ */
+void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      rmm::cuda_stream_view stream)
+{
+  auto const num_stripes = chunks.size().first;
+  if (num_stripes == 0) return;
+
+  auto const num_columns = chunks.size().second;
+  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
+    // Null counts sums are only needed for children of struct columns
+    if (chunks[0][col_idx].type_kind == STRUCT) {
+      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+    }
+  }
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+                     auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(0) + psums.size(),
+                       psums.begin(),
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+
+                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
+                   });
+  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
+  stream.synchronize();
+}
+
+// TODO: this is called for each chunk of stripes.
+/**
+ * @brief Aggregate child metadata from parent column chunks.
+ */
+void aggregate_child_meta(std::size_t stripe_start,
+                          std::size_t level,
+                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          host_span<orc_column_meta const> nested_cols,
+                          host_span<column_buffer> out_buffers,
+                          reader_column_meta& col_meta)
+{
+  auto const num_of_stripes         = chunks.size().first;
+  auto const num_of_rowgroups       = row_groups.size().first;
+  auto const num_child_cols         = selected_columns.levels[level + 1].size();
+  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
+  auto& num_child_rows              = col_meta.num_child_rows;
+  auto& parent_column_data          = col_meta.parent_column_data;
+
+  // Reset the meta to store child column details.
+  num_child_rows.resize(selected_columns.levels[level + 1].size());
+  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
+  parent_column_data.resize(number_of_child_chunks);
+  col_meta.parent_column_index.resize(number_of_child_chunks);
+  col_meta.child_start_row.resize(number_of_child_chunks);
+  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
+  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
+
+  auto child_start_row = cudf::detail::host_2dspan<int64_t>(
+    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
+  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<int64_t>(
+    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
+    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+
+  int index = 0;  // number of child column processed
+
+  printf("\n\n");
+  // For each parent column, update its child column meta for each stripe.
+  std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
+    // printf("p_col.id: %d\n", (int)p_col.id);
+
+    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+    // printf("   level: %d, parent_col_idx: %d\n", (int)level, (int)parent_col_idx);
+
+    int64_t start_row         = 0;
+    auto processed_row_groups = 0;
+
+    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
+      // Aggregate num_rows and start_row from processed parent columns per row groups
+      if (num_of_rowgroups) {
+        // printf("   num_of_rowgroups: %d\n", (int)num_of_rowgroups);
+
+        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
+        auto processed_child_rows  = 0;
+
+        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
+             rowgroup_id++, processed_row_groups++) {
+          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
+          for (size_type id = 0; id < p_col.num_children; id++) {
+            auto const child_col_idx                                  = index + id;
+            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
+            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
+          }
+          processed_child_rows += child_rows;
+        }
+      }
+
+      // Aggregate start row, number of rows per chunk and total number of rows in a column
+      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      // printf("     stripe_id: %d: child_rows: %d\n", (int)stripe_id, (int)child_rows);
+      // printf("      p_col.num_children: %d\n", (int)p_col.num_children);
+
+      for (size_type id = 0; id < p_col.num_children; id++) {
+        auto const child_col_idx = index + id;
+
+        // TODO: Check for overflow here.
+        num_child_rows[child_col_idx] += child_rows;
+        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
+        // start row could be different for each column when there is nesting at each stripe level
+        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+        // printf("update child_start_row (%d, %d): %d\n",
+        //        (int)stripe_id,
+        //        (int)child_col_idx,
+        //        (int)start_row);
+      }
+      start_row += child_rows;
+      // printf("        start_row: %d\n", (int)start_row);
+    }
+
+    // Parent column null mask and null count would be required for child column
+    // to adjust its nullmask.
+    auto type              = out_buffers[parent_col_idx].type.id();
+    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
+    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
+    auto num_rows          = out_buffers[parent_col_idx].size;
+
+    for (size_type id = 0; id < p_col.num_children; id++) {
+      auto const child_col_idx                    = index + id;
+      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
+      if (type == type_id::STRUCT) {
+        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
+        // Number of rows in child will remain same as parent in case of struct column
+        num_child_rows[child_col_idx] = num_rows;
+      } else {
+        parent_column_data[child_col_idx] = {nullptr, 0};
+      }
+    }
+    index += p_col.num_children;
+  });
+}
+
+/**
+ * @brief struct to store buffer data and size of list buffer
+ */
+struct list_buffer_data {
+  size_type* data;
+  size_type size;
+};
+
+// Generates offsets for list buffer from number of elements in a row.
+void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
+{
+  for (auto& list_data : buff_data) {
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           list_data.data,
+                           list_data.data + list_data.size,
+                           list_data.data);
+  }
+}
+
+/**
+ * @brief TODO
+ * @param input
+ * @param size_limit
+ * @param stream
+ * @return
+ */
+std::vector<chunk> find_table_splits(table_view const& input,
+                                     size_type segment_length,
+                                     std::size_t size_limit,
+                                     rmm::cuda_stream_view stream)
+{
+  printf("find table split, seg length = %d, limit = %d \n", segment_length, (int)size_limit);
+
+  // If segment_length is zero: we don't have any limit on granularity.
+  // As such, set segment length to the number of rows.
+  if (segment_length == 0) { segment_length = input.num_rows(); }
+
+  // If we have small number of rows, need to adjust segment_length before calling to
+  // `segmented_row_bit_count`.
+  segment_length = std::min(segment_length, input.num_rows());
+
+  // Default 10k rows.
+  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
+    input, segment_length, stream, rmm::mr::get_current_device_resource());
+
+  auto segmented_sizes =
+    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
+
+  // TODO: exec_policy_nosync
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(d_segmented_sizes->size()),
+    segmented_sizes.d_begin(),
+    [segment_length,
+     num_rows = input.num_rows(),
+     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
+      // Since the number of rows may not divisible by segment_length,
+      // the last segment may be shorter than the others.
+      auto const current_length =
+        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const size = d_sizes[segment_idx];
+      return cumulative_size{current_length, static_cast<std::size_t>(size)};
+    });
+
+  // TODO: remove:
+  segmented_sizes.device_to_host_sync(stream);
+  printf("total row sizes by segment = %d:\n", (int)segment_length);
+  for (auto& size : segmented_sizes) {
+    printf("size: %ld, %zu\n", size.count, size.size_bytes / CHAR_BIT);
+  }
+
+  // TODO: exec_policy_nosync
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         segmented_sizes.d_begin(),
+                         segmented_sizes.d_end(),
+                         segmented_sizes.d_begin(),
+                         cumulative_size_sum{});
+  segmented_sizes.device_to_host_sync(stream);
+
+  // Since the segment sizes are in bits, we need to multiply CHAR_BIT with the output limit.
+  return find_splits(segmented_sizes, input.num_rows(), size_limit * CHAR_BIT);
+}
+
+}  // namespace
+
+// TODO: this should be called per chunk of stripes.
+void reader::impl::decompress_and_decode()
+{
+  if (_file_itm_data.has_no_data()) { return; }
+
+  auto const stripe_chunk =
+    _chunk_read_data.decode_stripe_chunks[_chunk_read_data.curr_decode_stripe_chunk++];
+  auto const stripe_start = stripe_chunk.start_idx;
+  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+
+  auto const load_stripe_start =
+    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1].start_idx;
+
+  printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+
+  auto const rows_to_skip = _file_itm_data.rows_to_skip;
+  // auto const rows_to_read      = _file_itm_data.rows_to_read;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+
+  // auto const rows_to_skip = 0;
+  auto rows_to_read = 0;
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_info = stripe.stripe_info;
+    // TODO: check overflow
+    // CUDF_EXPECTS(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows <
+    //                static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+    //              "TODO");
+    rows_to_read += static_cast<size_type>(stripe_info->numberOfRows);
+
+    if (_file_itm_data.rows_to_skip > 0) {
+      CUDF_EXPECTS(_file_itm_data.rows_to_skip < static_cast<int64_t>(stripe_info->numberOfRows),
+                   "TODO");
+    }
+  }
+  rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
+  _file_itm_data.rows_to_skip = 0;
+
+  // Set up table for converting timestamp columns from local to UTC time
+  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
+  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
+  auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
+  auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
+
+  null_count_prefix_sums.clear();
+
+  // TODO: move this to global step
+  lvl_chunks.resize(_selected_columns.num_levels());
+  _out_buffers.clear();
+  _out_buffers.resize(_selected_columns.num_levels());
+
+  //
+  //
+  //
+  // TODO: move this to reader_impl.cu, decomp and decode step
+  //  std::size_t num_stripes = selected_stripes.size();
+  std::size_t num_stripes = stripe_chunk.count;
+
+  // Iterates through levels of nested columns, child column will be one level down
+  // compared to parent column.
+  auto& col_meta = *_col_meta;
+
+#if 0
+  printf("num_child_rows: (size %d)\n", (int)_col_meta->num_child_rows.size());
+  if (_col_meta->num_child_rows.size()) {
+    for (auto x : _col_meta->num_child_rows) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+
+    _col_meta->num_child_rows.clear();
+  }
+
+  printf("parent_column_data null count: (size %d)\n", (int)_col_meta->parent_column_data.size());
+  if (_col_meta->parent_column_data.size()) {
+    for (auto x : _col_meta->parent_column_data) {
+      printf("%d, ", (int)x.null_count);
+    }
+    printf("\n");
+    _col_meta->parent_column_data.clear();
+  }
+
+  printf("parent_column_index: (size %d)\n", (int)_col_meta->parent_column_index.size());
+  if (_col_meta->parent_column_index.size()) {
+    for (auto x : _col_meta->parent_column_index) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+    _col_meta->parent_column_index.clear();
+  }
+
+  printf("child_start_row: (size %d)\n", (int)_col_meta->child_start_row.size());
+  if (_col_meta->child_start_row.size()) {
+    for (auto x : _col_meta->child_start_row) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+    _col_meta->child_start_row.clear();
+  }
+
+  printf("num_child_rows_per_stripe: (size %d)\n",
+         (int)_col_meta->num_child_rows_per_stripe.size());
+  if (_col_meta->num_child_rows_per_stripe.size()) {
+    for (auto x : _col_meta->num_child_rows_per_stripe) {
+      printf("%d, ", (int)x);
+    }
+    printf("\n");
+    _col_meta->num_child_rows_per_stripe.clear();
+  }
+
+  printf("rwgrp_meta: (size %d)\n", (int)_col_meta->rwgrp_meta.size());
+  if (_col_meta->rwgrp_meta.size()) {
+    for (auto x : _col_meta->rwgrp_meta) {
+      printf("(%d | %d), ", (int)x.start_row, (int)x.num_rows);
+    }
+    printf("\n");
+  }
+
+#endif
+
+  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
+
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    printf("processing level = %d\n", (int)level);
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
+    auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
+    auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
+
+    auto& columns_level = _selected_columns.levels[level];
+
+    // TODO: do it in global step
+    // Association between each ORC column and its cudf::column
+    std::vector<orc_column_meta> nested_cols;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
+    for (auto& col : columns_level) {
+      auto col_type =
+        to_cudf_type(_metadata.get_col_type(col.id).kind,
+                     _config.use_np_dtypes,
+                     _config.timestamp_type.id(),
+                     to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    auto const num_columns = columns_level.size();
+    auto& chunks           = lvl_chunks[level];
+    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
+    const bool use_index =
+      _config.use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() != 0 && num_columns * num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
+
+    printf(" use_index: %d\n", (int)use_index);
+
+    // Logically view streams as columns
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    int64_t stripe_start_row = 0;
+    int64_t num_dict_entries = 0;
+    int64_t num_rowgroups    = 0;
+
+    // TODO: Stripe and stream idx must be by chunk.
+    //    std::size_t stripe_idx = 0;
+    std::size_t stream_idx = 0;
+
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      //    for (auto const& stripe : selected_stripes) {
+
+      printf("processing stripe_idx = %d\n", (int)stripe_idx);
+      auto const& stripe       = selected_stripes[stripe_idx];
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      // printf("stripeinfo->indexLength: %d, data: %d\n",
+      //        (int)stripe_info->indexLength,
+      //        (int)stripe_info->dataLength);
+
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
+                                                                      level,
+                                                                      stripe_info,
+                                                                      stripe_footer,
+                                                                      col_meta.orc_col_map[level],
+                                                                      _metadata.get_types(),
+                                                                      use_index,
+                                                                      level == 0,
+                                                                      &num_dict_entries,
+                                                                      &stream_idx,
+                                                                      std::nullopt,  // stream_info
+                                                                      &chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      printf("is_stripe_data_empty: %d\n", (int)is_stripe_data_empty);
+
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      // TODO: Wrong?
+      // stripe load_stripe_start?
+      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
+
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
+
+      auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+      printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
+
+      auto const rowgroup_id    = num_rowgroups;
+      auto stripe_num_rowgroups = 0;
+      if (use_index) {
+        stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                               _metadata.get_row_index_stride();
+      }
+
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
+
+      // Update chunks to reference streams pointers
+      for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+        auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row =
+          (level == 0)
+            ? stripe_start_row
+            : col_meta.child_start_row[(stripe_idx - stripe_start) * num_columns + col_idx];
+        chunk.num_rows =
+          (level == 0)
+            ? static_cast<int64_t>(stripe_info->numberOfRows)
+            : col_meta
+                .num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_columns + col_idx];
+        printf("col idx: %d, start_row: %d, num rows: %d\n",
+               (int)col_idx,
+               (int)chunk.start_row,
+               (int)chunk.num_rows);
+
+        chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0)
+            ? nullptr
+            : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+
+        printf("type: %d\n", (int)chunk.type_kind);
+
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+        // printf("stripe_num_rowgroups: %d\n", (int)stripe_num_rowgroups);
+
+        if (chunk.type_kind == orc::TIMESTAMP) {
+          chunk.timestamp_type_id = _config.timestamp_type.id();
+        }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stream_begin].dst_pos;
+            // printf("chunk.streams[%d] of chunk.strm_id[%d], stripe %d | %d, collect from %d\n",
+            //        (int)k,
+            //        (int)chunk.strm_id[k],
+            //        (int)stripe_idx,
+            //        (int)stripe_start,
+            //        (int)(chunk.strm_id[k] + stream_begin));
+          }
+        }
+      }
+
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
+
+      stripe_start_row += num_rows_per_stripe;
+      num_rowgroups += stripe_num_rowgroups;
+
+      //      stripe_idx++;
+    }  // for (stripe : selected_stripes)
+
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
+
+    if (stripe_data.empty()) { continue; }
+
+    // Process dataset chunk pages into output columns
+    auto row_groups =
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+    if (level > 0 and row_groups.size().first) {
+      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                     num_rowgroups * num_columns);
+      auto& rw_grp_meta = col_meta.rwgrp_meta;
+
+      // Update start row and num rows per row group
+      std::transform(rw_grp_meta.begin(),
+                     rw_grp_meta.end(),
+                     row_groups_span.begin(),
+                     rw_grp_meta.begin(),
+                     [&](auto meta, auto& row_grp) {
+                       row_grp.num_rows  = meta.num_rows;
+                       row_grp.start_row = meta.start_row;
+                       return meta;
+                     });
+    }
+
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
+
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // printf("decompress----------------------\n");
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
+      CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_chunk > 0, "ERRRRR");
+
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+
+      auto decomp_data = decompress_stripe_data(
+        _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1],
+        stripe_chunk,
+        _file_itm_data.compinfo_map,
+        *_metadata.per_file_metadata[0].decompressor,
+        stripe_data,
+        stream_info,
+        chunks,
+        row_groups,
+        num_stripes,
+        _metadata.get_row_index_stride(),
+        level == 0,
+        _stream);
+      // stripe_data.clear();
+      // stripe_data.push_back(std::move(decomp_data));
+
+      // TODO: only reset each one if the new size/type are different.
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (int64_t i = 1; i < stripe_chunk.count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+
+      // printf("line %d\n", __LINE__);
+      // fflush(stdout);
+
+    } else {
+      // printf("no decompression----------------------\n");
+
+      if (row_groups.size().first) {
+        // printf("line %d\n", __LINE__);
+        // fflush(stdout);
+        chunks.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                                nullptr,
+                                chunks.base_device_ptr(),
+                                num_columns,
+                                num_stripes,
+                                _metadata.get_row_index_stride(),
+                                level == 0,
+                                _stream);
+      }
+    }
+
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
+    // TODO: do not clear but reset each one.
+    // and only reset if the new size/type are different.
+    _out_buffers[level].clear();
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
+    for (std::size_t i = 0; i < column_types.size(); ++i) {
+      bool is_nullable = false;
+      for (std::size_t j = 0; j < num_stripes; ++j) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          printf("   is nullable\n");
+          is_nullable = true;
+          break;
+        }
+      }
+      auto is_list_type = (column_types[i].id() == type_id::LIST);
+      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+
+      // printf("  create col, num rows: %d\n", (int)n_rows);
+
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+
+      // For list column, offset column will be always size + 1
+      if (is_list_type) n_rows++;
+      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+
+      {
+        _stream.synchronize();
+        auto peak_mem = mem_stats_logger.peak_memory_usage();
+        std::cout << __LINE__ << ", buffer size: " << n_rows
+                  << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      }
+    }
+
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
+    decode_stream_data(num_dict_entries,
+                       rows_to_skip,
+                       _metadata.get_row_index_stride(),
+                       level,
+                       tz_table->view(),
+                       chunks,
+                       row_groups,
+                       _out_buffers[level],
+                       _stream,
+                       _mr);
+
+    {
+      _stream.synchronize();
+      auto peak_mem = mem_stats_logger.peak_memory_usage();
+      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+    }
+
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
+
+    if (nested_cols.size()) {
+      printf("have nested col\n");
+
+      // Extract information to process nested child columns
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+
+      row_groups.device_to_host_sync(_stream);
+      aggregate_child_meta(stripe_start,
+                           level,
+                           _selected_columns,
+                           chunks,
+                           row_groups,
+                           nested_cols,
+                           _out_buffers[level],
+                           col_meta);
+
+      // ORC stores number of elements at each row, so we need to generate offsets from that
+      std::vector<list_buffer_data> buff_data;
+      std::for_each(
+        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+          if (out_buffer.type.id() == type_id::LIST) {
+            auto data = static_cast<size_type*>(out_buffer.data());
+            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+          }
+        });
+
+      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    }
+
+    // printf("line %d\n", __LINE__);
+    // fflush(stdout);
+  }  // end loop level
+
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
+  std::vector<std::unique_ptr<column>> out_columns;
+  _out_metadata = get_meta_with_user_data();
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      _out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
+      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
+  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
+
+  // TODO: do not clear but reset each one.
+  // and only reset if the new size/type are different.
+  // This clear is just to check if there is memory leak.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    _out_buffers[level].clear();
+
+    auto& stripe_data = lvl_stripe_data[level];
+
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (int64_t i = 0; i < stripe_chunk.count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
+  }
+
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+
+  // printf("col: \n");
+  // cudf::test::print(_chunk_read_data.decoded_table->get_column(0).view());
+
+  // DEBUG only
+  // _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
+
+  _chunk_read_data.curr_output_table_chunk = 0;
+  _chunk_read_data.output_table_chunks =
+    _chunk_read_data.output_size_limit == 0
+      ? std::vector<chunk>{chunk{0, _chunk_read_data.decoded_table->num_rows()}}
+      : find_table_splits(_chunk_read_data.decoded_table->view(),
+                          _chunk_read_data.output_row_granularity,
+                          _chunk_read_data.output_size_limit,
+                          _stream);
+
+  auto& splits = _chunk_read_data.output_table_chunks;
+  printf("------------\nSplits decoded table (/total num rows = %d): \n",
+         (int)_chunk_read_data.decoded_table->num_rows());
+  for (size_t idx = 0; idx < splits.size(); idx++) {
+    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+  }
+  fflush(stdout);
+
+  {
+    _stream.synchronize();
+    auto peak_mem = mem_stats_logger.peak_memory_usage();
+    std::cout << "decomp and decode, peak_memory_usage: " << peak_mem << "("
+              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  }
+}
+
+}  // namespace cudf::io::orc::detail

From c44f0ec03c6338664e77388a4ba1f908e41669de Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 5 Mar 2024 16:53:07 -0800
Subject: [PATCH 169/321] Change comment and docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/CMakeLists.txt      |  1 +
 cpp/include/cudf/io/detail/orc.hpp | 27 +++++++++++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5e7b13331a1..516338febca 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -255,6 +255,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
+# TODO: add back the removed file, and add new file
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp)
 
 # ##################################################################################################
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index c6176021a79..83035b32e04 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -79,10 +79,10 @@ class reader {
 };
 
 /**
- * @brief The reader class that supports iterative reading of a given file.
+ * @brief The reader class that supports iterative reading from an array of data sources.
  *
  * This class intentionally subclasses the `reader` class with private inheritance to hide the
- * `reader::read()` API. As such, only chunked reading APIs are supported.
+ * base class `reader::read()` API. As such, only chunked reading APIs are supported through it.
  */
 class chunked_reader : private reader {
  public:
@@ -98,18 +98,25 @@ class chunked_reader : private reader {
    *
    * ```
    *
-   * If `output_size_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
-   * whole file and return a table containing all rows.
+   * If `output_size_limit == 0` (i.e., no output limit) and `data_read_limit == 0` (no temporary
+   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
+   * containing all rows.
    *
-   * TODO: data read limit
-   * TODO: granularity
+   * The `output_size_limit` parameter controls the size of the output table to be returned per
+   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
+   * tables that have a total bytes size (over all columns) of 100 MB or less.
+   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
    *
-   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   * The `data_read_limit` parameter controls how much temporary memory is used in the entire
+   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
+   * the reader will try to make the best effort.
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
-   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
-   *        of input, or `0` if there is no limit
    * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behavior
+   * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */

From b842118ad940521bf5f6c9ef1803c6e15faec212 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 14:50:33 -0800
Subject: [PATCH 170/321] Add error check for `output_row_granularity`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 4d285e6788d..79bcaae25f1 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -57,6 +57,7 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                rmm::mr::device_memory_resource* mr)
   : reader()  // TODO
 {
+  CUDF_EXPECTS(output_row_granularity > 0, "Invalid value of `output_row_granularity`.");
   _impl = std::make_unique<impl>(output_size_limit,
                                  data_read_limit,
                                  output_row_granularity,

From 248f0ef382add0323b166137d88650fb9fca4bde Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 15:12:14 -0800
Subject: [PATCH 171/321] Update docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp | 36 +++++++++++++-----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 83035b32e04..32fcafc1923 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -111,10 +111,18 @@ class chunked_reader : private reader {
    * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
    * the reader will try to make the best effort.
    *
+   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
+   * output chunk. For each call to `read_chunk()`, with respect to the given `data_read_limit`, a
+   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
+   * reader will then subdivide that table into smaller tables for final output using
+   * `output_row_granularity` as the subdivision step.
+   *
    * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
    * @param data_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
+   * @param output_row_granularity The granularity parameter used for subdividing the decoded
+   *        table for final output
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
@@ -122,42 +130,28 @@ class chunked_reader : private reader {
    */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
+                          size_type output_row_granularity,
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           orc_reader_options const& options,
                           rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from size limits and an array of data sources with reader options.
    *
-   * The typical usage should be similar to this:
-   * ```
-   *  do {
-   *    auto const chunk = reader.read_chunk();
-   *    // Process chunk
-   *  } while (reader.has_next());
+   * This constructor implicitly call the other constructor with `output_row_granularity` set to
+   * 10'000 rows.
    *
-   * ```
-   *
-   * If `output_size_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
-   * whole file and return a table containing all rows.
-   *
-   * TODO: data read limit
-   * TODO: granularity
-   *
-   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
-   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
-   *        of input, or `0` if there is no limit
-   * @param output_row_granularity  TODO
    * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behavior
+   * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
-                          size_type output_row_granularity,
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           orc_reader_options const& options,
                           rmm::cuda_stream_view stream,

From 497eea5f5bab5e1047e5aafe5c7c91c041296ff9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 15:33:54 -0800
Subject: [PATCH 172/321] Update docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp |  59 ++--------------
 cpp/include/cudf/io/orc.hpp        | 110 +++++++++++++++++------------
 cpp/src/io/functions.cpp           |  34 ++++++---
 3 files changed, 95 insertions(+), 108 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 32fcafc1923..d532cee5677 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -87,46 +87,9 @@ class reader {
 class chunked_reader : private reader {
  public:
   /**
-   * @brief Constructor from size limits and an array of data sources with reader options.
-   *
-   * The typical usage should be similar to this:
-   * ```
-   *  do {
-   *    auto const chunk = reader.read_chunk();
-   *    // Process chunk
-   *  } while (reader.has_next());
-   *
-   * ```
-   *
-   * If `output_size_limit == 0` (i.e., no output limit) and `data_read_limit == 0` (no temporary
-   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
-   * containing all rows.
-   *
-   * The `output_size_limit` parameter controls the size of the output table to be returned per
-   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
-   * tables that have a total bytes size (over all columns) of 100 MB or less.
-   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
-   *
-   * The `data_read_limit` parameter controls how much temporary memory is used in the entire
-   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
-   * the reader will try to make the best effort.
-   *
-   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
-   * output chunk. For each call to `read_chunk()`, with respect to the given `data_read_limit`, a
-   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
-   * reader will then subdivide that table into smaller tables for final output using
-   * `output_row_granularity` as the subdivision step.
-   *
-   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
-   *        or `0` if there is no limit
-   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
-   *        or `0` if there is no limit
-   * @param output_row_granularity The granularity parameter used for subdividing the decoded
-   *        table for final output
-   * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behaviors
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
+   * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
+   * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
    */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
@@ -136,19 +99,9 @@ class chunked_reader : private reader {
                           rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource* mr);
   /**
-   * @brief Constructor from size limits and an array of data sources with reader options.
-   *
-   * This constructor implicitly call the other constructor with `output_row_granularity` set to
-   * 10'000 rows.
-   *
-   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
-   *        or `0` if there is no limit
-   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
-   *        or `0` if there is no limit
-   * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behaviors
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
+   * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
+   * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
    */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 19252e77b91..129541be156 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -424,73 +424,95 @@ class chunked_orc_reader {
   chunked_orc_reader() = default;
 
   /**
-   * @brief Constructor for chunked reader.
+   * @brief Constructor from size limits and an array of data sources with reader options.
    *
-   * This constructor requires the same `orc_reader_option` parameter as in
-   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
-   * output table for each reading.
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
    *
-   * TODO: data read limit
+   * ```
    *
-   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   * If `output_size_limit == 0` (i.e., no output limit) and `data_read_limit == 0` (no temporary
+   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
+   * containing all rows.
+   *
+   * The `output_size_limit` parameter controls the size of the output table to be returned per
+   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
+   * tables that have a total bytes size (over all columns) of 100 MB or less.
+   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
+   *
+   * The `data_read_limit` parameter controls how much temporary memory is used in the entire
+   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
+   * the reader will try to make the best effort.
+   *
+   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
+   * output chunk. For each call to `read_chunk()`, with respect to the given `data_read_limit`, a
+   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
+   * reader will then subdivide that table into smaller tables for final output using
+   * `output_row_granularity` as the subdivision step.
+   *
+   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
-   * @param options The options used to read Parquet file
+   * @param output_row_granularity The granularity parameter used for subdividing the decoded
+   *        table for final output
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  chunked_orc_reader(std::size_t output_size_limit,
-                     orc_reader_options const& options,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  explicit chunked_orc_reader(
+    std::size_t output_size_limit,
+    std::size_t data_read_limit,
+    size_type output_row_granularity,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Constructor for chunked reader.
+   * @brief Constructor from size limits and an array of data sources with reader options.
    *
-   * This constructor requires the same `orc_reader_option` parameter as in
-   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
-   * output table for each reading.
+   * This constructor implicitly call the other constructor with `output_row_granularity` set to
+   * 10'000 rows.
    *
-   * TODO: data read limit
-   *
-   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
-   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
-   *        of input, or `0` if there is no limit
-   * @param options The options used to read Parquet file
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  chunked_orc_reader(std::size_t output_size_limit,
-                     std::size_t data_read_limit,
-                     orc_reader_options const& options,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  explicit chunked_orc_reader(
+    std::size_t output_size_limit,
+    std::size_t data_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Constructor for chunked reader.
+   * @brief Constructor from output size limit and an array of data sources with reader options.
    *
-   * This constructor requires the same `orc_reader_option` parameter as in
-   * `cudf::read_orc()`, and additional parameters to specify the size byte limits of the
-   * output table for each reading.
+   * This constructor implicitly call the other constructor with `data_read_limit` set to `0` and
+   * `output_row_granularity` set to 10'000 rows.
    *
-   * TODO: data read limit
-   *
-   * @param output_size_limit Limit on total number of bytes to be returned per read,
+   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
-   * @param data_read_limit Limit on memory usage for the purposes of decompression and processing
-   *        of input, or `0` if there is no limit
-   * @param output_row_granularity  TODO
-   * @param options The options used to read Parquet file
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  chunked_orc_reader(std::size_t output_size_limit,
-                     std::size_t data_read_limit,
-                     size_type output_row_granularity,
-                     orc_reader_options const& options,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
+  explicit chunked_orc_reader(
+    std::size_t output_size_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Destructor, destroying the internal reader instance.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 04799fabeef..378a37ce859 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -449,23 +449,19 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
 }
 
 /**
- * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader
+ * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
+ * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
+ * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
  */
-chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
-                                       orc_reader_options const& options,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
-  : chunked_orc_reader(output_size_limit, 0UL, options, stream, mr)
-{
-}
-
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        std::size_t data_read_limit,
+                                       size_type output_row_granularity,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
   : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
                                                          data_read_limit,
+                                                         output_row_granularity,
                                                          make_datasources(options.get_source()),
                                                          options,
                                                          stream,
@@ -473,15 +469,18 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
+/**
+ * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
+ * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
+ * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ */
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        std::size_t data_read_limit,
-                                       size_type output_row_granularity,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
   : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
                                                          data_read_limit,
-                                                         output_row_granularity,
                                                          make_datasources(options.get_source()),
                                                          options,
                                                          stream,
@@ -489,6 +488,19 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
+/**
+ * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t,
+ * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
+ * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ */
+chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+  : chunked_orc_reader(output_size_limit, 0UL, options, stream, mr)
+{
+}
+
 /**
  * @copydoc cudf::io::chunked_orc_reader::~chunked_orc_reader
  */

From 33aff9412329c08a17ed06e842fff32be78d3f67 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 16:41:14 -0800
Subject: [PATCH 173/321] Cleanup and change docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp               | 10 +++-----
 cpp/src/io/functions.cpp                  | 28 ++++-------------------
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 12 ++++------
 cpp/src/io/orc/reader.cu                  |  4 ++--
 cpp/src/io/utilities/row_selection.cpp    | 14 ++++++------
 5 files changed, 20 insertions(+), 48 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 129541be156..99580bd9886 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -406,7 +406,7 @@ table_with_metadata read_orc(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief The chunked orc reader class to read ORC file iteratively in to a series of
+ * @brief The chunked orc reader class to read ORC file iteratively into a series of
  * tables, chunk by chunk.
  *
  * This class is designed to address the reading issue when reading very large ORC files such
@@ -515,10 +515,6 @@ class chunked_orc_reader {
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Destructor, destroying the internal reader instance.
-   *
-   * Since the declaration of the internal `reader` object does not exist in this header, this
-   * destructor needs to be defined in a separate source file which can access to that object's
-   * declaration.
    */
   ~chunked_orc_reader();
 
@@ -1179,7 +1175,7 @@ class chunked_orc_writer_options {
    */
   void set_stripe_size_bytes(size_t size_bytes)
   {
-    // CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
+    CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
     _stripe_size_bytes = size_bytes;
   }
 
@@ -1195,7 +1191,7 @@ class chunked_orc_writer_options {
    */
   void set_stripe_size_rows(size_type size_rows)
   {
-    // CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
+    CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
     _stripe_size_rows = size_rows;
   }
 
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 378a37ce859..e8dbf97abd6 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -448,11 +448,6 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   }
 }
 
-/**
- * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
- * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
- * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
- */
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        std::size_t data_read_limit,
                                        size_type output_row_granularity,
@@ -469,11 +464,6 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
-/**
- * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
- * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
- * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
- */
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        std::size_t data_read_limit,
                                        orc_reader_options const& options,
@@ -488,11 +478,6 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
-/**
- * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t,
- * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
- * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
- */
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
@@ -501,14 +486,12 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
-/**
- * @copydoc cudf::io::chunked_orc_reader::~chunked_orc_reader
- */
+// This destructor destroys the internal reader instance.
+// Since the declaration of the internal `reader` object does not exist in the header, this
+// destructor needs to be defined in a separate source file which can access to that object's
+// declaration.
 chunked_orc_reader::~chunked_orc_reader() = default;
 
-/**
- * @copydoc cudf::io::chunked_orc_reader::has_next
- */
 bool chunked_orc_reader::has_next() const
 {
   CUDF_FUNC_RANGE();
@@ -516,9 +499,6 @@ bool chunked_orc_reader::has_next() const
   return reader->has_next();
 }
 
-/**
- * @copydoc cudf::io::chunked_orc_reader::read_chunk
- */
 table_with_metadata chunked_orc_reader::read_chunk() const
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 1e9cb50d532..9ec4488cbf2 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -168,10 +168,6 @@ aggregate_orc_metadata::select_stripes(
   }();
 
   struct stripe_source_mapping {
-    stripe_source_mapping(int source_idx, std::vector<metadata::OrcStripeInfo>&& stripe_info)
-      : source_idx(source_idx), stripe_info(std::move(stripe_info))
-    {
-    }
     int source_idx;
     std::vector<metadata::OrcStripeInfo> stripe_info;
   };
@@ -206,8 +202,8 @@ aggregate_orc_metadata::select_stripes(
                (int)rows_to_read);
         printf(" stripe to read: %d-%d\n", (int)src_file_idx, (int)stripe_idx);
       }
-      selected_stripes_mapping.emplace_back(static_cast<int>(src_file_idx),
-                                            std::move(stripe_infos));
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
   } else {
     int64_t count            = 0;
@@ -232,8 +228,8 @@ aggregate_orc_metadata::select_stripes(
         }
       }
 
-      selected_stripes_mapping.emplace_back(static_cast<int>(src_file_idx),
-                                            std::move(stripe_infos));
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
     // Need to remove skipped rows from the stripes which are not selected.
     rows_to_skip -= stripe_skip_rows;
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 79bcaae25f1..5ffff3d7d40 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -42,7 +42,7 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
-  : reader()  // TODO
+  : reader()
 {
   _impl = std::make_unique<impl>(
     output_size_limit, data_read_limit, std::move(sources), options, stream, mr);
@@ -55,7 +55,7 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
-  : reader()  // TODO
+  : reader()
 {
   CUDF_EXPECTS(output_row_granularity > 0, "Invalid value of `output_row_granularity`.");
   _impl = std::make_unique<impl>(output_size_limit,
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index f136cd11ff7..d91791b3371 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -26,17 +26,17 @@ namespace cudf::io::detail {
 std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
   int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows)
 {
-  auto const rows_to_skip = std::min(skip_rows, num_source_rows);
+  auto const rows_to_skip      = std::min(skip_rows, num_source_rows);
+  auto const num_rows_can_read = num_source_rows - rows_to_skip;
+
   if (not num_rows.has_value()) {
-    CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows exceeds the column size limit",
+    CUDF_EXPECTS(num_rows_can_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "The requested number of rows exceeds the column size limit.",
                  std::overflow_error);
-    return {rows_to_skip, num_source_rows - rows_to_skip};
+    return {rows_to_skip, static_cast<size_type>(num_rows_can_read)};
   }
   // Limit the number of rows to the end of the input
-  return {
-    rows_to_skip,
-    static_cast<size_type>(std::min<int64_t>(num_rows.value(), num_source_rows - rows_to_skip))};
+  return {rows_to_skip, std::min(num_rows.value(), static_cast<size_type>(num_rows_can_read))};
 }
 
 }  // namespace cudf::io::detail

From d071f46c4b30941e10bb6c715b08c58030dc8d35 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 19:42:16 -0800
Subject: [PATCH 174/321] Support 64bit size for `rows_to_read`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp  | 28 ++++++++++++++--------
 cpp/src/io/orc/aggregate_orc_metadata.hpp  |  2 +-
 cpp/src/io/orc/reader_impl.cu              | 13 ++++++----
 cpp/src/io/orc/reader_impl.hpp             | 13 ++++++----
 cpp/src/io/orc/reader_impl_chunking.cu     |  9 ++++++-
 cpp/src/io/orc/reader_impl_chunking.hpp    |  2 +-
 cpp/src/io/orc/reader_impl_decode.cu       | 10 ++++++--
 cpp/src/io/parquet/reader_impl_helpers.cpp |  5 ++--
 cpp/src/io/utilities/row_selection.cpp     | 15 +++++-------
 cpp/src/io/utilities/row_selection.hpp     |  5 ++--
 10 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 9ec4488cbf2..0a270877154 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -152,7 +152,7 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, size_type, std::vector<metadata::OrcStripeInfo>>
+std::tuple<int64_t, int64_t, std::vector<metadata::OrcStripeInfo>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
@@ -163,7 +163,7 @@ aggregate_orc_metadata::select_stripes(
                "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
-    if (not user_specified_stripes.empty()) { return std::pair<int64_t, size_type>{0, 0}; }
+    if (not user_specified_stripes.empty()) { return std::pair<int64_t, int64_t>{0, 0}; }
     return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
   }();
 
@@ -194,12 +194,15 @@ aggregate_orc_metadata::select_stripes(
                                 nullptr,
                                 static_cast<int>(src_file_idx)});
 
-        // TODO: change return type to int64_t
-        rows_to_read += static_cast<size_type>(
-          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows);
-        printf(" rows_to_read : %d / %d\n",
-               (int)per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows,
-               (int)rows_to_read);
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        rows_to_read += static_cast<int64_t>(stripe_rows);
+
+        // TODO: remove below
+        printf(" rows_to_read : %d / %d\n", (int)stripe_rows, (int)rows_to_read);
         printf(" stripe to read: %d-%d\n", (int)src_file_idx, (int)stripe_idx);
       }
       selected_stripes_mapping.emplace_back(
@@ -217,8 +220,13 @@ aggregate_orc_metadata::select_stripes(
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
-        count +=
-          static_cast<int64_t>(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows);
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        count += static_cast<int64_t>(stripe_rows);
+
         if (count > rows_to_skip || count == 0) {
           stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
                                   nullptr,
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 62d7a6a5c3f..613c08fb745 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -113,7 +113,7 @@ class aggregate_orc_metadata {
    *
    * Stripes are potentially selected from multiple files.
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::OrcStripeInfo>> select_stripes(
+  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::OrcStripeInfo>> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
     int64_t skip_rows,
     std::optional<size_type> const& num_rows,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d4ddbea347c..43d3a2d38f8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -61,7 +61,8 @@ namespace cudf::io::orc::detail {
 
 void reader::impl::prepare_data(int64_t skip_rows,
                                 std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
+                                std::vector<std::vector<size_type>> const& stripes,
+                                read_mode mode)
 {
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns
@@ -73,7 +74,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   std::cout << "call global, skip = " << skip_rows << std::endl;
 
-  global_preprocess(skip_rows, num_rows_opt, stripes);
+  global_preprocess(skip_rows, num_rows_opt, stripes, mode);
 
   if (!_chunk_read_data.more_table_chunk_to_output()) {
     if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
@@ -272,14 +273,15 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
                                        std::optional<size_type> const& num_rows_opt,
                                        std::vector<std::vector<size_type>> const& stripes)
 {
-  prepare_data(skip_rows, num_rows_opt, stripes);
+  prepare_data(skip_rows, num_rows_opt, stripes, read_mode::READ_ALL);
   return make_output_chunk();
 }
 
 bool reader::impl::has_next()
 {
   printf("==================query has next \n");
-  prepare_data(_config.skip_rows, _config.num_read_rows, _config.selected_stripes);
+  prepare_data(
+    _config.skip_rows, _config.num_read_rows, _config.selected_stripes, read_mode::CHUNKED_READ);
 
   printf("has next: %d\n", (int)_chunk_read_data.has_next());
   return _chunk_read_data.has_next();
@@ -313,7 +315,8 @@ table_with_metadata reader::impl::read_chunk()
 #endif
   }
 
-  prepare_data(_config.skip_rows, _config.num_read_rows, _config.selected_stripes);
+  prepare_data(
+    _config.skip_rows, _config.num_read_rows, _config.selected_stripes, read_mode::CHUNKED_READ);
 
   {
     _stream.synchronize();
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index de1d0ed68f5..48257659ebb 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -122,6 +122,9 @@ class reader::impl {
   table_with_metadata read_chunk();
 
  private:
+  // TODO
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
    *
@@ -132,9 +135,10 @@ class reader::impl {
    * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
    * @param stripes Indices of individual stripes to load if non-empty
    */
-  void prepare_data(int64_t skip_rows                                  = 0,
-                    std::optional<size_type> const& num_rows_opt       = std::nullopt,
-                    std::vector<std::vector<size_type>> const& stripes = {});
+  void prepare_data(int64_t skip_rows,
+                    std::optional<size_type> const& num_rows_opt,
+                    std::vector<std::vector<size_type>> const& stripes,
+                    read_mode mode);
 
   /**
    * @brief Perform a global preprocessing step that executes exactly once for the entire duration
@@ -150,7 +154,8 @@ class reader::impl {
    */
   void global_preprocess(uint64_t skip_rows,
                          std::optional<size_type> const& num_rows_opt,
-                         std::vector<std::vector<size_type>> const& stripes);
+                         std::vector<std::vector<size_type>> const& stripes,
+                         read_mode mode);
 
   /**
    * @brief Load stripes from the input source and store the data in the internal buffers.
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 944f23e7764..e9b6e92e9cb 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -339,7 +339,8 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
 
 void reader::impl::global_preprocess(uint64_t skip_rows,
                                      std::optional<size_type> const& num_rows_opt,
-                                     std::vector<std::vector<size_type>> const& stripes)
+                                     std::vector<std::vector<size_type>> const& stripes,
+                                     read_mode mode)
 {
   if (_file_itm_data.global_preprocessed) { return; }
   _file_itm_data.global_preprocessed = true;
@@ -350,6 +351,12 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
   if (_file_itm_data.has_no_data()) { return; }
 
+  CUDF_EXPECTS(
+    mode == read_mode::CHUNKED_READ ||
+      _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+    "Number or rows to read exceeds the column size limit in READ_ALL mode.",
+    std::overflow_error);
+
   printf("input skip rows: %d, num rows: %d\n", (int)skip_rows, (int)num_rows_opt.value_or(-1));
   printf("actual skip rows: %d, num rows: %d\n",
          (int)_file_itm_data.rows_to_skip,
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0769f46f1d1..f67407d3671 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -123,7 +123,7 @@ struct range {
  */
 struct file_intermediate_data {
   int64_t rows_to_skip;
-  size_type rows_to_read;
+  int64_t rows_to_read;
   std::vector<metadata::OrcStripeInfo> selected_stripes;
 
   // Return true if no rows or stripes to read.
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 4971f65debb..327a3124ebe 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -824,15 +824,17 @@ void reader::impl::decompress_and_decode()
   auto const& selected_stripes = _file_itm_data.selected_stripes;
 
   // auto const rows_to_skip = 0;
-  auto rows_to_read = 0;
+  int64_t rows_to_read = 0;
   for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
     auto const& stripe     = selected_stripes[stripe_idx];
     auto const stripe_info = stripe.stripe_info;
+    // TODO: this is indeed not needed since we split stripes before this based on stripe row
+
     // TODO: check overflow
     // CUDF_EXPECTS(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows <
     //                static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
     //              "TODO");
-    rows_to_read += static_cast<size_type>(stripe_info->numberOfRows);
+    rows_to_read += static_cast<int64_t>(stripe_info->numberOfRows);
 
     if (_file_itm_data.rows_to_skip > 0) {
       CUDF_EXPECTS(_file_itm_data.rows_to_skip < static_cast<int64_t>(stripe_info->numberOfRows),
@@ -842,6 +844,10 @@ void reader::impl::decompress_and_decode()
   rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
   _file_itm_data.rows_to_skip = 0;
 
+  CUDF_EXPECTS(rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+               "Number or rows to decode exceeds the column size limit.",
+               std::overflow_error);
+
   // Set up table for converting timestamp columns from local to UTC time
   auto const tz_table = [&, &selected_stripes = selected_stripes] {
     auto const has_timestamp_column = std::any_of(
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 6f11debb8df..7ab6b2cdd26 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -461,8 +461,9 @@ aggregate_reader_metadata::select_row_groups(
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
-      skip_rows_opt, num_rows_opt, get_num_rows());
-    return std::pair{static_cast<int64_t>(from_opts.first), from_opts.second};
+      skip_rows_opt, std::optional<int64_t>{num_rows_opt.value()}, get_num_rows());
+    return std::pair{static_cast<int64_t>(from_opts.first),
+                     static_cast<size_type>(from_opts.second)};
   }();
 
   if (!row_group_indices.empty()) {
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index d91791b3371..c0bbca39167 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -23,20 +23,17 @@
 
 namespace cudf::io::detail {
 
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows)
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows)
 {
   auto const rows_to_skip      = std::min(skip_rows, num_source_rows);
   auto const num_rows_can_read = num_source_rows - rows_to_skip;
 
-  if (not num_rows.has_value()) {
-    CUDF_EXPECTS(num_rows_can_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-                 "The requested number of rows exceeds the column size limit.",
-                 std::overflow_error);
-    return {rows_to_skip, static_cast<size_type>(num_rows_can_read)};
-  }
+  if (not num_rows.has_value()) { return {rows_to_skip, num_rows_can_read}; }
+
   // Limit the number of rows to the end of the input
-  return {rows_to_skip, std::min(num_rows.value(), static_cast<size_type>(num_rows_can_read))};
+  return {rows_to_skip, std::min(num_rows.value(), num_rows_can_read)};
 }
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 0b5d3aef8bd..7fdcc65d77b 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -34,7 +34,8 @@ namespace cudf::io::detail {
  *
  * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows);
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows);
 
 }  // namespace cudf::io::detail

From 388adb3985ce7b94e55b22679c57585ab3956979 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 20:37:20 -0800
Subject: [PATCH 175/321] Implement `cumulative_size_and_row`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 61 +++++++++++++++++--------
 cpp/src/io/orc/reader_impl_chunking.hpp | 18 ++++++--
 cpp/src/io/orc/reader_impl_decode.cu    |  2 +-
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index e9b6e92e9cb..2bd98ce7c42 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -191,9 +191,8 @@ std::size_t gather_stream_info_and_column_desc(
  * @brief Find the splits of the input data such that each split has cumulative size less than a
  * given `size_limit`.
  */
-std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
-                               int64_t total_count,
-                               size_t size_limit)
+template <typename T>
+std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, size_t size_limit)
 {
   // if (size_limit == 0) {
   //   printf("0 limit: output chunk = 0, %d\n", (int)total_count);
@@ -251,6 +250,12 @@ std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
 
   return splits;
 }
+
+template std::vector<chunk> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
+                                                         int64_t total_count,
+                                                         size_t size_limit);
+template std::vector<chunk> find_splits<cumulative_size_and_row>(
+  host_span<cumulative_size_and_row const> sizes, int64_t total_count, size_t size_limit);
 #endif
 
 namespace {
@@ -528,7 +533,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
                                               chunk_read_data::load_limit_ratio);
     return tmp > 0UL ? tmp : 1UL;
   }();
-  _chunk_read_data.load_stripe_chunks = find_splits(total_stripe_sizes, num_stripes, load_limit);
+  _chunk_read_data.load_stripe_chunks =
+    find_splits<cumulative_size>(total_stripe_sizes, num_stripes, load_limit);
 
 #ifndef PRINT_DEBUG
   auto& splits = _chunk_read_data.load_stripe_chunks;
@@ -557,11 +563,10 @@ void reader::impl::load_data()
   if (_file_itm_data.has_no_data()) { return; }
 
   //  auto const rows_to_read      = _file_itm_data.rows_to_read;
-  //  auto const& selected_stripes = _file_itm_data.selected_stripes;
-
-  auto& lvl_stripe_data  = _file_itm_data.lvl_stripe_data;
-  auto& lvl_stripe_sizes = _file_itm_data.lvl_stripe_sizes;
-  auto& read_info        = _file_itm_data.data_read_info;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes       = _file_itm_data.lvl_stripe_sizes;
+  auto& read_info              = _file_itm_data.data_read_info;
 
   //  std::size_t num_stripes = selected_stripes.size();
   auto const stripe_chunk =
@@ -625,8 +630,17 @@ void reader::impl::load_data()
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   stream_id_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
-  cudf::detail::hostdevice_vector<cumulative_size> stripe_decomp_sizes(stripe_chunk.count, _stream);
-  std::fill(stripe_decomp_sizes.begin(), stripe_decomp_sizes.end(), cumulative_size{1, 0});
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_chunk.count,
+                                                                               _stream);
+  for (int64_t stripe_idx = 0; stripe_idx < stripe_chunk.count; ++stripe_idx) {
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_info = stripe.stripe_info;
+
+    stripe_decomp_sizes[stripe_idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
+    // printf("loading stripe with rows = %d\n", (int)stripe_info->numberOfRows);
+  }
+  // std::fill(
+  //   stripe_decomp_sizes.begin(), stripe_decomp_sizes.end(), cumulative_size_and_row{1, 0, 0});
 
   // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
@@ -735,6 +749,14 @@ void reader::impl::load_data()
     return;
   }
 
+  {
+    int count{0};
+    for (auto& size : stripe_decomp_sizes) {
+      printf("decomp stripe size: %ld, %zu, %zu\n", size.count, size.size_bytes, size.rows);
+      if (count++ > 5) break;
+    }
+  }
+
   // Compute the prefix sum of stripe data sizes.
   stripe_decomp_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
@@ -745,23 +767,26 @@ void reader::impl::load_data()
 
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
+  {
+    int count{0};
+    for (auto& size : stripe_decomp_sizes) {
+      printf(
+        "prefix sum decomp stripe size: %ld, %zu, %zu\n", size.count, size.size_bytes, size.rows);
+      if (count++ > 5) break;
+    }
+  }
+
   auto const decode_limit = [&] {
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               (1.0 - chunk_read_data::load_limit_ratio));
     return tmp > 0UL ? tmp : 1UL;
   }();
   _chunk_read_data.decode_stripe_chunks =
-    find_splits(stripe_decomp_sizes, stripe_chunk.count, decode_limit);
+    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_chunk.count, decode_limit);
   for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
     chunk.start_idx += stripe_chunk.start_idx;
   }
 
-  int count{0};
-  for (auto& size : stripe_decomp_sizes) {
-    printf("decomp size: %ld, %zu\n", size.count, size.size_bytes);
-    if (count++ > 5) break;
-  }
-
 #ifndef PRINT_DEBUG
   auto& splits = _chunk_read_data.decode_stripe_chunks;
   printf("------------\nSplits decode_stripe_chunks (/%d): \n", (int)stripe_chunk.count);
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index f67407d3671..61b27ff7c54 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -252,6 +252,13 @@ struct cumulative_size {
   std::size_t size_bytes{0};
 };
 
+// TODO
+struct cumulative_size_and_row {
+  int64_t count{0};
+  std::size_t size_bytes{0};
+  std::size_t rows{0};
+};
+
 /**
  * @brief Functor to sum up cumulative sizes.
  */
@@ -260,15 +267,20 @@ struct cumulative_size_sum {
   {
     return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
   }
+
+  __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a,
+                                                cumulative_size_and_row const& b) const
+  {
+    return cumulative_size_and_row{a.count + b.count, a.size_bytes + b.size_bytes, a.rows + b.rows};
+  }
 };
 
 /**
  * @brief Find the splits of the input data such that each split has cumulative size less than a
  * given `size_limit`.
  */
-std::vector<chunk> find_splits(host_span<cumulative_size const> sizes,
-                               int64_t total_count,
-                               size_t size_limit);
+template <typename T>
+std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, size_t size_limit);
 
 // TODO
 std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 327a3124ebe..a0abf546a22 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -799,7 +799,7 @@ std::vector<chunk> find_table_splits(table_view const& input,
   segmented_sizes.device_to_host_sync(stream);
 
   // Since the segment sizes are in bits, we need to multiply CHAR_BIT with the output limit.
-  return find_splits(segmented_sizes, input.num_rows(), size_limit * CHAR_BIT);
+  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit * CHAR_BIT);
 }
 
 }  // namespace

From 7e451abafa68b5e9baebc3e49e8d7595c108c07a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Mar 2024 21:36:40 -0800
Subject: [PATCH 176/321] Split if num rows exceeds size limit

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2bd98ce7c42..2a6324da676 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -205,6 +205,8 @@ std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, si
   int64_t cur_pos{0};
   size_t cur_cumulative_size{0};
 
+  [[maybe_unused]] size_t cur_cumulative_rows{0};
+
   auto const start = thrust::make_transform_iterator(
     sizes.begin(), [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
   auto const end = start + static_cast<int64_t>(sizes.size());
@@ -220,13 +222,20 @@ std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, si
       split_pos--;
     }
 
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      while (split_pos > 0 && sizes[split_pos].rows - cur_cumulative_rows >
+                                static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+        split_pos--;
+      }
+    }
+
     // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
     // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
     // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
     // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
     // either do this, or we have to call unique() on the input first.
     while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-           (split_pos < 0 || sizes[split_pos].count == cur_count)) {
+           (split_pos < 0 || sizes[split_pos].count <= cur_count)) {
       split_pos++;
     }
 
@@ -235,6 +244,10 @@ std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, si
     splits.emplace_back(chunk{start_idx, static_cast<size_type>(cur_count - start_idx)});
     cur_pos             = split_pos;
     cur_cumulative_size = sizes[split_pos].size_bytes;
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      cur_cumulative_rows = sizes[split_pos].rows;
+    }
   }
 
   // If the last chunk has size smaller than `merge_threshold` percent of the second last one,

From 758e2d0f81fd2060fb2e3046efa5e9432edf71d4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 09:03:21 -0800
Subject: [PATCH 177/321] Add test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 111 +++++++++++++++---------
 1 file changed, 72 insertions(+), 39 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 2857b82d415..7ef9d72d348 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1342,15 +1342,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
 }
 
-#define LOCAL_TEST
-
-// This test is extremely heavy, thus it should be disabled by default.
-#ifdef LOCAL_TEST
 TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 {
   int64_t constexpr num_rows    = 500'000'000l;
   int constexpr rows_per_stripe = 1'000'000;
-  int constexpr num_reps        = 10l;
+  int constexpr num_reps        = 10;
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
@@ -1371,46 +1367,83 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     }
   }
 
+  printf("size: %zu\n", data_buffer.size());
+
   // Verify metadata.
   auto const metadata =
     cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
   EXPECT_EQ(metadata.num_rows(), total_rows);
   EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
 
-  int constexpr num_rows_to_read = 5'000'000;
-  const auto num_rows_to_skip    = metadata.num_rows() - num_rows_to_read -
-                                123456 /*just shift the read data region back by a random offset*/;
-
-  // Check validity of the last 5 million rows.
-  const auto sequence_start = num_rows_to_skip % num_rows;
-  auto const skipped_col = int64s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
-  auto const expected    = cudf::table_view{{skipped_col}};
-
-  auto const read_opts = cudf::io::orc_reader_options::builder(
-                           cudf::io::source_info{data_buffer.data(), data_buffer.size()})
-                           .use_index(false)
-                           .skip_rows(num_rows_to_skip)
-                           .num_rows(num_rows_to_read)
-                           .build();
-  auto reader = cudf::io::chunked_orc_reader(
-    600'000UL * sizeof(int64_t) /*output limit, equal to 600k int64_t rows */,
-    8'000'000UL /*input limit, around size of 1 stripe's decoded data */,
-    500'000 /*output granularity, or minimum number of rows for the output chunk*/,
-    read_opts);
-
-  auto num_chunks  = 0;
-  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
-  auto tviews      = std::vector<cudf::table_view>{};
+  // Read with row selections and memory limit.
+  {
+    int constexpr num_rows_to_read = 5'000'000;
+    const auto num_rows_to_skip =
+      metadata.num_rows() - num_rows_to_read -
+      123456 /*just shift the read data region back by a random offset*/;
+
+    // Check validity of the last 5 million rows.
+    const auto sequence_start = num_rows_to_skip % num_rows;
+    auto const skipped_col =
+      int64s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+    auto const expected = cudf::table_view{{skipped_col}};
+
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .skip_rows(num_rows_to_skip)
+                             .num_rows(num_rows_to_read)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      600'000UL * sizeof(int64_t) /* output limit, equal to 600k int64_t rows */,
+      8'000'000UL /* input limit, around size of 1 stripe's decoded data */,
+      500'000 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    auto num_chunks  = 0;
+    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    auto tviews      = std::vector<cudf::table_view>{};
+
+    do {
+      auto chunk = reader.read_chunk();
+      ++num_chunks;
+      tviews.emplace_back(chunk.tbl->view());
+      read_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
+
+    auto const read_result = cudf::concatenate(tviews);
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+  }
+
+  // Read with only output limit.
+  // There is no limit on the memory usage.
+  // However, the reader should be able to detect and load only enough stripes each time
+  // to avoid decoding a table having number of rows that exceeds the column size limit.
+  {
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      600'000UL * sizeof(int64_t) /* output limit, equal to 600k int64_t rows */,
+      0UL /* no input limit */,
+      500'000 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    auto num_chunks  = 0;
+    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    auto tviews      = std::vector<cudf::table_view>{};
+
+    do {
+      auto chunk = reader.read_chunk();
+      ++num_chunks;
+      tviews.emplace_back(chunk.tbl->view());
+      read_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
 
-  do {
-    auto chunk = reader.read_chunk();
-    ++num_chunks;
-    tviews.emplace_back(chunk.tbl->view());
-    read_tables.emplace_back(std::move(chunk.tbl));
-  } while (reader.has_next());
+    EXPECT_EQ(num_chunks, 10);
 
-  auto const read_result = cudf::concatenate(tviews);
-  EXPECT_EQ(num_chunks, 10);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+    // Verify only the last chunk.
+  }
 }
-#endif

From 5de81792ffb18d84ca6f59efcfb9e7153cc0cd3f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 09:27:21 -0800
Subject: [PATCH 178/321] Changing skip and num rows

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp               | 21 ++++++++++-----------
 cpp/src/io/orc/aggregate_orc_metadata.cpp |  4 ----
 cpp/src/io/orc/reader_impl.cu             |  4 ++--
 cpp/src/io/orc/reader_impl.hpp            | 12 ++++++------
 cpp/src/io/orc/reader_impl_chunking.cu    | 15 ++++++++-------
 cpp/tests/io/orc_chunked_reader_test.cu   |  3 ++-
 6 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 99580bd9886..8bf5baef97b 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -57,10 +57,10 @@ class orc_reader_options {
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
-  // Rows to skip from the start; ORC stores the number of rows as uint64_t
-  uint64_t _skip_rows = 0;
+  // Rows to skip from the start
+  int64_t _skip_rows = 0;
   // Rows to read; `nullopt` is all
-  std::optional<size_type> _num_rows;
+  std::optional<int64_t> _num_rows;
 
   // Whether to use row index to speed-up reading
   bool _use_index = true;
@@ -124,7 +124,7 @@ class orc_reader_options {
    *
    * @return Number of rows to skip from the start
    */
-  uint64_t get_skip_rows() const { return _skip_rows; }
+  int64_t get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of row to read.
@@ -132,7 +132,7 @@ class orc_reader_options {
    * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
    * is read until the end)
    */
-  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
+  std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Whether to use row index to speed-up reading.
@@ -197,10 +197,9 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_skip_rows(uint64_t rows)
+  void set_skip_rows(int64_t rows)
   {
     CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
-    CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(), "skip_rows is too large");
     _skip_rows = rows;
   }
 
@@ -212,7 +211,7 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_num_rows(size_type nrows)
+  void set_num_rows(int64_t nrows)
   {
     CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
     CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
@@ -270,7 +269,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the column to read.
@@ -302,7 +301,7 @@ class orc_reader_options_builder {
    * @param rows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& skip_rows(uint64_t rows)
+  orc_reader_options_builder& skip_rows(int64_t rows)
   {
     options.set_skip_rows(rows);
     return *this;
@@ -314,7 +313,7 @@ class orc_reader_options_builder {
    * @param nrows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& num_rows(size_type nrows)
+  orc_reader_options_builder& num_rows(int64_t nrows)
   {
     options.set_num_rows(nrows);
     return *this;
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 0a270877154..15049b8b732 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -200,10 +200,6 @@ aggregate_orc_metadata::select_stripes(
                      "The number of rows in one stripe exceeds the column size limit.",
                      std::overflow_error);
         rows_to_read += static_cast<int64_t>(stripe_rows);
-
-        // TODO: remove below
-        printf(" rows_to_read : %d / %d\n", (int)stripe_rows, (int)rows_to_read);
-        printf(" stripe to read: %d-%d\n", (int)src_file_idx, (int)stripe_idx);
       }
       selected_stripes_mapping.emplace_back(
         stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 43d3a2d38f8..e36d00a1fa6 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -60,7 +60,7 @@
 namespace cudf::io::orc::detail {
 
 void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
+                                std::optional<int64_t> const& num_rows_opt,
                                 std::vector<std::vector<size_type>> const& stripes,
                                 read_mode mode)
 {
@@ -270,7 +270,7 @@ reader::impl::impl(std::size_t output_size_limit,
 }
 
 table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> const& num_rows_opt,
+                                       std::optional<int64_t> const& num_rows_opt,
                                        std::vector<std::vector<size_type>> const& stripes)
 {
   prepare_data(skip_rows, num_rows_opt, stripes, read_mode::READ_ALL);
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 48257659ebb..853055f50ed 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -108,7 +108,7 @@ class reader::impl {
    * @return The set of columns along with metadata
    */
   table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows_opt,
+                           std::optional<int64_t> const& num_rows_opt,
                            std::vector<std::vector<size_type>> const& stripes);
 
   /**
@@ -136,7 +136,7 @@ class reader::impl {
    * @param stripes Indices of individual stripes to load if non-empty
    */
   void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows_opt,
+                    std::optional<int64_t> const& num_rows_opt,
                     std::vector<std::vector<size_type>> const& stripes,
                     read_mode mode);
 
@@ -152,8 +152,8 @@ class reader::impl {
    * stripes for reading each time using the `load_data()` step. This is to ensure that loading
    * these stripes will not exceed a fixed portion the data read limit.
    */
-  void global_preprocess(uint64_t skip_rows,
-                         std::optional<size_type> const& num_rows_opt,
+  void global_preprocess(int64_t skip_rows,
+                         std::optional<int64_t> const& num_rows_opt,
                          std::vector<std::vector<size_type>> const& stripes,
                          read_mode mode);
 
@@ -210,8 +210,8 @@ class reader::impl {
     std::vector<std::string> decimal128_columns;  // control decimals conversion
 
     // User specified reading rows/stripes selection.
-    uint64_t const skip_rows;
-    std::optional<size_type> num_read_rows;
+    int64_t const skip_rows;
+    std::optional<int64_t> num_read_rows;
     std::vector<std::vector<size_type>> const selected_stripes;
   } const _config;
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2a6324da676..340b85b83e3 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -355,8 +355,8 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
   return {begin, end};
 }
 
-void reader::impl::global_preprocess(uint64_t skip_rows,
-                                     std::optional<size_type> const& num_rows_opt,
+void reader::impl::global_preprocess(int64_t skip_rows,
+                                     std::optional<int64_t> const& num_rows_opt,
                                      std::vector<std::vector<size_type>> const& stripes,
                                      read_mode mode)
 {
@@ -375,10 +375,10 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
     "Number or rows to read exceeds the column size limit in READ_ALL mode.",
     std::overflow_error);
 
-  printf("input skip rows: %d, num rows: %d\n", (int)skip_rows, (int)num_rows_opt.value_or(-1));
-  printf("actual skip rows: %d, num rows: %d\n",
-         (int)_file_itm_data.rows_to_skip,
-         (int)_file_itm_data.rows_to_read);
+  printf("input skip rows: %lu, num rows: %lu\n", skip_rows, num_rows_opt.value_or(-1));
+  printf("actual skip rows: %lu, num rows: %lu\n",
+         _file_itm_data.rows_to_skip,
+         _file_itm_data.rows_to_read);
 
   //  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   //  auto const rows_to_read      = _file_itm_data.rows_to_read;
@@ -508,7 +508,8 @@ void reader::impl::global_preprocess(uint64_t skip_rows,
   _chunk_read_data.curr_load_stripe_chunk = 0;
 
   // Load all chunks if there is no read limit.
-  if (_chunk_read_data.data_read_limit == 0) {
+  if (_chunk_read_data.data_read_limit == 0 &&
+      _file_itm_data.rows_to_read < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
     printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
     return;
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 7ef9d72d348..30cf854b349 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1376,7 +1376,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
 
   // Read with row selections and memory limit.
-  {
+  if (0) {
     int constexpr num_rows_to_read = 5'000'000;
     const auto num_rows_to_skip =
       metadata.num_rows() - num_rows_to_read -
@@ -1416,6 +1416,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
   }
 
+  if (1)
   // Read with only output limit.
   // There is no limit on the memory usage.
   // However, the reader should be able to detect and load only enough stripes each time

From 31f6b6d2c5b6708a2c986108d67705c64bab3e9c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 11:28:16 -0800
Subject: [PATCH 179/321] Fix test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 24 ++++++++++++++++++------
 cpp/tests/io/orc_chunked_reader_test.cu | 18 ++++++++++--------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 340b85b83e3..9c4cc7cecd4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -375,8 +375,8 @@ void reader::impl::global_preprocess(int64_t skip_rows,
     "Number or rows to read exceeds the column size limit in READ_ALL mode.",
     std::overflow_error);
 
-  printf("input skip rows: %lu, num rows: %lu\n", skip_rows, num_rows_opt.value_or(-1));
-  printf("actual skip rows: %lu, num rows: %lu\n",
+  printf("input skip rows: %ld, num rows: %ld\n", skip_rows, num_rows_opt.value_or(-1l));
+  printf("actual skip rows: %ld, num rows: %ld\n",
          _file_itm_data.rows_to_skip,
          _file_itm_data.rows_to_read);
 
@@ -508,8 +508,7 @@ void reader::impl::global_preprocess(int64_t skip_rows,
   _chunk_read_data.curr_load_stripe_chunk = 0;
 
   // Load all chunks if there is no read limit.
-  if (_chunk_read_data.data_read_limit == 0 &&
-      _file_itm_data.rows_to_read < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+  if (_chunk_read_data.data_read_limit == 0) {
     printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
     return;
@@ -541,7 +540,7 @@ void reader::impl::global_preprocess(int64_t skip_rows,
     if (count > 5) break;
   }
 
-  // TODO: handle case for extremely large files.
+  // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
   auto const load_limit = [&] {
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               chunk_read_data::load_limit_ratio);
@@ -758,11 +757,18 @@ void reader::impl::load_data()
   // That is because the estimated `max_uncompressed_size` of stream data from
   // `ParseCompressedStripeData` is just the approximate of the maximum possible size, not the
   // actual size, which can be much smaller in practice.
-  if (_chunk_read_data.data_read_limit == 0) {
+
+  // TODO: docs on handle size overflow
+  if (_chunk_read_data.data_read_limit == 0 &&
+      _file_itm_data.rows_to_read < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+    printf("0 limit: output decode stripe chunk unchanged\n");
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
     return;
   }
 
+  // TODO: remove
+  if (_chunk_read_data.data_read_limit == 0) { printf("0 limit but size overflow\n"); }
+
   {
     int count{0};
     for (auto& size : stripe_decomp_sizes) {
@@ -791,6 +797,12 @@ void reader::impl::load_data()
   }
 
   auto const decode_limit = [&] {
+    // In this case, we have no read limit but have to split due to having large input in which
+    // the number of rows exceed column size limit.
+    // We will split based on row number, not data size.
+    if (_chunk_read_data.data_read_limit == 0) { return std::numeric_limits<std::size_t>::max(); }
+
+    // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               (1.0 - chunk_read_data::load_limit_ratio));
     return tmp > 0UL ? tmp : 1UL;
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 30cf854b349..4d4b029d211 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1350,8 +1350,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it          = thrust::make_counting_iterator(int64_t{0});
-  auto const col         = int64s_col(it, it + num_rows);
+  using data_col = cudf::test::fixed_width_column_wrapper<int32_t, int64_t>;
+
+  auto const it =
+    cudf::detail::make_counting_transform_iterator(0l, [](int64_t i) { return i % 123456789l; });
+  auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;
@@ -1384,9 +1387,8 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 
     // Check validity of the last 5 million rows.
     const auto sequence_start = num_rows_to_skip % num_rows;
-    auto const skipped_col =
-      int64s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
-    auto const expected = cudf::table_view{{skipped_col}};
+    auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+    auto const expected    = cudf::table_view{{skipped_col}};
 
     auto const read_opts = cudf::io::orc_reader_options::builder(
                              cudf::io::source_info{data_buffer.data(), data_buffer.size()})
@@ -1395,8 +1397,8 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                              .num_rows(num_rows_to_read)
                              .build();
     auto reader = cudf::io::chunked_orc_reader(
-      600'000UL * sizeof(int64_t) /* output limit, equal to 600k int64_t rows */,
-      8'000'000UL /* input limit, around size of 1 stripe's decoded data */,
+      600'000UL * sizeof(int32_t) /* output limit, equal to 600k int32_t rows */,
+      4'000'000UL /* input limit, around size of 1 stripe's decoded data */,
       500'000 /* output granularity, or minimum number of rows for the output chunk */,
       read_opts);
 
@@ -1427,7 +1429,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                              .use_index(false)
                              .build();
     auto reader = cudf::io::chunked_orc_reader(
-      600'000UL * sizeof(int64_t) /* output limit, equal to 600k int64_t rows */,
+      600'000UL * sizeof(int32_t) /* output limit, equal to 600k int64_t rows */,
       0UL /* no input limit */,
       500'000 /* output granularity, or minimum number of rows for the output chunk */,
       read_opts);

From 07a095a1267d0ed1e47f1397c3c73a0175b45416 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 12:53:01 -0800
Subject: [PATCH 180/321] Fix skip rows and num rows

---
 cpp/src/io/orc/reader_impl.cu           |  7 ++++++-
 cpp/src/io/orc/reader_impl_chunking.cu  |  1 +
 cpp/src/io/orc/reader_impl_chunking.hpp |  4 ++++
 cpp/src/io/orc/reader_impl_decode.cu    | 23 +++++++++++++++++------
 cpp/tests/io/orc_chunked_reader_test.cu | 22 ++++++++++++----------
 5 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index e36d00a1fa6..0048a6fd1b9 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -110,7 +110,7 @@ table_with_metadata reader::impl::make_output_chunk()
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
   // If no rows or stripes to read, return empty columns
-  if (_file_itm_data.has_no_data() || !_chunk_read_data.more_table_chunk_to_output()) {
+  if (!_chunk_read_data.more_table_chunk_to_output()) {
     printf("has no next\n");
     std::vector<std::unique_ptr<column>> out_columns;
     auto out_metadata = get_meta_with_user_data();
@@ -159,6 +159,11 @@ table_with_metadata reader::impl::make_output_chunk()
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
 
+    // If this is the last slice, we also delete the decoded_table to free up memory.
+    if (!_chunk_read_data.more_table_chunk_to_output()) {
+      _chunk_read_data.decoded_table.reset(nullptr);
+    }
+
     return std::make_unique<table>(out_tview, _stream, _mr);
   }();
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9c4cc7cecd4..5157425ea44 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -760,6 +760,7 @@ void reader::impl::load_data()
 
   // TODO: docs on handle size overflow
   if (_chunk_read_data.data_read_limit == 0 &&
+      // TODO: rows_to_read  is changed every decode, should we change this?
       _file_itm_data.rows_to_read < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
     printf("0 limit: output decode stripe chunk unchanged\n");
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 61b27ff7c54..0aef5285ecf 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -240,6 +240,10 @@ struct chunk_read_data {
   // Only has more chunk to output if:
   bool has_next() const
   {
+    printf("compute has_next: %d, %d, %d\n",
+           (int)more_stripe_to_load(),
+           (int)more_stripe_to_decode(),
+           (int)more_table_chunk_to_output());
     return more_stripe_to_load() || more_stripe_to_decode() || more_table_chunk_to_output();
   }
 };
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index a0abf546a22..ede4a4b107a 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -783,11 +783,16 @@ std::vector<chunk> find_table_splits(table_view const& input,
       return cumulative_size{current_length, static_cast<std::size_t>(size)};
     });
 
-  // TODO: remove:
-  segmented_sizes.device_to_host_sync(stream);
-  printf("total row sizes by segment = %d:\n", (int)segment_length);
-  for (auto& size : segmented_sizes) {
-    printf("size: %ld, %zu\n", size.count, size.size_bytes / CHAR_BIT);
+  {
+    int count{0};
+    // TODO: remove:
+    segmented_sizes.device_to_host_sync(stream);
+    printf("total row sizes by segment = %d:\n", (int)segment_length);
+    for (auto& size : segmented_sizes) {
+      printf("size: %ld, %zu\n", size.count, size.size_bytes / CHAR_BIT);
+      if (count > 5) break;
+      ++count;
+    }
   }
 
   // TODO: exec_policy_nosync
@@ -841,8 +846,14 @@ void reader::impl::decompress_and_decode()
                    "TODO");
     }
   }
-  rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
+  rows_to_read = std::min<int64_t>(rows_to_read, _file_itm_data.rows_to_read) - rows_to_skip;
+  CUDF_EXPECTS(rows_to_read > 0, "Invalid rows_to_read computation.");
+
+  // rows_to_read -= rows_to_skip;
   _file_itm_data.rows_to_skip = 0;
+  _file_itm_data.rows_to_read -= rows_to_read;
+
+  printf("decode, skip = %ld, read = %ld\n", rows_to_skip, rows_to_read);
 
   CUDF_EXPECTS(rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 4d4b029d211..95437b9e8ca 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1344,16 +1344,18 @@ TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
 
 TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
 {
+  using data_type = int16_t;
+  using data_col  = cudf::test::fixed_width_column_wrapper<data_type, int64_t>;
+
   int64_t constexpr num_rows    = 500'000'000l;
   int constexpr rows_per_stripe = 1'000'000;
   int constexpr num_reps        = 10;
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  using data_col = cudf::test::fixed_width_column_wrapper<int32_t, int64_t>;
-
-  auto const it =
-    cudf::detail::make_counting_transform_iterator(0l, [](int64_t i) { return i % 123456789l; });
+  auto const it          = cudf::detail::make_counting_transform_iterator(0l, [](int64_t i) {
+    return i % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+  });
   auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
@@ -1379,7 +1381,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
 
   // Read with row selections and memory limit.
-  if (0) {
+  if (1) {
     int constexpr num_rows_to_read = 5'000'000;
     const auto num_rows_to_skip =
       metadata.num_rows() - num_rows_to_read -
@@ -1397,9 +1399,9 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                              .num_rows(num_rows_to_read)
                              .build();
     auto reader = cudf::io::chunked_orc_reader(
-      600'000UL * sizeof(int32_t) /* output limit, equal to 600k int32_t rows */,
-      4'000'000UL /* input limit, around size of 1 stripe's decoded data */,
-      500'000 /* output granularity, or minimum number of rows for the output chunk */,
+      600'000UL * sizeof(data_type) /* output limit, equal to 600k rows */,
+      rows_per_stripe * sizeof(data_type) /* input limit, around size of 1 stripe's decoded data */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
       read_opts);
 
     auto num_chunks  = 0;
@@ -1414,11 +1416,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     } while (reader.has_next());
 
     auto const read_result = cudf::concatenate(tviews);
-    EXPECT_EQ(num_chunks, 10);
+    EXPECT_EQ(num_chunks, 11);
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
   }
 
-  if (1)
+  if (0)
   // Read with only output limit.
   // There is no limit on the memory usage.
   // However, the reader should be able to detect and load only enough stripes each time

From 6a6061ab74a88fba1f2fee6609ad16885490d79d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 13:26:37 -0800
Subject: [PATCH 181/321] Add test

---
 cpp/tests/io/orc_chunked_reader_test.cu | 66 ++++++++++++++++---------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 95437b9e8ca..43f0fb07a0c 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1353,10 +1353,10 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it          = cudf::detail::make_counting_transform_iterator(0l, [](int64_t i) {
-    return i % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
+    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
   });
-  auto const col         = data_col(it, it + num_rows);
+  auto const col = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;
@@ -1372,7 +1372,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     }
   }
 
-  printf("size: %zu\n", data_buffer.size());
+  printf("buffer size: %zu\n", data_buffer.size());
 
   // Verify metadata.
   auto const metadata =
@@ -1381,14 +1381,14 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
 
   // Read with row selections and memory limit.
-  if (1) {
-    int constexpr num_rows_to_read = 5'000'000;
-    const auto num_rows_to_skip =
-      metadata.num_rows() - num_rows_to_read -
-      123456 /*just shift the read data region back by a random offset*/;
+  {
+    int64_t constexpr num_rows_to_read = 5'000'000l;
+    int64_t const num_rows_to_skip =
+      static_cast<int64_t>(metadata.num_rows()) - num_rows_to_read -
+      123456l /*just shift the read data region back by a random offset*/;
 
     // Check validity of the last 5 million rows.
-    const auto sequence_start = num_rows_to_skip % num_rows;
+    auto const sequence_start = num_rows_to_skip % num_rows;
     auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read);
     auto const expected    = cudf::table_view{{skipped_col}};
 
@@ -1420,9 +1420,9 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
   }
 
-  if (0)
-  // Read with only output limit.
-  // There is no limit on the memory usage.
+#define LOCAL_TEST
+#ifdef LOCAL_TEST
+  // Read with only output limit -- there is no limit on the memory usage.
   // However, the reader should be able to detect and load only enough stripes each time
   // to avoid decoding a table having number of rows that exceeds the column size limit.
   {
@@ -1431,24 +1431,44 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                              .use_index(false)
                              .build();
     auto reader = cudf::io::chunked_orc_reader(
-      600'000UL * sizeof(int32_t) /* output limit, equal to 600k int64_t rows */,
+      static_cast<std::size_t>(rows_per_stripe * 5.7) *
+        sizeof(data_type) /* output limit, equal to 5.2M rows */,
       0UL /* no input limit */,
-      500'000 /* output granularity, or minimum number of rows for the output chunk */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
       read_opts);
 
-    auto num_chunks  = 0;
-    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
-    auto tviews      = std::vector<cudf::table_view>{};
+    int num_chunks          = 0;
+    int64_t num_read_rows   = 0;
+    int64_t test_rows_start = 0;
+    auto test_chunk         = std::unique_ptr<cudf::table>{};
 
     do {
-      auto chunk = reader.read_chunk();
+      auto chunk            = reader.read_chunk();
+      auto const chunk_rows = chunk.tbl->num_rows();
+
+      // Just randomly select one output chunk to verify.
+      if (num_chunks == 123) {
+        test_rows_start = num_read_rows;
+        test_chunk      = std::move(chunk.tbl);
+      }
+
       ++num_chunks;
-      tviews.emplace_back(chunk.tbl->view());
-      read_tables.emplace_back(std::move(chunk.tbl));
+      num_read_rows += chunk_rows;
     } while (reader.has_next());
 
-    EXPECT_EQ(num_chunks, 10);
+    EXPECT_EQ(num_read_rows, total_rows);
+
+    // Typically, we got a chunk having 5M rows.
+    // However, since the reader internally splits file stripes that are not multiple of 5 stripes,
+    // we may have some extra chunks that have less than 5M rows.
+    EXPECT_EQ(num_chunks, 1002);
 
-    // Verify only the last chunk.
+    // Verify the selected chunk.
+    using namespace cudf::test::iterators;
+    auto const skipped_col =
+      data_col(it + test_rows_start, it + test_rows_start + test_chunk->num_rows(), no_nulls());
+    auto const expected = cudf::table_view{{skipped_col}};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
   }
+#endif  // LOCAL_TEST
 }

From d8c7c449f5025a959372bde0dbab1f25f1429a90 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 14:09:58 -0800
Subject: [PATCH 182/321] Fix a bug

---
 cpp/src/io/orc/reader_impl_decode.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ede4a4b107a..cab80235ea3 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -846,8 +846,8 @@ void reader::impl::decompress_and_decode()
                    "TODO");
     }
   }
-  rows_to_read = std::min<int64_t>(rows_to_read, _file_itm_data.rows_to_read) - rows_to_skip;
-  CUDF_EXPECTS(rows_to_read > 0, "Invalid rows_to_read computation.");
+  CUDF_EXPECTS(rows_to_read > rows_to_skip, "Invalid rows_to_read computation.");
+  rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
 
   // rows_to_read -= rows_to_skip;
   _file_itm_data.rows_to_skip = 0;

From c33ebce5c7520c4a643bcbd02e0780c10f412247 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 14:56:50 -0800
Subject: [PATCH 183/321] Fix return order bug

---
 cpp/src/io/orc/reader_impl.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 0048a6fd1b9..51d14b739e3 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -159,12 +159,14 @@ table_with_metadata reader::impl::make_output_chunk()
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
 
+    auto output = std::make_unique<table>(out_tview, _stream, _mr);
+
     // If this is the last slice, we also delete the decoded_table to free up memory.
     if (!_chunk_read_data.more_table_chunk_to_output()) {
       _chunk_read_data.decoded_table.reset(nullptr);
     }
 
-    return std::make_unique<table>(out_tview, _stream, _mr);
+    return output;
   }();
 
 #endif

From e7958cc078380b7350a9c6be717d7a0dc027ddf0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 14:58:15 -0800
Subject: [PATCH 184/321] Change local test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 43f0fb07a0c..cded7a300de 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1420,7 +1420,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
   }
 
-#define LOCAL_TEST
+// #define LOCAL_TEST
 #ifdef LOCAL_TEST
   // Read with only output limit -- there is no limit on the memory usage.
   // However, the reader should be able to detect and load only enough stripes each time
@@ -1470,5 +1470,8 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     auto const expected = cudf::table_view{{skipped_col}};
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
   }
+
+  printf("done local test\n");
+  fflush(stdout);
 #endif  // LOCAL_TEST
 }

From 52951875ba414e0ee45ff3233324158908a03e28 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 16:29:40 -0800
Subject: [PATCH 185/321] Add changes in `hostdevice_vector.hpp` ahead of time

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/utilities/hostdevice_vector.hpp | 24 ++++++++--------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index c3e2c4cc8e2..af1591b709a 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -26,13 +26,9 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/host/host_memory_resource.hpp>
 
-#include <thrust/host_vector.h>
-
-#include <variant>
-
 namespace cudf::detail {
 
 /**
@@ -64,7 +60,6 @@ class hostdevice_vector {
     h_data.reserve(max_size);
     h_data.resize(initial_size);
 
-    current_size = initial_size;
     d_data.resize(max_size, stream);
   }
 
@@ -72,20 +67,14 @@ class hostdevice_vector {
   {
     CUDF_EXPECTS(size() < capacity(),
                  "Cannot insert data into hostdevice_vector because capacity has been exceeded.");
-    h_data[current_size++] = data;
+    h_data.push_back(data);
   }
 
   [[nodiscard]] size_t capacity() const noexcept { return d_data.size(); }
-  [[nodiscard]] size_t size() const noexcept { return current_size; }
+  [[nodiscard]] size_t size() const noexcept { return h_data.size(); }
   [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); }
   [[nodiscard]] bool empty() const noexcept { return size() == 0; }
 
-  [[nodiscard]] T& front() { return h_data.front(); }
-  [[nodiscard]] T const& front() const { return front(); }
-
-  [[nodiscard]] T& back() { return h_data.back(); }
-  [[nodiscard]] T const& back() const { return back(); }
-
   [[nodiscard]] T& operator[](size_t i) { return h_data[i]; }
   [[nodiscard]] T const& operator[](size_t i) const { return h_data[i]; }
 
@@ -98,6 +87,12 @@ class hostdevice_vector {
   [[nodiscard]] T* end() { return host_ptr(size()); }
   [[nodiscard]] T const* end() const { return host_ptr(size()); }
 
+  [[nodiscard]] T& front() { return h_data.front(); }
+  [[nodiscard]] T const& front() const { return front(); }
+
+  [[nodiscard]] T& back() { return h_data.back(); }
+  [[nodiscard]] T const& back() const { return back(); }
+
   [[nodiscard]] T* device_ptr(size_t offset = 0) { return d_data.data() + offset; }
   [[nodiscard]] T const* device_ptr(size_t offset = 0) const { return d_data.data() + offset; }
 
@@ -181,7 +176,6 @@ class hostdevice_vector {
 
  private:
   cudf::detail::rmm_host_vector<T> h_data;
-  size_t current_size = 0;
   rmm::device_uvector<T> d_data;
 };
 

From fe2f55eea9bab2728eab1cb39664d5bbd4892198 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 19:41:54 -0800
Subject: [PATCH 186/321] Fix style

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 516338febca..caf3b35b629 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -255,7 +255,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
-# TODO: add back the removed file, and add new file
+#   TODO: add back the removed file, and add new file
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp)
 
 # ##################################################################################################

From 0ced9f43d38ae70df80cfe0c41724399d9f72f76 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 21:27:31 -0800
Subject: [PATCH 187/321] Fix doxygen

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp | 10 ++++++----
 cpp/include/cudf/io/orc.hpp        | 19 +++++++++----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index d532cee5677..8cc41bd5057 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -88,8 +88,9 @@ class chunked_reader : private reader {
  public:
   /**
    * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
-   * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
-   * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
    */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
@@ -100,8 +101,9 @@ class chunked_reader : private reader {
                           rmm::mr::device_memory_resource* mr);
   /**
    * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
-   * std::vector<std::unique_ptr<cudf::io::datasource>>&&, orc_reader_options const&,
-   * rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
    */
   explicit chunked_reader(std::size_t output_size_limit,
                           std::size_t data_read_limit,
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 8bf5baef97b..259c5c1016a 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -423,7 +423,8 @@ class chunked_orc_reader {
   chunked_orc_reader() = default;
 
   /**
-   * @brief Constructor from size limits and an array of data sources with reader options.
+   * @brief Construct the reader from input/output size limits, output row granularity, along with
+   * other ORC reader options.
    *
    * The typical usage should be similar to this:
    * ```
@@ -459,7 +460,6 @@ class chunked_orc_reader {
    *        or `0` if there is no limit
    * @param output_row_granularity The granularity parameter used for subdividing the decoded
    *        table for final output
-   * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
@@ -473,7 +473,7 @@ class chunked_orc_reader {
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Constructor from size limits and an array of data sources with reader options.
+   * @brief Construct the reader from input/output size limits along with other ORC reader options.
    *
    * This constructor implicitly call the other constructor with `output_row_granularity` set to
    * 10'000 rows.
@@ -482,7 +482,6 @@ class chunked_orc_reader {
    *        or `0` if there is no limit
    * @param data_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
-   * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
@@ -495,14 +494,13 @@ class chunked_orc_reader {
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Constructor from output size limit and an array of data sources with reader options.
+   * @brief Construct the reader from output size limits along with other ORC reader options.
    *
    * This constructor implicitly call the other constructor with `data_read_limit` set to `0` and
    * `output_row_granularity` set to 10'000 rows.
    *
    * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
-   * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
@@ -512,25 +510,26 @@ class chunked_orc_reader {
     orc_reader_options const& options,
     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Destructor, destroying the internal reader instance.
    */
   ~chunked_orc_reader();
 
   /**
-   * @brief Check if there is any data in the given file has not yet read.
+   * @brief Check if there is any data in the given data sources has not yet read.
    *
    * @return A boolean value indicating if there is any data left to read
    */
   [[nodiscard]] bool has_next() const;
 
   /**
-   * @brief Read a chunk of rows in the given ORC file.
+   * @brief Read a chunk of rows in the given data sources.
    *
    * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
-   * dataset as reading the entire given file at once.
+   * dataset as reading the entire given data sources at once.
    *
-   * An empty table will be returned if the given file is empty, or all the data in the file has
+   * An empty table will be returned if the given sources are empty, or all the data has
    * been read and returned by the previous calls.
    *
    * @return An output `cudf::table` along with its metadata

From 223f078e1507c1472ce64ec8ecb2057eb47d6d78 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 22:41:28 -0800
Subject: [PATCH 188/321] Rename struct

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 10 +++++-----
 cpp/src/io/orc/aggregate_orc_metadata.hpp |  2 +-
 cpp/src/io/orc/orc.hpp                    |  4 ++--
 cpp/src/io/orc/reader_impl_chunking.hpp   |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 15049b8b732..4f89142bdf9 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -152,7 +152,7 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, int64_t, std::vector<metadata::OrcStripeInfo>>
+std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
@@ -169,7 +169,7 @@ aggregate_orc_metadata::select_stripes(
 
   struct stripe_source_mapping {
     int source_idx;
-    std::vector<metadata::OrcStripeInfo> stripe_info;
+    std::vector<metadata::orc_stripe_info> stripe_info;
   };
 
   std::vector<stripe_source_mapping> selected_stripes_mapping;
@@ -181,7 +181,7 @@ aggregate_orc_metadata::select_stripes(
     // Each vector entry represents a source file; each nested vector represents the
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-      std::vector<metadata::OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -211,7 +211,7 @@ aggregate_orc_metadata::select_stripes(
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
-      std::vector<metadata::OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
@@ -239,7 +239,7 @@ aggregate_orc_metadata::select_stripes(
     rows_to_skip -= stripe_skip_rows;
   }
 
-  std::vector<metadata::OrcStripeInfo> output;
+  std::vector<metadata::orc_stripe_info> output;
 
   // Read each stripe's stripefooter metadata
   for (auto& mapping : selected_stripes_mapping) {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 613c08fb745..65d1f0a7ad4 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -113,7 +113,7 @@ class aggregate_orc_metadata {
    *
    * Stripes are potentially selected from multiple files.
    */
-  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::OrcStripeInfo>> select_stripes(
+  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
     int64_t skip_rows,
     std::optional<size_type> const& num_rows,
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 4a35aaf5107..fd55cbb6846 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -603,12 +603,12 @@ struct column_validity_info {
  */
 class metadata {
  public:
-  struct OrcStripeInfo {
+  struct orc_stripe_info {
     StripeInformation const* stripe_info;
     StripeFooter const* stripe_footer;
     int source_idx;
   };
-  std::vector<OrcStripeInfo> stripe_info;
+  std::vector<orc_stripe_info> stripe_info;
 
  public:
   explicit metadata(datasource* const src, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0aef5285ecf..95dd2fc13a2 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -124,7 +124,7 @@ struct range {
 struct file_intermediate_data {
   int64_t rows_to_skip;
   int64_t rows_to_read;
-  std::vector<metadata::OrcStripeInfo> selected_stripes;
+  std::vector<metadata::orc_stripe_info> selected_stripes;
 
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }

From 112131fcf60c34885446b750ac2cab6e34aa885c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Mar 2024 22:42:48 -0800
Subject: [PATCH 189/321] Change error message

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 5ffff3d7d40..af6a3a79817 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -57,7 +57,8 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                rmm::mr::device_memory_resource* mr)
   : reader()
 {
-  CUDF_EXPECTS(output_row_granularity > 0, "Invalid value of `output_row_granularity`.");
+  CUDF_EXPECTS(output_row_granularity > 0,
+               "The value of `output_row_granularity` must be positive.");
   _impl = std::make_unique<impl>(output_size_limit,
                                  data_read_limit,
                                  output_row_granularity,

From be544f5bb89435b690b38b0262bf1aca381a493b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 09:32:18 -0800
Subject: [PATCH 190/321] Reverse changes in parquet code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/parquet/reader_impl.cpp | 16 ++++++++--------
 cpp/src/io/parquet/reader_impl.hpp |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c930aa7f969..89562514564 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -416,8 +416,8 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
 table_with_metadata reader::impl::read_chunk_internal(
   bool uses_custom_row_bounds, std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  // If `_out_metadata` has been constructed, just copy it over.
-  auto out_metadata = _out_metadata ? table_metadata{*_out_metadata} : table_metadata{};
+  // If `_output_metadata` has been constructed, just copy it over.
+  auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
   out_metadata.schema_info.resize(_output_buffers.size());
 
   // output cudf columns as determined by the top level schema
@@ -448,8 +448,8 @@ table_with_metadata reader::impl::read_chunk_internal(
       metadata = std::make_optional<reader_column_schema>();
       metadata->set_convert_binary_to_strings(false);
     }
-    // Only construct `out_metadata` if `_out_metadata` has not been cached.
-    if (!_out_metadata) {
+    // Only construct `out_metadata` if `_output_metadata` has not been cached.
+    if (!_output_metadata) {
       column_name_info& col_name = out_metadata.schema_info[i];
       out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream));
     } else {
@@ -468,7 +468,7 @@ table_with_metadata reader::impl::finalize_output(
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
-    if (!_out_metadata) {
+    if (!_output_metadata) {
       column_name_info& col_name = out_metadata.schema_info[i];
       out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr));
     } else {
@@ -476,10 +476,10 @@ table_with_metadata reader::impl::finalize_output(
     }
   }
 
-  if (!_out_metadata) {
+  if (!_output_metadata) {
     populate_metadata(out_metadata);
-    // Finally, save the output table metadata into `_out_metadata` for reuse next time.
-    _out_metadata = std::make_unique<table_metadata>(out_metadata);
+    // Finally, save the output table metadata into `_output_metadata` for reuse next time.
+    _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
   // advance output chunk/subpass/pass info
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index e6d63166029..185419a5b46 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -364,7 +364,7 @@ class reader::impl {
   std::vector<int> _output_column_schemas;
 
   // _output_buffers associated metadata
-  std::unique_ptr<table_metadata> _out_metadata;
+  std::unique_ptr<table_metadata> _output_metadata;
 
   bool _strings_to_categorical = false;
 

From ead3124ff54c3ac1c91893204c9bd06c05f1670b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 10:39:58 -0800
Subject: [PATCH 191/321] Fix option access

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/parquet/reader_impl_helpers.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index ddf0f55b6af..6c3cba8059c 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -618,7 +618,9 @@ aggregate_reader_metadata::select_row_groups(
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
-      skip_rows_opt, std::optional<int64_t>{num_rows_opt.value()}, get_num_rows());
+      skip_rows_opt,
+      num_rows_opt.has_value() ? std::optional<int64_t>{num_rows_opt.value()} : std::nullopt,
+      get_num_rows());
     return std::pair{static_cast<int64_t>(from_opts.first),
                      static_cast<size_type>(from_opts.second)};
   }();

From 74d14d11ff48939583d13351c513d5fd49c596be Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 12:16:25 -0800
Subject: [PATCH 192/321] Remove outdated test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/row_selection_test.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 0c259c81a23..ebadd870091 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -122,17 +122,4 @@ TEST_F(FromOptsTest, LimitOptionsToFileRows)
   }
 }
 
-TEST_F(FromOptsTest, OverFlowDetection)
-{
-  auto const too_large_for_32bit = std::numeric_limits<int64_t>::max();
-
-  // Too many rows to read until the end of the file
-  EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               std::overflow_error);
-
-  // Should work fine with num_rows
-  EXPECT_NO_THROW(
-    skip_rows_num_rows_from_options(1000, too_large_for_32bit - 100, too_large_for_32bit));
-}
-
 CUDF_TEST_PROGRAM_MAIN()

From 07103ad2a286bd47028a79168a52445e115502d9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 12:44:07 -0800
Subject: [PATCH 193/321] Wrap the debug print lines in `#ifdef/#endif`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           |  69 +++++++-------
 cpp/src/io/orc/reader_impl.hpp          |   2 +
 cpp/src/io/orc/reader_impl_chunking.cu  | 121 +++++++-----------------
 cpp/src/io/orc/reader_impl_chunking.hpp |   3 +
 cpp/src/io/orc/reader_impl_decode.cu    |  85 +++++++++++++----
 cpp/tests/io/orc_chunked_reader_test.cu |   5 -
 6 files changed, 139 insertions(+), 146 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 51d14b739e3..1061a7ec64f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,12 +14,9 @@
  * limitations under the License.
  */
 
-// #define PRINT_DEBUG
-
 // TODO: remove
 #include <cudf_test/debug_utilities.hpp>
 
-#include <cudf/concatenate.hpp>
 //
 //
 //
@@ -72,46 +69,55 @@ void reader::impl::prepare_data(int64_t skip_rows,
   // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return; }
 
+#ifdef LOCAL_TEST
   std::cout << "call global, skip = " << skip_rows << std::endl;
+#endif
 
   global_preprocess(skip_rows, num_rows_opt, stripes, mode);
 
   if (!_chunk_read_data.more_table_chunk_to_output()) {
     if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
+#ifdef LOCAL_TEST
       printf("load more data\n\n");
+#endif
+
       load_data();
     }
 
     if (_chunk_read_data.more_stripe_to_decode()) {
+#ifdef LOCAL_TEST
       printf("decode more data\n\n");
+#endif
+
       decompress_and_decode();
     }
   }
 
+#ifdef LOCAL_TEST
   printf("done load and decode data\n\n");
-
-  // decompress_and_decode();
-  // while (_chunk_read_data.more_stripe_to_decode()) {
-  //   decompress_and_decode();
-  //   _file_itm_data.out_buffers.push_back(std::move(_out_buffers));
-  // }
+#endif
 }
 
 table_with_metadata reader::impl::make_output_chunk()
 {
+#ifdef LOCAL_TEST
   {
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
     std::cout << "start to make out, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
+#endif
 
   // There is no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
   // If no rows or stripes to read, return empty columns
   if (!_chunk_read_data.more_table_chunk_to_output()) {
+#ifdef LOCAL_TEST
     printf("has no next\n");
+#endif
+
     std::vector<std::unique_ptr<column>> out_columns;
     auto out_metadata = get_meta_with_user_data();
     std::transform(_selected_columns.levels[0].begin(),
@@ -130,20 +136,23 @@ table_with_metadata reader::impl::make_output_chunk()
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
-#if 1
   auto out_table = [&] {
     if (_chunk_read_data.output_table_chunks.size() == 1) {
       _chunk_read_data.curr_output_table_chunk++;
+#ifdef LOCAL_TEST
       printf("one chunk, no more table---------------------------------\n");
+#endif
       return std::move(_chunk_read_data.decoded_table);
     }
 
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << "prepare to make out, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     auto const out_chunk =
       _chunk_read_data.output_table_chunks[_chunk_read_data.curr_output_table_chunk++];
@@ -152,12 +161,15 @@ table_with_metadata reader::impl::make_output_chunk()
                           {static_cast<size_type>(out_chunk.start_idx),
                            static_cast<size_type>(out_chunk.start_idx + out_chunk.count)},
                           _stream)[0];
+
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << "done make out, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     auto output = std::make_unique<table>(out_tview, _stream, _mr);
 
@@ -169,8 +181,7 @@ table_with_metadata reader::impl::make_output_chunk()
     return output;
   }();
 
-#endif
-
+#ifdef LOCAL_TEST
   if (!_chunk_read_data.has_next()) {
     static int count{0};
     count++;
@@ -184,6 +195,7 @@ table_with_metadata reader::impl::make_output_chunk()
     std::cout << "done, partial, peak_memory_usage: " << peak_mem
               << " , MB = " << (peak_mem * 1.0) / (1024.0 * 1024.0) << std::endl;
   }
+#endif
 
   return {std::move(out_table), _out_metadata};
 }
@@ -267,13 +279,6 @@ reader::impl::impl(std::size_t output_size_limit,
       data_read_limit,
       output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY}
 {
-  printf("construct reader , limit = %d, %d, gradunarity %d \n",
-
-         (int)output_size_limit,
-         (int)data_read_limit,
-         (int)output_row_granularity
-
-  );
 }
 
 table_with_metadata reader::impl::read(int64_t skip_rows,
@@ -286,16 +291,23 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
 
 bool reader::impl::has_next()
 {
+#ifdef LOCAL_TEST
   printf("==================query has next \n");
+#endif
+
   prepare_data(
     _config.skip_rows, _config.num_read_rows, _config.selected_stripes, read_mode::CHUNKED_READ);
 
+#ifdef LOCAL_TEST
   printf("has next: %d\n", (int)_chunk_read_data.has_next());
+#endif
+
   return _chunk_read_data.has_next();
 }
 
 table_with_metadata reader::impl::read_chunk()
 {
+#ifdef LOCAL_TEST
   printf("==================call read chunk\n");
   {
     _stream.synchronize();
@@ -303,34 +315,19 @@ table_with_metadata reader::impl::read_chunk()
     std::cout << "\n\n\n------------start read chunk, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
-
-  {
-    static int count{0};
-    ++count;
-
-#if 0
-    if (count == 3) {
-      _file_itm_data.lvl_stripe_data.clear();
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << "clear all, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-      exit(0);
-    }
 #endif
-  }
 
   prepare_data(
     _config.skip_rows, _config.num_read_rows, _config.selected_stripes, read_mode::CHUNKED_READ);
 
+#ifdef LOCAL_TEST
   {
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
     std::cout << "done prepare data, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
+#endif
 
   return make_output_chunk();
 }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 853055f50ed..84033ca0778 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -38,7 +38,9 @@ class memory_stats_logger {
  public:
   explicit memory_stats_logger(rmm::mr::device_memory_resource* mr) : existing_mr(mr)
   {
+#ifdef LOCAL_TEST
     printf("exist mr: %p\n", mr);
+#endif
 
     statistics_mr =
       std::make_unique<rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>(
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5157425ea44..c4e094f47dd 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-// #define PRINT_DEBUG
-
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/reader_impl.hpp"
@@ -108,8 +106,12 @@ std::size_t gather_stream_info_and_column_desc(
       // Ignore reading this stream from source.
       // cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be
       // ignored.");
+
+#ifdef LOCAL_TEST
       printf("Unexpected stream in the input ORC source. The stream will be ignored\n");
       fflush(stdout);
+#endif
+
       src_offset += stream.length;
       continue;
     }
@@ -271,65 +273,6 @@ template std::vector<chunk> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, int64_t total_count, size_t size_limit);
 #endif
 
-namespace {
-
-#ifdef PRINT_DEBUG
-/**
- * @brief Verify the splits, checking if they are correct.
- *
- * We need to verify that:
- *  1. All chunk must have count > 0
- *  2. Chunks are continuous.
- *  3. sum(all sizes in a chunk) < size_limit
- *  4. sum(all counts in all chunks) == total_count.
- */
-void verify_splits(host_span<chunk const> splits,
-                   host_span<cumulative_size const> sizes,
-                   size_type total_count,
-                   size_t size_limit)
-{
-  chunk last_split{0, 0};
-  int64_t count{0};
-  size_t cur_cumulative_size{0};
-  for (auto const& split : splits) {
-    CUDF_EXPECTS(split.count > 0, "Invalid split count.");
-    CUDF_EXPECTS(last_split.start_idx + last_split.count == split.start_idx,
-                 "Invalid split start_idx.");
-    count += split.count;
-    last_split = split;
-
-    if (split.count > 1) {
-      //      printf("split: %ld - %ld, size: %zu, limit: %zu\n",
-      //             split.start_idx,
-      //             split.count,
-      //             sizes[split.start_idx + split.count - 1].size_bytes - cur_cumulative_size,
-      //             size_limit);
-      //      fflush(stdout);
-      CUDF_EXPECTS(
-        sizes[split.start_idx + split.count - 1].size_bytes - cur_cumulative_size <= size_limit,
-        "Chunk total size exceeds limit.");
-      if (split.start_idx + split.count < total_count) {
-        //        printf("wrong split: %ld - %ld, size: %zu, limit: %zu\n",
-        //               split.start_idx,
-        //               split.count + 1,
-        //               sizes[split.start_idx + split.count].size_bytes - cur_cumulative_size,
-        //               size_limit);
-
-        CUDF_EXPECTS(
-          sizes[split.start_idx + split.count].size_bytes - cur_cumulative_size > size_limit,
-          "Invalid split.");
-      }
-    }
-    cur_cumulative_size = sizes[split.start_idx + split.count - 1].size_bytes;
-  }
-  CUDF_EXPECTS(last_split.start_idx + last_split.count == sizes.back().count,
-               "Invalid split start_idx.");
-  CUDF_EXPECTS(count == total_count, "Invalid total count.");
-}
-#endif
-
-}  // namespace
-
 /**
  * @brief Find range of the data span by a given chunk of chunks.
  *
@@ -375,10 +318,12 @@ void reader::impl::global_preprocess(int64_t skip_rows,
     "Number or rows to read exceeds the column size limit in READ_ALL mode.",
     std::overflow_error);
 
+#ifdef LOCAL_TEST
   printf("input skip rows: %ld, num rows: %ld\n", skip_rows, num_rows_opt.value_or(-1l));
   printf("actual skip rows: %ld, num rows: %ld\n",
          _file_itm_data.rows_to_skip,
          _file_itm_data.rows_to_read);
+#endif
 
   //  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   //  auto const rows_to_read      = _file_itm_data.rows_to_read;
@@ -400,7 +345,9 @@ void reader::impl::global_preprocess(int64_t skip_rows,
   // Get the total number of stripes across all input files.
   auto const num_stripes = selected_stripes.size();
 
+#ifdef LOCAL_TEST
   printf("num load stripe: %d\n", (int)num_stripes);
+#endif
 
   stripe_data_read_chunks.resize(num_stripes);
   lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
@@ -509,11 +456,15 @@ void reader::impl::global_preprocess(int64_t skip_rows,
 
   // Load all chunks if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
+#ifdef LOCAL_TEST
     printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
+#endif
+
     _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
     return;
   }
 
+#ifdef LOCAL_TEST
   printf("total stripe sizes:\n");
   int count{0};
   for (auto& size : total_stripe_sizes) {
@@ -521,6 +472,7 @@ void reader::impl::global_preprocess(int64_t skip_rows,
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
     if (count > 5) break;
   }
+#endif
 
   // Compute the prefix sum of stripe data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
@@ -532,6 +484,7 @@ void reader::impl::global_preprocess(int64_t skip_rows,
 
   total_stripe_sizes.device_to_host_sync(_stream);
 
+#ifdef LOCAL_TEST
   count = 0;
   printf("prefix sum total stripe sizes:\n");
   for (auto& size : total_stripe_sizes) {
@@ -539,6 +492,7 @@ void reader::impl::global_preprocess(int64_t skip_rows,
     printf("size: %ld, %zu\n", size.count, size.size_bytes);
     if (count > 5) break;
   }
+#endif
 
   // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
   auto const load_limit = [&] {
@@ -549,24 +503,13 @@ void reader::impl::global_preprocess(int64_t skip_rows,
   _chunk_read_data.load_stripe_chunks =
     find_splits<cumulative_size>(total_stripe_sizes, num_stripes, load_limit);
 
-#ifndef PRINT_DEBUG
+#ifdef LOCAL_TEST
   auto& splits = _chunk_read_data.load_stripe_chunks;
   printf("------------\nSplits (/total num stripe = %d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
   }
   fflush(stdout);
-
-  //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
-  //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);
-
-  // We need to verify that:
-  //  1. All chunk must have count > 0
-  //  2. Chunks are continuous.
-  //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
-  //  4. sum(number of stripes in all chunks) == total_num_stripes.
-  // TODO: enable only in debug.
-//  verify_splits(splits, total_stripe_sizes, num_stripes, _chunk_read_data.data_read_limit);
 #endif
 }
 
@@ -587,7 +530,9 @@ void reader::impl::load_data()
   auto const stripe_start = stripe_chunk.start_idx;
   auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
 
+#ifdef LOCAL_TEST
   printf("\n\nloading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+#endif
 
   // Prepare the buffer to read raw data onto.
   // TODO: clear all old buffer.
@@ -687,7 +632,7 @@ void reader::impl::load_data()
           info.length));
         stream_compinfo_map[stream_id_info{
           info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind}] = &compinfo.back();
-#ifdef PRINT_DEBUG
+#ifdef LOCAL_TEST
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.id.stripe_idx,
                (int)info.id.level,
@@ -716,7 +661,8 @@ void reader::impl::load_data()
                                    stream_compinfo->max_uncompressed_size};
         stripe_decomp_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
           stream_compinfo->max_uncompressed_size;
-#ifdef PRINT_DEBUG
+
+#ifdef LOCAL_TEST
         printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
                (int)stream_id.stripe_idx,
                (int)stream_id.level,
@@ -733,8 +679,10 @@ void reader::impl::load_data()
       stream_compinfo_map.clear();
 
     } else {
+#ifdef LOCAL_TEST
       printf("no compression \n");
       fflush(stdout);
+#endif
 
       // Set decompression size equal to the input size.
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
@@ -762,11 +710,15 @@ void reader::impl::load_data()
   if (_chunk_read_data.data_read_limit == 0 &&
       // TODO: rows_to_read  is changed every decode, should we change this?
       _file_itm_data.rows_to_read < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+#ifdef LOCAL_TEST
     printf("0 limit: output decode stripe chunk unchanged\n");
+#endif
+
     _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
     return;
   }
 
+#ifdef LOCAL_TEST
   // TODO: remove
   if (_chunk_read_data.data_read_limit == 0) { printf("0 limit but size overflow\n"); }
 
@@ -777,6 +729,7 @@ void reader::impl::load_data()
       if (count++ > 5) break;
     }
   }
+#endif
 
   // Compute the prefix sum of stripe data sizes.
   stripe_decomp_sizes.host_to_device_async(_stream);
@@ -788,6 +741,7 @@ void reader::impl::load_data()
 
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
+#ifdef LOCAL_TEST
   {
     int count{0};
     for (auto& size : stripe_decomp_sizes) {
@@ -796,6 +750,7 @@ void reader::impl::load_data()
       if (count++ > 5) break;
     }
   }
+#endif
 
   auto const decode_limit = [&] {
     // In this case, we have no read limit but have to split due to having large input in which
@@ -814,33 +769,23 @@ void reader::impl::load_data()
     chunk.start_idx += stripe_chunk.start_idx;
   }
 
-#ifndef PRINT_DEBUG
+#ifdef LOCAL_TEST
   auto& splits = _chunk_read_data.decode_stripe_chunks;
   printf("------------\nSplits decode_stripe_chunks (/%d): \n", (int)stripe_chunk.count);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
   }
   fflush(stdout);
-
-  //  std::cout << "  total rows: " << _file_itm_data.rows_to_read << std::endl;
-  //  print_cumulative_row_info(stripe_size_bytes, "  ", _chunk_read_info.chunks);
-
-  // We need to verify that:
-  //  1. All chunk must have count > 0
-  //  2. Chunks are continuous.
-  //  3. sum(sizes of stripes in a chunk) < size_limit if chunk has more than 1 stripe
-  //  4. sum(number of stripes in all chunks) == total_num_stripes.
-  // TODO: enable only in debug.
-//  verify_splits(splits, stripe_decompression_sizes, stripe_chunk.count,
-//  _file_itm_data.data_read_limit);
 #endif
 
   // lvl_stripe_data.clear();
   // _file_itm_data.compinfo_ready = true;
 
+#ifdef LOCAL_TEST
   auto peak_mem = mem_stats_logger.peak_memory_usage();
   std::cout << "load, peak_memory_usage: " << peak_mem << "("
             << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+#endif
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 95dd2fc13a2..7f2e0b15b8a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -240,10 +240,13 @@ struct chunk_read_data {
   // Only has more chunk to output if:
   bool has_next() const
   {
+#ifdef LOCAL_TEST
     printf("compute has_next: %d, %d, %d\n",
            (int)more_stripe_to_load(),
            (int)more_stripe_to_decode(),
            (int)more_table_chunk_to_output());
+#endif
+
     return more_stripe_to_load() || more_stripe_to_decode() || more_table_chunk_to_output();
   }
 };
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index cab80235ea3..255fe8c0b0c 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -116,15 +116,15 @@ rmm::device_buffer decompress_stripe_data(
       continue;
     }
 
-#ifdef PRINT_DEBUG
-    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-           (int)info.id.stripe_idx,
-           (int)info.id.level,
-           (int)info.id.orc_cold_idx,
-           (int)info.id.kind,
-           info.dst_pos,
-           info.length);
-    fflush(stdout);
+#ifdef LOCAL_TEST
+//    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
+//           (int)info.id.stripe_idx,
+//           (int)info.id.level,
+//           (int)info.id.orc_col_idx,
+//           (int)info.id.kind,
+//           info.dst_pos,
+//           info.length);
+//    fflush(stdout);
 #endif
 
     compinfo.push_back(gpu::CompressedStreamInfo(
@@ -485,7 +485,10 @@ void decode_stream_data(std::size_t num_dicts,
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
+
+#ifdef LOCAL_TEST
   printf("decode %d stripess \n", (int)num_stripes);
+#endif
 
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
@@ -507,7 +510,10 @@ void decode_stream_data(std::size_t num_dicts,
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
   if (level > 0) {
+#ifdef LOCAL_TEST
     printf("update_null_mask\n");
+#endif
+
     // Update nullmasks for children if parent was a struct and had null mask
     update_null_mask(chunks, out_buffers, stream, mr);
   }
@@ -643,7 +649,6 @@ void aggregate_child_meta(std::size_t stripe_start,
 
   int index = 0;  // number of child column processed
 
-  printf("\n\n");
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
     // printf("p_col.id: %d\n", (int)p_col.id);
@@ -749,7 +754,9 @@ std::vector<chunk> find_table_splits(table_view const& input,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
 {
+#ifdef LOCAL_TEST
   printf("find table split, seg length = %d, limit = %d \n", segment_length, (int)size_limit);
+#endif
 
   // If segment_length is zero: we don't have any limit on granularity.
   // As such, set segment length to the number of rows.
@@ -783,6 +790,7 @@ std::vector<chunk> find_table_splits(table_view const& input,
       return cumulative_size{current_length, static_cast<std::size_t>(size)};
     });
 
+#ifdef LOCAL_TEST
   {
     int count{0};
     // TODO: remove:
@@ -794,6 +802,7 @@ std::vector<chunk> find_table_splits(table_view const& input,
       ++count;
     }
   }
+#endif
 
   // TODO: exec_policy_nosync
   thrust::inclusive_scan(rmm::exec_policy(stream),
@@ -822,7 +831,9 @@ void reader::impl::decompress_and_decode()
   auto const load_stripe_start =
     _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1].start_idx;
 
+#ifdef LOCAL_TEST
   printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+#endif
 
   auto const rows_to_skip = _file_itm_data.rows_to_skip;
   // auto const rows_to_read      = _file_itm_data.rows_to_read;
@@ -853,7 +864,9 @@ void reader::impl::decompress_and_decode()
   _file_itm_data.rows_to_skip = 0;
   _file_itm_data.rows_to_read -= rows_to_read;
 
+#ifdef LOCAL_TEST
   printf("decode, skip = %ld, read = %ld\n", rows_to_skip, rows_to_read);
+#endif
 
   CUDF_EXPECTS(rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",
@@ -956,6 +969,7 @@ void reader::impl::decompress_and_decode()
   auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+#ifdef LOCAL_TEST
     printf("processing level = %d\n", (int)level);
 
     {
@@ -964,6 +978,7 @@ void reader::impl::decompress_and_decode()
       std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
     auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
@@ -1006,12 +1021,14 @@ void reader::impl::decompress_and_decode()
     chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     const bool use_index =
       _config.use_index &&
@@ -1025,7 +1042,9 @@ void reader::impl::decompress_and_decode()
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
+#ifdef LOCAL_TEST
     printf(" use_index: %d\n", (int)use_index);
+#endif
 
     // Logically view streams as columns
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
@@ -1051,9 +1070,10 @@ void reader::impl::decompress_and_decode()
     std::size_t stream_idx = 0;
 
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-      //    for (auto const& stripe : selected_stripes) {
-
+#ifdef LOCAL_TEST
       printf("processing stripe_idx = %d\n", (int)stripe_idx);
+#endif
+
       auto const& stripe       = selected_stripes[stripe_idx];
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
@@ -1076,7 +1096,9 @@ void reader::impl::decompress_and_decode()
                                                                       &chunks);
 
       auto const is_stripe_data_empty = total_data_size == 0;
+#ifdef LOCAL_TEST
       printf("is_stripe_data_empty: %d\n", (int)is_stripe_data_empty);
+#endif
 
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
@@ -1089,7 +1111,9 @@ void reader::impl::decompress_and_decode()
       // fflush(stdout);
 
       auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+#ifdef LOCAL_TEST
       printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
+#endif
 
       auto const rowgroup_id    = num_rowgroups;
       auto stripe_num_rowgroups = 0;
@@ -1115,10 +1139,11 @@ void reader::impl::decompress_and_decode()
             ? static_cast<int64_t>(stripe_info->numberOfRows)
             : col_meta
                 .num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_columns + col_idx];
-        printf("col idx: %d, start_row: %d, num rows: %d\n",
-               (int)col_idx,
-               (int)chunk.start_row,
-               (int)chunk.num_rows);
+
+        // printf("col idx: %d, start_row: %d, num rows: %d\n",
+        //        (int)col_idx,
+        //        (int)chunk.start_row,
+        //        (int)chunk.num_rows);
 
         chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
         chunk.parent_validity_info =
@@ -1131,7 +1156,7 @@ void reader::impl::decompress_and_decode()
         chunk.type_kind =
           _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
 
-        printf("type: %d\n", (int)chunk.type_kind);
+        // printf("type: %d\n", (int)chunk.type_kind);
 
         // num_child_rows for a struct column will be same, for other nested types it will be
         // calculated.
@@ -1211,12 +1236,14 @@ void reader::impl::decompress_and_decode()
       // fflush(stdout);
       CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_chunk > 0, "ERRRRR");
 
+#ifdef LOCAL_TEST
       {
         _stream.synchronize();
         auto peak_mem = mem_stats_logger.peak_memory_usage();
         std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                   << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
       }
+#endif
 
       auto decomp_data = decompress_stripe_data(
         _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1],
@@ -1240,12 +1267,14 @@ void reader::impl::decompress_and_decode()
         stripe_data[i + stripe_start - load_stripe_start] = {};
       }
 
+#ifdef LOCAL_TEST
       {
         _stream.synchronize();
         auto peak_mem = mem_stats_logger.peak_memory_usage();
         std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                   << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
       }
+#endif
 
       // printf("line %d\n", __LINE__);
       // fflush(stdout);
@@ -1273,29 +1302,35 @@ void reader::impl::decompress_and_decode()
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
 
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     // TODO: do not clear but reset each one.
     // and only reset if the new size/type are different.
     _out_buffers[level].clear();
 
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < num_stripes; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+#ifdef LOCAL_TEST
           printf("   is nullable\n");
+#endif
           is_nullable = true;
           break;
         }
@@ -1305,17 +1340,20 @@ void reader::impl::decompress_and_decode()
 
       // printf("  create col, num rows: %d\n", (int)n_rows);
 
+#ifdef LOCAL_TEST
       {
         _stream.synchronize();
         auto peak_mem = mem_stats_logger.peak_memory_usage();
         std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                   << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
       }
+#endif
 
       // For list column, offset column will be always size + 1
       if (is_list_type) n_rows++;
       _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
 
+#ifdef LOCAL_TEST
       {
         _stream.synchronize();
         auto peak_mem = mem_stats_logger.peak_memory_usage();
@@ -1323,17 +1361,20 @@ void reader::impl::decompress_and_decode()
                   << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                   << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
       }
+#endif
     }
 
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
 
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
@@ -1346,18 +1387,22 @@ void reader::impl::decompress_and_decode()
                        _stream,
                        _mr);
 
+#ifdef LOCAL_TEST
     {
       _stream.synchronize();
       auto peak_mem = mem_stats_logger.peak_memory_usage();
       std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
                 << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
     }
+#endif
 
     // printf("line %d\n", __LINE__);
     // fflush(stdout);
 
     if (nested_cols.size()) {
+#ifdef LOCAL_TEST
       printf("have nested col\n");
+#endif
 
       // Extract information to process nested child columns
       scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
@@ -1389,12 +1434,14 @@ void reader::impl::decompress_and_decode()
     // fflush(stdout);
   }  // end loop level
 
+#ifdef LOCAL_TEST
   {
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
     std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
+#endif
 
   std::vector<std::unique_ptr<column>> out_columns;
   _out_metadata = get_meta_with_user_data();
@@ -1427,12 +1474,14 @@ void reader::impl::decompress_and_decode()
     }
   }
 
+#ifdef LOCAL_TEST
   {
     _stream.synchronize();
     auto peak_mem = mem_stats_logger.peak_memory_usage();
     std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
+#endif
 
   // printf("col: \n");
   // cudf::test::print(_chunk_read_data.decoded_table->get_column(0).view());
@@ -1449,6 +1498,7 @@ void reader::impl::decompress_and_decode()
                           _chunk_read_data.output_size_limit,
                           _stream);
 
+#ifdef LOCAL_TEST
   auto& splits = _chunk_read_data.output_table_chunks;
   printf("------------\nSplits decoded table (/total num rows = %d): \n",
          (int)_chunk_read_data.decoded_table->num_rows());
@@ -1463,6 +1513,7 @@ void reader::impl::decompress_and_decode()
     std::cout << "decomp and decode, peak_memory_usage: " << peak_mem << "("
               << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
   }
+#endif
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index cded7a300de..862324e5aa8 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1036,8 +1036,6 @@ void input_limit_test_read(int test_location,
   for (size_t idx = 0; idx < test_files.size(); ++idx) {
     SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
                  ", file idx: " + std::to_string(idx));
-    // TODO: remove
-    printf("file_idx %d\n", (int)idx);
     auto const [result, num_chunks] =
       chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
     EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
@@ -1372,8 +1370,6 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     }
   }
 
-  printf("buffer size: %zu\n", data_buffer.size());
-
   // Verify metadata.
   auto const metadata =
     cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
@@ -1420,7 +1416,6 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
   }
 
-// #define LOCAL_TEST
 #ifdef LOCAL_TEST
   // Read with only output limit -- there is no limit on the memory usage.
   // However, the reader should be able to detect and load only enough stripes each time

From 971296f982585d015a501a3d9445f76a2d027f5f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 13:18:36 -0800
Subject: [PATCH 194/321] Update benchmark

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 8254bf65fe2..3d83568e128 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -119,8 +119,9 @@ using d_type_list = nvbench::
 using io_list =
   nvbench::enum_type_list<cudf::io::io_type::FILEPATH, cudf::io::io_type::HOST_BUFFER>;
 
-using compression_list =
-  nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
+using compression_list = nvbench::enum_type_list<cudf::io::compression_type::NONE,
+                                                 cudf::io::compression_type::ZSTD,
+                                                 cudf::io::compression_type::SNAPPY>;
 
 // NVBENCH_BENCH_TYPES(BM_orc_read_data,
 //                     NVBENCH_TYPE_AXES(d_type_list,

From 10b7ca7ad359772e2be457f58ef5a58b76e187fb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 13:44:39 -0800
Subject: [PATCH 195/321] Revert changes in `orc_read_input.cpp`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/CMakeLists.txt              |  3 +-
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 46 +++++++++++++---------
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index caf3b35b629..ef25278877e 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -255,8 +255,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
-#   TODO: add back the removed file, and add new file
-ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp)
+ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
 
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 3d83568e128..fdb7dbe59b8 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,6 +88,9 @@ void BM_orc_read_io_compression(
   nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
+                                         static_cast<int32_t>(data_type::FLOAT),
+                                         static_cast<int32_t>(data_type::DECIMAL),
+                                         static_cast<int32_t>(data_type::TIMESTAMP),
                                          static_cast<int32_t>(data_type::STRING),
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
@@ -113,24 +116,29 @@ void BM_orc_read_io_compression(
   orc_read_common(num_rows_written, source_sink, state);
 }
 
-using d_type_list = nvbench::
-  enum_type_list<data_type::INTEGRAL_SIGNED, data_type::STRING, data_type::LIST, data_type::STRUCT>;
-
-using io_list =
-  nvbench::enum_type_list<cudf::io::io_type::FILEPATH, cudf::io::io_type::HOST_BUFFER>;
-
-using compression_list = nvbench::enum_type_list<cudf::io::compression_type::NONE,
-                                                 cudf::io::compression_type::ZSTD,
-                                                 cudf::io::compression_type::SNAPPY>;
-
-// NVBENCH_BENCH_TYPES(BM_orc_read_data,
-//                     NVBENCH_TYPE_AXES(d_type_list,
-//                                       nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
-//   .set_name("orc_read_decode")
-//   .set_type_axes_names({"data_type", "io"})
-//   .set_min_samples(4)
-//   .add_int64_axis("cardinality", {0, 1000})
-//   .add_int64_axis("run_length", {1, 32});
+using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
+                                            data_type::FLOAT,
+                                            data_type::DECIMAL,
+                                            data_type::TIMESTAMP,
+                                            data_type::STRING,
+                                            data_type::LIST,
+                                            data_type::STRUCT>;
+
+using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
+                                        cudf::io::io_type::HOST_BUFFER,
+                                        cudf::io::io_type::DEVICE_BUFFER>;
+
+using compression_list =
+  nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
+
+NVBENCH_BENCH_TYPES(BM_orc_read_data,
+                    NVBENCH_TYPE_AXES(d_type_list,
+                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+  .set_name("orc_read_decode")
+  .set_type_axes_names({"data_type", "io"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
 
 NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
   .set_name("orc_read_io_compression")

From 0b8a2b56610ce0df4bcb69a5c33bad49f0b4af85 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 14:08:51 -0800
Subject: [PATCH 196/321] Revert changes in `parquet/reader_impl_helpers.cpp`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/parquet/reader_impl_helpers.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 6c3cba8059c..3ecc5beb9d3 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -618,9 +618,7 @@ aggregate_reader_metadata::select_row_groups(
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
-      skip_rows_opt,
-      num_rows_opt.has_value() ? std::optional<int64_t>{num_rows_opt.value()} : std::nullopt,
-      get_num_rows());
+      skip_rows_opt, num_rows_opt, get_num_rows());
     return std::pair{static_cast<int64_t>(from_opts.first),
                      static_cast<size_type>(from_opts.second)};
   }();

From 589975120f2607b456dd2b2e8af38c7fe4a116df Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 15:10:33 -0800
Subject: [PATCH 197/321] Implement chunked read benchmark

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 79 +++++++++++++++++-----
 1 file changed, 63 insertions(+), 16 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fdb7dbe59b8..0503ede62ed 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,31 +24,70 @@
 
 #include <nvbench/nvbench.cuh>
 
+namespace {
+
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
+template <typename Timer>
+void read_once(cudf::io::orc_reader_options const& options,
+               cudf::size_type num_rows_to_read,
+               Timer& timer)
+{
+  timer.start();
+  auto const result = cudf::io::read_orc(options);
+  timer.stop();
+
+  CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
+  CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+}
+
+template <typename Timer>
+void chunked_read(cudf::io::orc_reader_options const& options,
+                  cudf::size_type num_rows_to_read,
+                  cudf::size_type appox_num_chunks,
+                  Timer& timer)
+{
+  // Create a chunked reader that has an internal memory limits to process around 10 chunks.
+  auto const output_limit = static_cast<std::size_t>(data_size / appox_num_chunks);
+  auto const input_limit  = output_limit * 10;
+
+  auto reader = cudf::io::chunked_orc_reader(output_limit, input_limit, options);
+  cudf::size_type num_rows{0};
+
+  timer.start();
+  do {
+    auto chunk = reader.read_chunk();
+    num_rows += chunk.tbl->num_rows();
+  } while (reader.has_next());
+  timer.stop();
+
+  CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
+}
+
+template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
                      cuio_source_sink_pair& source_sink,
                      nvbench::state& state)
 {
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(source_sink.make_source_info());
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();
+  cudf::size_type constexpr approx_num_chunks = 10;
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch&, auto& timer) {
+               try_drop_l3_cache();
 
-      timer.start();
-      auto const result = cudf::io::read_orc(read_opts);
-      timer.stop();
-
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
+               if constexpr (!is_chunked_read) {
+                 read_once(read_opts, num_rows_to_read, timer);
+               } else {
+                 chunked_read(read_opts, num_rows_to_read, approx_num_chunks, timer);
+               }
+             });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -57,6 +96,8 @@ void orc_read_common(cudf::size_type num_rows_to_read,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+}  // namespace
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_orc_read_data(nvbench::state& state,
                       nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
@@ -79,7 +120,7 @@ void BM_orc_read_data(nvbench::state& state,
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
 template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
@@ -113,7 +154,12 @@ void BM_orc_read_io_compression(
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  auto const is_chunked_read = static_cast<bool>(state.get_int64("chunked_read"));
+  if (is_chunked_read) {
+    orc_read_common<true>(num_rows_written, source_sink, state);
+  } else {
+    orc_read_common<false>(num_rows_written, source_sink, state);
+  }
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -145,4 +191,5 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .set_type_axes_names({"io", "compression"})
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
+  .add_int64_axis("run_length", {1, 32})
+  .add_int64_axis("chunked_read", {0, 1});

From 9df437ba0e3c889fba8da62e330dae71ffd10091 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 16:54:47 -0800
Subject: [PATCH 198/321] Remove redundant parameters, and rewrite docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp     |  3 +-
 cpp/src/io/functions.cpp               |  2 +-
 cpp/src/io/orc/reader.cu               |  5 +-
 cpp/src/io/orc/reader_impl.cu          | 21 +++-----
 cpp/src/io/orc/reader_impl.hpp         | 72 ++++++++++----------------
 cpp/src/io/orc/reader_impl_chunking.cu |  8 ++-
 6 files changed, 40 insertions(+), 71 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 8cc41bd5057..c07dbef11d7 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -72,10 +72,9 @@ class reader {
   /**
    * @brief Reads the entire dataset.
    *
-   * @param options Settings for controlling reading behavior
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options);
+  table_with_metadata read();
 };
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index e8dbf97abd6..6a08e41d161 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -419,7 +419,7 @@ table_with_metadata read_orc(orc_reader_options const& options,
 
   auto datasources = make_datasources(options.get_source());
   auto reader = std::make_unique<orc::detail::reader>(std::move(datasources), options, stream, mr);
-  return reader->read(options);
+  return reader->read();
 }
 
 /**
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index af6a3a79817..ea0b43c0f93 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -31,10 +31,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
 {
 }
 
-table_with_metadata reader::read(orc_reader_options const& options)
-{
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 chunked_reader::chunked_reader(std::size_t output_size_limit,
                                std::size_t data_read_limit,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1061a7ec64f..1ed3e1347c0 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -56,14 +56,11 @@
 
 namespace cudf::io::orc::detail {
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<int64_t> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes,
-                                read_mode mode)
+void reader::impl::prepare_data(read_mode mode)
 {
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
+  CUDF_EXPECTS(_config.skip_rows == 0 or _selected_columns.num_levels() == 1,
                "skip_rows is not supported by nested columns");
 
   // There are no columns in the table.
@@ -73,7 +70,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
   std::cout << "call global, skip = " << skip_rows << std::endl;
 #endif
 
-  global_preprocess(skip_rows, num_rows_opt, stripes, mode);
+  global_preprocess(mode);
 
   if (!_chunk_read_data.more_table_chunk_to_output()) {
     if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
@@ -281,11 +278,9 @@ reader::impl::impl(std::size_t output_size_limit,
 {
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<int64_t> const& num_rows_opt,
-                                       std::vector<std::vector<size_type>> const& stripes)
+table_with_metadata reader::impl::read()
 {
-  prepare_data(skip_rows, num_rows_opt, stripes, read_mode::READ_ALL);
+  prepare_data(read_mode::READ_ALL);
   return make_output_chunk();
 }
 
@@ -295,8 +290,7 @@ bool reader::impl::has_next()
   printf("==================query has next \n");
 #endif
 
-  prepare_data(
-    _config.skip_rows, _config.num_read_rows, _config.selected_stripes, read_mode::CHUNKED_READ);
+  prepare_data(read_mode::CHUNKED_READ);
 
 #ifdef LOCAL_TEST
   printf("has next: %d\n", (int)_chunk_read_data.has_next());
@@ -317,8 +311,7 @@ table_with_metadata reader::impl::read_chunk()
   }
 #endif
 
-  prepare_data(
-    _config.skip_rows, _config.num_read_rows, _config.selected_stripes, read_mode::CHUNKED_READ);
+  prepare_data(read_mode::CHUNKED_READ);
 
 #ifdef LOCAL_TEST
   {
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 84033ca0778..b609c04affd 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -102,16 +102,9 @@ class reader::impl {
                 rmm::mr::device_memory_resource* mr);
 
   /**
-   * @brief Read an entire set or a subset of data and returns a set of columns
-   *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
-   * @return The set of columns along with metadata
+   * @copydoc cudf::io::orc::detail::reader::read
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<int64_t> const& num_rows_opt,
-                           std::vector<std::vector<size_type>> const& stripes);
+  table_with_metadata read();
 
   /**
    * @copydoc cudf::io::chunked_orc_reader::has_next
@@ -124,70 +117,59 @@ class reader::impl {
   table_with_metadata read_chunk();
 
  private:
-  // TODO
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
   enum class read_mode { READ_ALL, CHUNKED_READ };
 
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
    *
    * This is the proxy to call all other data preprocessing functions, which are prerequisite
-   * for generating an output table.
+   * for generating the output.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<int64_t> const& num_rows_opt,
-                    std::vector<std::vector<size_type>> const& stripes,
-                    read_mode mode);
+  void prepare_data(read_mode mode);
 
   /**
    * @brief Perform a global preprocessing step that executes exactly once for the entire duration
    * of the reader.
    *
-   * TODO: rewrite, not use "ensure".
+   * In this step, the metadata of all stripes in the data sources is parsed, and information about
+   * data streams of the selected columns in all stripes are generated. If the reader has a data
+   * read limit, sizes of these streams are used to split the list of all stripes into multiple
+   * subsets, each of which will be read into memory in the `load_data()` step. These subsets are
+   * computed such that memory usage will be capped around a fixed size limit.
    *
-   * In this step, the metadata of all stripes in the data source is parsed, and information about
-   * data streams for all selected columns in alls tripes are generated. If the reader has a data
-   * read limit, data size of all stripes are used to determine the chunks of consecutive
-   * stripes for reading each time using the `load_data()` step. This is to ensure that loading
-   * these stripes will not exceed a fixed portion the data read limit.
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void global_preprocess(int64_t skip_rows,
-                         std::optional<int64_t> const& num_rows_opt,
-                         std::vector<std::vector<size_type>> const& stripes,
-                         read_mode mode);
+  void global_preprocess(read_mode mode);
 
   /**
-   * @brief Load stripes from the input source and store the data in the internal buffers.
+   * @brief Load stripes from the input data sources into memory.
    *
-   * If there is a data read limit, only a chunk of stripes are read at a time such that
-   * their total data size does not exceed a fixed portion of the limit. Then, the data is
-   * probed to determine the uncompressed sizes for these loaded stripes, which are in turn
-   * used to determine a subset of stripes to decompress and decode in the next step
-   * `decompress_and_decode()`.
-   * This is to ensure that loading data together with decompression and decoding will not exceed
-   * the data read limit.
+   * If there is a data read limit, only a subset of stripes are read at a time such that
+   * their total data size does not exceed a fixed size limit. Then, the data is probed to
+   * estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+   * smaller subsets, each of which to be decompressed and decoded in the next step
+   * `decompress_and_decode()`. This is to ensure that loading data from data sources together with
+   * decompression and decoding will be capped around the given data read limit.
    */
   void load_data();
 
   /**
-   * @brief Decompress and decode the data in the internal buffers, and store the result into
-   * an internal table.
+   * @brief Decompress and decode stripe data in the internal buffers, and store the result into
+   * an intermediate table.
    *
-   * If there is a data read limit, only a chunk of stripes are decompressed and decoded at a time.
-   * Then, the result is stored in an internal table, and sizes of its rows are computed
-   * to determine slices of rows to return as the output table in the final step
-   * `make_output_chunk`.
+   * This function expects that the other preprocessing steps (`global preprocess()` and
+   * `load_data()`) have already been done.
    */
   void decompress_and_decode();
 
   /**
    * @brief Create the output table from the intermediate table and return it along with metadata.
    *
-   * This function is called internally and expects all preprocessing steps have already been done.
-   *
    * @return The output table along with columns' metadata
    */
   table_with_metadata make_output_chunk();
@@ -204,7 +186,7 @@ class reader::impl {
 
   memory_stats_logger mem_stats_logger;
 
-  // Reader configs
+  // Reader configs.
   struct {
     data_type timestamp_type;  // override output timestamp resolution
     bool use_index;            // enable or disable attempt to use row index for parsing
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index c4e094f47dd..09adefad6d9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -298,10 +298,7 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
   return {begin, end};
 }
 
-void reader::impl::global_preprocess(int64_t skip_rows,
-                                     std::optional<int64_t> const& num_rows_opt,
-                                     std::vector<std::vector<size_type>> const& stripes,
-                                     read_mode mode)
+void reader::impl::global_preprocess(read_mode mode)
 {
   if (_file_itm_data.global_preprocessed) { return; }
   _file_itm_data.global_preprocessed = true;
@@ -309,7 +306,8 @@ void reader::impl::global_preprocess(int64_t skip_rows,
   // Load stripes's metadata.
   std::tie(
     _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+    _metadata.select_stripes(
+      _config.selected_stripes, _config.skip_rows, _config.num_read_rows, _stream);
   if (_file_itm_data.has_no_data()) { return; }
 
   CUDF_EXPECTS(

From dff02358b9a91b22e9138fedb73535b3b35d9c78 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 17:57:33 -0800
Subject: [PATCH 199/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 40 ++++----------------------
 cpp/src/io/orc/reader_impl.hpp         |  4 ++-
 cpp/src/io/orc/reader_impl_chunking.cu | 12 +++++---
 3 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1ed3e1347c0..51bbd47d690 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -58,16 +58,11 @@ namespace cudf::io::orc::detail {
 
 void reader::impl::prepare_data(read_mode mode)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(_config.skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
   // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return; }
 
 #ifdef LOCAL_TEST
-  std::cout << "call global, skip = " << skip_rows << std::endl;
+  std::cout << "call global, skip = " << _config.skip_rows << std::endl;
 #endif
 
   global_preprocess(mode);
@@ -276,6 +271,10 @@ reader::impl::impl(std::size_t output_size_limit,
       data_read_limit,
       output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY}
 {
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns.
+  CUDF_EXPECTS(_config.skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported if having nested columns");
 }
 
 table_with_metadata reader::impl::read()
@@ -286,42 +285,13 @@ table_with_metadata reader::impl::read()
 
 bool reader::impl::has_next()
 {
-#ifdef LOCAL_TEST
-  printf("==================query has next \n");
-#endif
-
   prepare_data(read_mode::CHUNKED_READ);
-
-#ifdef LOCAL_TEST
-  printf("has next: %d\n", (int)_chunk_read_data.has_next());
-#endif
-
   return _chunk_read_data.has_next();
 }
 
 table_with_metadata reader::impl::read_chunk()
 {
-#ifdef LOCAL_TEST
-  printf("==================call read chunk\n");
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "\n\n\n------------start read chunk, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-#endif
-
   prepare_data(read_mode::CHUNKED_READ);
-
-#ifdef LOCAL_TEST
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "done prepare data, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-#endif
-
   return make_output_chunk();
 }
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b609c04affd..ae518bc2a5f 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -199,13 +199,15 @@ class reader::impl {
     std::vector<std::vector<size_type>> const selected_stripes;
   } const _config;
 
-  // Intermediate data for internal processing.
+  // Intermediate data for reading.
   std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
   file_intermediate_data _file_itm_data;
   chunk_read_data _chunk_read_data;
+
+  // Intermediate data for output.
   std::unique_ptr<table_metadata> _meta_with_user_data;
   table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 09adefad6d9..a719ec73c91 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -317,10 +317,14 @@ void reader::impl::global_preprocess(read_mode mode)
     std::overflow_error);
 
 #ifdef LOCAL_TEST
-  printf("input skip rows: %ld, num rows: %ld\n", skip_rows, num_rows_opt.value_or(-1l));
-  printf("actual skip rows: %ld, num rows: %ld\n",
-         _file_itm_data.rows_to_skip,
-         _file_itm_data.rows_to_read);
+  {
+    auto const skip_rows    = _config.skip_rows;
+    auto const num_rows_opt = _config.num_read_rows;
+    printf("input skip rows: %ld, num rows: %ld\n", skip_rows, num_rows_opt.value_or(-1l));
+    printf("actual skip rows: %ld, num rows: %ld\n",
+           _file_itm_data.rows_to_skip,
+           _file_itm_data.rows_to_read);
+  }
 #endif
 
   //  auto const rows_to_skip      = _file_itm_data.rows_to_skip;

From 28e631f56284bedb5e978846e2260934314629a5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 18:22:02 -0800
Subject: [PATCH 200/321] Rename variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 21 ++++++++++-----------
 cpp/src/io/orc/reader_impl_chunking.hpp |  4 ++--
 cpp/src/io/orc/reader_impl_decode.cu    |  5 ++---
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index a719ec73c91..c00e7085bc3 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -68,7 +68,7 @@
 namespace cudf::io::orc::detail {
 
 std::size_t gather_stream_info_and_column_desc(
-  int64_t stripe_index,
+  int64_t stripe_processing_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -77,7 +77,7 @@ std::size_t gather_stream_info_and_column_desc(
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
-  std::size_t* stream_idx,
+  std::size_t* stream_processing_order,
   std::optional<std::vector<orc_stream_info>*> const& stream_info,
   std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks)
 {
@@ -104,8 +104,7 @@ std::size_t gather_stream_info_and_column_desc(
   for (auto const& stream : stripefooter->streams) {
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       // Ignore reading this stream from source.
-      // cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be
-      // ignored.");
+      CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored.");
 
 #ifdef LOCAL_TEST
       printf("Unexpected stream in the input ORC source. The stream will be ignored\n");
@@ -126,14 +125,13 @@ std::size_t gather_stream_info_and_column_desc(
       auto const schema_type = types[column_id];
       if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
           stream.kind == orc::PRESENT) {
-        // printf("present stream\n");
         for (auto const& idx : schema_type.subtypes) {
           auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
           if (child_idx >= 0) {
             col = child_idx;
             if (chunks.has_value()) {
-              auto& chunk                     = (*chunks.value())[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = *stream_idx;
+              auto& chunk                     = (*chunks.value())[stripe_processing_order][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *stream_processing_order;
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
           }
@@ -144,7 +142,7 @@ std::size_t gather_stream_info_and_column_desc(
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk = (*chunks.value())[stripe_index][col];
+            auto& chunk = (*chunks.value())[stripe_processing_order][col];
             // printf("use stream id: %d, stripe: %d, level: %d, col idx: %d, kind: %d\n",
             //        (int)(*stream_idx),
             //        (int)stripe_index,
@@ -152,7 +150,7 @@ std::size_t gather_stream_info_and_column_desc(
             //        (int)column_id,
             //        (int)stream.kind);
 
-            chunk.strm_id[index_type]  = *stream_idx;
+            chunk.strm_id[index_type]  = *stream_processing_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
             chunk.skip_count |= 1 << index_type;
@@ -165,7 +163,7 @@ std::size_t gather_stream_info_and_column_desc(
           }
         }
 
-        (*stream_idx)++;
+        (*stream_processing_order)++;
       } else {  // not chunks.has_value()
         // printf("collect stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
         //        (int)stripe_index,
@@ -177,7 +175,8 @@ std::size_t gather_stream_info_and_column_desc(
           stripeinfo->offset + src_offset,
           dst_offset,
           stream.length,
-          stream_id_info{static_cast<uint32_t>(stripe_index), level, column_id, stream.kind});
+          stream_id_info{
+            static_cast<uint32_t>(stripe_processing_order), level, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 7f2e0b15b8a..8a09090cc78 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -298,7 +298,7 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
  * data, but not both.
  */
 std::size_t gather_stream_info_and_column_desc(
-  int64_t stripe_index,
+  int64_t stripe_processing_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -307,7 +307,7 @@ std::size_t gather_stream_info_and_column_desc(
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
-  std::size_t* stream_idx,
+  std::size_t* stream_processing_order,
   std::optional<std::vector<orc_stream_info>*> const& stream_info,
   std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks);
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 255fe8c0b0c..e7aa9709608 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -1066,8 +1066,7 @@ void reader::impl::decompress_and_decode()
     int64_t num_rowgroups    = 0;
 
     // TODO: Stripe and stream idx must be by chunk.
-    //    std::size_t stripe_idx = 0;
-    std::size_t stream_idx = 0;
+    std::size_t stream_processing_order = 0;
 
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
 #ifdef LOCAL_TEST
@@ -1091,7 +1090,7 @@ void reader::impl::decompress_and_decode()
                                                                       use_index,
                                                                       level == 0,
                                                                       &num_dict_entries,
-                                                                      &stream_idx,
+                                                                      &stream_processing_order,
                                                                       std::nullopt,  // stream_info
                                                                       &chunks);
 

From d8163db9acbc864bc059cb81022a4ca5939d6d47 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 20:42:36 -0800
Subject: [PATCH 201/321] Rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 20 ++++++++-------
 cpp/src/io/orc/reader_impl_chunking.hpp | 16 ++++++------
 cpp/src/io/orc/reader_impl_decode.cu    | 34 ++++++++++++-------------
 3 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index c00e7085bc3..2a6af96983a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -175,7 +175,7 @@ std::size_t gather_stream_info_and_column_desc(
           stripeinfo->offset + src_offset,
           dst_offset,
           stream.length,
-          stream_id_info{
+          stream_source_info{
             static_cast<uint32_t>(stripe_processing_order), level, column_id, stream.kind});
       }
 
@@ -628,17 +628,18 @@ void reader::impl::load_data()
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.id.stripe_idx - stripe_start].data()) +
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data()) +
             info.dst_pos,
           info.length));
-        stream_compinfo_map[stream_id_info{
-          info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind}] = &compinfo.back();
+        stream_compinfo_map[stream_source_info{
+          info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind}] =
+          &compinfo.back();
 #ifdef LOCAL_TEST
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-               (int)info.id.stripe_idx,
-               (int)info.id.level,
-               (int)info.id.orc_col_idx,
-               (int)info.id.kind,
+               (int)info.source.stripe_idx,
+               (int)info.source.level,
+               (int)info.source.orc_col_idx,
+               (int)info.source.kind,
                info.dst_pos,
                info.length);
         fflush(stdout);
@@ -688,7 +689,8 @@ void reader::impl::load_data()
       // Set decompression size equal to the input size.
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-        stripe_decomp_sizes[info.id.stripe_idx - stripe_chunk.start_idx].size_bytes += info.length;
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_chunk.start_idx].size_bytes +=
+          info.length;
       }
     }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 8a09090cc78..2543929bd72 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -32,15 +32,15 @@ namespace cudf::io::orc::detail {
 /**
  * @brief Struct that store identification of an ORC streams
  */
-struct stream_id_info {
-  uint32_t stripe_idx;  // global stripe id throughout the data source
+struct stream_source_info {
+  uint32_t stripe_idx;  // global stripe id throughout all data sources
   // TODO: change type below
   std::size_t level;     // level of the nested column
   uint32_t orc_col_idx;  // orc column id
   StreamKind kind;       // stream kind
 
   struct hash {
-    std::size_t operator()(stream_id_info const& id) const
+    std::size_t operator()(stream_source_info const& id) const
     {
       auto const hasher = std::hash<size_t>{};
       return hasher(id.stripe_idx) ^ hasher(id.level) ^
@@ -49,7 +49,7 @@ struct stream_id_info {
     }
   };
   struct equal_to {
-    bool operator()(stream_id_info const& lhs, stream_id_info const& rhs) const
+    bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
     {
       return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
              lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
@@ -62,7 +62,7 @@ struct stream_id_info {
  */
 template <typename T>
 using stream_id_map =
-  std::unordered_map<stream_id_info, T, stream_id_info::hash, stream_id_info::equal_to>;
+  std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
 
 /**
  * @brief Struct that store identification of an ORC stream.
@@ -72,8 +72,8 @@ struct orc_stream_info {
   explicit orc_stream_info(uint64_t offset_,
                            std::size_t dst_pos_,
                            uint32_t length_,
-                           stream_id_info const& id_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), id(id_)
+                           stream_source_info const& source_)
+    : offset(offset_), dst_pos(dst_pos_), length(length_), source(source_)
   {
 #ifdef PRINT_DEBUG
     printf("   construct stripe id [%d, %d, %d, %d]\n",
@@ -89,7 +89,7 @@ struct orc_stream_info {
   std::size_t length;   // stream length to read
 
   // Store location of the stream in the stripe, so we can look up where this stream comes from.
-  stream_id_info id;
+  stream_source_info source;
 };
 
 /**
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index e7aa9709608..13b928c23c4 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -101,8 +101,8 @@ rmm::device_buffer decompress_stripe_data(
   // TODO: use lvl_stripe_stream_chunks
   std::size_t count{0};
   for (auto const& info : stream_info) {
-    if (info.id.stripe_idx < stripe_chunk.start_idx ||
-        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+    if (info.source.stripe_idx < stripe_chunk.start_idx ||
+        info.source.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
       continue;
     }
     count++;
@@ -111,17 +111,17 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
 
   for (auto const& info : stream_info) {
-    if (info.id.stripe_idx < stripe_chunk.start_idx ||
-        info.id.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+    if (info.source.stripe_idx < stripe_chunk.start_idx ||
+        info.source.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
       continue;
     }
 
 #ifdef LOCAL_TEST
 //    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-//           (int)info.id.stripe_idx,
-//           (int)info.id.level,
-//           (int)info.id.orc_col_idx,
-//           (int)info.id.kind,
+//           (int)info.source.stripe_idx,
+//           (int)info.source.level,
+//           (int)info.source.orc_col_idx,
+//           (int)info.source.kind,
 //           info.dst_pos,
 //           info.length);
 //    fflush(stdout);
@@ -129,19 +129,19 @@ rmm::device_buffer decompress_stripe_data(
 
     compinfo.push_back(gpu::CompressedStreamInfo(
       static_cast<uint8_t const*>(
-        stripe_data[info.id.stripe_idx - load_stripe_chunk.start_idx].data()) +
+        stripe_data[info.source.stripe_idx - load_stripe_chunk.start_idx].data()) +
         info.dst_pos,
       info.length));
 
     //    printf("line %d\n", __LINE__);
     //    fflush(stdout);
-    auto const& cached_comp_info = compinfo_map.at(
-      stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_col_idx, info.id.kind});
+    auto const& cached_comp_info = compinfo_map.at(stream_source_info{
+      info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind});
     //    printf("line %d\n", __LINE__);
     //    fflush(stdout);
     // auto const& cached_comp_info =
-    //   compinfo_map[stream_id_info{info.id.stripe_idx, info.id.level, info.id.orc_cold_idx,
-    //   info.id.kind}];
+    //   compinfo_map[stream_id_info{info.source.stripe_idx, info.source.level,
+    //   info.source.orc_cold_idx, info.source.kind}];
     auto& stream_comp_info                   = compinfo.back();
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
@@ -171,10 +171,10 @@ rmm::device_buffer decompress_stripe_data(
 
     auto const& info = stream_info[i];
     printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-           (int)info.id.stripe_idx,
-           (int)info.id.level,
-           (int)info.id.orc_cold_idx,
-           (int)info.id.kind,
+           (int)info.source.stripe_idx,
+           (int)info.source.level,
+           (int)info.source.orc_cold_idx,
+           (int)info.source.kind,
            (size_t)compinfo[i].num_compressed_blocks,
            (size_t)compinfo[i].num_uncompressed_blocks,
            compinfo[i].max_uncompressed_size);

From 3f57b5ff70cd6af2df27f3d2234ebd9871e0cb28 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 20:45:47 -0800
Subject: [PATCH 202/321] Change variable name

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  2 +-
 cpp/src/io/orc/reader_impl_chunking.hpp | 12 ++----------
 cpp/src/io/orc/reader_impl_decode.cu    |  2 +-
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2a6af96983a..2bacbf6e72e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -587,7 +587,7 @@ void reader::impl::load_data()
 
   // TODO: This is subpass
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
-  stream_id_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
+  stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_chunk.count,
                                                                                _stream);
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 2543929bd72..371509c5eb4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -61,27 +61,19 @@ struct stream_source_info {
  * @brief Map to lookup a value from stream id.
  */
 template <typename T>
-using stream_id_map =
+using stream_source_map =
   std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
 
 /**
  * @brief Struct that store identification of an ORC stream.
  */
 struct orc_stream_info {
-  // TODO: remove constructor
   explicit orc_stream_info(uint64_t offset_,
                            std::size_t dst_pos_,
                            uint32_t length_,
                            stream_source_info const& source_)
     : offset(offset_), dst_pos(dst_pos_), length(length_), source(source_)
   {
-#ifdef PRINT_DEBUG
-    printf("   construct stripe id [%d, %d, %d, %d]\n",
-           (int)stripe_idx,
-           (int)level,
-           (int)orc_col_idx,
-           (int)kind);
-#endif
   }
   // Data info:
   uint64_t offset;      // offset in data source
@@ -133,7 +125,7 @@ struct file_intermediate_data {
   std::size_t num_stripes() const { return selected_stripes.size(); }
 
   // Store the compression information for each data stream.
-  stream_id_map<stripe_level_comp_info> compinfo_map;
+  stream_source_map<stripe_level_comp_info> compinfo_map;
 
   // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
   // After decoding, such buffers can be released.
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 13b928c23c4..f96efaa0174 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -80,7 +80,7 @@ namespace {
 rmm::device_buffer decompress_stripe_data(
   chunk const& load_stripe_chunk,
   chunk const& stripe_chunk,
-  stream_id_map<stripe_level_comp_info> const& compinfo_map,
+  stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info const> stream_info,

From 7a04022a3ac3b2e9830e3fc2e99092bf84cef20d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 21:31:19 -0800
Subject: [PATCH 203/321] Change data type

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 36 ++++++++++++-------------
 cpp/src/io/orc/reader_impl_chunking.hpp | 23 ++++++++--------
 cpp/src/io/orc/reader_impl_decode.cu    | 10 ++++---
 3 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2bacbf6e72e..ab0c5171f08 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -68,7 +68,7 @@
 namespace cudf::io::orc::detail {
 
 std::size_t gather_stream_info_and_column_desc(
-  int64_t stripe_processing_order,
+  std::size_t stripe_processing_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -175,8 +175,7 @@ std::size_t gather_stream_info_and_column_desc(
           stripeinfo->offset + src_offset,
           dst_offset,
           stream.length,
-          stream_source_info{
-            static_cast<uint32_t>(stripe_processing_order), level, column_id, stream.kind});
+          stream_source_info{stripe_processing_order, level, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
@@ -193,7 +192,9 @@ std::size_t gather_stream_info_and_column_desc(
  * given `size_limit`.
  */
 template <typename T>
-std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, size_t size_limit)
+std::vector<chunk> find_splits(host_span<T const> sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit)
 {
   // if (size_limit == 0) {
   //   printf("0 limit: output chunk = 0, %d\n", (int)total_count);
@@ -202,7 +203,7 @@ std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, si
   CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
 
   std::vector<chunk> splits;
-  int64_t cur_count{0};
+  std::size_t cur_count{0};
   int64_t cur_pos{0};
   size_t cur_cumulative_size{0};
 
@@ -242,7 +243,7 @@ std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, si
 
     auto const start_idx = cur_count;
     cur_count            = sizes[split_pos].count;
-    splits.emplace_back(chunk{start_idx, static_cast<size_type>(cur_count - start_idx)});
+    splits.emplace_back(chunk{start_idx, cur_count - start_idx});
     cur_pos             = split_pos;
     cur_cumulative_size = sizes[split_pos].size_bytes;
 
@@ -266,10 +267,10 @@ std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, si
 }
 
 template std::vector<chunk> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
-                                                         int64_t total_count,
-                                                         size_t size_limit);
+                                                         std::size_t total_count,
+                                                         std::size_t size_limit);
 template std::vector<chunk> find_splits<cumulative_size_and_row>(
-  host_span<cumulative_size_and_row const> sizes, int64_t total_count, size_t size_limit);
+  host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
 #endif
 
 /**
@@ -400,7 +401,7 @@ void reader::impl::global_preprocess(read_mode mode)
     auto const stripe_footer = stripe.stripe_footer;
 
     std::size_t total_stripe_size{0};
-    auto const last_read_size = static_cast<int64_t>(read_info.size());
+    auto const last_read_size = read_info.size();
     for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
       auto& stream_info  = _file_itm_data.lvl_stream_info[level];
       auto& stripe_sizes = lvl_stripe_sizes[level];
@@ -428,10 +429,8 @@ void reader::impl::global_preprocess(read_mode mode)
       stripe_sizes[stripe_idx] = stripe_size;
       total_stripe_size += stripe_size;
 
-      auto& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
-      stripe_stream_chunks[stripe_idx] =
-        chunk{static_cast<int64_t>(stream_count),
-              static_cast<int64_t>(stream_info.size() - stream_count)};
+      auto& stripe_stream_chunks       = lvl_stripe_stream_chunks[level];
+      stripe_stream_chunks[stripe_idx] = chunk{stream_count, stream_info.size() - stream_count};
 
       // Coalesce consecutive streams into one read
       while (not is_stripe_data_empty and stream_count < stream_info.size()) {
@@ -448,9 +447,8 @@ void reader::impl::global_preprocess(read_mode mode)
         read_info.emplace_back(offset, len, d_dst, stripe.source_idx, stripe_idx, level);
       }
     }
-    total_stripe_sizes[stripe_idx] = {1, total_stripe_size};
-    stripe_data_read_chunks[stripe_idx] =
-      chunk{last_read_size, static_cast<int64_t>(read_info.size() - last_read_size)};
+    total_stripe_sizes[stripe_idx]      = {1, total_stripe_size};
+    stripe_data_read_chunks[stripe_idx] = chunk{last_read_size, read_info.size() - last_read_size};
   }
 
   _chunk_read_data.curr_load_stripe_chunk = 0;
@@ -461,7 +459,7 @@ void reader::impl::global_preprocess(read_mode mode)
     printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
 #endif
 
-    _chunk_read_data.load_stripe_chunks = {chunk{0, static_cast<int64_t>(num_stripes)}};
+    _chunk_read_data.load_stripe_chunks = {chunk{0ul, num_stripes}};
     return;
   }
 
@@ -591,7 +589,7 @@ void reader::impl::load_data()
 
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_chunk.count,
                                                                                _stream);
-  for (int64_t stripe_idx = 0; stripe_idx < stripe_chunk.count; ++stripe_idx) {
+  for (std::size_t stripe_idx = 0; stripe_idx < stripe_chunk.count; ++stripe_idx) {
     auto const& stripe     = selected_stripes[stripe_idx];
     auto const stripe_info = stripe.stripe_info;
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 371509c5eb4..9e70ec246a1 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -33,11 +33,10 @@ namespace cudf::io::orc::detail {
  * @brief Struct that store identification of an ORC streams
  */
 struct stream_source_info {
-  uint32_t stripe_idx;  // global stripe id throughout all data sources
-  // TODO: change type below
-  std::size_t level;     // level of the nested column
-  uint32_t orc_col_idx;  // orc column id
-  StreamKind kind;       // stream kind
+  std::size_t stripe_idx;  // global stripe id throughout all data sources
+  std::size_t level;       // level of the nested column
+  uint32_t orc_col_idx;    // orc column id
+  StreamKind kind;         // stream kind
 
   struct hash {
     std::size_t operator()(stream_source_info const& id) const
@@ -98,8 +97,8 @@ struct stripe_level_comp_info {
  * @brief Struct that store information about a chunk of data.
  */
 struct chunk {
-  int64_t start_idx;
-  int64_t count;
+  std::size_t start_idx;
+  std::size_t count;
 };
 
 /**
@@ -247,13 +246,13 @@ struct chunk_read_data {
  * @brief Struct to accumulate sizes of chunks of some data such as stripe or rows.
  */
 struct cumulative_size {
-  int64_t count{0};
+  std::size_t count{0};
   std::size_t size_bytes{0};
 };
 
 // TODO
 struct cumulative_size_and_row {
-  int64_t count{0};
+  std::size_t count{0};
   std::size_t size_bytes{0};
   std::size_t rows{0};
 };
@@ -279,7 +278,9 @@ struct cumulative_size_sum {
  * given `size_limit`.
  */
 template <typename T>
-std::vector<chunk> find_splits(host_span<T const> sizes, int64_t total_count, size_t size_limit);
+std::vector<chunk> find_splits(host_span<T const> sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit);
 
 // TODO
 std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
@@ -290,7 +291,7 @@ std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
  * data, but not both.
  */
 std::size_t gather_stream_info_and_column_desc(
-  int64_t stripe_processing_order,
+  std::size_t stripe_processing_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index f96efaa0174..dba2a4b135e 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -787,7 +787,8 @@ std::vector<chunk> find_table_splits(table_view const& input,
       auto const current_length =
         cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx];
-      return cumulative_size{current_length, static_cast<std::size_t>(size)};
+      return cumulative_size{static_cast<std::size_t>(current_length),
+                             static_cast<std::size_t>(size)};
     });
 
 #ifdef LOCAL_TEST
@@ -1262,7 +1263,7 @@ void reader::impl::decompress_and_decode()
 
       // TODO: only reset each one if the new size/type are different.
       stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
-      for (int64_t i = 1; i < stripe_chunk.count; ++i) {
+      for (std::size_t i = 1; i < stripe_chunk.count; ++i) {
         stripe_data[i + stripe_start - load_stripe_start] = {};
       }
 
@@ -1467,7 +1468,7 @@ void reader::impl::decompress_and_decode()
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       stripe_data[stripe_start - load_stripe_start] = {};
     } else {
-      for (int64_t i = 0; i < stripe_chunk.count; ++i) {
+      for (std::size_t i = 0; i < stripe_chunk.count; ++i) {
         stripe_data[i + stripe_start - load_stripe_start] = {};
       }
     }
@@ -1491,7 +1492,8 @@ void reader::impl::decompress_and_decode()
   _chunk_read_data.curr_output_table_chunk = 0;
   _chunk_read_data.output_table_chunks =
     _chunk_read_data.output_size_limit == 0
-      ? std::vector<chunk>{chunk{0, _chunk_read_data.decoded_table->num_rows()}}
+      ? std::vector<chunk>{chunk{
+          0, static_cast<std::size_t>(_chunk_read_data.decoded_table->num_rows())}}
       : find_table_splits(_chunk_read_data.decoded_table->view(),
                           _chunk_read_data.output_row_granularity,
                           _chunk_read_data.output_size_limit,

From bcdfab89a1066c78e9dfd8744ccaee1d08f7c818 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 21:56:19 -0800
Subject: [PATCH 204/321] Change from chunk to range

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           |  15 ++--
 cpp/src/io/orc/reader_impl_chunking.cu  | 107 ++++++++++++------------
 cpp/src/io/orc/reader_impl_chunking.hpp |  41 ++++-----
 cpp/src/io/orc/reader_impl_decode.cu    |  60 +++++++------
 4 files changed, 105 insertions(+), 118 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 51bbd47d690..57770cec4fe 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -129,8 +129,8 @@ table_with_metadata reader::impl::make_output_chunk()
   }
 
   auto out_table = [&] {
-    if (_chunk_read_data.output_table_chunks.size() == 1) {
-      _chunk_read_data.curr_output_table_chunk++;
+    if (_chunk_read_data.output_table_ranges.size() == 1) {
+      _chunk_read_data.curr_output_table_range++;
 #ifdef LOCAL_TEST
       printf("one chunk, no more table---------------------------------\n");
 #endif
@@ -147,12 +147,11 @@ table_with_metadata reader::impl::make_output_chunk()
 #endif
 
     auto const out_chunk =
-      _chunk_read_data.output_table_chunks[_chunk_read_data.curr_output_table_chunk++];
-    auto const out_tview =
-      cudf::detail::slice(_chunk_read_data.decoded_table->view(),
-                          {static_cast<size_type>(out_chunk.start_idx),
-                           static_cast<size_type>(out_chunk.start_idx + out_chunk.count)},
-                          _stream)[0];
+      _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
+    auto const out_tview = cudf::detail::slice(
+      _chunk_read_data.decoded_table->view(),
+      {static_cast<size_type>(out_chunk.begin), static_cast<size_type>(out_chunk.end)},
+      _stream)[0];
 
 #ifdef LOCAL_TEST
     {
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ab0c5171f08..0333492d1c7 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -192,7 +192,7 @@ std::size_t gather_stream_info_and_column_desc(
  * given `size_limit`.
  */
 template <typename T>
-std::vector<chunk> find_splits(host_span<T const> sizes,
+std::vector<range> find_splits(host_span<T const> sizes,
                                std::size_t total_count,
                                std::size_t size_limit)
 {
@@ -202,7 +202,7 @@ std::vector<chunk> find_splits(host_span<T const> sizes,
   // }
   CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
 
-  std::vector<chunk> splits;
+  std::vector<range> splits;
   std::size_t cur_count{0};
   int64_t cur_pos{0};
   size_t cur_cumulative_size{0};
@@ -241,9 +241,13 @@ std::vector<chunk> find_splits(host_span<T const> sizes,
       split_pos++;
     }
 
+    // #ifdef LOCAL_TEST
+    //     printf("  split_pos: %d\n", (int)split_pos);
+    // #endif
+
     auto const start_idx = cur_count;
     cur_count            = sizes[split_pos].count;
-    splits.emplace_back(chunk{start_idx, cur_count - start_idx});
+    splits.emplace_back(range{start_idx, cur_count});
     cur_pos             = split_pos;
     cur_cumulative_size = sizes[split_pos].size_bytes;
 
@@ -257,45 +261,39 @@ std::vector<chunk> find_splits(host_span<T const> sizes,
   if (splits.size() > 1) {
     auto constexpr merge_threshold = 0.15;
     if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
-        last.count <= static_cast<int64_t>(merge_threshold * second_last.count)) {
+        (last.end - last.begin) <=
+        static_cast<std::size_t>(merge_threshold * (second_last.end - second_last.begin))) {
       splits.pop_back();
-      splits.back().count += last.count;
+      splits.back().end = last.end;
     }
   }
 
   return splits;
 }
 
-template std::vector<chunk> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
+template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
                                                          std::size_t total_count,
                                                          std::size_t size_limit);
-template std::vector<chunk> find_splits<cumulative_size_and_row>(
+template std::vector<range> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
 #endif
 
 /**
- * @brief Find range of the data span by a given chunk of chunks.
+ * @brief Find range of the data span by a given range of ranges.
  *
- * @param input_chunks The list of all data chunks
- * @param selected_chunks A chunk of chunks in the input_chunks
- * @return The range of data span by the selected chunk of given chunks
+ * @param input_ranges The list of all data chunks
+ * @param selected_ranges A chunk of chunks in the input_chunks
+ * @return The range of data span by the selected range of given chunks
  */
-std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
-                                      chunk const& selected_chunks)
+std::pair<int64_t, int64_t> get_range(std::vector<range> const& input_ranges,
+                                      range const& selected_ranges)
 {
-  // Range indices to input_chunks
-  auto const chunk_begin = selected_chunks.start_idx;
-  auto const chunk_end   = selected_chunks.start_idx + selected_chunks.count;
-
-  // The first and last chunk, according to selected_chunk.
-  auto const& first_chunk = input_chunks[chunk_begin];
-  auto const& last_chunk  = input_chunks[chunk_end - 1];
-
-  // The range of data covered from the first to the last chunk.
-  auto const begin = first_chunk.start_idx;
-  auto const end   = last_chunk.start_idx + last_chunk.count;
+  // The first and last range, according to selected_chunk.
+  auto const& first_range = input_ranges[selected_ranges.begin];
+  auto const& last_range  = input_ranges[selected_ranges.end - 1];
 
-  return {begin, end};
+  // The range of data covered from the first to the last range.
+  return {first_range.begin, last_range.end};
 }
 
 void reader::impl::global_preprocess(read_mode mode)
@@ -337,8 +335,8 @@ void reader::impl::global_preprocess(read_mode mode)
   lvl_stripe_sizes.resize(_selected_columns.num_levels());
 
   auto& read_info                = _file_itm_data.data_read_info;
-  auto& stripe_data_read_chunks  = _file_itm_data.stripe_data_read_chunks;
-  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
+  auto& stripe_data_read_chunks  = _file_itm_data.stripe_data_read_ranges;
+  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_ranges;
 
   // Logically view streams as columns
   _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
@@ -430,7 +428,7 @@ void reader::impl::global_preprocess(read_mode mode)
       total_stripe_size += stripe_size;
 
       auto& stripe_stream_chunks       = lvl_stripe_stream_chunks[level];
-      stripe_stream_chunks[stripe_idx] = chunk{stream_count, stream_info.size() - stream_count};
+      stripe_stream_chunks[stripe_idx] = range{stream_count, stream_info.size()};
 
       // Coalesce consecutive streams into one read
       while (not is_stripe_data_empty and stream_count < stream_info.size()) {
@@ -448,10 +446,10 @@ void reader::impl::global_preprocess(read_mode mode)
       }
     }
     total_stripe_sizes[stripe_idx]      = {1, total_stripe_size};
-    stripe_data_read_chunks[stripe_idx] = chunk{last_read_size, read_info.size() - last_read_size};
+    stripe_data_read_chunks[stripe_idx] = range{last_read_size, read_info.size()};
   }
 
-  _chunk_read_data.curr_load_stripe_chunk = 0;
+  _chunk_read_data.curr_load_stripe_range = 0;
 
   // Load all chunks if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
@@ -459,7 +457,7 @@ void reader::impl::global_preprocess(read_mode mode)
     printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
 #endif
 
-    _chunk_read_data.load_stripe_chunks = {chunk{0ul, num_stripes}};
+    _chunk_read_data.load_stripe_ranges = {range{0ul, num_stripes}};
     return;
   }
 
@@ -499,14 +497,14 @@ void reader::impl::global_preprocess(read_mode mode)
                                               chunk_read_data::load_limit_ratio);
     return tmp > 0UL ? tmp : 1UL;
   }();
-  _chunk_read_data.load_stripe_chunks =
+  _chunk_read_data.load_stripe_ranges =
     find_splits<cumulative_size>(total_stripe_sizes, num_stripes, load_limit);
 
 #ifdef LOCAL_TEST
-  auto& splits = _chunk_read_data.load_stripe_chunks;
+  auto& splits = _chunk_read_data.load_stripe_ranges;
   printf("------------\nSplits (/total num stripe = %d): \n", (int)num_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+    printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
   }
   fflush(stdout);
 #endif
@@ -525,9 +523,10 @@ void reader::impl::load_data()
 
   //  std::size_t num_stripes = selected_stripes.size();
   auto const stripe_chunk =
-    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk++];
-  auto const stripe_start = stripe_chunk.start_idx;
-  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
+  auto const stripe_start = stripe_chunk.begin;
+  auto const stripe_end   = stripe_chunk.end;
+  auto const stripe_count = stripe_end - stripe_start;
 
 #ifdef LOCAL_TEST
   printf("\n\nloading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
@@ -537,7 +536,7 @@ void reader::impl::load_data()
   // TODO: clear all old buffer.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stripe_data = lvl_stripe_data[level];
-    stripe_data.resize(stripe_chunk.count);
+    stripe_data.resize(stripe_count);
 
     auto& stripe_sizes = lvl_stripe_sizes[level];
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
@@ -550,7 +549,7 @@ void reader::impl::load_data()
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
-  auto const& stripe_data_read_chunks = _file_itm_data.stripe_data_read_chunks;
+  auto const& stripe_data_read_chunks = _file_itm_data.stripe_data_read_ranges;
   auto const [read_begin, read_end]   = get_range(stripe_data_read_chunks, stripe_chunk);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
@@ -581,15 +580,15 @@ void reader::impl::load_data()
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
-  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
+  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_ranges;
 
   // TODO: This is subpass
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
-  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_chunk.count,
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
                                                                                _stream);
-  for (std::size_t stripe_idx = 0; stripe_idx < stripe_chunk.count; ++stripe_idx) {
+  for (std::size_t stripe_idx = 0; stripe_idx < stripe_count; ++stripe_idx) {
     auto const& stripe     = selected_stripes[stripe_idx];
     auto const stripe_info = stripe.stripe_info;
 
@@ -659,7 +658,7 @@ void reader::impl::load_data()
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
-        stripe_decomp_sizes[stream_id.stripe_idx - stripe_chunk.start_idx].size_bytes +=
+        stripe_decomp_sizes[stream_id.stripe_idx - stripe_start].size_bytes +=
           stream_compinfo->max_uncompressed_size;
 
 #ifdef LOCAL_TEST
@@ -687,8 +686,7 @@ void reader::impl::load_data()
       // Set decompression size equal to the input size.
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-        stripe_decomp_sizes[info.source.stripe_idx - stripe_chunk.start_idx].size_bytes +=
-          info.length;
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
       }
     }
 
@@ -697,7 +695,7 @@ void reader::impl::load_data()
   }  // end loop level
 
   // Decoding is reset to start from the first chunk in `decode_stripe_chunks`.
-  _chunk_read_data.curr_decode_stripe_chunk = 0;
+  _chunk_read_data.curr_decode_stripe_range = 0;
 
   // Decode all chunks if there is no read and no output limit.
   // In theory, we should just decode enough stripes for output one table chunk.
@@ -715,7 +713,7 @@ void reader::impl::load_data()
     printf("0 limit: output decode stripe chunk unchanged\n");
 #endif
 
-    _chunk_read_data.decode_stripe_chunks = {stripe_chunk};
+    _chunk_read_data.decode_stripe_ranges = {stripe_chunk};
     return;
   }
 
@@ -764,17 +762,18 @@ void reader::impl::load_data()
                                               (1.0 - chunk_read_data::load_limit_ratio));
     return tmp > 0UL ? tmp : 1UL;
   }();
-  _chunk_read_data.decode_stripe_chunks =
-    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_chunk.count, decode_limit);
-  for (auto& chunk : _chunk_read_data.decode_stripe_chunks) {
-    chunk.start_idx += stripe_chunk.start_idx;
+  _chunk_read_data.decode_stripe_ranges =
+    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
+  for (auto& chunk : _chunk_read_data.decode_stripe_ranges) {
+    chunk.begin += stripe_start;
+    chunk.end += stripe_start;
   }
 
 #ifdef LOCAL_TEST
-  auto& splits = _chunk_read_data.decode_stripe_chunks;
-  printf("------------\nSplits decode_stripe_chunks (/%d): \n", (int)stripe_chunk.count);
+  auto& splits = _chunk_read_data.decode_stripe_ranges;
+  printf("------------\nSplits decode_stripe_chunks (/%d): \n", (int)stripe_count);
   for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+    printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
   }
   fflush(stdout);
 #endif
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 9e70ec246a1..ff54d542a2b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -92,21 +92,12 @@ struct stripe_level_comp_info {
   std::size_t total_decomp_size{0};
 };
 
-// TODO: remove this and use range instead
-/**
- * @brief Struct that store information about a chunk of data.
- */
-struct chunk {
-  std::size_t start_idx;
-  std::size_t count;
-};
-
 /**
  * @brief Struct that store information about a range of data.
  */
 struct range {
-  int64_t begin;
-  int64_t end;
+  std::size_t begin;
+  std::size_t end;
 };
 
 /**
@@ -166,14 +157,14 @@ struct file_intermediate_data {
 
   // For each stripe, we perform a number of read for its streams.
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
-  std::vector<chunk> stripe_data_read_chunks;
+  std::vector<range> stripe_data_read_ranges;
 
   // Store info for each ORC stream at each nested level.
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
   // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
   // This is used to identify the range of streams for each stripe from that vector.
-  std::vector<std::vector<chunk>> lvl_stripe_stream_chunks;
+  std::vector<std::vector<range>> lvl_stripe_stream_ranges;
 
   // TODO rename
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
@@ -207,25 +198,25 @@ struct chunk_read_data {
 
   // Chunks of stripes that can be load into memory such that their data size is within a size
   // limit.
-  std::vector<chunk> load_stripe_chunks;
-  std::size_t curr_load_stripe_chunk{0};
-  bool more_stripe_to_load() const { return curr_load_stripe_chunk < load_stripe_chunks.size(); }
+  std::vector<range> load_stripe_ranges;
+  std::size_t curr_load_stripe_range{0};
+  bool more_stripe_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
 
   // Chunks of stripes such that their decompression size is within a size limit.
-  std::vector<chunk> decode_stripe_chunks;
-  std::size_t curr_decode_stripe_chunk{0};
+  std::vector<range> decode_stripe_ranges;
+  std::size_t curr_decode_stripe_range{0};
   bool more_stripe_to_decode() const
   {
-    return curr_decode_stripe_chunk < decode_stripe_chunks.size();
+    return curr_decode_stripe_range < decode_stripe_ranges.size();
   }
 
   // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
-  std::vector<chunk> output_table_chunks;
-  std::size_t curr_output_table_chunk{0};
+  std::vector<range> output_table_ranges;
+  std::size_t curr_output_table_range{0};
   std::unique_ptr<cudf::table> decoded_table;
   bool more_table_chunk_to_output() const
   {
-    return curr_output_table_chunk < output_table_chunks.size();
+    return curr_output_table_range < output_table_ranges.size();
   }
 
   // Only has more chunk to output if:
@@ -278,13 +269,13 @@ struct cumulative_size_sum {
  * given `size_limit`.
  */
 template <typename T>
-std::vector<chunk> find_splits(host_span<T const> sizes,
+std::vector<range> find_splits(host_span<T const> sizes,
                                std::size_t total_count,
                                std::size_t size_limit);
 
 // TODO
-std::pair<int64_t, int64_t> get_range(std::vector<chunk> const& input_chunks,
-                                      chunk const& selected_chunks);
+std::pair<int64_t, int64_t> get_range(std::vector<range> const& input_ranges,
+                                      range const& selected_ranges);
 
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index dba2a4b135e..87fdc40b351 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -78,8 +78,8 @@ namespace {
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
-  chunk const& load_stripe_chunk,
-  chunk const& stripe_chunk,
+  range const& load_stripe_range,
+  range const& stripe_range,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
@@ -101,8 +101,7 @@ rmm::device_buffer decompress_stripe_data(
   // TODO: use lvl_stripe_stream_chunks
   std::size_t count{0};
   for (auto const& info : stream_info) {
-    if (info.source.stripe_idx < stripe_chunk.start_idx ||
-        info.source.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+    if (info.source.stripe_idx < stripe_range.begin || info.source.stripe_idx >= stripe_range.end) {
       continue;
     }
     count++;
@@ -111,8 +110,7 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
 
   for (auto const& info : stream_info) {
-    if (info.source.stripe_idx < stripe_chunk.start_idx ||
-        info.source.stripe_idx >= stripe_chunk.start_idx + stripe_chunk.count) {
+    if (info.source.stripe_idx < stripe_range.begin || info.source.stripe_idx >= stripe_range.end) {
       continue;
     }
 
@@ -129,7 +127,7 @@ rmm::device_buffer decompress_stripe_data(
 
     compinfo.push_back(gpu::CompressedStreamInfo(
       static_cast<uint8_t const*>(
-        stripe_data[info.source.stripe_idx - load_stripe_chunk.start_idx].data()) +
+        stripe_data[info.source.stripe_idx - load_stripe_range.begin].data()) +
         info.dst_pos,
       info.length));
 
@@ -749,7 +747,7 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
  * @param stream
  * @return
  */
-std::vector<chunk> find_table_splits(table_view const& input,
+std::vector<range> find_table_splits(table_view const& input,
                                      size_type segment_length,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
@@ -825,12 +823,13 @@ void reader::impl::decompress_and_decode()
   if (_file_itm_data.has_no_data()) { return; }
 
   auto const stripe_chunk =
-    _chunk_read_data.decode_stripe_chunks[_chunk_read_data.curr_decode_stripe_chunk++];
-  auto const stripe_start = stripe_chunk.start_idx;
-  auto const stripe_end   = stripe_chunk.start_idx + stripe_chunk.count;
+    _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
+  auto const stripe_start = stripe_chunk.begin;
+  auto const stripe_end   = stripe_chunk.end;
+  auto const stripe_count = stripe_chunk.end - stripe_chunk.begin;
 
   auto const load_stripe_start =
-    _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1].start_idx;
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1].begin;
 
 #ifdef LOCAL_TEST
   printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
@@ -903,7 +902,6 @@ void reader::impl::decompress_and_decode()
   //
   // TODO: move this to reader_impl.cu, decomp and decode step
   //  std::size_t num_stripes = selected_stripes.size();
-  std::size_t num_stripes = stripe_chunk.count;
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
@@ -967,7 +965,7 @@ void reader::impl::decompress_and_decode()
 
 #endif
 
-  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_chunks;
+  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
 #ifdef LOCAL_TEST
@@ -981,8 +979,8 @@ void reader::impl::decompress_and_decode()
     }
 #endif
 
-    auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
-    auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
+    auto const& stripe_stream_ranges      = lvl_stripe_stream_ranges[level];
+    auto const [stream_begin, stream_end] = get_range(stripe_stream_ranges, stripe_chunk);
 
     auto& columns_level = _selected_columns.levels[level];
 
@@ -1019,7 +1017,7 @@ void reader::impl::decompress_and_decode()
 
     auto const num_columns = columns_level.size();
     auto& chunks           = lvl_chunks[level];
-    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(num_stripes, num_columns, _stream);
+    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
 #ifdef LOCAL_TEST
@@ -1038,7 +1036,7 @@ void reader::impl::decompress_and_decode()
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
       (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * num_stripes < 8 * 128) &&
+       _metadata.get_row_index_stride() != 0 && num_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
@@ -1056,7 +1054,7 @@ void reader::impl::decompress_and_decode()
                     _selected_columns.levels[level].size(),
                     [&]() {
                       return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        num_stripes, _stream, rmm::mr::get_current_device_resource());
+                        stripe_count, _stream, rmm::mr::get_current_device_resource());
                     });
 
     // Tracker for eventually deallocating compressed and uncompressed data
@@ -1234,7 +1232,7 @@ void reader::impl::decompress_and_decode()
       // printf("decompress----------------------\n");
       // printf("line %d\n", __LINE__);
       // fflush(stdout);
-      CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_chunk > 0, "ERRRRR");
+      CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "ERRRRR");
 
 #ifdef LOCAL_TEST
       {
@@ -1246,7 +1244,7 @@ void reader::impl::decompress_and_decode()
 #endif
 
       auto decomp_data = decompress_stripe_data(
-        _chunk_read_data.load_stripe_chunks[_chunk_read_data.curr_load_stripe_chunk - 1],
+        _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1],
         stripe_chunk,
         _file_itm_data.compinfo_map,
         *_metadata.per_file_metadata[0].decompressor,
@@ -1254,7 +1252,7 @@ void reader::impl::decompress_and_decode()
         stream_info,
         chunks,
         row_groups,
-        num_stripes,
+        stripe_count,
         _metadata.get_row_index_stride(),
         level == 0,
         _stream);
@@ -1263,7 +1261,7 @@ void reader::impl::decompress_and_decode()
 
       // TODO: only reset each one if the new size/type are different.
       stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
-      for (std::size_t i = 1; i < stripe_chunk.count; ++i) {
+      for (std::size_t i = 1; i < stripe_count; ++i) {
         stripe_data[i + stripe_start - load_stripe_start] = {};
       }
 
@@ -1292,7 +1290,7 @@ void reader::impl::decompress_and_decode()
                                 nullptr,
                                 chunks.base_device_ptr(),
                                 num_columns,
-                                num_stripes,
+                                stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
                                 _stream);
@@ -1326,7 +1324,7 @@ void reader::impl::decompress_and_decode()
 
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
-      for (std::size_t j = 0; j < num_stripes; ++j) {
+      for (std::size_t j = 0; j < stripe_count; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
 #ifdef LOCAL_TEST
           printf("   is nullable\n");
@@ -1468,7 +1466,7 @@ void reader::impl::decompress_and_decode()
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       stripe_data[stripe_start - load_stripe_start] = {};
     } else {
-      for (std::size_t i = 0; i < stripe_chunk.count; ++i) {
+      for (std::size_t i = 0; i < stripe_count; ++i) {
         stripe_data[i + stripe_start - load_stripe_start] = {};
       }
     }
@@ -1489,10 +1487,10 @@ void reader::impl::decompress_and_decode()
   // DEBUG only
   // _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
 
-  _chunk_read_data.curr_output_table_chunk = 0;
-  _chunk_read_data.output_table_chunks =
+  _chunk_read_data.curr_output_table_range = 0;
+  _chunk_read_data.output_table_ranges =
     _chunk_read_data.output_size_limit == 0
-      ? std::vector<chunk>{chunk{
+      ? std::vector<range>{range{
           0, static_cast<std::size_t>(_chunk_read_data.decoded_table->num_rows())}}
       : find_table_splits(_chunk_read_data.decoded_table->view(),
                           _chunk_read_data.output_row_granularity,
@@ -1500,11 +1498,11 @@ void reader::impl::decompress_and_decode()
                           _stream);
 
 #ifdef LOCAL_TEST
-  auto& splits = _chunk_read_data.output_table_chunks;
+  auto& splits = _chunk_read_data.output_table_ranges;
   printf("------------\nSplits decoded table (/total num rows = %d): \n",
          (int)_chunk_read_data.decoded_table->num_rows());
   for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].start_idx, splits[idx].count);
+    printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
   }
   fflush(stdout);
 

From 98d82fc728cff400657509ef4eebf4e42c2ea732 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 8 Mar 2024 22:30:46 -0800
Subject: [PATCH 205/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  2 +-
 cpp/src/io/orc/reader_impl_chunking.hpp | 31 +++++++++++--------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 0333492d1c7..b54a4483523 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -442,7 +442,7 @@ void reader::impl::global_preprocess(read_mode mode)
           len += stream_info[stream_count].length;
           stream_count++;
         }
-        read_info.emplace_back(offset, len, d_dst, stripe.source_idx, stripe_idx, level);
+        read_info.emplace_back(offset, d_dst, len, stripe.source_idx, stripe_idx, level);
       }
     }
     total_stripe_sizes[stripe_idx]      = {1, total_stripe_size};
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index ff54d542a2b..5731bd242d9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -24,13 +24,12 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <tuple>
 #include <unordered_map>
 
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct that store identification of an ORC streams
+ * @brief Struct that store source information of an ORC streams.
  */
 struct stream_source_info {
   std::size_t stripe_idx;  // global stripe id throughout all data sources
@@ -57,14 +56,14 @@ struct stream_source_info {
 };
 
 /**
- * @brief Map to lookup a value from stream id.
+ * @brief Map to lookup a value from stream source.
  */
 template <typename T>
 using stream_source_map =
   std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
 
 /**
- * @brief Struct that store identification of an ORC stream.
+ * @brief Struct that store information of an ORC stream.
  */
 struct orc_stream_info {
   explicit orc_stream_info(uint64_t offset_,
@@ -79,7 +78,7 @@ struct orc_stream_info {
   std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
   std::size_t length;   // stream length to read
 
-  // Store location of the stream in the stripe, so we can look up where this stream comes from.
+  // Store source of the stream in the stripe, so we can look up where this stream comes from.
   stream_source_info source;
 };
 
@@ -101,7 +100,7 @@ struct range {
 };
 
 /**
- * @brief Struct to store file-level data that remains constant for all chunks being output.
+ * @brief Struct to store intermediate processing data loaded from data sources.
  */
 struct file_intermediate_data {
   int64_t rows_to_skip;
@@ -111,9 +110,6 @@ struct file_intermediate_data {
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
-  // TODO: remove
-  std::size_t num_stripes() const { return selected_stripes.size(); }
-
   // Store the compression information for each data stream.
   stream_source_map<stripe_level_comp_info> compinfo_map;
 
@@ -129,26 +125,26 @@ struct file_intermediate_data {
   // Store information to identify where to read a chunk of data from source.
   // Each read corresponds to one or more consecutive streams combined.
   struct stream_data_read_info {
-    // TODO: remove constructor
     stream_data_read_info(uint64_t offset_,
-                          std::size_t length_,
                           std::size_t dst_pos_,
+                          std::size_t length_,
                           std::size_t source_idx_,
                           std::size_t stripe_idx_,
                           std::size_t level_)
       : offset(offset_),
-        length(length_),
         dst_pos(dst_pos_),
+        length(length_),
         source_idx(source_idx_),
         stripe_idx(stripe_idx_),
         level(level_)
     {
     }
+
     uint64_t offset;         // offset in data source
     std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
     std::size_t length;      // data length to read
     std::size_t source_idx;  // the data source id
-    std::size_t stripe_idx;  // stream id TODO: processing or source stripe id?
+    std::size_t stripe_idx;  // global stripe index
     std::size_t level;       // nested level
   };
 
@@ -190,10 +186,11 @@ struct chunk_read_data {
   }
 
   // TODO: const for 3 below?
-  std::size_t output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
-  std::size_t data_read_limit;    // approximate maximum size (in bytes) used for store
-                                  // intermediate data, or 0 for no limit
-  size_type output_row_granularity;               // TODO
+  std::size_t const
+    output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t const data_read_limit;       // approximate maximum size (in bytes) used for store
+                                           // intermediate data, or 0 for no limit
+  size_type const output_row_granularity;  // TODO
   static double constexpr load_limit_ratio{0.4};  // TODO
 
   // Chunks of stripes that can be load into memory such that their data size is within a size

From 1ec9dc0ef7cc0f10adc3602d3d75fd14b633f3f3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 9 Mar 2024 19:19:13 -0800
Subject: [PATCH 206/321] Cleanup and rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp          |  1 +
 cpp/src/io/orc/reader_impl_chunking.cu  | 46 ++++++++-----------------
 cpp/src/io/orc/reader_impl_chunking.hpp | 45 ++++++++++++++++++------
 3 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index ae518bc2a5f..b3f91e5e92a 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -212,6 +212,7 @@ class reader::impl {
   table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 
+  // The default value used for subdividing the decoded table for final output.
   static constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index b54a4483523..44a247c2405 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -84,8 +84,8 @@ std::size_t gather_stream_info_and_column_desc(
   CUDF_EXPECTS(stream_info.has_value() ^ chunks.has_value(),
                "Either stream_info or chunks must be provided, but not both.");
 
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
+  std::size_t src_offset = 0;
+  std::size_t dst_offset = 0;
 
   auto const get_stream_index_type = [](orc::StreamKind kind) {
     switch (kind) {
@@ -186,20 +186,11 @@ std::size_t gather_stream_info_and_column_desc(
   return dst_offset;
 }
 
-#if 1
-/**
- * @brief Find the splits of the input data such that each split has cumulative size less than a
- * given `size_limit`.
- */
 template <typename T>
-std::vector<range> find_splits(host_span<T const> sizes,
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
                                std::size_t total_count,
                                std::size_t size_limit)
 {
-  // if (size_limit == 0) {
-  //   printf("0 limit: output chunk = 0, %d\n", (int)total_count);
-  //   return {chunk{0, total_count}};
-  // }
   CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
 
   std::vector<range> splits;
@@ -210,8 +201,9 @@ std::vector<range> find_splits(host_span<T const> sizes,
   [[maybe_unused]] size_t cur_cumulative_rows{0};
 
   auto const start = thrust::make_transform_iterator(
-    sizes.begin(), [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
-  auto const end = start + static_cast<int64_t>(sizes.size());
+    cumulative_sizes.begin(),
+    [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
+  auto const end = start + static_cast<int64_t>(cumulative_sizes.size());
 
   while (cur_count < total_count) {
     int64_t split_pos =
@@ -219,13 +211,13 @@ std::vector<range> find_splits(host_span<T const> sizes,
 
     // If we're past the end, or if the returned bucket is bigger than the chunk_read_limit, move
     // back one.
-    if (static_cast<std::size_t>(split_pos) >= sizes.size() ||
-        (sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
+    if (static_cast<std::size_t>(split_pos) >= cumulative_sizes.size() ||
+        (cumulative_sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
       split_pos--;
     }
 
     if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
-      while (split_pos > 0 && sizes[split_pos].rows - cur_cumulative_rows >
+      while (split_pos > 0 && cumulative_sizes[split_pos].rows - cur_cumulative_rows >
                                 static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
         split_pos--;
       }
@@ -236,8 +228,8 @@ std::vector<range> find_splits(host_span<T const> sizes,
     // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
     // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
     // either do this, or we have to call unique() on the input first.
-    while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-           (split_pos < 0 || sizes[split_pos].count <= cur_count)) {
+    while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
+           (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
       split_pos++;
     }
 
@@ -246,13 +238,13 @@ std::vector<range> find_splits(host_span<T const> sizes,
     // #endif
 
     auto const start_idx = cur_count;
-    cur_count            = sizes[split_pos].count;
+    cur_count            = cumulative_sizes[split_pos].count;
     splits.emplace_back(range{start_idx, cur_count});
     cur_pos             = split_pos;
-    cur_cumulative_size = sizes[split_pos].size_bytes;
+    cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
 
     if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
-      cur_cumulative_rows = sizes[split_pos].rows;
+      cur_cumulative_rows = cumulative_sizes[split_pos].rows;
     }
   }
 
@@ -276,19 +268,11 @@ template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_si
                                                          std::size_t size_limit);
 template std::vector<range> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
-#endif
 
-/**
- * @brief Find range of the data span by a given range of ranges.
- *
- * @param input_ranges The list of all data chunks
- * @param selected_ranges A chunk of chunks in the input_chunks
- * @return The range of data span by the selected range of given chunks
- */
 std::pair<int64_t, int64_t> get_range(std::vector<range> const& input_ranges,
                                       range const& selected_ranges)
 {
-  // The first and last range, according to selected_chunk.
+  // The first and last range.
   auto const& first_range = input_ranges[selected_ranges.begin];
   auto const& last_range  = input_ranges[selected_ranges.end - 1];
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 5731bd242d9..ba3611d5757 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -83,7 +83,7 @@ struct orc_stream_info {
 };
 
 /**
- * @brief Struct that store compression information for a stripe at a specific nested level.
+ * @brief Compression information for a stripe at a specific nested level.
  */
 struct stripe_level_comp_info {
   std::size_t num_compressed_blocks{0};
@@ -92,7 +92,7 @@ struct stripe_level_comp_info {
 };
 
 /**
- * @brief Struct that store information about a range of data.
+ * @brief Struct representing a range of data.
  */
 struct range {
   std::size_t begin;
@@ -100,7 +100,7 @@ struct range {
 };
 
 /**
- * @brief Struct to store intermediate processing data loaded from data sources.
+ * @brief Struct storing intermediate processing data loaded from data sources.
  */
 struct file_intermediate_data {
   int64_t rows_to_skip;
@@ -173,7 +173,7 @@ struct file_intermediate_data {
 };
 
 /**
- * @brief Struct to store all data necessary for chunked reading.
+ * @brief Struct collecting data necessary for chunked reading.
  */
 struct chunk_read_data {
   explicit chunk_read_data(std::size_t output_size_limit_,
@@ -231,14 +231,17 @@ struct chunk_read_data {
 };
 
 /**
- * @brief Struct to accumulate sizes of chunks of some data such as stripe or rows.
+ * @brief Struct to accumulate counts and sizes of some types such as stripes or rows.
  */
 struct cumulative_size {
   std::size_t count{0};
   std::size_t size_bytes{0};
 };
 
-// TODO
+/**
+ * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or
+ * rows in tables.
+ */
 struct cumulative_size_and_row {
   std::size_t count{0};
   std::size_t size_bytes{0};
@@ -246,7 +249,7 @@ struct cumulative_size_and_row {
 };
 
 /**
- * @brief Functor to sum up cumulative sizes.
+ * @brief Functor to sum up cumulative data.
  */
 struct cumulative_size_sum {
   __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
@@ -262,21 +265,41 @@ struct cumulative_size_sum {
 };
 
 /**
- * @brief Find the splits of the input data such that each split has cumulative size less than a
- * given `size_limit`.
+ * @brief Find the splits of the input data such that each split range has cumulative size less than
+ * a given `size_limit`.
+ *
+ * Note that the given limit is just a soft limit. The function will always output ranges that
+ * have at least one count, even such ranges have sizes exceed the value of `size_limit`.
+ *
+ * @param cumulative_sizes The input cumulative sizes to compute split ranges
+ * @param total_count The total count in the entire input
+ * @param size_limit The given soft limit to compute splits
+ * @return A vector of ranges as splits of the input
  */
 template <typename T>
-std::vector<range> find_splits(host_span<T const> sizes,
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
                                std::size_t total_count,
                                std::size_t size_limit);
 
-// TODO
+/**
+ * @brief Expand a range of ranges into a simple range of data.
+ *
+ * @param input_ranges The list of all data ranges
+ * @param selected_ranges A range of ranges from `input_ranges`
+ * @return The range of data span by the selected range of ranges
+ */
 std::pair<int64_t, int64_t> get_range(std::vector<range> const& input_ranges,
                                       range const& selected_ranges);
 
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column
  * data, but not both.
+ *
+ * This function is used in the global step, to gather information for streams of all stripes in
+ * the data sources (when `stream_info` is present). Later on, it is used again to populate column
+ * descriptors (`chunks` is present) during decompression and decoding. The two steps share
+ * most of the execution path thus this function takes mutually exclusive parameters `stream_info`
+ * or `chunks` depending on each use case.
  */
 std::size_t gather_stream_info_and_column_desc(
   std::size_t stripe_processing_order,

From 64c155aee1e5e0a9ee5fa273c980126e4f3812fa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 9 Mar 2024 20:19:27 -0800
Subject: [PATCH 207/321] Further cleanup and rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 65 +++++++++-----------------
 1 file changed, 22 insertions(+), 43 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 44a247c2405..4d815568e3d 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -105,12 +105,6 @@ std::size_t gather_stream_info_and_column_desc(
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       // Ignore reading this stream from source.
       CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored.");
-
-#ifdef LOCAL_TEST
-      printf("Unexpected stream in the input ORC source. The stream will be ignored\n");
-      fflush(stdout);
-#endif
-
       src_offset += stream.length;
       continue;
     }
@@ -142,14 +136,7 @@ std::size_t gather_stream_info_and_column_desc(
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk = (*chunks.value())[stripe_processing_order][col];
-            // printf("use stream id: %d, stripe: %d, level: %d, col idx: %d, kind: %d\n",
-            //        (int)(*stream_idx),
-            //        (int)stripe_index,
-            //        (int)level,
-            //        (int)column_id,
-            //        (int)stream.kind);
-
+            auto& chunk                = (*chunks.value())[stripe_processing_order][col];
             chunk.strm_id[index_type]  = *stream_processing_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
@@ -165,12 +152,6 @@ std::size_t gather_stream_info_and_column_desc(
 
         (*stream_processing_order)++;
       } else {  // not chunks.has_value()
-        // printf("collect stream id: stripe: %d, level: %d, col idx: %d, kind: %d\n",
-        //        (int)stripe_index,
-        //        (int)level,
-        //        (int)column_id,
-        //        (int)stream.kind);
-
         stream_info.value()->emplace_back(
           stripeinfo->offset + src_offset,
           dst_offset,
@@ -196,50 +177,47 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
   std::vector<range> splits;
   std::size_t cur_count{0};
   int64_t cur_pos{0};
-  size_t cur_cumulative_size{0};
+  std::size_t cur_cumulative_size{0};
 
-  [[maybe_unused]] size_t cur_cumulative_rows{0};
+  [[maybe_unused]] std::size_t cur_cumulative_rows{0};
 
   auto const start = thrust::make_transform_iterator(
     cumulative_sizes.begin(),
     [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
-  auto const end = start + static_cast<int64_t>(cumulative_sizes.size());
+  auto const end = start + cumulative_sizes.size();
 
   while (cur_count < total_count) {
     int64_t split_pos =
       thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit));
 
-    // If we're past the end, or if the returned bucket is bigger than the chunk_read_limit, move
-    // back one.
-    if (static_cast<std::size_t>(split_pos) >= cumulative_sizes.size() ||
-        (cumulative_sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
+    // If we're past the end, or if the returned range has size exceeds the given size limit,
+    // move back one position.
+    if (split_pos >= static_cast<int64_t>(cumulative_sizes.size()) ||
+        (cumulative_sizes[split_pos].size_bytes > cur_cumulative_size + size_limit)) {
       split_pos--;
     }
 
     if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
-      while (split_pos > 0 && cumulative_sizes[split_pos].rows - cur_cumulative_rows >
-                                static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+      // Similarly, while the returned range has total number of rows exceeds column size limit,
+      // move back one position.
+      while (split_pos > 0 && cumulative_sizes[split_pos].rows >
+                                cur_cumulative_rows +
+                                  static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
         split_pos--;
       }
     }
 
-    // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-    // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-    // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-    // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-    // either do this, or we have to call unique() on the input first.
+    // In case we have moved back too in the steps above, far beyond the last split point: that
+    // means we cannot find any range that has size fits within the given size limit.
+    // In such case, we need to move forward until we move pass the last output range.
     while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
            (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
       split_pos++;
     }
 
-    // #ifdef LOCAL_TEST
-    //     printf("  split_pos: %d\n", (int)split_pos);
-    // #endif
-
-    auto const start_idx = cur_count;
-    cur_count            = cumulative_sizes[split_pos].count;
-    splits.emplace_back(range{start_idx, cur_count});
+    auto const start_count = cur_count;
+    cur_count              = cumulative_sizes[split_pos].count;
+    splits.emplace_back(range{start_count, cur_count});
     cur_pos             = split_pos;
     cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
 
@@ -248,10 +226,11 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
     }
   }
 
-  // If the last chunk has size smaller than `merge_threshold` percent of the second last one,
+  // If the last range has size smaller than `merge_threshold` percent of the second last one,
   // merge it with the second last one.
+  // This is to prevent having too small trailing range.
   if (splits.size() > 1) {
-    auto constexpr merge_threshold = 0.15;
+    double constexpr merge_threshold = 0.15;
     if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
         (last.end - last.begin) <=
         static_cast<std::size_t>(merge_threshold * (second_last.end - second_last.begin))) {

From 5b361fb89545fbfcc7ef97cea1008b5edad9d151 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 9 Mar 2024 20:53:42 -0800
Subject: [PATCH 208/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 97 ++++++++++++--------------
 1 file changed, 46 insertions(+), 51 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4d815568e3d..5112a628b9f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -288,75 +288,65 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 #endif
 
-  //  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
-  //  auto const rows_to_read      = _file_itm_data.rows_to_read;
-  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto const num_total_stripes = _file_itm_data.selected_stripes.size();
+  auto const num_levels        = _selected_columns.num_levels();
+
+#ifdef LOCAL_TEST
+  printf("num load stripe: %d\n", (int)num_total_stripes);
+#endif
 
-  auto& lvl_stripe_data  = _file_itm_data.lvl_stripe_data;
-  auto& lvl_stripe_sizes = _file_itm_data.lvl_stripe_sizes;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-  lvl_stripe_sizes.resize(_selected_columns.num_levels());
+  //
+  // Pre allocate necessary memory for data processed in the next steps:
+  //
+  auto& stripe_data_read_chunks = _file_itm_data.stripe_data_read_ranges;
+  stripe_data_read_chunks.resize(num_total_stripes);
 
-  auto& read_info                = _file_itm_data.data_read_info;
-  auto& stripe_data_read_chunks  = _file_itm_data.stripe_data_read_ranges;
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
+  auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
   auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_ranges;
 
-  // Logically view streams as columns
-  _file_itm_data.lvl_stream_info.resize(_selected_columns.num_levels());
-
-  // TODO: handle large number of stripes.
-  // Get the total number of stripes across all input files.
-  auto const num_stripes = selected_stripes.size();
+  lvl_stripe_data.resize(num_levels);
+  lvl_stripe_sizes.resize(num_levels);
+  lvl_stream_info.resize(num_levels);
+  lvl_stripe_stream_chunks.resize(num_levels);
+  _file_itm_data.lvl_data_chunks.resize(num_levels);
+  _out_buffers.resize(num_levels);
 
-#ifdef LOCAL_TEST
-  printf("num load stripe: %d\n", (int)num_stripes);
-#endif
+  auto& read_info = _file_itm_data.data_read_info;
+  auto& col_meta  = *_col_meta;
 
-  stripe_data_read_chunks.resize(num_stripes);
-  lvl_stripe_stream_chunks.resize(_selected_columns.num_levels());
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    lvl_stripe_sizes[level].resize(num_total_stripes);
+    lvl_stripe_stream_chunks[level].resize(num_total_stripes);
 
-  // TODO: move this
-  auto& lvl_chunks = _file_itm_data.lvl_data_chunks;
-  lvl_chunks.resize(_selected_columns.num_levels());
-  _out_buffers.resize(_selected_columns.num_levels());
-
-  // TODO: Check if these data depends on pass and subpass, instead of global pass.
-  // Prepare data.
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
-  auto& col_meta = *_col_meta;
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
     col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
 
     size_type col_id{0};
-    for (auto& col : columns_level) {
+    for (auto const& col : _selected_columns.levels[level]) {
       // Map each ORC column to its column
       col_meta.orc_col_map[level][col.id] = col_id++;
     }
 
-    // auto& stripe_data = lvl_stripe_data[level];
-    // stripe_data.resize(num_stripes);
-
-    auto& stream_info      = _file_itm_data.lvl_stream_info[level];
     auto const num_columns = _selected_columns.levels[level].size();
-    auto& stripe_sizes     = lvl_stripe_sizes[level];
-    stream_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
 
-    stripe_sizes.resize(selected_stripes.size());
-    if (read_info.capacity() < selected_stripes.size()) {
-      read_info.reserve(selected_stripes.size() * num_columns);  // final size is unknown
+    // Try to reserve some memory, but the final size is unknown,
+    // since each column may have more than one stream.
+    lvl_stream_info[level].reserve(num_total_stripes * num_columns);
+    if (read_info.capacity() < num_total_stripes * num_columns) {
+      read_info.reserve(num_total_stripes * num_columns);
     }
-
-    auto& stripe_stream_chunks = lvl_stripe_stream_chunks[level];
-    stripe_stream_chunks.resize(num_stripes);
   }
 
-  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_stripes, _stream);
+  //
+  // Load all stripes' metadata.
+  //
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_total_stripes, _stream);
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
   // Compute input size for each stripe.
-  for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+  for (std::size_t stripe_idx = 0; stripe_idx < num_total_stripes; ++stripe_idx) {
     auto const& stripe       = selected_stripes[stripe_idx];
     auto const stripe_info   = stripe.stripe_info;
     auto const stripe_footer = stripe.stripe_footer;
@@ -412,15 +402,20 @@ void reader::impl::global_preprocess(read_mode mode)
     stripe_data_read_chunks[stripe_idx] = range{last_read_size, read_info.size()};
   }
 
+  //
+  // Compute stripes' data sizes, and split list of all stripes into subsets that be loaded
+  // separately without blowing up memory:
+  //
+
   _chunk_read_data.curr_load_stripe_range = 0;
 
   // Load all chunks if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
 #ifdef LOCAL_TEST
-    printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_stripes);
+    printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_total_stripes);
 #endif
 
-    _chunk_read_data.load_stripe_ranges = {range{0ul, num_stripes}};
+    _chunk_read_data.load_stripe_ranges = {range{0ul, num_total_stripes}};
     return;
   }
 
@@ -461,11 +456,11 @@ void reader::impl::global_preprocess(read_mode mode)
     return tmp > 0UL ? tmp : 1UL;
   }();
   _chunk_read_data.load_stripe_ranges =
-    find_splits<cumulative_size>(total_stripe_sizes, num_stripes, load_limit);
+    find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
 
 #ifdef LOCAL_TEST
   auto& splits = _chunk_read_data.load_stripe_ranges;
-  printf("------------\nSplits (/total num stripe = %d): \n", (int)num_stripes);
+  printf("------------\nSplits (/total num stripe = %d): \n", (int)num_total_stripes);
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
   }

From 1206ba1e21b9c4e0f62b32b6e510223c722a536c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 9 Mar 2024 21:04:57 -0800
Subject: [PATCH 209/321] Cleanup and rename variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 67 +++++++++++++-------------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5112a628b9f..62cfa73cb47 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -288,7 +288,8 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 #endif
 
-  auto const num_total_stripes = _file_itm_data.selected_stripes.size();
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto const num_total_stripes = selected_stripes.size();
   auto const num_levels        = _selected_columns.num_levels();
 
 #ifdef LOCAL_TEST
@@ -343,23 +344,22 @@ void reader::impl::global_preprocess(read_mode mode)
   // Load all stripes' metadata.
   //
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_total_stripes, _stream);
-  auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  // Compute input size for each stripe.
-  for (std::size_t stripe_idx = 0; stripe_idx < num_total_stripes; ++stripe_idx) {
-    auto const& stripe       = selected_stripes[stripe_idx];
+  for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
+       ++stripe_global_idx) {
+    auto const& stripe       = selected_stripes[stripe_global_idx];
     auto const stripe_info   = stripe.stripe_info;
     auto const stripe_footer = stripe.stripe_footer;
 
-    std::size_t total_stripe_size{0};
+    std::size_t stripe_size{0};
     auto const last_read_size = read_info.size();
-    for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    for (std::size_t level = 0; level < num_levels; ++level) {
       auto& stream_info  = _file_itm_data.lvl_stream_info[level];
       auto& stripe_sizes = lvl_stripe_sizes[level];
 
-      auto stream_count = stream_info.size();
-      auto const stripe_size =
-        gather_stream_info_and_column_desc(stripe_idx,
+      auto stream_level_count = stream_info.size();
+      auto const stripe_level_size =
+        gather_stream_info_and_column_desc(stripe_global_idx,
                                            level,
                                            stripe_info,
                                            stripe_footer,
@@ -368,43 +368,42 @@ void reader::impl::global_preprocess(read_mode mode)
                                            false,  // use_index,
                                            level == 0,
                                            nullptr,  // num_dictionary_entries
-                                           nullptr,  // stream_idx
+                                           nullptr,  // stream_processing_order
                                            &stream_info,
                                            std::nullopt  // chunks
         );
 
-      auto const is_stripe_data_empty = stripe_size == 0;
+      auto const is_stripe_data_empty = stripe_level_size == 0;
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      stripe_sizes[stripe_idx] = stripe_size;
-      total_stripe_size += stripe_size;
+      stripe_sizes[stripe_global_idx] = stripe_level_size;
+      stripe_size += stripe_level_size;
 
-      auto& stripe_stream_chunks       = lvl_stripe_stream_chunks[level];
-      stripe_stream_chunks[stripe_idx] = range{stream_count, stream_info.size()};
+      auto& stripe_stream_chunks              = lvl_stripe_stream_chunks[level];
+      stripe_stream_chunks[stripe_global_idx] = range{stream_level_count, stream_info.size()};
 
       // Coalesce consecutive streams into one read
-      while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-        auto const d_dst  = stream_info[stream_count].dst_pos;
-        auto const offset = stream_info[stream_count].offset;
-        auto len          = stream_info[stream_count].length;
-        stream_count++;
-
-        while (stream_count < stream_info.size() &&
-               stream_info[stream_count].offset == offset + len) {
-          len += stream_info[stream_count].length;
-          stream_count++;
+      while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
+        auto const d_dst  = stream_info[stream_level_count].dst_pos;
+        auto const offset = stream_info[stream_level_count].offset;
+        auto len          = stream_info[stream_level_count].length;
+        stream_level_count++;
+
+        while (stream_level_count < stream_info.size() &&
+               stream_info[stream_level_count].offset == offset + len) {
+          len += stream_info[stream_level_count].length;
+          stream_level_count++;
         }
-        read_info.emplace_back(offset, d_dst, len, stripe.source_idx, stripe_idx, level);
+        read_info.emplace_back(offset, d_dst, len, stripe.source_idx, stripe_global_idx, level);
       }
     }
-    total_stripe_sizes[stripe_idx]      = {1, total_stripe_size};
-    stripe_data_read_chunks[stripe_idx] = range{last_read_size, read_info.size()};
+    total_stripe_sizes[stripe_global_idx]      = {1, stripe_size};
+    stripe_data_read_chunks[stripe_global_idx] = range{last_read_size, read_info.size()};
   }
 
   //
-  // Compute stripes' data sizes, and split list of all stripes into subsets that be loaded
-  // separately without blowing up memory:
+  // Split list of all stripes into subsets that be loaded separately without blowing up memory:
   //
 
   _chunk_read_data.curr_load_stripe_range = 0;
@@ -429,9 +428,9 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 #endif
 
-  // Compute the prefix sum of stripe data sizes.
+  // Compute the prefix sum of stripes' data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
-  thrust::inclusive_scan(rmm::exec_policy(_stream),
+  thrust::inclusive_scan(rmm::exec_policy(_stream),  // todo no sync
                          total_stripe_sizes.d_begin(),
                          total_stripe_sizes.d_end(),
                          total_stripe_sizes.d_begin(),
@@ -449,10 +448,10 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 #endif
 
-  // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
   auto const load_limit = [&] {
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               chunk_read_data::load_limit_ratio);
+    // Make sure not to pass 0 byte limit (due to round-off) to compute splits.
     return tmp > 0UL ? tmp : 1UL;
   }();
   _chunk_read_data.load_stripe_ranges =

From 71386a2a1e5813d6be421f055f6bfc457635b841 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 9 Mar 2024 22:13:42 -0800
Subject: [PATCH 210/321] Cleanup heavily

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 78 ++++++++++++--------------
 1 file changed, 36 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 62cfa73cb47..44eb8b4b339 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -380,8 +380,8 @@ void reader::impl::global_preprocess(read_mode mode)
       stripe_sizes[stripe_global_idx] = stripe_level_size;
       stripe_size += stripe_level_size;
 
-      auto& stripe_stream_chunks              = lvl_stripe_stream_chunks[level];
-      stripe_stream_chunks[stripe_global_idx] = range{stream_level_count, stream_info.size()};
+      lvl_stripe_stream_chunks[level][stripe_global_idx] =
+        range{stream_level_count, stream_info.size()};
 
       // Coalesce consecutive streams into one read
       while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
@@ -472,62 +472,58 @@ void reader::impl::load_data()
 {
   if (_file_itm_data.has_no_data()) { return; }
 
-  //  auto const rows_to_read      = _file_itm_data.rows_to_read;
-  auto const& selected_stripes = _file_itm_data.selected_stripes;
-  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
-  auto& lvl_stripe_sizes       = _file_itm_data.lvl_stripe_sizes;
-  auto& read_info              = _file_itm_data.data_read_info;
-
-  //  std::size_t num_stripes = selected_stripes.size();
-  auto const stripe_chunk =
+  auto const load_stripe_range =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
-  auto const stripe_start = stripe_chunk.begin;
-  auto const stripe_end   = stripe_chunk.end;
+  auto const stripe_start = load_stripe_range.begin;
+  auto const stripe_end   = load_stripe_range.end;
   auto const stripe_count = stripe_end - stripe_start;
 
+  auto const num_levels = _selected_columns.num_levels();
+
 #ifdef LOCAL_TEST
   printf("\n\nloading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
 #endif
 
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+
   // Prepare the buffer to read raw data onto.
-  // TODO: clear all old buffer.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto& stripe_data = lvl_stripe_data[level];
     stripe_data.resize(stripe_count);
 
-    auto& stripe_sizes = lvl_stripe_sizes[level];
-    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-      // TODO: only do this if it was not allocated before.
-      stripe_data[stripe_idx - stripe_start] = rmm::device_buffer(
-        cudf::util::round_up_safe(stripe_sizes[stripe_idx], BUFFER_PADDING_MULTIPLE), _stream);
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      stripe_data[idx]       = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
     }
   }
 
+  // Load stripe data into memory.
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-
-  auto const& stripe_data_read_chunks = _file_itm_data.stripe_data_read_ranges;
-  auto const [read_begin, read_end]   = get_range(stripe_data_read_chunks, stripe_chunk);
+  auto const [read_begin, read_end] =
+    get_range(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
-    auto const& read  = read_info[read_idx];
-    auto& stripe_data = lvl_stripe_data[read.level];
-    auto dst_base     = static_cast<uint8_t*>(stripe_data[read.stripe_idx - stripe_start].data());
+    auto const& read_info = _file_itm_data.data_read_info[read_idx];
+    auto const source     = _metadata.per_file_metadata[read_info.source_idx].source;
+    auto const dst_base   = static_cast<uint8_t*>(
+      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
-    if (_metadata.per_file_metadata[read.source_idx].source->is_device_read_preferred(
-          read.length)) {
+    if (source->is_device_read_preferred(read_info.length)) {
       read_tasks.push_back(
-        std::pair(_metadata.per_file_metadata[read.source_idx].source->device_read_async(
-                    read.offset, read.length, dst_base + read.dst_pos, _stream),
-                  read.length));
+        std::pair(source->device_read_async(
+                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+                  read_info.length));
 
     } else {
-      auto buffer =
-        _metadata.per_file_metadata[read.source_idx].source->host_read(read.offset, read.length);
-      CUDF_EXPECTS(buffer->size() == read.length, "Unexpected discrepancy in bytes read.");
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        dst_base + read.dst_pos, buffer->data(), read.length, cudaMemcpyDefault, _stream.value()));
-      //        _stream.synchronize();
+      auto buffer = source->host_read(read_info.offset, read_info.length);
+      CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
+                                    buffer->data(),
+                                    read_info.length,
+                                    cudaMemcpyDefault,
+                                    _stream.value()));
       host_read_buffers.emplace_back(std::move(buffer));
     }
   }
@@ -537,8 +533,6 @@ void reader::impl::load_data()
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
-  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_ranges;
-
   // TODO: This is subpass
   // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
@@ -546,7 +540,7 @@ void reader::impl::load_data()
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
                                                                                _stream);
   for (std::size_t stripe_idx = 0; stripe_idx < stripe_count; ++stripe_idx) {
-    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const& stripe     = _file_itm_data.selected_stripes[stripe_idx];
     auto const stripe_info = stripe.stripe_info;
 
     stripe_decomp_sizes[stripe_idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
@@ -564,9 +558,9 @@ void reader::impl::load_data()
     auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 
-    auto const& stripe_stream_chunks      = lvl_stripe_stream_chunks[level];
-    auto const [stream_begin, stream_end] = get_range(stripe_stream_chunks, stripe_chunk);
-    auto const num_streams                = stream_end - stream_begin;
+    auto const [stream_begin, stream_end] =
+      get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+    auto const num_streams = stream_end - stream_begin;
 
     // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
@@ -670,7 +664,7 @@ void reader::impl::load_data()
     printf("0 limit: output decode stripe chunk unchanged\n");
 #endif
 
-    _chunk_read_data.decode_stripe_ranges = {stripe_chunk};
+    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
     return;
   }
 

From 17c3393729e82ced96df0e1364d156a0765fb0d2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 10 Mar 2024 10:16:11 -0700
Subject: [PATCH 211/321] Continue cleaning up

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 44 ++++++++++++++++----------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 44eb8b4b339..395fc345160 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -299,8 +299,8 @@ void reader::impl::global_preprocess(read_mode mode)
   //
   // Pre allocate necessary memory for data processed in the next steps:
   //
-  auto& stripe_data_read_chunks = _file_itm_data.stripe_data_read_ranges;
-  stripe_data_read_chunks.resize(num_total_stripes);
+  auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
+  stripe_data_read_ranges.resize(num_total_stripes);
 
   auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
@@ -343,6 +343,8 @@ void reader::impl::global_preprocess(read_mode mode)
   //
   // Load all stripes' metadata.
   //
+
+  // Collect total data size for all data streams in each stripe.
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_total_stripes, _stream);
 
   for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
@@ -399,7 +401,7 @@ void reader::impl::global_preprocess(read_mode mode)
       }
     }
     total_stripe_sizes[stripe_global_idx]      = {1, stripe_size};
-    stripe_data_read_chunks[stripe_global_idx] = range{last_read_size, read_info.size()};
+    stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
   }
 
   //
@@ -498,9 +500,18 @@ void reader::impl::load_data()
     }
   }
 
-  // Load stripe data into memory.
+  //
+  // Load stripe data into memory:
+  //
+
+  // After loading data from sources into host buffers, we need to transfer (async) data to device.
+  // Such host buffers need to be kept alive until we sync device.
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
+
+  // If we load data directly from sources into device, we also need to the entire read tasks.
+  // Thus, we need to keep all read tasks alive and sync all together.
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+
   auto const [read_begin, read_end] =
     get_range(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
@@ -533,28 +544,29 @@ void reader::impl::load_data()
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
-  // TODO: This is subpass
-  // TODO: Don't have to keep it for all stripe/level. Can reset it after each iter.
+  //
+  // Split list of all stripes into subsets that be loaded separately without blowing up memory:
+  //
+
+  // A map from stripe source into `CompressedStreamInfo*` which are generated during parsing
+  // streams decompressed sizes.
+  // These pointers are then used to populate stripe/level decompressed sizes for later
+  // decompression and decoding.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
+  // For estimating the decompressed sizes of the loaded stripes.
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
                                                                                _stream);
   for (std::size_t stripe_idx = 0; stripe_idx < stripe_count; ++stripe_idx) {
-    auto const& stripe     = _file_itm_data.selected_stripes[stripe_idx];
-    auto const stripe_info = stripe.stripe_info;
-
+    auto const& stripe              = _file_itm_data.selected_stripes[stripe_idx];
+    auto const stripe_info          = stripe.stripe_info;
     stripe_decomp_sizes[stripe_idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
-    // printf("loading stripe with rows = %d\n", (int)stripe_info->numberOfRows);
   }
-  // std::fill(
-  //   stripe_decomp_sizes.begin(), stripe_decomp_sizes.end(), cumulative_size_and_row{1, 0, 0});
 
-  // Parse the decompressed sizes for each stripe.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& stream_info      = _file_itm_data.lvl_stream_info[level];
-    auto const num_columns = _selected_columns.levels[level].size();
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+    auto const num_columns  = _selected_columns.levels[level].size();
 
-    // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 

From 86e429f474fa39ddc858364bf5dfb98c850498f4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 10 Mar 2024 19:29:41 -0700
Subject: [PATCH 212/321] Cleanup and add docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 72 ++++++++++++--------------
 1 file changed, 32 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 395fc345160..982fae2efb8 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -548,21 +548,24 @@ void reader::impl::load_data()
   // Split list of all stripes into subsets that be loaded separately without blowing up memory:
   //
 
-  // A map from stripe source into `CompressedStreamInfo*` which are generated during parsing
-  // streams decompressed sizes.
-  // These pointers are then used to populate stripe/level decompressed sizes for later
+  // A map from stripe source into `CompressedStreamInfo*` pointer.
+  // These pointers are then used to retrieve stripe/level decompressed sizes for later
   // decompression and decoding.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
   // For estimating the decompressed sizes of the loaded stripes.
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
                                                                                _stream);
+  std::size_t num_loaded_stripes{0};
   for (std::size_t stripe_idx = 0; stripe_idx < stripe_count; ++stripe_idx) {
     auto const& stripe              = _file_itm_data.selected_stripes[stripe_idx];
     auto const stripe_info          = stripe.stripe_info;
     stripe_decomp_sizes[stripe_idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
+    num_loaded_stripes += stripe_info->numberOfRows;
   }
 
+  auto& compinfo_map = _file_itm_data.compinfo_map;
+
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
@@ -574,26 +577,23 @@ void reader::impl::load_data()
       get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
     auto const num_streams = stream_end - stream_begin;
 
-    // Setup row group descriptors if using indexes
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      // Cannot be cached, since this is for streams in a loaded stripe chunk, while
-      // the latter decoding step will use a different stripe chunk.
+      // Cannot be cached as-is, since this is for streams in a loaded stripe range, while
+      // the latter decompression/decoding step will use a different stripe range.
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
 
-      // TODO: Instead of all stream info, loop using read_chunk info to process
-      // only stream info of the curr_load_stripe_chunk.
-
       for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-        compinfo.push_back(gpu::CompressedStreamInfo(
-          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data()) +
-            info.dst_pos,
-          info.length));
+        auto const dst_base =
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
+
+        compinfo.push_back(gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length));
         stream_compinfo_map[stream_source_info{
           info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind}] =
           &compinfo.back();
+
 #ifdef LOCAL_TEST
         printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
                (int)info.source.stripe_idx,
@@ -607,7 +607,6 @@ void reader::impl::load_data()
       }
 
       compinfo.host_to_device_async(_stream);
-
       gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                      compinfo.size(),
                                      decompressor.GetBlockSize(),
@@ -615,9 +614,8 @@ void reader::impl::load_data()
                                      _stream);
       compinfo.device_to_host_sync(_stream);
 
-      auto& compinfo_map = _file_itm_data.compinfo_map;
       for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
-        // Cache these parsed numbers so they can be reused in the decoding step.
+        // Cache these parsed numbers so they can be reused in the decompression/decoding step.
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
                                    stream_compinfo->num_uncompressed_blocks,
                                    stream_compinfo->max_uncompressed_size};
@@ -637,7 +635,7 @@ void reader::impl::load_data()
 #endif
       }
 
-      // Must clear map since the next level will have similar keys.
+      // Important: must clear this map since the next level will have similar keys.
       stream_compinfo_map.clear();
 
     } else {
@@ -657,21 +655,18 @@ void reader::impl::load_data()
 
   }  // end loop level
 
-  // Decoding is reset to start from the first chunk in `decode_stripe_chunks`.
+  // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
   _chunk_read_data.curr_decode_stripe_range = 0;
 
-  // Decode all chunks if there is no read and no output limit.
-  // In theory, we should just decode enough stripes for output one table chunk.
-  // However, we do not know the output size of each stripe after decompressing and decoding,
-  // thus we have to process all loaded chunks.
-  // That is because the estimated `max_uncompressed_size` of stream data from
-  // `ParseCompressedStripeData` is just the approximate of the maximum possible size, not the
-  // actual size, which can be much smaller in practice.
-
-  // TODO: docs on handle size overflow
+  // Decode all loaded stripes if there is no read limit.
+  // In theory, we should just decode enough stripes for output one table chunk, instead of
+  // decoding all stripes like this.
+  // However, we do not know how many stripes are 'enough' because there is not any simple and
+  // cheap way to compute the exact decoded sizes of stripes.
   if (_chunk_read_data.data_read_limit == 0 &&
-      // TODO: rows_to_read  is changed every decode, should we change this?
-      _file_itm_data.rows_to_read < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+      // In addition to not have any read limit, we also need to check if the the total number of
+      // rows in the loaded stripes exceeds column size limit.
+      num_loaded_stripes < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
 #ifdef LOCAL_TEST
     printf("0 limit: output decode stripe chunk unchanged\n");
 #endif
@@ -693,14 +688,13 @@ void reader::impl::load_data()
   }
 #endif
 
-  // Compute the prefix sum of stripe data sizes.
+  // Compute the prefix sum of stripe data sizes and rows.
   stripe_decomp_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
                          stripe_decomp_sizes.d_begin(),
                          stripe_decomp_sizes.d_end(),
                          stripe_decomp_sizes.d_begin(),
                          cumulative_size_sum{});
-
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
 #ifdef LOCAL_TEST
@@ -715,9 +709,8 @@ void reader::impl::load_data()
 #endif
 
   auto const decode_limit = [&] {
-    // In this case, we have no read limit but have to split due to having large input in which
-    // the number of rows exceed column size limit.
-    // We will split based on row number, not data size.
+    // In this case, we have no read limit but have to split due to having number of rows in loaded
+    // stripes exceeds column size limit. So we will split based on row number, not data size.
     if (_chunk_read_data.data_read_limit == 0) { return std::numeric_limits<std::size_t>::max(); }
 
     // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
@@ -727,6 +720,10 @@ void reader::impl::load_data()
   }();
   _chunk_read_data.decode_stripe_ranges =
     find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
+
+  // The split ranges always start from zero.
+  // We need to update the ranges to start from `stripe_start` which is covererd by the current
+  // range of loaded stripes.
   for (auto& chunk : _chunk_read_data.decode_stripe_ranges) {
     chunk.begin += stripe_start;
     chunk.end += stripe_start;
@@ -738,16 +735,11 @@ void reader::impl::load_data()
   for (size_t idx = 0; idx < splits.size(); idx++) {
     printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
   }
-  fflush(stdout);
-#endif
 
-  // lvl_stripe_data.clear();
-  // _file_itm_data.compinfo_ready = true;
-
-#ifdef LOCAL_TEST
   auto peak_mem = mem_stats_logger.peak_memory_usage();
   std::cout << "load, peak_memory_usage: " << peak_mem << "("
             << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+  fflush(stdout);
 #endif
 }
 

From c50071959198cbaabc46df6d9e03ec622c2f12b5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 10 Mar 2024 22:08:28 -0700
Subject: [PATCH 213/321] Rename variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu |  8 ++++----
 cpp/src/io/orc/reader_impl_decode.cu   | 28 +++++++++++++-------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 982fae2efb8..d22fc9e30e6 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -305,12 +305,12 @@ void reader::impl::global_preprocess(read_mode mode)
   auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
   auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
-  auto& lvl_stripe_stream_chunks = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
 
   lvl_stripe_data.resize(num_levels);
   lvl_stripe_sizes.resize(num_levels);
   lvl_stream_info.resize(num_levels);
-  lvl_stripe_stream_chunks.resize(num_levels);
+  lvl_stripe_stream_ranges.resize(num_levels);
   _file_itm_data.lvl_data_chunks.resize(num_levels);
   _out_buffers.resize(num_levels);
 
@@ -319,7 +319,7 @@ void reader::impl::global_preprocess(read_mode mode)
 
   for (std::size_t level = 0; level < num_levels; ++level) {
     lvl_stripe_sizes[level].resize(num_total_stripes);
-    lvl_stripe_stream_chunks[level].resize(num_total_stripes);
+    lvl_stripe_stream_ranges[level].resize(num_total_stripes);
 
     // Association between each ORC column and its cudf::column
     col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
@@ -382,7 +382,7 @@ void reader::impl::global_preprocess(read_mode mode)
       stripe_sizes[stripe_global_idx] = stripe_level_size;
       stripe_size += stripe_level_size;
 
-      lvl_stripe_stream_chunks[level][stripe_global_idx] =
+      lvl_stripe_stream_ranges[level][stripe_global_idx] =
         range{stream_level_count, stream_info.size()};
 
       // Coalesce consecutive streams into one read
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 87fdc40b351..441d0b507f6 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -78,8 +78,8 @@ namespace {
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
-  range const& load_stripe_range,
-  range const& stripe_range,
+  range const& loaded_stripe_range,
+  range const& decode_stripe_range,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
@@ -96,12 +96,11 @@ rmm::device_buffer decompress_stripe_data(
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
 
-  // printf("decompress #stripe: %d, ")
-
-  // TODO: use lvl_stripe_stream_chunks
+  // TODO: use lvl_stripe_stream_ranges
   std::size_t count{0};
   for (auto const& info : stream_info) {
-    if (info.source.stripe_idx < stripe_range.begin || info.source.stripe_idx >= stripe_range.end) {
+    if (info.source.stripe_idx < decode_stripe_range.begin ||
+        info.source.stripe_idx >= decode_stripe_range.end) {
       continue;
     }
     count++;
@@ -110,7 +109,8 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
 
   for (auto const& info : stream_info) {
-    if (info.source.stripe_idx < stripe_range.begin || info.source.stripe_idx >= stripe_range.end) {
+    if (info.source.stripe_idx < decode_stripe_range.begin ||
+        info.source.stripe_idx >= decode_stripe_range.end) {
       continue;
     }
 
@@ -127,7 +127,7 @@ rmm::device_buffer decompress_stripe_data(
 
     compinfo.push_back(gpu::CompressedStreamInfo(
       static_cast<uint8_t const*>(
-        stripe_data[info.source.stripe_idx - load_stripe_range.begin].data()) +
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
         info.dst_pos,
       info.length));
 
@@ -822,11 +822,11 @@ void reader::impl::decompress_and_decode()
 {
   if (_file_itm_data.has_no_data()) { return; }
 
-  auto const stripe_chunk =
+  auto const stripe_range =
     _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
-  auto const stripe_start = stripe_chunk.begin;
-  auto const stripe_end   = stripe_chunk.end;
-  auto const stripe_count = stripe_chunk.end - stripe_chunk.begin;
+  auto const stripe_start = stripe_range.begin;
+  auto const stripe_end   = stripe_range.end;
+  auto const stripe_count = stripe_range.end - stripe_range.begin;
 
   auto const load_stripe_start =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1].begin;
@@ -980,7 +980,7 @@ void reader::impl::decompress_and_decode()
 #endif
 
     auto const& stripe_stream_ranges      = lvl_stripe_stream_ranges[level];
-    auto const [stream_begin, stream_end] = get_range(stripe_stream_ranges, stripe_chunk);
+    auto const [stream_begin, stream_end] = get_range(stripe_stream_ranges, stripe_range);
 
     auto& columns_level = _selected_columns.levels[level];
 
@@ -1245,7 +1245,7 @@ void reader::impl::decompress_and_decode()
 
       auto decomp_data = decompress_stripe_data(
         _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1],
-        stripe_chunk,
+        stripe_range,
         _file_itm_data.compinfo_map,
         *_metadata.per_file_metadata[0].decompressor,
         stripe_data,

From a03cb3de2adf190d80260362551e969f8e109677 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 10 Mar 2024 22:22:34 -0700
Subject: [PATCH 214/321] Change return type of `get_range`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 11 ++++----
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 +-
 cpp/src/io/orc/reader_impl_decode.cu    | 37 +++++++------------------
 3 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index d22fc9e30e6..92d0f2233db 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -248,8 +248,7 @@ template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_si
 template std::vector<range> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
 
-std::pair<int64_t, int64_t> get_range(std::vector<range> const& input_ranges,
-                                      range const& selected_ranges)
+range get_range(std::vector<range> const& input_ranges, range const& selected_ranges)
 {
   // The first and last range.
   auto const& first_range = input_ranges[selected_ranges.begin];
@@ -573,9 +572,9 @@ void reader::impl::load_data()
     auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 
-    auto const [stream_begin, stream_end] =
+    auto const stream_range =
       get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
-    auto const num_streams = stream_end - stream_begin;
+    auto const num_streams = stream_range.end - stream_range.begin;
 
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
@@ -584,7 +583,7 @@ void reader::impl::load_data()
       // the latter decompression/decoding step will use a different stripe range.
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
 
-      for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
           static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
@@ -645,7 +644,7 @@ void reader::impl::load_data()
 #endif
 
       // Set decompression size equal to the input size.
-      for (auto stream_idx = stream_begin; stream_idx < stream_end; ++stream_idx) {
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
       }
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index ba3611d5757..a3883426787 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -288,8 +288,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
  * @param selected_ranges A range of ranges from `input_ranges`
  * @return The range of data span by the selected range of ranges
  */
-std::pair<int64_t, int64_t> get_range(std::vector<range> const& input_ranges,
-                                      range const& selected_ranges);
+range get_range(std::vector<range> const& input_ranges, range const& selected_ranges);
 
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 441d0b507f6..10f2192b019 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -79,7 +79,7 @@ namespace {
  */
 rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
-  range const& decode_stripe_range,
+  range const& stream_range,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
@@ -96,23 +96,11 @@ rmm::device_buffer decompress_stripe_data(
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
 
-  // TODO: use lvl_stripe_stream_ranges
-  std::size_t count{0};
-  for (auto const& info : stream_info) {
-    if (info.source.stripe_idx < decode_stripe_range.begin ||
-        info.source.stripe_idx >= decode_stripe_range.end) {
-      continue;
-    }
-    count++;
-  }
+  auto const num_streams = stream_range.end - stream_range.begin;
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, stream);
 
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, count, stream);
-
-  for (auto const& info : stream_info) {
-    if (info.source.stripe_idx < decode_stripe_range.begin ||
-        info.source.stripe_idx >= decode_stripe_range.end) {
-      continue;
-    }
+  for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+    auto const& info = stream_info[stream_idx];
 
 #ifdef LOCAL_TEST
 //    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
@@ -979,8 +967,8 @@ void reader::impl::decompress_and_decode()
     }
 #endif
 
-    auto const& stripe_stream_ranges      = lvl_stripe_stream_ranges[level];
-    auto const [stream_begin, stream_end] = get_range(stripe_stream_ranges, stripe_range);
+    auto const& stripe_stream_ranges = lvl_stripe_stream_ranges[level];
+    auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
 
     auto& columns_level = _selected_columns.levels[level];
 
@@ -1179,13 +1167,8 @@ void reader::impl::decompress_and_decode()
         }
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stream_begin].dst_pos;
-            // printf("chunk.streams[%d] of chunk.strm_id[%d], stripe %d | %d, collect from %d\n",
-            //        (int)k,
-            //        (int)chunk.strm_id[k],
-            //        (int)stripe_idx,
-            //        (int)stripe_start,
-            //        (int)(chunk.strm_id[k] + stream_begin));
+            chunk.streams[k] =
+              dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
         }
       }
@@ -1245,7 +1228,7 @@ void reader::impl::decompress_and_decode()
 
       auto decomp_data = decompress_stripe_data(
         _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1],
-        stripe_range,
+        get_range(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range),
         _file_itm_data.compinfo_map,
         *_metadata.per_file_metadata[0].decompressor,
         stripe_data,

From cebb051b9cc61b655dfd2f68b23e61552fc68414 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sun, 10 Mar 2024 22:33:57 -0700
Subject: [PATCH 215/321] More cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu |  6 ++---
 cpp/src/io/orc/reader_impl_decode.cu   | 31 +++++++++++---------------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 92d0f2233db..2c15e977c28 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -723,9 +723,9 @@ void reader::impl::load_data()
   // The split ranges always start from zero.
   // We need to update the ranges to start from `stripe_start` which is covererd by the current
   // range of loaded stripes.
-  for (auto& chunk : _chunk_read_data.decode_stripe_ranges) {
-    chunk.begin += stripe_start;
-    chunk.end += stripe_start;
+  for (auto& range : _chunk_read_data.decode_stripe_ranges) {
+    range.begin += stripe_start;
+    range.end += stripe_start;
   }
 
 #ifdef LOCAL_TEST
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 10f2192b019..629494569fc 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -71,14 +71,13 @@ namespace {
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
  * @param row_index_stride Distance between each row index
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
-  range const& loaded_stripe_range,
+  range const& stripe_range,
   range const& stream_range,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
@@ -86,7 +85,6 @@ rmm::device_buffer decompress_stripe_data(
   host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_type num_stripes,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
@@ -114,21 +112,14 @@ rmm::device_buffer decompress_stripe_data(
 #endif
 
     compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(
-        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
+      static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_range.begin].data()) +
         info.dst_pos,
       info.length));
 
-    //    printf("line %d\n", __LINE__);
-    //    fflush(stdout);
     auto const& cached_comp_info = compinfo_map.at(stream_source_info{
       info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind});
-    //    printf("line %d\n", __LINE__);
-    //    fflush(stdout);
-    // auto const& cached_comp_info =
-    //   compinfo_map[stream_id_info{info.source.stripe_idx, info.source.level,
-    //   info.source.orc_cold_idx, info.source.kind}];
-    auto& stream_comp_info                   = compinfo.back();
+    auto& stream_comp_info       = compinfo.back();
+
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
     stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
@@ -186,7 +177,10 @@ rmm::device_buffer decompress_stripe_data(
   // Required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-  if (decomp_data.is_empty()) { return decomp_data; }
+
+  // If total_decomp_size is zero, the data should not be compressed, and this function
+  // should not be called at all.
+  CUDF_EXPECTS(!decomp_data.is_empty(), "Invalid decompression size");
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
     num_compressed_blocks + num_uncompressed_blocks, stream);
@@ -325,15 +319,16 @@ rmm::device_buffer decompress_stripe_data(
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  auto const num_columns = static_cast<size_type>(chunks.size().second);
+  auto const num_stripes = stripe_range.end - stripe_range.begin;
+  auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
   // TBD: We could update the value from the information we already
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (size_type i = 0; i < num_stripes; ++i) {
-    for (size_type j = 0; j < num_columns; ++j) {
+  for (std::size_t i = 0; i < num_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
@@ -821,6 +816,7 @@ void reader::impl::decompress_and_decode()
 
 #ifdef LOCAL_TEST
   printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
+  printf("\n loaded stripe start %d \n", (int)load_stripe_start);
 #endif
 
   auto const rows_to_skip = _file_itm_data.rows_to_skip;
@@ -1235,7 +1231,6 @@ void reader::impl::decompress_and_decode()
         stream_info,
         chunks,
         row_groups,
-        stripe_count,
         _metadata.get_row_index_stride(),
         level == 0,
         _stream);

From a0492fde02469c52bee328306684eb9a57b8a54c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 09:22:36 -0700
Subject: [PATCH 216/321] Fix num stripes

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 629494569fc..22dc684588a 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -77,8 +77,9 @@ namespace {
  * @return Device buffer to decompressed page data
  */
 rmm::device_buffer decompress_stripe_data(
-  range const& stripe_range,
+  range const& loaded_stripe_range,
   range const& stream_range,
+  std::size_t num_decode_stripes,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
@@ -112,7 +113,8 @@ rmm::device_buffer decompress_stripe_data(
 #endif
 
     compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_range.begin].data()) +
+      static_cast<uint8_t const*>(
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
         info.dst_pos,
       info.length));
 
@@ -319,7 +321,6 @@ rmm::device_buffer decompress_stripe_data(
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  auto const num_stripes = stripe_range.end - stripe_range.begin;
   auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
@@ -327,7 +328,7 @@ rmm::device_buffer decompress_stripe_data(
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (std::size_t i = 0; i < num_stripes; ++i) {
+  for (std::size_t i = 0; i < num_decode_stripes; ++i) {
     for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
@@ -346,7 +347,7 @@ rmm::device_buffer decompress_stripe_data(
                             compinfo.device_ptr(),
                             chunks.base_device_ptr(),
                             num_columns,
-                            num_stripes,
+                            num_decode_stripes,
                             row_index_stride,
                             use_base_stride,
                             stream);
@@ -1225,6 +1226,7 @@ void reader::impl::decompress_and_decode()
       auto decomp_data = decompress_stripe_data(
         _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1],
         get_range(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range),
+        stripe_count,
         _file_itm_data.compinfo_map,
         *_metadata.per_file_metadata[0].decompressor,
         stripe_data,

From d2e892d7bc9198ef0cd317628e1d08d7695a616c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 09:31:45 -0700
Subject: [PATCH 217/321] Update docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 22dc684588a..04b48742f64 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -61,11 +61,17 @@ namespace cudf::io::orc::detail {
 
 namespace {
 
-// TODO: update
-// TODO: compute num stripes from chunks
 /**
- * @brief Decompresses the stripe data, at stream granularity.
+ * @brief  Decompresses the stripe data, at stream granularity.
  *
+ * Only the streams in the provided `stream_range` are decoded. That range is determined in
+ * the previous steps, after splitting stripes into subsets to maintain memory usage to be
+ * under data read limit.
+ *
+ * @param loaded_stripe_range Range of stripes that are already loaded in memory
+ * @param stream_range Range of streams to be decoded
+ * @param num_decoded_stripes Number of stripes that the decoding streams belong to
+ * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
  * @param stripe_data List of source stripe column data
  * @param stream_info List of stream to column mappings
@@ -74,7 +80,7 @@ namespace {
  * @param row_index_stride Distance between each row index
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
+ * @return Device buffer to decompressed data
  */
 rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
@@ -208,13 +214,13 @@ rmm::device_buffer decompress_stripe_data(
     compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
-    //    stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
     start_pos += compinfo[i].num_compressed_blocks;
     start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
     max_uncomp_block_size =
       std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
   }
+
   compinfo.host_to_device_async(stream);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
@@ -290,8 +296,6 @@ rmm::device_buffer decompress_stripe_data(
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
 
-    // TODO: proclam return type
-
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(

From 5e4b16f7c9cabadd25311d0fa43174a63cb14be0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 10:59:10 -0700
Subject: [PATCH 218/321] Cleanup and add docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 35 +++++++++-------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 04b48742f64..13c0ab1f637 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -597,12 +597,10 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   stream.synchronize();
 }
 
-// TODO: this is called for each chunk of stripes.
 /**
  * @brief Aggregate child metadata from parent column chunks.
  */
-void aggregate_child_meta(std::size_t stripe_start,
-                          std::size_t level,
+void aggregate_child_meta(std::size_t level,
                           cudf::io::orc::detail::column_hierarchy const& selected_columns,
                           cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
                           cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
@@ -637,10 +635,7 @@ void aggregate_child_meta(std::size_t stripe_start,
 
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
-    // printf("p_col.id: %d\n", (int)p_col.id);
-
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
-    // printf("   level: %d, parent_col_idx: %d\n", (int)level, (int)parent_col_idx);
 
     int64_t start_row         = 0;
     auto processed_row_groups = 0;
@@ -648,8 +643,6 @@ void aggregate_child_meta(std::size_t stripe_start,
     for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
       // Aggregate num_rows and start_row from processed parent columns per row groups
       if (num_of_rowgroups) {
-        // printf("   num_of_rowgroups: %d\n", (int)num_of_rowgroups);
-
         auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
         auto processed_child_rows  = 0;
 
@@ -667,24 +660,24 @@ void aggregate_child_meta(std::size_t stripe_start,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      // printf("     stripe_id: %d: child_rows: %d\n", (int)stripe_id, (int)child_rows);
-      // printf("      p_col.num_children: %d\n", (int)p_col.num_children);
 
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
-        // TODO: Check for overflow here.
         num_child_rows[child_col_idx] += child_rows;
+
+        // The number of rows in child column should not be very large otherwise we will have
+        // size overflow.
+        // If that is the case, we need to set a read limit to reduce number of decoding stripes.
+        CUDF_EXPECTS(num_child_rows[child_col_idx] <=
+                       static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                     "Number of rows in the child column exceeds column size limit.");
+
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
         child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
-        // printf("update child_start_row (%d, %d): %d\n",
-        //        (int)stripe_id,
-        //        (int)child_col_idx,
-        //        (int)start_row);
       }
       start_row += child_rows;
-      // printf("        start_row: %d\n", (int)start_row);
     }
 
     // Parent column null mask and null count would be required for child column
@@ -1390,14 +1383,8 @@ void reader::impl::decompress_and_decode()
       scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
 
       row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(stripe_start,
-                           level,
-                           _selected_columns,
-                           chunks,
-                           row_groups,
-                           nested_cols,
-                           _out_buffers[level],
-                           col_meta);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
 
       // ORC stores number of elements at each row, so we need to generate offsets from that
       std::vector<list_buffer_data> buff_data;

From 3ec50ef9999106dd06e540815c39787cd824cb9f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 12:22:25 -0700
Subject: [PATCH 219/321] Cleanup, docs, and rename variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu |  2 +
 cpp/src/io/orc/reader_impl_decode.cu   | 77 ++++++++++++++------------
 2 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 2c15e977c28..9ea03fd2e56 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -429,6 +429,7 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 #endif
 
+  // TODO: exec_policy_nosync
   // Compute the prefix sum of stripes' data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),  // todo no sync
@@ -687,6 +688,7 @@ void reader::impl::load_data()
   }
 #endif
 
+  // TODO: exec_policy_nosync
   // Compute the prefix sum of stripe data sizes and rows.
   stripe_decomp_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy(_stream),
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 13c0ab1f637..e85389e4c9a 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -576,6 +576,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
+  // TODO: exec_policy_nosync
   thrust::for_each(rmm::exec_policy(stream),
                    d_prefix_sums_to_update.begin(),
                    d_prefix_sums_to_update.end(),
@@ -722,11 +723,20 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 }
 
 /**
- * @brief TODO
- * @param input
- * @param size_limit
- * @param stream
- * @return
+ * @brief Find the splits of the input table such that each split range has cumulative size less
+ * than a given `size_limit`.
+ *
+ * The parameter `segment_length` is to control the granularity of splits. The output ranges will
+ * always have numbers of rows that are multiple of this value, except the last range that contains
+ * the remaining rows.
+ *
+ * Similar to `find_splits`, the given limit is just a soft limit. The function will never output
+ * empty ranges, even they have sizes exceed the value of `size_limit`.
+ *
+ * @param input The input table to find splits
+ * @param size_limit A limit on the output size of each split range
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A vector of ranges as splits of the input
  */
 std::vector<range> find_table_splits(table_view const& input,
                                      size_type segment_length,
@@ -738,14 +748,12 @@ std::vector<range> find_table_splits(table_view const& input,
 #endif
 
   // If segment_length is zero: we don't have any limit on granularity.
-  // As such, set segment length to the number of rows.
+  // As such, set segment length equal to the number of rows.
   if (segment_length == 0) { segment_length = input.num_rows(); }
 
-  // If we have small number of rows, need to adjust segment_length before calling to
-  // `segmented_row_bit_count`.
+  // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
   segment_length = std::min(segment_length, input.num_rows());
 
-  // Default 10k rows.
   auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
     input, segment_length, stream, rmm::mr::get_current_device_resource());
 
@@ -798,7 +806,6 @@ std::vector<range> find_table_splits(table_view const& input,
 
 }  // namespace
 
-// TODO: this should be called per chunk of stripes.
 void reader::impl::decompress_and_decode()
 {
   if (_file_itm_data.has_no_data()) { return; }
@@ -809,6 +816,7 @@ void reader::impl::decompress_and_decode()
   auto const stripe_end   = stripe_range.end;
   auto const stripe_count = stripe_range.end - stripe_range.begin;
 
+  // The start index of loaded stripes. They are different from decoding stripes.
   auto const load_stripe_start =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1].begin;
 
@@ -817,40 +825,36 @@ void reader::impl::decompress_and_decode()
   printf("\n loaded stripe start %d \n", (int)load_stripe_start);
 #endif
 
-  auto const rows_to_skip = _file_itm_data.rows_to_skip;
-  // auto const rows_to_read      = _file_itm_data.rows_to_read;
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
 
-  // auto const rows_to_skip = 0;
-  int64_t rows_to_read = 0;
+  // Number of rows to decode in this decompressing/decoding step.
+  int64_t rows_to_decode = 0;
   for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
     auto const& stripe     = selected_stripes[stripe_idx];
-    auto const stripe_info = stripe.stripe_info;
-    // TODO: this is indeed not needed since we split stripes before this based on stripe row
-
-    // TODO: check overflow
-    // CUDF_EXPECTS(per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows <
-    //                static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
-    //              "TODO");
-    rows_to_read += static_cast<int64_t>(stripe_info->numberOfRows);
-
-    if (_file_itm_data.rows_to_skip > 0) {
-      CUDF_EXPECTS(_file_itm_data.rows_to_skip < static_cast<int64_t>(stripe_info->numberOfRows),
-                   "TODO");
+    auto const stripe_rows = static_cast<int64_t>(stripe.stripe_info->numberOfRows);
+    rows_to_decode += stripe_rows;
+
+    // The rows to skip should never be larger than number of rows in the first loaded stripes.
+    // This is just to make sure there was not any bug with it.
+    if (rows_to_skip > 0) {
+      CUDF_EXPECTS(rows_to_skip < stripe_rows, "Invalid rows_to_skip computation.");
     }
   }
-  CUDF_EXPECTS(rows_to_read > rows_to_skip, "Invalid rows_to_read computation.");
-  rows_to_read = std::min<int64_t>(rows_to_read - rows_to_skip, _file_itm_data.rows_to_read);
 
-  // rows_to_read -= rows_to_skip;
+  CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation.");
+  rows_to_decode = std::min<int64_t>(rows_to_decode - rows_to_skip, _file_itm_data.rows_to_read);
+
+  // After this step, we no longer have any rows to skip.
+  // The number of rows remains to read in the future also reduced.
   _file_itm_data.rows_to_skip = 0;
-  _file_itm_data.rows_to_read -= rows_to_read;
+  _file_itm_data.rows_to_read -= rows_to_decode;
 
 #ifdef LOCAL_TEST
-  printf("decode, skip = %ld, read = %ld\n", rows_to_skip, rows_to_read);
+  printf("decode, skip = %ld, decode = %ld\n", rows_to_skip, rows_to_decode);
 #endif
 
-  CUDF_EXPECTS(rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+  CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",
                std::overflow_error);
 
@@ -1017,8 +1021,9 @@ void reader::impl::decompress_and_decode()
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * stripe_count < 8 * 128) &&
+      (rows_to_decode > _metadata.get_row_index_stride() &&
+       !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
+       num_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
@@ -1125,7 +1130,7 @@ void reader::impl::decompress_and_decode()
         //        (int)chunk.start_row,
         //        (int)chunk.num_rows);
 
-        chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
         chunk.parent_validity_info =
           (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
         chunk.parent_null_count_prefix_sums =
@@ -1311,7 +1316,7 @@ void reader::impl::decompress_and_decode()
         }
       }
       auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+      auto n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
 
       // printf("  create col, num rows: %d\n", (int)n_rows);
 

From 73c1a193620e004373cac025e0853ce318d9c6c5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 12:55:18 -0700
Subject: [PATCH 220/321] Update `hostdevice_vector.hpp`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/utilities/hostdevice_vector.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index af1591b709a..0883ac3609f 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -53,14 +53,12 @@ class hostdevice_vector {
   }
 
   explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream)
-    : h_data({cudf::io::get_host_memory_resource(), stream}), d_data(0, stream)
+    : h_data({cudf::io::get_host_memory_resource(), stream}), d_data(max_size, stream)
   {
     CUDF_EXPECTS(initial_size <= max_size, "initial_size cannot be larger than max_size");
 
     h_data.reserve(max_size);
     h_data.resize(initial_size);
-
-    d_data.resize(max_size, stream);
   }
 
   void push_back(T const& data)

From a897155c95d4aed440463077801b84bbeb03bc1f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 13:11:29 -0700
Subject: [PATCH 221/321] Optimize `tz_table` parameter usage

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 74 ++++------------------------
 1 file changed, 9 insertions(+), 65 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index e85389e4c9a..ef0ae79dcfb 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -462,7 +462,7 @@ void decode_stream_data(std::size_t num_dicts,
                         int64_t skip_rows,
                         size_type row_index_stride,
                         std::size_t level,
-                        table_view const& tz_table,
+                        table_device_view const& d_tz_table,
                         cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
@@ -504,7 +504,6 @@ void decode_stream_data(std::size_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
   // Update the null map for child columns
 
@@ -526,7 +525,7 @@ void decode_stream_data(std::size_t num_dicts,
                            num_columns,
                            num_stripes,
                            skip_rows,
-                           *tz_table_dptr,
+                           d_tz_table,
                            row_groups.size().first,
                            row_index_stride,
                            level,
@@ -836,7 +835,8 @@ void reader::impl::decompress_and_decode()
     rows_to_decode += stripe_rows;
 
     // The rows to skip should never be larger than number of rows in the first loaded stripes.
-    // This is just to make sure there was not any bug with it.
+    // Technically, overflow here should never happen since `select_stripes` already checked it.
+    // This is just to make sure there was not any bug there.
     if (rows_to_skip > 0) {
       CUDF_EXPECTS(rows_to_skip < stripe_rows, "Invalid rows_to_skip computation.");
     }
@@ -854,6 +854,8 @@ void reader::impl::decompress_and_decode()
   printf("decode, skip = %ld, decode = %ld\n", rows_to_skip, rows_to_decode);
 #endif
 
+  // Technically, overflow here should never happen because the `load_data()` step
+  // already handled it by spliting the loaded stripe range into multiple decode ranges.
   CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",
                std::overflow_error);
@@ -871,6 +873,7 @@ void reader::impl::decompress_and_decode()
                                     {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
                                 : std::make_unique<cudf::table>();
   }();
+  auto const tz_table_dptr = table_device_view::create(tz_table->view(), _stream);
 
   auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
   auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
@@ -891,66 +894,7 @@ void reader::impl::decompress_and_decode()
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
-  auto& col_meta = *_col_meta;
-
-#if 0
-  printf("num_child_rows: (size %d)\n", (int)_col_meta->num_child_rows.size());
-  if (_col_meta->num_child_rows.size()) {
-    for (auto x : _col_meta->num_child_rows) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-
-    _col_meta->num_child_rows.clear();
-  }
-
-  printf("parent_column_data null count: (size %d)\n", (int)_col_meta->parent_column_data.size());
-  if (_col_meta->parent_column_data.size()) {
-    for (auto x : _col_meta->parent_column_data) {
-      printf("%d, ", (int)x.null_count);
-    }
-    printf("\n");
-    _col_meta->parent_column_data.clear();
-  }
-
-  printf("parent_column_index: (size %d)\n", (int)_col_meta->parent_column_index.size());
-  if (_col_meta->parent_column_index.size()) {
-    for (auto x : _col_meta->parent_column_index) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-    _col_meta->parent_column_index.clear();
-  }
-
-  printf("child_start_row: (size %d)\n", (int)_col_meta->child_start_row.size());
-  if (_col_meta->child_start_row.size()) {
-    for (auto x : _col_meta->child_start_row) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-    _col_meta->child_start_row.clear();
-  }
-
-  printf("num_child_rows_per_stripe: (size %d)\n",
-         (int)_col_meta->num_child_rows_per_stripe.size());
-  if (_col_meta->num_child_rows_per_stripe.size()) {
-    for (auto x : _col_meta->num_child_rows_per_stripe) {
-      printf("%d, ", (int)x);
-    }
-    printf("\n");
-    _col_meta->num_child_rows_per_stripe.clear();
-  }
-
-  printf("rwgrp_meta: (size %d)\n", (int)_col_meta->rwgrp_meta.size());
-  if (_col_meta->rwgrp_meta.size()) {
-    for (auto x : _col_meta->rwgrp_meta) {
-      printf("(%d | %d), ", (int)x.start_row, (int)x.num_rows);
-    }
-    printf("\n");
-  }
-
-#endif
-
+  auto& col_meta                 = *_col_meta;
   auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
@@ -1360,7 +1304,7 @@ void reader::impl::decompress_and_decode()
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
                        level,
-                       tz_table->view(),
+                       *tz_table_dptr,
                        chunks,
                        row_groups,
                        _out_buffers[level],

From 91f9cce3a51efc64bafae284d1142bcb47ab3746 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 14:09:21 -0700
Subject: [PATCH 222/321] Make `null_count_prefix_sums` local to decoding step

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 --
 cpp/src/io/orc/reader_impl_decode.cu    | 48 ++++++++++++-------------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index a3883426787..2b35e51b6f1 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -162,9 +162,6 @@ struct file_intermediate_data {
   // This is used to identify the range of streams for each stripe from that vector.
   std::vector<std::vector<range>> lvl_stripe_stream_ranges;
 
-  // TODO rename
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-
   // For data processing, decompression, and decoding.
   // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ef0ae79dcfb..a9c4f3352f6 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -875,17 +875,17 @@ void reader::impl::decompress_and_decode()
   }();
   auto const tz_table_dptr = table_device_view::create(tz_table->view(), _stream);
 
-  auto& lvl_stripe_data        = _file_itm_data.lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data.null_count_prefix_sums;
-  auto& lvl_chunks             = _file_itm_data.lvl_data_chunks;
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto& lvl_chunks      = _file_itm_data.lvl_data_chunks;
 
-  null_count_prefix_sums.clear();
+  auto const num_levels = _selected_columns.num_levels();
 
   // TODO: move this to global step
   lvl_chunks.resize(_selected_columns.num_levels());
-  _out_buffers.clear();
-  _out_buffers.resize(_selected_columns.num_levels());
 
+  _out_buffers.resize(num_levels);
+
+  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums(num_levels);
   //
   //
   //
@@ -945,9 +945,10 @@ void reader::impl::decompress_and_decode()
       }
     }
 
-    auto const num_columns = columns_level.size();
-    auto& chunks           = lvl_chunks[level];
-    chunks = cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_columns, _stream);
+    auto const num_level_columns = columns_level.size();
+    auto& chunks                 = lvl_chunks[level];
+    chunks =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_level_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
 #ifdef LOCAL_TEST
@@ -967,7 +968,7 @@ void reader::impl::decompress_and_decode()
       // TODO: Consider nrows, gpu, and tune the threshold
       (rows_to_decode > _metadata.get_row_index_stride() &&
        !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
-       num_columns * stripe_count < 8 * 128) &&
+       num_level_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
@@ -979,14 +980,11 @@ void reader::impl::decompress_and_decode()
     // Logically view streams as columns
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
 
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        stripe_count, _stream, rmm::mr::get_current_device_resource());
-                    });
+    null_count_prefix_sums[level].reserve(num_level_columns);
+    std::generate_n(std::back_inserter(null_count_prefix_sums[level]), num_level_columns, [&]() {
+      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+        stripe_count, _stream, rmm::mr::get_current_device_resource());
+    });
 
     // Tracker for eventually deallocating compressed and uncompressed data
     auto& stripe_data = lvl_stripe_data[level];
@@ -1055,19 +1053,19 @@ void reader::impl::decompress_and_decode()
       // fflush(stdout);
 
       // Update chunks to reference streams pointers
-      for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+      for (std::size_t col_idx = 0; col_idx < num_level_columns; col_idx++) {
         auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
         // start row, number of rows in a each stripe and total number of rows
         // may change in lower levels of nesting
         chunk.start_row =
           (level == 0)
             ? stripe_start_row
-            : col_meta.child_start_row[(stripe_idx - stripe_start) * num_columns + col_idx];
+            : col_meta.child_start_row[(stripe_idx - stripe_start) * num_level_columns + col_idx];
         chunk.num_rows =
           (level == 0)
             ? static_cast<int64_t>(stripe_info->numberOfRows)
-            : col_meta
-                .num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_columns + col_idx];
+            : col_meta.num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_level_columns +
+                                                 col_idx];
 
         // printf("col idx: %d, start_row: %d, num rows: %d\n",
         //        (int)col_idx,
@@ -1132,10 +1130,10 @@ void reader::impl::decompress_and_decode()
 
     // Process dataset chunk pages into output columns
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_level_columns, _stream);
     if (level > 0 and row_groups.size().first) {
       cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
+                                                     num_rowgroups * num_level_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -1215,7 +1213,7 @@ void reader::impl::decompress_and_decode()
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
                                 chunks.base_device_ptr(),
-                                num_columns,
+                                num_level_columns,
                                 stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,

From dd7e850d83b6ed80f3710d870216f7c754da7ab1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 14:17:20 -0700
Subject: [PATCH 223/321] Make `lvl_chunks` local to decoding step and some
 cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  1 -
 cpp/src/io/orc/reader_impl_chunking.hpp |  4 ----
 cpp/src/io/orc/reader_impl_decode.cu    | 25 +++++++++++--------------
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9ea03fd2e56..a6efbb3c3c4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -310,7 +310,6 @@ void reader::impl::global_preprocess(read_mode mode)
   lvl_stripe_sizes.resize(num_levels);
   lvl_stream_info.resize(num_levels);
   lvl_stripe_stream_ranges.resize(num_levels);
-  _file_itm_data.lvl_data_chunks.resize(num_levels);
   _out_buffers.resize(num_levels);
 
   auto& read_info = _file_itm_data.data_read_info;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 2b35e51b6f1..2aa48acb56b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -162,10 +162,6 @@ struct file_intermediate_data {
   // This is used to identify the range of streams for each stripe from that vector.
   std::vector<std::vector<range>> lvl_stripe_stream_ranges;
 
-  // For data processing, decompression, and decoding.
-  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
-  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_data_chunks;
-
   bool global_preprocessed{false};
 };
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index a9c4f3352f6..7b0b5c0b127 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -875,27 +875,24 @@ void reader::impl::decompress_and_decode()
   }();
   auto const tz_table_dptr = table_device_view::create(tz_table->view(), _stream);
 
-  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
-  auto& lvl_chunks      = _file_itm_data.lvl_data_chunks;
-
   auto const num_levels = _selected_columns.num_levels();
-
-  // TODO: move this to global step
-  lvl_chunks.resize(_selected_columns.num_levels());
-
   _out_buffers.resize(num_levels);
 
+  // Column descriptors ('chunks').
+  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+
+  // For computing null count.
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums(num_levels);
+
   //
   //
   //
-  // TODO: move this to reader_impl.cu, decomp and decode step
-  //  std::size_t num_stripes = selected_stripes.size();
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
-  auto& col_meta                 = *_col_meta;
-  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& col_meta                       = *_col_meta;
+  auto const& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
 #ifdef LOCAL_TEST
@@ -987,7 +984,7 @@ void reader::impl::decompress_and_decode()
     });
 
     // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
 
     int64_t stripe_start_row = 0;
     int64_t num_dict_entries = 0;
@@ -1169,7 +1166,7 @@ void reader::impl::decompress_and_decode()
 
       auto decomp_data = decompress_stripe_data(
         _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1],
-        get_range(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range),
+        stream_range,
         stripe_count,
         _file_itm_data.compinfo_map,
         *_metadata.per_file_metadata[0].decompressor,
@@ -1379,7 +1376,7 @@ void reader::impl::decompress_and_decode()
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     _out_buffers[level].clear();
 
-    auto& stripe_data = lvl_stripe_data[level];
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
 
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       stripe_data[stripe_start - load_stripe_start] = {};

From 89a2ac0c646cc91d9b7410428a8673da96a2921d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 14:40:18 -0700
Subject: [PATCH 224/321] Reorder variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 2aa48acb56b..939beb03b2b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -110,18 +110,6 @@ struct file_intermediate_data {
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
-  // Store the compression information for each data stream.
-  stream_source_map<stripe_level_comp_info> compinfo_map;
-
-  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
-  // After decoding, such buffers can be released.
-  // This can only be implemented after chunked output is ready.
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
-
-  // Store the size of each stripe at each nested level.
-  // This is used to initialize the stripe_data buffers.
-  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
-
   // Store information to identify where to read a chunk of data from source.
   // Each read corresponds to one or more consecutive streams combined.
   struct stream_data_read_info {
@@ -155,6 +143,9 @@ struct file_intermediate_data {
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
   std::vector<range> stripe_data_read_ranges;
 
+  // Store the compression information for each data stream.
+  stream_source_map<stripe_level_comp_info> compinfo_map;
+
   // Store info for each ORC stream at each nested level.
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
 
@@ -162,6 +153,15 @@ struct file_intermediate_data {
   // This is used to identify the range of streams for each stripe from that vector.
   std::vector<std::vector<range>> lvl_stripe_stream_ranges;
 
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
+  // This can only be implemented after chunked output is ready.
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
+
+  // Store the size of each stripe at each nested level.
+  // This is used to initialize the stripe_data buffers.
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+
   bool global_preprocessed{false};
 };
 

From c585c44054297d8c45d3baec0ca8395f7c6415bc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 15:20:21 -0700
Subject: [PATCH 225/321] Cleanup and rename variables

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 54 +++++++++++++++-----
 cpp/src/io/orc/reader_impl_chunking.hpp | 23 ++++++++-
 cpp/src/io/orc/reader_impl_decode.cu    | 66 ++++++-------------------
 3 files changed, 79 insertions(+), 64 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index a6efbb3c3c4..f10c5b754c0 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -68,7 +68,7 @@
 namespace cudf::io::orc::detail {
 
 std::size_t gather_stream_info_and_column_desc(
-  std::size_t stripe_processing_order,
+  std::size_t global_stripe_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -77,7 +77,7 @@ std::size_t gather_stream_info_and_column_desc(
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
-  std::size_t* stream_processing_order,
+  std::size_t* local_stream_order,
   std::optional<std::vector<orc_stream_info>*> const& stream_info,
   std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks)
 {
@@ -124,8 +124,8 @@ std::size_t gather_stream_info_and_column_desc(
           if (child_idx >= 0) {
             col = child_idx;
             if (chunks.has_value()) {
-              auto& chunk                     = (*chunks.value())[stripe_processing_order][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = *stream_processing_order;
+              auto& chunk                     = (*chunks.value())[global_stripe_order][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
           }
@@ -136,8 +136,8 @@ std::size_t gather_stream_info_and_column_desc(
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk                = (*chunks.value())[stripe_processing_order][col];
-            chunk.strm_id[index_type]  = *stream_processing_order;
+            auto& chunk                = (*chunks.value())[global_stripe_order][col];
+            chunk.strm_id[index_type]  = *local_stream_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
             chunk.skip_count |= 1 << index_type;
@@ -150,13 +150,13 @@ std::size_t gather_stream_info_and_column_desc(
           }
         }
 
-        (*stream_processing_order)++;
+        (*local_stream_order)++;
       } else {  // not chunks.has_value()
         stream_info.value()->emplace_back(
           stripeinfo->offset + src_offset,
           dst_offset,
           stream.length,
-          stream_source_info{stripe_processing_order, level, column_id, stream.kind});
+          stream_source_info{global_stripe_order, level, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
@@ -305,11 +305,15 @@ void reader::impl::global_preprocess(read_mode mode)
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
   auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
   auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& lvl_column_types         = _file_itm_data.lvl_column_types;
+  auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
 
   lvl_stripe_data.resize(num_levels);
   lvl_stripe_sizes.resize(num_levels);
   lvl_stream_info.resize(num_levels);
   lvl_stripe_stream_ranges.resize(num_levels);
+  lvl_column_types.resize(num_levels);
+  lvl_nested_cols.resize(num_levels);
   _out_buffers.resize(num_levels);
 
   auto& read_info = _file_itm_data.data_read_info;
@@ -322,16 +326,44 @@ void reader::impl::global_preprocess(read_mode mode)
     // Association between each ORC column and its cudf::column
     col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
 
+    auto const& columns_level = _selected_columns.levels[level];
     size_type col_id{0};
-    for (auto const& col : _selected_columns.levels[level]) {
+
+    for (auto const& col : columns_level) {
       // Map each ORC column to its column
       col_meta.orc_col_map[level][col.id] = col_id++;
-    }
 
-    auto const num_columns = _selected_columns.levels[level].size();
+      auto const col_type =
+        to_cudf_type(_metadata.get_col_type(col.id).kind,
+                     _config.use_np_dtypes,
+                     _config.timestamp_type.id(),
+                     to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+
+      auto& column_types = lvl_column_types[level];
+      auto& nested_cols  = lvl_nested_cols[level];
+
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column.
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
 
     // Try to reserve some memory, but the final size is unknown,
     // since each column may have more than one stream.
+    auto const num_columns = columns_level.size();
     lvl_stream_info[level].reserve(num_total_stripes * num_columns);
     if (read_info.capacity() < num_total_stripes * num_columns) {
       read_info.reserve(num_total_stripes * num_columns);
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 939beb03b2b..b2964e996c2 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -162,6 +162,12 @@ struct file_intermediate_data {
   // This is used to initialize the stripe_data buffers.
   std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
 
+  // List of column data types at each nested level.
+  std::vector<std::vector<data_type>> lvl_column_types;
+
+  // List of nested type columns at each nested level.
+  std::vector<std::vector<orc_column_meta>> lvl_nested_cols;
+
   bool global_preprocessed{false};
 };
 
@@ -292,9 +298,22 @@ range get_range(std::vector<range> const& input_ranges, range const& selected_ra
  * descriptors (`chunks` is present) during decompression and decoding. The two steps share
  * most of the execution path thus this function takes mutually exclusive parameters `stream_info`
  * or `chunks` depending on each use case.
+ *
+ * @param global_stripe_order The global index of the current decoding stripe
+ * @param level The nested level of the current decoding column
+ * @param stripeinfo The pointer to current decoding stripe's information
+ * @param stripefooter The pointer to current decoding stripe's footer
+ * @param orc2gdf The mapping from ORC column ids to gdf column ids
+ * @param types The schema type
+ * @param use_index Whether to use the row index for parsing
+ * @param apply_struct_map Indicating if this is the root level
+ * @param num_dictionary_entries The number of dictionary entries
+ * @param local_stream_order For retrieving 0-based orders of streams in the current decoding step
+ * @param stream_info The vector of streams' information
+ * @param chunks The vector of column descriptors
  */
 std::size_t gather_stream_info_and_column_desc(
-  std::size_t stripe_processing_order,
+  std::size_t global_stripe_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -303,7 +322,7 @@ std::size_t gather_stream_info_and_column_desc(
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
-  std::size_t* stream_processing_order,
+  std::size_t* local_stream_order,
   std::optional<std::vector<orc_stream_info>*> const& stream_info,
   std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks);
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 7b0b5c0b127..195288fb1a9 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -891,8 +891,7 @@ void reader::impl::decompress_and_decode()
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
-  auto& col_meta                       = *_col_meta;
-  auto const& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& col_meta = *_col_meta;
 
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
 #ifdef LOCAL_TEST
@@ -906,44 +905,16 @@ void reader::impl::decompress_and_decode()
     }
 #endif
 
-    auto const& stripe_stream_ranges = lvl_stripe_stream_ranges[level];
+    auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
 
     auto& columns_level = _selected_columns.levels[level];
+    auto& chunks        = lvl_chunks[level];
 
-    // TODO: do it in global step
-    // Association between each ORC column and its cudf::column
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type =
-        to_cudf_type(_metadata.get_col_type(col.id).kind,
-                     _config.use_np_dtypes,
-                     _config.timestamp_type.id(),
-                     to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
-
-      // Map each ORC column to its column
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
-
+    auto const& column_types     = _file_itm_data.lvl_column_types[level];
+    auto const& nested_cols      = _file_itm_data.lvl_nested_cols[level];
     auto const num_level_columns = columns_level.size();
-    auto& chunks                 = lvl_chunks[level];
+
     chunks =
       cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_level_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
@@ -974,24 +945,19 @@ void reader::impl::decompress_and_decode()
     printf(" use_index: %d\n", (int)use_index);
 #endif
 
-    // Logically view streams as columns
-    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
-
     null_count_prefix_sums[level].reserve(num_level_columns);
     std::generate_n(std::back_inserter(null_count_prefix_sums[level]), num_level_columns, [&]() {
       return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
         stripe_count, _stream, rmm::mr::get_current_device_resource());
     });
 
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
-
-    int64_t stripe_start_row = 0;
-    int64_t num_dict_entries = 0;
-    int64_t num_rowgroups    = 0;
+    auto& stripe_data       = _file_itm_data.lvl_stripe_data[level];
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
 
-    // TODO: Stripe and stream idx must be by chunk.
-    std::size_t stream_processing_order = 0;
+    int64_t stripe_start_row{0};
+    int64_t num_dict_entries{0};
+    int64_t num_rowgroups{0};
+    std::size_t local_stream_order{0};
 
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
 #ifdef LOCAL_TEST
@@ -1002,10 +968,8 @@ void reader::impl::decompress_and_decode()
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
-      // printf("stripeinfo->indexLength: %d, data: %d\n",
-      //        (int)stripe_info->indexLength,
-      //        (int)stripe_info->dataLength);
-
+      // Gather only for the decoding stripes, thus the first parameter (`stripe_processing_order`)
+      // needs to be normalized to be 0-based.
       auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
                                                                       level,
                                                                       stripe_info,
@@ -1015,7 +979,7 @@ void reader::impl::decompress_and_decode()
                                                                       use_index,
                                                                       level == 0,
                                                                       &num_dict_entries,
-                                                                      &stream_processing_order,
+                                                                      &local_stream_order,
                                                                       std::nullopt,  // stream_info
                                                                       &chunks);
 

From f339b2362f346ecead176afd0d52cda17e00c4ae Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 15:28:47 -0700
Subject: [PATCH 226/321] Reorder code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 195288fb1a9..2d9c88da8d4 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -885,14 +885,7 @@ void reader::impl::decompress_and_decode()
   // For computing null count.
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums(num_levels);
 
-  //
-  //
-  //
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
   auto& col_meta = *_col_meta;
-
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
 #ifdef LOCAL_TEST
     printf("processing level = %d\n", (int)level);
@@ -908,13 +901,15 @@ void reader::impl::decompress_and_decode()
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
 
-    auto& columns_level = _selected_columns.levels[level];
-    auto& chunks        = lvl_chunks[level];
+    auto const& columns_level = _selected_columns.levels[level];
+    auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
+    auto const& column_types  = _file_itm_data.lvl_column_types[level];
+    auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    auto const& column_types     = _file_itm_data.lvl_column_types[level];
-    auto const& nested_cols      = _file_itm_data.lvl_nested_cols[level];
-    auto const num_level_columns = columns_level.size();
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    auto& chunks      = lvl_chunks[level];
 
+    auto const num_level_columns = columns_level.size();
     chunks =
       cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_level_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
@@ -951,9 +946,6 @@ void reader::impl::decompress_and_decode()
         stripe_count, _stream, rmm::mr::get_current_device_resource());
     });
 
-    auto& stripe_data       = _file_itm_data.lvl_stripe_data[level];
-    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
-
     int64_t stripe_start_row{0};
     int64_t num_dict_entries{0};
     int64_t num_rowgroups{0};

From 9a2cee0058880c0833827fa9c2627b209bc92c36 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 15:52:36 -0700
Subject: [PATCH 227/321] More cleanup and code reordering

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 102 +++++++++------------------
 1 file changed, 34 insertions(+), 68 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 2d9c88da8d4..32e85e41851 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -816,8 +816,10 @@ void reader::impl::decompress_and_decode()
   auto const stripe_count = stripe_range.end - stripe_range.begin;
 
   // The start index of loaded stripes. They are different from decoding stripes.
-  auto const load_stripe_start =
-    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1].begin;
+  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
+  auto const load_stripe_start = load_stripe_range.begin;
 
 #ifdef LOCAL_TEST
   printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
@@ -946,6 +948,7 @@ void reader::impl::decompress_and_decode()
         stripe_count, _stream, rmm::mr::get_current_device_resource());
     });
 
+    // 0-based counters, used accross all decoding stripes in this step.
     int64_t stripe_start_row{0};
     int64_t num_dict_entries{0};
     int64_t num_rowgroups{0};
@@ -960,8 +963,8 @@ void reader::impl::decompress_and_decode()
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
-      // Gather only for the decoding stripes, thus the first parameter (`stripe_processing_order`)
-      // needs to be normalized to be 0-based.
+      // Gather only for the decoding stripes, thus the first parameter (`global_stripe_order`)
+      // needs to be normalized to 0-based.
       auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
                                                                       level,
                                                                       stripe_info,
@@ -983,29 +986,20 @@ void reader::impl::decompress_and_decode()
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      // TODO: Wrong?
-      // stripe load_stripe_start?
-      auto dst_base = static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
-
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
+      auto const dst_base =
+        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
       auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+      auto const rowgroup_id         = num_rowgroups;
+      auto const stripe_num_rowgroups =
+        use_index ? (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                      _metadata.get_row_index_stride()
+                  : 0;
+
 #ifdef LOCAL_TEST
       printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
 #endif
 
-      auto const rowgroup_id    = num_rowgroups;
-      auto stripe_num_rowgroups = 0;
-      if (use_index) {
-        stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                               _metadata.get_row_index_stride();
-      }
-
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
-      // Update chunks to reference streams pointers
+      // Update chunks to reference streams pointers.
       for (std::size_t col_idx = 0; col_idx < num_level_columns; col_idx++) {
         auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
         // start row, number of rows in a each stripe and total number of rows
@@ -1016,15 +1010,9 @@ void reader::impl::decompress_and_decode()
             : col_meta.child_start_row[(stripe_idx - stripe_start) * num_level_columns + col_idx];
         chunk.num_rows =
           (level == 0)
-            ? static_cast<int64_t>(stripe_info->numberOfRows)
+            ? num_rows_per_stripe
             : col_meta.num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_level_columns +
                                                  col_idx];
-
-        // printf("col idx: %d, start_row: %d, num rows: %d\n",
-        //        (int)col_idx,
-        //        (int)chunk.start_row,
-        //        (int)chunk.num_rows);
-
         chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
         chunk.parent_validity_info =
           (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
@@ -1036,8 +1024,6 @@ void reader::impl::decompress_and_decode()
         chunk.type_kind =
           _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
 
-        // printf("type: %d\n", (int)chunk.type_kind);
-
         // num_child_rows for a struct column will be same, for other nested types it will be
         // calculated.
         chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
@@ -1054,7 +1040,6 @@ void reader::impl::decompress_and_decode()
                                 ? sizeof(size_type)
                                 : cudf::size_of(column_types[col_idx]);
         chunk.num_rowgroups = stripe_num_rowgroups;
-        // printf("stripe_num_rowgroups: %d\n", (int)stripe_num_rowgroups);
 
         if (chunk.type_kind == orc::TIMESTAMP) {
           chunk.timestamp_type_id = _config.timestamp_type.id();
@@ -1067,21 +1052,13 @@ void reader::impl::decompress_and_decode()
         }
       }
 
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
       stripe_start_row += num_rows_per_stripe;
       num_rowgroups += stripe_num_rowgroups;
-
-      //      stripe_idx++;
-    }  // for (stripe : selected_stripes)
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
+    }
 
     if (stripe_data.empty()) { continue; }
 
-    // Process dataset chunk pages into output columns
+    // Process dataset chunks into output columns.
     auto row_groups =
       cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_level_columns, _stream);
     if (level > 0 and row_groups.size().first) {
@@ -1101,16 +1078,8 @@ void reader::impl::decompress_and_decode()
                      });
     }
 
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
-    // Setup row group descriptors if using indexes
+    // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      // printf("decompress----------------------\n");
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-      CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "ERRRRR");
-
 #ifdef LOCAL_TEST
       {
         _stream.synchronize();
@@ -1120,23 +1089,20 @@ void reader::impl::decompress_and_decode()
       }
 #endif
 
-      auto decomp_data = decompress_stripe_data(
-        _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1],
-        stream_range,
-        stripe_count,
-        _file_itm_data.compinfo_map,
-        *_metadata.per_file_metadata[0].decompressor,
-        stripe_data,
-        stream_info,
-        chunks,
-        row_groups,
-        _metadata.get_row_index_stride(),
-        level == 0,
-        _stream);
-      // stripe_data.clear();
-      // stripe_data.push_back(std::move(decomp_data));
-
-      // TODO: only reset each one if the new size/type are different.
+      auto decomp_data = decompress_stripe_data(load_stripe_range,
+                                                stream_range,
+                                                stripe_count,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+
+      // Just save the decompressed data and clear out the raw data to free up memory.
       stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
       for (std::size_t i = 1; i < stripe_count; ++i) {
         stripe_data[i + stripe_start - load_stripe_start] = {};

From a0d152886b9d79f1bb92930ac1abc8a0f5a645ab Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 15:59:43 -0700
Subject: [PATCH 228/321] Update docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index b2964e996c2..d43b5342eba 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -311,6 +311,7 @@ range get_range(std::vector<range> const& input_ranges, range const& selected_ra
  * @param local_stream_order For retrieving 0-based orders of streams in the current decoding step
  * @param stream_info The vector of streams' information
  * @param chunks The vector of column descriptors
+ * @return The number of bytes in the gathered streams
  */
 std::size_t gather_stream_info_and_column_desc(
   std::size_t global_stripe_order,

From 75a96d1b0ba021fe3a2b6e86af771b4abeaa6b6b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 16:05:58 -0700
Subject: [PATCH 229/321] Change variable types

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu |  3 +-
 cpp/src/io/orc/reader_impl_decode.cu   | 40 +++++++-------------------
 2 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index f10c5b754c0..075002276b8 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -145,7 +145,8 @@ std::size_t gather_stream_info_and_column_desc(
             if (index_type == gpu::CI_DICTIONARY) {
               chunk.dictionary_start = *num_dictionary_entries;
               chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-              *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
+              *num_dictionary_entries +=
+                static_cast<int64_t>(stripefooter->columns[column_id].dictionarySize);
             }
           }
         }
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 32e85e41851..45e9bcd7265 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -458,7 +458,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void decode_stream_data(std::size_t num_dicts,
+void decode_stream_data(int64_t num_dicts,
                         int64_t skip_rows,
                         size_type row_index_stride,
                         std::size_t level,
@@ -951,7 +951,7 @@ void reader::impl::decompress_and_decode()
     // 0-based counters, used accross all decoding stripes in this step.
     int64_t stripe_start_row{0};
     int64_t num_dict_entries{0};
-    int64_t num_rowgroups{0};
+    uint32_t num_rowgroups{0};
     std::size_t local_stream_order{0};
 
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
@@ -989,8 +989,9 @@ void reader::impl::decompress_and_decode()
       auto const dst_base =
         static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
       auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
-      auto const rowgroup_id         = num_rowgroups;
-      auto const stripe_num_rowgroups =
+
+      uint32_t const rowgroup_id = num_rowgroups;
+      uint32_t const stripe_num_rowgroups =
         use_index ? (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
                       _metadata.get_row_index_stride()
                   : 0;
@@ -1117,15 +1118,8 @@ void reader::impl::decompress_and_decode()
       }
 #endif
 
-      // printf("line %d\n", __LINE__);
-      // fflush(stdout);
-
     } else {
-      // printf("no decompression----------------------\n");
-
       if (row_groups.size().first) {
-        // printf("line %d\n", __LINE__);
-        // fflush(stdout);
         chunks.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
@@ -1140,9 +1134,6 @@ void reader::impl::decompress_and_decode()
       }
     }
 
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
 #ifdef LOCAL_TEST
     {
       _stream.synchronize();
@@ -1152,8 +1143,6 @@ void reader::impl::decompress_and_decode()
     }
 #endif
 
-    // TODO: do not clear but reset each one.
-    // and only reset if the new size/type are different.
     _out_buffers[level].clear();
 
 #ifdef LOCAL_TEST
@@ -1176,10 +1165,9 @@ void reader::impl::decompress_and_decode()
           break;
         }
       }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
 
-      // printf("  create col, num rows: %d\n", (int)n_rows);
+      auto const is_list_type = (column_types[i].id() == type_id::LIST);
+      auto const n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
 
 #ifdef LOCAL_TEST
       {
@@ -1190,9 +1178,9 @@ void reader::impl::decompress_and_decode()
       }
 #endif
 
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+      // For list column, offset column will be always size + 1.
+      _out_buffers[level].emplace_back(
+        column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr);
 
 #ifdef LOCAL_TEST
       {
@@ -1205,9 +1193,6 @@ void reader::impl::decompress_and_decode()
 #endif
     }
 
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
 #ifdef LOCAL_TEST
     {
       _stream.synchronize();
@@ -1237,15 +1222,12 @@ void reader::impl::decompress_and_decode()
     }
 #endif
 
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
-
     if (nested_cols.size()) {
 #ifdef LOCAL_TEST
       printf("have nested col\n");
 #endif
 
-      // Extract information to process nested child columns
+      // Extract information to process nested child columns.
       scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
 
       row_groups.device_to_host_sync(_stream);

From 96274abe9f5fa1c5d120428ae5d79623b7e39fef Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 16:16:35 -0700
Subject: [PATCH 230/321] More cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 45e9bcd7265..c61b5f00bf8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -1246,9 +1246,6 @@ void reader::impl::decompress_and_decode()
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
-
-    // printf("line %d\n", __LINE__);
-    // fflush(stdout);
   }  // end loop level
 
 #ifdef LOCAL_TEST
@@ -1260,6 +1257,7 @@ void reader::impl::decompress_and_decode()
   }
 #endif
 
+  // Now generate a table from the decoded result.
   std::vector<std::unique_ptr<column>> out_columns;
   _out_metadata = get_meta_with_user_data();
   std::transform(
@@ -1274,14 +1272,11 @@ void reader::impl::decompress_and_decode()
     });
   _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
 
-  // TODO: do not clear but reset each one.
-  // and only reset if the new size/type are different.
-  // This clear is just to check if there is memory leak.
+  // Free up memory.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     _out_buffers[level].clear();
 
     auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
-
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       stripe_data[stripe_start - load_stripe_start] = {};
     } else {
@@ -1300,12 +1295,6 @@ void reader::impl::decompress_and_decode()
   }
 #endif
 
-  // printf("col: \n");
-  // cudf::test::print(_chunk_read_data.decoded_table->get_column(0).view());
-
-  // DEBUG only
-  // _chunk_read_data.output_size_limit = _chunk_read_data.data_read_limit / 3;
-
   _chunk_read_data.curr_output_table_range = 0;
   _chunk_read_data.output_table_ranges =
     _chunk_read_data.output_size_limit == 0

From 246dd5bc17b6348f37511bc5b650b74da5c28553 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 16:22:07 -0700
Subject: [PATCH 231/321] Complete cleaning up

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 57770cec4fe..7835180e0b4 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -130,10 +130,13 @@ table_with_metadata reader::impl::make_output_chunk()
 
   auto out_table = [&] {
     if (_chunk_read_data.output_table_ranges.size() == 1) {
+      // Must change the index of output range, so calling `has_next()` after that
+      // can return the correct answer.
       _chunk_read_data.curr_output_table_range++;
 #ifdef LOCAL_TEST
       printf("one chunk, no more table---------------------------------\n");
 #endif
+      // If there is no slicing, just hand over the decoded table.
       return std::move(_chunk_read_data.decoded_table);
     }
 
@@ -146,11 +149,11 @@ table_with_metadata reader::impl::make_output_chunk()
     }
 #endif
 
-    auto const out_chunk =
+    auto const out_range =
       _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
     auto const out_tview = cudf::detail::slice(
       _chunk_read_data.decoded_table->view(),
-      {static_cast<size_type>(out_chunk.begin), static_cast<size_type>(out_chunk.end)},
+      {static_cast<size_type>(out_range.begin), static_cast<size_type>(out_range.end)},
       _stream)[0];
 
 #ifdef LOCAL_TEST
@@ -164,7 +167,7 @@ table_with_metadata reader::impl::make_output_chunk()
 
     auto output = std::make_unique<table>(out_tview, _stream, _mr);
 
-    // If this is the last slice, we also delete the decoded_table to free up memory.
+    // If this is the last slice, we also delete the decoded table to free up memory.
     if (!_chunk_read_data.more_table_chunk_to_output()) {
       _chunk_read_data.decoded_table.reset(nullptr);
     }

From 027f899a2e57d36309de4acc9099dd44513e9091 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 16:26:33 -0700
Subject: [PATCH 232/321] Revert error message

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 7835180e0b4..ab24b7c1eaf 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -276,7 +276,7 @@ reader::impl::impl(std::size_t output_size_limit,
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns.
   CUDF_EXPECTS(_config.skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported if having nested columns");
+               "skip_rows is not supported by nested column");
 }
 
 table_with_metadata reader::impl::read()

From 961d4680443e945f0938c7d8cc05f6f44bf1ff29 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 16:29:27 -0700
Subject: [PATCH 233/321] Revert error handling that may be wrong

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index c61b5f00bf8..022e776ed10 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -186,9 +186,8 @@ rmm::device_buffer decompress_stripe_data(
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
-  // If total_decomp_size is zero, the data should not be compressed, and this function
-  // should not be called at all.
-  CUDF_EXPECTS(!decomp_data.is_empty(), "Invalid decompression size");
+  // If total_decomp_size is zero, the input data may be just empty.
+  if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
     num_compressed_blocks + num_uncompressed_blocks, stream);

From 30b589925bff2ff1ae5150780e7ba7b45de5fccc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 16:52:20 -0700
Subject: [PATCH 234/321] Fix spell

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 022e776ed10..82ca39e6e57 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -856,7 +856,7 @@ void reader::impl::decompress_and_decode()
 #endif
 
   // Technically, overflow here should never happen because the `load_data()` step
-  // already handled it by spliting the loaded stripe range into multiple decode ranges.
+  // already handled it by splitting the loaded stripe range into multiple decode ranges.
   CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",
                std::overflow_error);
@@ -947,7 +947,7 @@ void reader::impl::decompress_and_decode()
         stripe_count, _stream, rmm::mr::get_current_device_resource());
     });
 
-    // 0-based counters, used accross all decoding stripes in this step.
+    // 0-based counters, used across all decoding stripes in this step.
     int64_t stripe_start_row{0};
     int64_t num_dict_entries{0};
     uint32_t num_rowgroups{0};

From 40b28faedb1220d357ef042d47eee5d6643f1ae0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 21:06:19 -0700
Subject: [PATCH 235/321] Update python code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 python/cudf/cudf/_lib/cpp/io/orc.pxd | 14 +++++++-------
 python/cudf/cudf/_lib/orc.pyx        |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index d5ac8574fe4..93e3f61142d 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport uint8_t, int64_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -21,8 +21,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         cudf_io_types.source_info get_source() except +
         vector[vector[size_type]] get_stripes() except +
-        size_type get_skip_rows() except +
-        size_type get_num_rows() except +
+        int64_t get_skip_rows() except +
+        optional[int64_t] get_num_rows() except +
         bool is_enabled_use_index() except +
         bool is_enabled_use_np_dtypes() except +
         data_type get_timestamp_type() except +
@@ -31,8 +31,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         void set_columns(vector[string] col_names) except +
         void set_stripes(vector[vector[size_type]] strps) except +
-        void set_skip_rows(size_type rows) except +
-        void set_num_rows(size_type nrows) except +
+        void set_skip_rows(int64_t rows) except +
+        void set_num_rows(int64_t nrows) except +
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,8 +49,8 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& columns(vector[string] col_names) except +
         orc_reader_options_builder& \
             stripes(vector[vector[size_type]] strps) except +
-        orc_reader_options_builder& skip_rows(size_type rows) except +
-        orc_reader_options_builder& num_rows(size_type nrows) except +
+        orc_reader_options_builder& skip_rows(int64_t rows) except +
+        orc_reader_options_builder& num_rows(int64_t nrows) except +
         orc_reader_options_builder& use_index(bool val) except +
         orc_reader_options_builder& use_np_dtypes(bool val) except +
         orc_reader_options_builder& timestamp_type(data_type type) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 3fc9823b914..aaaeb558846 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -325,7 +325,7 @@ cdef int64_t get_skiprows_arg(object arg) except*:
         raise TypeError("skiprows must be an int >= 0")
     return <int64_t> arg
 
-cdef size_type get_num_rows_arg(object arg) except*:
+cdef int64_t get_num_rows_arg(object arg) except*:
     arg = -1 if arg is None else arg
     if not isinstance(arg, int) or arg < -1:
         raise TypeError("num_rows must be an int >= -1")
@@ -337,7 +337,7 @@ cdef orc_reader_options make_orc_reader_options(
     object column_names,
     object stripes,
     int64_t skip_rows,
-    size_type num_rows,
+    int64_t num_rows,
     type_id timestamp_type,
     bool use_index
 ) except*:

From 51c2abf3937a68deff4aea49500c9be5e206fac0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Mar 2024 21:18:18 -0700
Subject: [PATCH 236/321] Update copyright year

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 python/cudf/cudf/_lib/cpp/io/orc.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 93e3f61142d..93e481be760 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, int64_t
 from libcpp cimport bool

From de5cf15829d8f89b920b3df3a4a515c1d25b12e0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Mar 2024 09:13:02 -0700
Subject: [PATCH 237/321] Fix style

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 python/cudf/cudf/_lib/cpp/io/orc.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 93e481be760..d5bb1726a43 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t, int64_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr

From 10945a6d212380253d6921ccbca8a0c78989ce91 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Mar 2024 09:50:19 -0700
Subject: [PATCH 238/321] Change benchmark

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 37 +++++++++++++++++-----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 0503ede62ed..94327d460ae 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -123,10 +123,8 @@ void BM_orc_read_data(nvbench::state& state,
   orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
-void BM_orc_read_io_compression(
-  nvbench::state& state,
-  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
+void orc_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
                                          static_cast<int32_t>(data_type::FLOAT),
@@ -154,14 +152,29 @@ void BM_orc_read_io_compression(
     return view.num_rows();
   }();
 
-  auto const is_chunked_read = static_cast<bool>(state.get_int64("chunked_read"));
-  if (is_chunked_read) {
+  if constexpr (chunked_read) {
     orc_read_common<true>(num_rows_written, source_sink, state);
   } else {
     orc_read_common<false>(num_rows_written, source_sink, state);
   }
 }
 
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+void BM_orc_read_io_compression(
+  nvbench::state& state,
+  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+{
+  return orc_read_io_compression<IOType, Compression, false>(state);
+}
+
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+void BM_orc_chunked_read_io_compression(
+  nvbench::state& state,
+  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+{
+  return orc_read_io_compression<IOType, Compression, true>(state);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -191,5 +204,13 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .set_type_axes_names({"io", "compression"})
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32})
-  .add_int64_axis("chunked_read", {0, 1});
+  .add_int64_axis("run_length", {1, 32});
+
+// Should have the same parameters as `BM_orc_read_io_compression` for comparision.
+NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
+                    NVBENCH_TYPE_AXES(io_list, compression_list))
+  .set_name("orc_chunked_read_io_compression")
+  .set_type_axes_names({"io", "compression"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});

From 9c9a3c9201b46201e39cec3debae1f30448992a8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Mar 2024 10:17:23 -0700
Subject: [PATCH 239/321] Change benchmark

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 36 +++++++++++-----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 94327d460ae..140fbf7f8aa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -47,14 +47,11 @@ void read_once(cudf::io::orc_reader_options const& options,
 template <typename Timer>
 void chunked_read(cudf::io::orc_reader_options const& options,
                   cudf::size_type num_rows_to_read,
-                  cudf::size_type appox_num_chunks,
+                  std::size_t output_limit,
+                  std::size_t read_limit,
                   Timer& timer)
 {
-  // Create a chunked reader that has an internal memory limits to process around 10 chunks.
-  auto const output_limit = static_cast<std::size_t>(data_size / appox_num_chunks);
-  auto const input_limit  = output_limit * 10;
-
-  auto reader = cudf::io::chunked_orc_reader(output_limit, input_limit, options);
+  auto reader = cudf::io::chunked_orc_reader(output_limit, read_limit, options);
   cudf::size_type num_rows{0};
 
   timer.start();
@@ -74,20 +71,21 @@ void orc_read_common(cudf::size_type num_rows_to_read,
 {
   auto const read_opts =
     cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();
-  cudf::size_type constexpr approx_num_chunks = 10;
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch&, auto& timer) {
-               try_drop_l3_cache();
-
-               if constexpr (!is_chunked_read) {
-                 read_once(read_opts, num_rows_to_read, timer);
-               } else {
-                 chunked_read(read_opts, num_rows_to_read, approx_num_chunks, timer);
-               }
-             });
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+      try_drop_l3_cache();
+
+      if constexpr (!is_chunked_read) {
+        read_once(read_opts, num_rows_to_read, timer);
+      } else {
+        auto const output_limit = static_cast<std::size_t>(state.get_int64("output_limit"));
+        auto const read_limit   = static_cast<std::size_t>(state.get_int64("read_limit"));
+        chunked_read(read_opts, num_rows_to_read, output_limit, read_limit, timer);
+      }
+    });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -213,4 +211,6 @@ NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
   .set_type_axes_names({"io", "compression"})
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
+  .add_int64_axis("run_length", {1, 32})
+  .add_int64_axis("output_limit", {0, 500'000})
+  .add_int64_axis("read_limit", {0, 500'000});

From bc34e40013cc5305d21b1b2a41fcbf4f5b15fd5e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Mar 2024 15:56:51 -0700
Subject: [PATCH 240/321] Fix python code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 python/cudf/cudf/_lib/orc.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index aaaeb558846..d3c75823471 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -329,7 +329,7 @@ cdef int64_t get_num_rows_arg(object arg) except*:
     arg = -1 if arg is None else arg
     if not isinstance(arg, int) or arg < -1:
         raise TypeError("num_rows must be an int >= -1")
-    return <size_type> arg
+    return <int64_t> arg
 
 
 cdef orc_reader_options make_orc_reader_options(

From de5ce7811622834bdf04887870642c0c759320a6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Mar 2024 16:07:04 -0700
Subject: [PATCH 241/321] Fix spell

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 140fbf7f8aa..8514af28c63 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -204,7 +204,7 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
 
-// Should have the same parameters as `BM_orc_read_io_compression` for comparision.
+// Should have the same parameters as `BM_orc_read_io_compression` for comparison.
 NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
                     NVBENCH_TYPE_AXES(io_list, compression_list))
   .set_name("orc_chunked_read_io_compression")

From 56750bd4d442d92f2e6463f9fd8b8b84c7afca06 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Mar 2024 22:10:58 -0700
Subject: [PATCH 242/321] Disable mem stat

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu  | 2 ++
 cpp/src/io/orc/reader_impl.hpp | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ab24b7c1eaf..82e1b1220ba 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -256,7 +256,9 @@ reader::impl::impl(std::size_t output_size_limit,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
+#ifdef LOCAL_TEST
     mem_stats_logger(mr),
+#endif
     _config{options.get_timestamp_type(),
             options.is_enabled_use_index(),
             options.is_enabled_use_np_dtypes(),
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b3f91e5e92a..2173b90d30a 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -34,13 +34,12 @@
 
 namespace cudf::io::orc::detail {
 
+#ifdef LOCAL_TEST
 class memory_stats_logger {
  public:
   explicit memory_stats_logger(rmm::mr::device_memory_resource* mr) : existing_mr(mr)
   {
-#ifdef LOCAL_TEST
     printf("exist mr: %p\n", mr);
-#endif
 
     statistics_mr =
       std::make_unique<rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>(
@@ -62,6 +61,7 @@ class memory_stats_logger {
     rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>
     statistics_mr;
 };
+#endif
 
 struct reader_column_meta;
 
@@ -184,7 +184,9 @@ class reader::impl {
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
 
+#ifdef LOCAL_TEST
   memory_stats_logger mem_stats_logger;
+#endif
 
   // Reader configs.
   struct {

From c97150ec59e409c698d0371470863b931eec06cf Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 14 Mar 2024 21:54:58 -0700
Subject: [PATCH 243/321] Change memory limits for data loading and decoding

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  2 +-
 cpp/src/io/orc/reader_impl_chunking.hpp | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 075002276b8..f58b702ddac 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -748,7 +748,7 @@ void reader::impl::load_data()
 
     // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
-                                              (1.0 - chunk_read_data::load_limit_ratio));
+                                              chunk_read_data::decode_limit_ratio);
     return tmp > 0UL ? tmp : 1UL;
   }();
   _chunk_read_data.decode_stripe_ranges =
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index d43b5342eba..7e1e08e2d91 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -184,13 +184,18 @@ struct chunk_read_data {
   {
   }
 
-  // TODO: const for 3 below?
   std::size_t const
     output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
-  std::size_t const data_read_limit;       // approximate maximum size (in bytes) used for store
-                                           // intermediate data, or 0 for no limit
-  size_type const output_row_granularity;  // TODO
-  static double constexpr load_limit_ratio{0.4};  // TODO
+  std::size_t const data_read_limit;  // approximate maximum size (in bytes) used for store
+                                      // intermediate data, or 0 for no limit
+  size_type const output_row_granularity;
+
+  // Memory limits for loading data and decoding are computed as
+  // `load/decode_limit_ratio * data_read_limit`.
+  // This is to maintain the total memory usage to be **around** the given `data_read_limit`.
+  // Note that sum of these limits may not be `1.0`, and their values are set empirically.
+  static double constexpr load_limit_ratio{0.25};
+  static double constexpr decode_limit_ratio{0.6};
 
   // Chunks of stripes that can be load into memory such that their data size is within a size
   // limit.

From 710734cbdb4b48f30dda6b071d0f6c7cc7ff2708 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 14 Mar 2024 22:10:12 -0700
Subject: [PATCH 244/321] Fix tests due to changing internal parameters

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 862324e5aa8..78f0894134c 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1066,7 +1066,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
   }
 
   {
-    int constexpr expected[] = {10, 13, 10};
+    int constexpr expected[] = {17, 13, 10};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
@@ -1102,7 +1102,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
   }
 
   {
-    int constexpr expected[] = {10, 50, 15};
+    int constexpr expected[] = {17, 50, 17};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
   }
@@ -1169,7 +1169,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, ListType)
   input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
-    int constexpr expected[] = {2, 40, 3};
+    int constexpr expected[] = {3, 40, 3};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
   }
@@ -1252,13 +1252,13 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
   input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
 
   {
-    int constexpr expected[] = {8, 8, 6};
+    int constexpr expected[] = {13, 8, 6};
     input_limit_test_read(
       __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
   }
 
   {
-    int constexpr expected[] = {16, 15, 17};
+    int constexpr expected[] = {13, 15, 17};
     input_limit_test_read(__LINE__,
                           test_files,
                           input,

From 7b890947acf9f9fbb411710431a2ea47f901e4a9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 10:07:53 -0700
Subject: [PATCH 245/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 127 ++++---------------------
 cpp/src/io/orc/reader_impl.hpp         |  48 +++-------
 cpp/src/io/orc/reader_impl_chunking.cu |  58 +++++------
 3 files changed, 53 insertions(+), 180 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 82e1b1220ba..140e4517862 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,45 +14,13 @@
  * limitations under the License.
  */
 
-// TODO: remove
-#include <cudf_test/debug_utilities.hpp>
-
-//
-//
-//
-#include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
-#include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/timezone.hpp>
-#include <cudf/detail/transform.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuda/functional>
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
 
 #include <algorithm>
-#include <iterator>
 
 namespace cudf::io::orc::detail {
 
@@ -61,55 +29,33 @@ void reader::impl::prepare_data(read_mode mode)
   // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return; }
 
-#ifdef LOCAL_TEST
-  std::cout << "call global, skip = " << _config.skip_rows << std::endl;
-#endif
-
+  // This will be no-op if it was called before.
   global_preprocess(mode);
 
   if (!_chunk_read_data.more_table_chunk_to_output()) {
     if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
-#ifdef LOCAL_TEST
-      printf("load more data\n\n");
-#endif
-
+      // Only load stripe data if:
+      //  - There is more stripe to load, and
+      //  - All loaded stripes were decoded, and
+      //  - All the decoded results were output.
       load_data();
     }
-
     if (_chunk_read_data.more_stripe_to_decode()) {
-#ifdef LOCAL_TEST
-      printf("decode more data\n\n");
-#endif
-
+      // Only decompress/decode the loaded stripes if:
+      //  - There are loaded stripes that were not decoded yet, and
+      //  - All the decoded results were output.
       decompress_and_decode();
     }
   }
-
-#ifdef LOCAL_TEST
-  printf("done load and decode data\n\n");
-#endif
 }
 
 table_with_metadata reader::impl::make_output_chunk()
 {
-#ifdef LOCAL_TEST
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "start to make out, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-#endif
-
   // There is no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
-  // If no rows or stripes to read, return empty columns
+  // If no rows or stripes to read, return empty columns.
   if (!_chunk_read_data.more_table_chunk_to_output()) {
-#ifdef LOCAL_TEST
-    printf("has no next\n");
-#endif
-
     std::vector<std::unique_ptr<column>> out_columns;
     auto out_metadata = get_meta_with_user_data();
     std::transform(_selected_columns.levels[0].begin(),
@@ -128,43 +74,23 @@ table_with_metadata reader::impl::make_output_chunk()
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
-  auto out_table = [&] {
+  auto const make_output_table = [&] {
     if (_chunk_read_data.output_table_ranges.size() == 1) {
-      // Must change the index of output range, so calling `has_next()` after that
-      // can return the correct answer.
+      // Must change the index of the current output range such that calling `has_next()` after
+      // this will return the correct answer (`false`, since there is only one range).
       _chunk_read_data.curr_output_table_range++;
-#ifdef LOCAL_TEST
-      printf("one chunk, no more table---------------------------------\n");
-#endif
-      // If there is no slicing, just hand over the decoded table.
-      return std::move(_chunk_read_data.decoded_table);
-    }
 
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << "prepare to make out, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
+      // Just hand over the decoded table without slicing.
+      return std::move(_chunk_read_data.decoded_table);
     }
-#endif
 
+    // The range of rows in the decoded table to output.
     auto const out_range =
       _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
     auto const out_tview = cudf::detail::slice(
       _chunk_read_data.decoded_table->view(),
       {static_cast<size_type>(out_range.begin), static_cast<size_type>(out_range.end)},
       _stream)[0];
-
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << "done make out, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
-
     auto output = std::make_unique<table>(out_tview, _stream, _mr);
 
     // If this is the last slice, we also delete the decoded table to free up memory.
@@ -173,25 +99,9 @@ table_with_metadata reader::impl::make_output_chunk()
     }
 
     return output;
-  }();
-
-#ifdef LOCAL_TEST
-  if (!_chunk_read_data.has_next()) {
-    static int count{0};
-    count++;
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "complete, " << count << ", peak_memory_usage: " << peak_mem
-              << " , MB = " << (peak_mem * 1.0) / (1024.0 * 1024.0) << std::endl;
-  } else {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "done, partial, peak_memory_usage: " << peak_mem
-              << " , MB = " << (peak_mem * 1.0) / (1024.0 * 1024.0) << std::endl;
-  }
-#endif
+  };
 
-  return {std::move(out_table), _out_metadata};
+  return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/};
 }
 
 table_metadata reader::impl::get_meta_with_user_data()
@@ -256,9 +166,6 @@ reader::impl::impl(std::size_t output_size_limit,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
-#ifdef LOCAL_TEST
-    mem_stats_logger(mr),
-#endif
     _config{options.get_timestamp_type(),
             options.is_enabled_use_index(),
             options.is_enabled_use_np_dtypes(),
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 2173b90d30a..45d60acb3db 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -24,7 +24,6 @@
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>  // TODO: remove
 
 #include <io/utilities/column_buffer.hpp>
 
@@ -34,35 +33,6 @@
 
 namespace cudf::io::orc::detail {
 
-#ifdef LOCAL_TEST
-class memory_stats_logger {
- public:
-  explicit memory_stats_logger(rmm::mr::device_memory_resource* mr) : existing_mr(mr)
-  {
-    printf("exist mr: %p\n", mr);
-
-    statistics_mr =
-      std::make_unique<rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>(
-        existing_mr);
-
-    rmm::mr::set_current_device_resource(statistics_mr.get());
-  }
-
-  ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
-
-  [[nodiscard]] size_t peak_memory_usage() const noexcept
-  {
-    return statistics_mr->get_bytes_counter().peak;
-  }
-
- private:
-  rmm::mr::device_memory_resource* existing_mr;
-  static inline std::unique_ptr<
-    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>>
-    statistics_mr;
-};
-#endif
-
 struct reader_column_meta;
 
 /**
@@ -73,6 +43,9 @@ class reader::impl {
   /**
    * @brief Constructor from a dataset source with reader options.
    *
+   * This constructor will call the other constructor with `output_size_limit` and `data_read_limit`
+   * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`.
+   *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
@@ -84,7 +57,8 @@ class reader::impl {
                 rmm::mr::device_memory_resource* mr);
 
   /**
-   * @copydoc cudf::io::orc::detail::chunked_reader
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
    */
   explicit impl(std::size_t output_size_limit,
                 std::size_t data_read_limit,
@@ -93,6 +67,10 @@ class reader::impl {
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
+  /**
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   */
   explicit impl(std::size_t output_size_limit,
                 std::size_t data_read_limit,
                 size_type output_row_granularity,
@@ -140,7 +118,7 @@ class reader::impl {
    * data streams of the selected columns in all stripes are generated. If the reader has a data
    * read limit, sizes of these streams are used to split the list of all stripes into multiple
    * subsets, each of which will be read into memory in the `load_data()` step. These subsets are
-   * computed such that memory usage will be capped around a fixed size limit.
+   * computed such that memory usage will be kept to be around a fixed size limit.
    *
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
@@ -184,10 +162,6 @@ class reader::impl {
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
 
-#ifdef LOCAL_TEST
-  memory_stats_logger mem_stats_logger;
-#endif
-
   // Reader configs.
   struct {
     data_type timestamp_type;  // override output timestamp resolution
@@ -215,7 +189,7 @@ class reader::impl {
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 
   // The default value used for subdividing the decoded table for final output.
-  static constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
+  static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index f58b702ddac..5635caa58ec 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -188,8 +188,8 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
   auto const end = start + cumulative_sizes.size();
 
   while (cur_count < total_count) {
-    int64_t split_pos =
-      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit));
+    int64_t split_pos = static_cast<int64_t>(
+      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit)));
 
     // If we're past the end, or if the returned range has size exceeds the given size limit,
     // move back one position.
@@ -208,9 +208,9 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
       }
     }
 
-    // In case we have moved back too in the steps above, far beyond the last split point: that
-    // means we cannot find any range that has size fits within the given size limit.
-    // In such case, we need to move forward until we move pass the last output range.
+    // In case we have moved back too much in the steps above, far beyond the last split point, that
+    // means we could not find any range that has size fits within the given size limit.
+    // In such situations, we need to move forward until we move pass the last output range.
     while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
            (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
       split_pos++;
@@ -227,7 +227,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
     }
   }
 
-  // If the last range has size smaller than `merge_threshold` percent of the second last one,
+  // If the last range has size smaller than `merge_threshold` the size of the second last one,
   // merge it with the second last one.
   // This is to prevent having too small trailing range.
   if (splits.size() > 1) {
@@ -243,6 +243,8 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
   return splits;
 }
 
+// Since `find_splits` is a template function, we need to explicitly instantiate it so it can be
+// used outside of this TU.
 template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
                                                          std::size_t total_count,
                                                          std::size_t size_limit);
@@ -264,7 +266,9 @@ void reader::impl::global_preprocess(read_mode mode)
   if (_file_itm_data.global_preprocessed) { return; }
   _file_itm_data.global_preprocessed = true;
 
-  // Load stripes's metadata.
+  //
+  // Load stripes' metadata:
+  //
   std::tie(
     _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
     _metadata.select_stripes(
@@ -274,30 +278,15 @@ void reader::impl::global_preprocess(read_mode mode)
   CUDF_EXPECTS(
     mode == read_mode::CHUNKED_READ ||
       _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-    "Number or rows to read exceeds the column size limit in READ_ALL mode.",
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.",
     std::overflow_error);
 
-#ifdef LOCAL_TEST
-  {
-    auto const skip_rows    = _config.skip_rows;
-    auto const num_rows_opt = _config.num_read_rows;
-    printf("input skip rows: %ld, num rows: %ld\n", skip_rows, num_rows_opt.value_or(-1l));
-    printf("actual skip rows: %ld, num rows: %ld\n",
-           _file_itm_data.rows_to_skip,
-           _file_itm_data.rows_to_read);
-  }
-#endif
-
   auto const& selected_stripes = _file_itm_data.selected_stripes;
   auto const num_total_stripes = selected_stripes.size();
   auto const num_levels        = _selected_columns.num_levels();
 
-#ifdef LOCAL_TEST
-  printf("num load stripe: %d\n", (int)num_total_stripes);
-#endif
-
   //
-  // Pre allocate necessary memory for data processed in the next steps:
+  // Pre allocate necessary memory for data processed in the other reading steps:
   //
   auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
   stripe_data_read_ranges.resize(num_total_stripes);
@@ -320,6 +309,10 @@ void reader::impl::global_preprocess(read_mode mode)
   auto& read_info = _file_itm_data.data_read_info;
   auto& col_meta  = *_col_meta;
 
+  //
+  // Collect columns' types.
+  //
+
   for (std::size_t level = 0; level < num_levels; ++level) {
     lvl_stripe_sizes[level].resize(num_total_stripes);
     lvl_stripe_stream_ranges[level].resize(num_total_stripes);
@@ -372,10 +365,10 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 
   //
-  // Load all stripes' metadata.
+  // Collect all data streams' information:
   //
 
-  // Collect total data size for all data streams in each stripe.
+  // Accumulate data size for data streams in each stripe.
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_total_stripes, _stream);
 
   for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
@@ -384,11 +377,10 @@ void reader::impl::global_preprocess(read_mode mode)
     auto const stripe_info   = stripe.stripe_info;
     auto const stripe_footer = stripe.stripe_footer;
 
-    std::size_t stripe_size{0};
+    std::size_t this_stripe_size{0};
     auto const last_read_size = read_info.size();
     for (std::size_t level = 0; level < num_levels; ++level) {
-      auto& stream_info  = _file_itm_data.lvl_stream_info[level];
-      auto& stripe_sizes = lvl_stripe_sizes[level];
+      auto& stream_info = _file_itm_data.lvl_stream_info[level];
 
       auto stream_level_count = stream_info.size();
       auto const stripe_level_size =
@@ -401,7 +393,7 @@ void reader::impl::global_preprocess(read_mode mode)
                                            false,  // use_index,
                                            level == 0,
                                            nullptr,  // num_dictionary_entries
-                                           nullptr,  // stream_processing_order
+                                           nullptr,  // local_stream_order
                                            &stream_info,
                                            std::nullopt  // chunks
         );
@@ -410,8 +402,8 @@ void reader::impl::global_preprocess(read_mode mode)
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      stripe_sizes[stripe_global_idx] = stripe_level_size;
-      stripe_size += stripe_level_size;
+      lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size;
+      this_stripe_size += stripe_level_size;
 
       lvl_stripe_stream_ranges[level][stripe_global_idx] =
         range{stream_level_count, stream_info.size()};
@@ -431,7 +423,7 @@ void reader::impl::global_preprocess(read_mode mode)
         read_info.emplace_back(offset, d_dst, len, stripe.source_idx, stripe_global_idx, level);
       }
     }
-    total_stripe_sizes[stripe_global_idx]      = {1, stripe_size};
+    total_stripe_sizes[stripe_global_idx]      = {1, this_stripe_size};
     stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
   }
 

From cb41a6525840a84dfbf97aca78dabd7ee6e7e6c2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 10:43:26 -0700
Subject: [PATCH 246/321] Cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 51 ++++++--------------------
 1 file changed, 11 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5635caa58ec..da7eacf24ed 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -405,10 +405,11 @@ void reader::impl::global_preprocess(read_mode mode)
       lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size;
       this_stripe_size += stripe_level_size;
 
+      // Range of the streams in `stream_info` corresponding to this stripe at the current level.
       lvl_stripe_stream_ranges[level][stripe_global_idx] =
         range{stream_level_count, stream_info.size()};
 
-      // Coalesce consecutive streams into one read
+      // Coalesce consecutive streams into one read.
       while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
         auto const d_dst  = stream_info[stream_level_count].dst_pos;
         auto const offset = stream_info[stream_level_count].offset;
@@ -422,8 +423,11 @@ void reader::impl::global_preprocess(read_mode mode)
         }
         read_info.emplace_back(offset, d_dst, len, stripe.source_idx, stripe_global_idx, level);
       }
-    }
-    total_stripe_sizes[stripe_global_idx]      = {1, this_stripe_size};
+    }  // end loop level
+
+    total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size};
+
+    // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
     stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
   }
 
@@ -433,26 +437,12 @@ void reader::impl::global_preprocess(read_mode mode)
 
   _chunk_read_data.curr_load_stripe_range = 0;
 
-  // Load all chunks if there is no read limit.
+  // Load all stripes if there is no read limit.
   if (_chunk_read_data.data_read_limit == 0) {
-#ifdef LOCAL_TEST
-    printf("0 limit: output load stripe chunk = 0, %d\n", (int)num_total_stripes);
-#endif
-
-    _chunk_read_data.load_stripe_ranges = {range{0ul, num_total_stripes}};
+    _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
     return;
   }
 
-#ifdef LOCAL_TEST
-  printf("total stripe sizes:\n");
-  int count{0};
-  for (auto& size : total_stripe_sizes) {
-    ++count;
-    printf("size: %ld, %zu\n", size.count, size.size_bytes);
-    if (count > 5) break;
-  }
-#endif
-
   // TODO: exec_policy_nosync
   // Compute the prefix sum of stripes' data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
@@ -461,36 +451,17 @@ void reader::impl::global_preprocess(read_mode mode)
                          total_stripe_sizes.d_end(),
                          total_stripe_sizes.d_begin(),
                          cumulative_size_sum{});
-
   total_stripe_sizes.device_to_host_sync(_stream);
 
-#ifdef LOCAL_TEST
-  count = 0;
-  printf("prefix sum total stripe sizes:\n");
-  for (auto& size : total_stripe_sizes) {
-    ++count;
-    printf("size: %ld, %zu\n", size.count, size.size_bytes);
-    if (count > 5) break;
-  }
-#endif
-
   auto const load_limit = [&] {
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               chunk_read_data::load_limit_ratio);
-    // Make sure not to pass 0 byte limit (due to round-off) to compute splits.
+    // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
     return tmp > 0UL ? tmp : 1UL;
   }();
+
   _chunk_read_data.load_stripe_ranges =
     find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
-
-#ifdef LOCAL_TEST
-  auto& splits = _chunk_read_data.load_stripe_ranges;
-  printf("------------\nSplits (/total num stripe = %d): \n", (int)num_total_stripes);
-  for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
-  }
-  fflush(stdout);
-#endif
 }
 
 // Load each chunk from `load_stripe_chunks`.

From d68562c9a7ed1a9cb16f2bba28434ccb36c6185a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 11:35:12 -0700
Subject: [PATCH 247/321] Fix a bug in stripe rows computation

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 39 ++++++++++++--------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index da7eacf24ed..ca6f64b94e4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -464,7 +464,6 @@ void reader::impl::global_preprocess(read_mode mode)
     find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
 }
 
-// Load each chunk from `load_stripe_chunks`.
 void reader::impl::load_data()
 {
   if (_file_itm_data.has_no_data()) { return; }
@@ -475,16 +474,11 @@ void reader::impl::load_data()
   auto const stripe_end   = load_stripe_range.end;
   auto const stripe_count = stripe_end - stripe_start;
 
-  auto const num_levels = _selected_columns.num_levels();
-
-#ifdef LOCAL_TEST
-  printf("\n\nloading data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
-#endif
-
   auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto const num_levels = _selected_columns.num_levels();
 
   // Prepare the buffer to read raw data onto.
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+  for (std::size_t level = 0; level < num_levels; ++level) {
     auto& stripe_data = lvl_stripe_data[level];
     stripe_data.resize(stripe_count);
 
@@ -499,12 +493,12 @@ void reader::impl::load_data()
   // Load stripe data into memory:
   //
 
-  // After loading data from sources into host buffers, we need to transfer (async) data to device.
-  // Such host buffers need to be kept alive until we sync device.
+  // If we load data from sources into host buffers, we need to transfer (async) data to device
+  // memory. Such host buffers need to be kept alive until we sync the transfers.
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
 
-  // If we load data directly from sources into device, we also need to the entire read tasks.
-  // Thus, we need to keep all read tasks alive and sync all together.
+  // If we load data directly from sources into device, the loads are also async.
+  // Thus, we need to make sure to sync all them at the end.
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
   auto const [read_begin, read_end] =
@@ -543,7 +537,7 @@ void reader::impl::load_data()
   // Split list of all stripes into subsets that be loaded separately without blowing up memory:
   //
 
-  // A map from stripe source into `CompressedStreamInfo*` pointer.
+  // A map from a stripe sources into `CompressedStreamInfo*` pointers.
   // These pointers are then used to retrieve stripe/level decompressed sizes for later
   // decompression and decoding.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
@@ -551,17 +545,20 @@ void reader::impl::load_data()
   // For estimating the decompressed sizes of the loaded stripes.
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
                                                                                _stream);
-  std::size_t num_loaded_stripes{0};
-  for (std::size_t stripe_idx = 0; stripe_idx < stripe_count; ++stripe_idx) {
-    auto const& stripe              = _file_itm_data.selected_stripes[stripe_idx];
-    auto const stripe_info          = stripe.stripe_info;
-    stripe_decomp_sizes[stripe_idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
-    num_loaded_stripes += stripe_info->numberOfRows;
+
+  // Number of rows in the loading stripes.
+  std::size_t num_loading_rows{0};
+
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe       = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info   = stripe.stripe_info;
+    stripe_decomp_sizes[idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
+    num_loading_rows += stripe_info->numberOfRows;
   }
 
   auto& compinfo_map = _file_itm_data.compinfo_map;
 
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+  for (std::size_t level = 0; level < num_levels; ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
 
@@ -661,7 +658,7 @@ void reader::impl::load_data()
   if (_chunk_read_data.data_read_limit == 0 &&
       // In addition to not have any read limit, we also need to check if the the total number of
       // rows in the loaded stripes exceeds column size limit.
-      num_loaded_stripes < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+      num_loading_rows < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
 #ifdef LOCAL_TEST
     printf("0 limit: output decode stripe chunk unchanged\n");
 #endif

From 86ec4367951749db57ac23de18e33f56ec18eaa4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 12:06:32 -0700
Subject: [PATCH 248/321] Cleanup `reader_impl_chunking.cu`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 169 +++++-------------------
 cpp/src/io/orc/reader_impl_chunking.hpp |  22 +--
 2 files changed, 47 insertions(+), 144 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ca6f64b94e4..e5d17204fb9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -15,60 +15,26 @@
  */
 
 #include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
-#include "io/utilities/config_utils.hpp"
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cuda/functional>
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/reduce.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
-#include <thrust/transform.h>
 
-#include <algorithm>
-#include <iterator>
-
-//
-//
-//
-#include <cudf_test/debug_utilities.hpp>
-
-#include <cudf/detail/utilities/linked_column.hpp>
-//
-//
-//
+#include <tuple>
 
 namespace cudf::io::orc::detail {
 
 std::size_t gather_stream_info_and_column_desc(
-  std::size_t global_stripe_order,
+  std::size_t stripe_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -124,7 +90,7 @@ std::size_t gather_stream_info_and_column_desc(
           if (child_idx >= 0) {
             col = child_idx;
             if (chunks.has_value()) {
-              auto& chunk                     = (*chunks.value())[global_stripe_order][col];
+              auto& chunk                     = (*chunks.value())[stripe_order][col];
               chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
@@ -136,7 +102,7 @@ std::size_t gather_stream_info_and_column_desc(
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk                = (*chunks.value())[global_stripe_order][col];
+            auto& chunk                = (*chunks.value())[stripe_order][col];
             chunk.strm_id[index_type]  = *local_stream_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
@@ -157,7 +123,7 @@ std::size_t gather_stream_info_and_column_desc(
           stripeinfo->offset + src_offset,
           dst_offset,
           stream.length,
-          stream_source_info{global_stripe_order, level, column_id, stream.kind});
+          stream_source_info{stripe_order, level, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
@@ -432,9 +398,11 @@ void reader::impl::global_preprocess(read_mode mode)
   }
 
   //
-  // Split list of all stripes into subsets that be loaded separately without blowing up memory:
+  // Split range of all stripes into subranges that can be loaded separately without blowing up
+  // memory:
   //
 
+  // Load range is reset to start from the first position in `load_stripe_ranges`.
   _chunk_read_data.curr_load_stripe_range = 0;
 
   // Load all stripes if there is no read limit.
@@ -497,27 +465,28 @@ void reader::impl::load_data()
   // memory. Such host buffers need to be kept alive until we sync the transfers.
   std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
 
-  // If we load data directly from sources into device, the loads are also async.
+  // If we load data directly from sources into device memory, the loads are also async.
   // Thus, we need to make sure to sync all them at the end.
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
 
+  // Range of the read info (offset, length) to read for the current being loaded stripes.
   auto const [read_begin, read_end] =
     get_range(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
-    auto const source     = _metadata.per_file_metadata[read_info.source_idx].source;
+    auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
     auto const dst_base   = static_cast<uint8_t*>(
       lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
-    if (source->is_device_read_preferred(read_info.length)) {
+    if (source_ptr->is_device_read_preferred(read_info.length)) {
       read_tasks.push_back(
-        std::pair(source->device_read_async(
+        std::pair(source_ptr->device_read_async(
                     read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
                   read_info.length));
 
     } else {
-      auto buffer = source->host_read(read_info.offset, read_info.length);
+      auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
       CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
       CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
                                     buffer->data(),
@@ -528,13 +497,17 @@ void reader::impl::load_data()
     }
   }
 
-  if (host_read_buffers.size() > 0) { _stream.synchronize(); }
+  if (host_read_buffers.size() > 0) {
+    _stream.synchronize();
+    host_read_buffers.clear();
+  }
   for (auto& task : read_tasks) {
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
   //
-  // Split list of all stripes into subsets that be loaded separately without blowing up memory:
+  // Split range of loaded stripes into subranges that can be decoded separately without blowing up
+  // memory:
   //
 
   // A map from a stripe sources into `CompressedStreamInfo*` pointers.
@@ -565,6 +538,7 @@ void reader::impl::load_data()
     auto& stripe_data = lvl_stripe_data[level];
     if (stripe_data.empty()) { continue; }
 
+    // Range of all streams in the loaded stripes.
     auto const stream_range =
       get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
     auto const num_streams = stream_range.end - stream_range.begin;
@@ -572,8 +546,8 @@ void reader::impl::load_data()
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      // Cannot be cached as-is, since this is for streams in a loaded stripe range, while
-      // the latter decompression/decoding step will use a different stripe range.
+      // Cannot be cached as-is, since this is for streams in the current loaded stripe range,
+      // while the decompression/decoding step would probably use just a subrange of it.
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
 
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
@@ -585,17 +559,6 @@ void reader::impl::load_data()
         stream_compinfo_map[stream_source_info{
           info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind}] =
           &compinfo.back();
-
-#ifdef LOCAL_TEST
-        printf("collec stream [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-               (int)info.source.stripe_idx,
-               (int)info.source.level,
-               (int)info.source.orc_col_idx,
-               (int)info.source.kind,
-               info.dst_pos,
-               info.length);
-        fflush(stdout);
-#endif
       }
 
       compinfo.host_to_device_async(_stream);
@@ -613,38 +576,17 @@ void reader::impl::load_data()
                                    stream_compinfo->max_uncompressed_size};
         stripe_decomp_sizes[stream_id.stripe_idx - stripe_start].size_bytes +=
           stream_compinfo->max_uncompressed_size;
-
-#ifdef LOCAL_TEST
-        printf("cache info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-               (int)stream_id.stripe_idx,
-               (int)stream_id.level,
-               (int)stream_id.orc_col_idx,
-               (int)stream_id.kind,
-               (size_t)stream_compinfo->num_compressed_blocks,
-               (size_t)stream_compinfo->num_uncompressed_blocks,
-               stream_compinfo->max_uncompressed_size);
-        fflush(stdout);
-#endif
       }
 
-      // Important: must clear this map since the next level will have similar keys.
+      // Important: must clear this map to reuse the (empty) map for processing the next level.
       stream_compinfo_map.clear();
-
-    } else {
-#ifdef LOCAL_TEST
-      printf("no compression \n");
-      fflush(stdout);
-#endif
-
-      // Set decompression size equal to the input size.
+    } else {  // no decompression
+      // Set decompression sizes equal to the input sizes.
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
       }
     }
-
-    // printf("  end level %d\n\n", (int)level);
-
   }  // end loop level
 
   // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
@@ -652,34 +594,18 @@ void reader::impl::load_data()
 
   // Decode all loaded stripes if there is no read limit.
   // In theory, we should just decode enough stripes for output one table chunk, instead of
-  // decoding all stripes like this.
+  // decoding all stripes like this, for better load-balancing and reduce memory usage.
   // However, we do not know how many stripes are 'enough' because there is not any simple and
-  // cheap way to compute the exact decoded sizes of stripes.
+  // cheap way to compute the exact decoded sizes of stripes without actually decoding them.
   if (_chunk_read_data.data_read_limit == 0 &&
-      // In addition to not have any read limit, we also need to check if the the total number of
+      // In addition to read limit, we also need to check if the the total number of
       // rows in the loaded stripes exceeds column size limit.
+      // If that is the case, we cannot read all stripes at once.
       num_loading_rows < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
-#ifdef LOCAL_TEST
-    printf("0 limit: output decode stripe chunk unchanged\n");
-#endif
-
     _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
     return;
   }
 
-#ifdef LOCAL_TEST
-  // TODO: remove
-  if (_chunk_read_data.data_read_limit == 0) { printf("0 limit but size overflow\n"); }
-
-  {
-    int count{0};
-    for (auto& size : stripe_decomp_sizes) {
-      printf("decomp stripe size: %ld, %zu, %zu\n", size.count, size.size_bytes, size.rows);
-      if (count++ > 5) break;
-    }
-  }
-#endif
-
   // TODO: exec_policy_nosync
   // Compute the prefix sum of stripe data sizes and rows.
   stripe_decomp_sizes.host_to_device_async(_stream);
@@ -690,50 +616,27 @@ void reader::impl::load_data()
                          cumulative_size_sum{});
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
-#ifdef LOCAL_TEST
-  {
-    int count{0};
-    for (auto& size : stripe_decomp_sizes) {
-      printf(
-        "prefix sum decomp stripe size: %ld, %zu, %zu\n", size.count, size.size_bytes, size.rows);
-      if (count++ > 5) break;
-    }
-  }
-#endif
-
   auto const decode_limit = [&] {
     // In this case, we have no read limit but have to split due to having number of rows in loaded
     // stripes exceeds column size limit. So we will split based on row number, not data size.
     if (_chunk_read_data.data_read_limit == 0) { return std::numeric_limits<std::size_t>::max(); }
 
-    // If `data_read_limit` is too small, make sure not to pass 0 byte limit to compute splits.
+    // If `data_read_limit` is too small, make sure not to pass 0 byte limit to `find_splits`.
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               chunk_read_data::decode_limit_ratio);
     return tmp > 0UL ? tmp : 1UL;
   }();
+
   _chunk_read_data.decode_stripe_ranges =
     find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
 
   // The split ranges always start from zero.
-  // We need to update the ranges to start from `stripe_start` which is covererd by the current
-  // range of loaded stripes.
+  // We need to change these ranges to start from `stripe_start` which are the correct subranges of
+  // the current loaded stripe range.
   for (auto& range : _chunk_read_data.decode_stripe_ranges) {
     range.begin += stripe_start;
     range.end += stripe_start;
   }
-
-#ifdef LOCAL_TEST
-  auto& splits = _chunk_read_data.decode_stripe_ranges;
-  printf("------------\nSplits decode_stripe_chunks (/%d): \n", (int)stripe_count);
-  for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
-  }
-
-  auto peak_mem = mem_stats_logger.peak_memory_usage();
-  std::cout << "load, peak_memory_usage: " << peak_mem << "("
-            << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  fflush(stdout);
-#endif
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 7e1e08e2d91..f77ca173d35 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -298,28 +298,28 @@ range get_range(std::vector<range> const& input_ranges, range const& selected_ra
  * @brief Function that populates descriptors for either individual streams or chunks of column
  * data, but not both.
  *
- * This function is used in the global step, to gather information for streams of all stripes in
- * the data sources (when `stream_info` is present). Later on, it is used again to populate column
- * descriptors (`chunks` is present) during decompression and decoding. The two steps share
- * most of the execution path thus this function takes mutually exclusive parameters `stream_info`
- * or `chunks` depending on each use case.
+ * This function is firstly used in the global step, to gather information for streams of all
+ * stripes in the data sources (when `stream_info` is present). Later on, it is used again to
+ * populate column descriptors (`chunks` is present) during decompression and decoding. The two
+ * steps share most of the execution path thus this function takes mutually exclusive parameters
+ * `stream_info` or `chunks` depending on each use case.
  *
- * @param global_stripe_order The global index of the current decoding stripe
- * @param level The nested level of the current decoding column
- * @param stripeinfo The pointer to current decoding stripe's information
- * @param stripefooter The pointer to current decoding stripe's footer
+ * @param stripe_order The index of the current stripe, can be global index or local decoding index
+ * @param level The current processing nested level
+ * @param stripeinfo The pointer to current stripe's information
+ * @param stripefooter The pointer to current stripe's footer
  * @param orc2gdf The mapping from ORC column ids to gdf column ids
  * @param types The schema type
  * @param use_index Whether to use the row index for parsing
  * @param apply_struct_map Indicating if this is the root level
  * @param num_dictionary_entries The number of dictionary entries
- * @param local_stream_order For retrieving 0-based orders of streams in the current decoding step
+ * @param local_stream_order For retrieving 0-based orders of streams in the decoding step
  * @param stream_info The vector of streams' information
  * @param chunks The vector of column descriptors
  * @return The number of bytes in the gathered streams
  */
 std::size_t gather_stream_info_and_column_desc(
-  std::size_t global_stripe_order,
+  std::size_t stripe_order,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,

From 0f78b0d930b0f121b9bb8aa89e88d059ed0d62fb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 13:00:15 -0700
Subject: [PATCH 249/321] Cleanup `reader_impl_decode.cu`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 332 +++------------------------
 1 file changed, 30 insertions(+), 302 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 82ca39e6e57..2aab392cd6b 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -14,15 +14,6 @@
  * limitations under the License.
  */
 
-// #define PRINT_DEBUG
-
-// TODO: remove
-#include <cudf_test/debug_utilities.hpp>
-
-#include <cudf/concatenate.hpp>
-//
-//
-//
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/reader_impl.hpp"
@@ -36,7 +27,6 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -45,7 +35,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -55,7 +44,7 @@
 #include <thrust/transform.h>
 
 #include <algorithm>
-#include <iterator>
+#include <numeric>
 
 namespace cudf::io::orc::detail {
 
@@ -65,7 +54,7 @@ namespace {
  * @brief  Decompresses the stripe data, at stream granularity.
  *
  * Only the streams in the provided `stream_range` are decoded. That range is determined in
- * the previous steps, after splitting stripes into subsets to maintain memory usage to be
+ * the previous steps, after splitting stripes into ranges to maintain memory usage to be
  * under data read limit.
  *
  * @param loaded_stripe_range Range of stripes that are already loaded in memory
@@ -107,27 +96,14 @@ rmm::device_buffer decompress_stripe_data(
   for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
     auto const& info = stream_info[stream_idx];
 
-#ifdef LOCAL_TEST
-//    printf("collec stream  again [%d, %d, %d, %d]: dst = %lu,  length = %lu\n",
-//           (int)info.source.stripe_idx,
-//           (int)info.source.level,
-//           (int)info.source.orc_col_idx,
-//           (int)info.source.kind,
-//           info.dst_pos,
-//           info.length);
-//    fflush(stdout);
-#endif
-
     compinfo.push_back(gpu::CompressedStreamInfo(
       static_cast<uint8_t const*>(
         stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
         info.dst_pos,
       info.length));
 
-    auto const& cached_comp_info = compinfo_map.at(stream_source_info{
-      info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind});
-    auto& stream_comp_info       = compinfo.back();
-
+    auto const& cached_comp_info             = compinfo_map.at(info.source);
+    auto& stream_comp_info                   = compinfo.back();
     stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
     stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
     stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
@@ -141,52 +117,12 @@ rmm::device_buffer decompress_stripe_data(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
-#ifdef XXX
-  std::size_t old_num_compressed_blocks   = num_compressed_blocks;
-  std::size_t old_num_uncompressed_blocks = num_uncompressed_blocks;
-  std::size_t old_total_decomp_size       = total_decomp_size;
-
-  num_compressed_blocks   = 0;
-  num_uncompressed_blocks = 0;
-  total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
-
-    auto const& info = stream_info[i];
-    printf("compute info [%d, %d, %d, %d]:  %lu | %lu | %lu\n",
-           (int)info.source.stripe_idx,
-           (int)info.source.level,
-           (int)info.source.orc_cold_idx,
-           (int)info.source.kind,
-           (size_t)compinfo[i].num_compressed_blocks,
-           (size_t)compinfo[i].num_uncompressed_blocks,
-           compinfo[i].max_uncompressed_size);
-    fflush(stdout);
-  }
-
-  if (old_num_compressed_blocks != num_compressed_blocks ||
-      old_num_uncompressed_blocks != num_uncompressed_blocks ||
-      old_total_decomp_size != total_decomp_size) {
-    printf("invalid: %d - %d, %d - %d, %d - %d\n",
-           (int)old_num_compressed_blocks,
-           (int)num_compressed_blocks,
-           (int)old_num_uncompressed_blocks,
-           (int)num_uncompressed_blocks,
-           (int)old_total_decomp_size,
-           (int)total_decomp_size
-
-    );
-  }
-#endif
-
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
+  // Buffer needs to be padded.This is required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
   // If total_decomp_size is zero, the input data may be just empty.
+  // This is still a valid input, thus do not be panick.
   if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
@@ -471,10 +407,6 @@ void decode_stream_data(int64_t num_dicts,
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
 
-#ifdef LOCAL_TEST
-  printf("decode %d stripess \n", (int)num_stripes);
-#endif
-
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -495,29 +427,11 @@ void decode_stream_data(int64_t num_dicts,
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
   if (level > 0) {
-#ifdef LOCAL_TEST
-    printf("update_null_mask\n");
-#endif
-
     // Update nullmasks for children if parent was a struct and had null mask
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
   rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
-
-  // printf(
-  //   "num col: %d, num stripe: %d, skip row: %d, row_groups size: %d, row index stride: %d, "
-  //   "level: "
-  //   "%d\n",
-  //   (int)num_columns,
-  //   (int)num_stripes,
-  //   (int)skip_rows,
-  //   (int)row_groups.size().first,
-  //   (int)row_index_stride,
-  //   (int)level
-  // );
-
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
@@ -541,12 +455,6 @@ void decode_stream_data(int64_t num_dicts,
                       stripe_idx_it + num_stripes,
                       0,
                       [&](auto null_count, auto const stripe_idx) {
-                        // printf(
-                        //   "null count: %d => %d\n", (int)stripe_idx,
-                        //   (int)chunks[stripe_idx][col_idx].null_count);
-                        // printf("num child rows: %d \n",
-                        // (int)chunks[stripe_idx][col_idx].num_child_rows);
-
                         return null_count + chunks[stripe_idx][col_idx].null_count;
                       });
   });
@@ -721,17 +629,18 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
 }
 
 /**
- * @brief Find the splits of the input table such that each split range has cumulative size less
+ * @brief Find the splits of the input table such that each split range of rows has data size less
  * than a given `size_limit`.
  *
  * The parameter `segment_length` is to control the granularity of splits. The output ranges will
  * always have numbers of rows that are multiple of this value, except the last range that contains
  * the remaining rows.
  *
- * Similar to `find_splits`, the given limit is just a soft limit. The function will never output
+ * Similar to `find_splits`, the given limit is just a soft limit. This function will never output
  * empty ranges, even they have sizes exceed the value of `size_limit`.
  *
  * @param input The input table to find splits
+ * @param segment_length Value to control granularity of the output ranges
  * @param size_limit A limit on the output size of each split range
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A vector of ranges as splits of the input
@@ -741,13 +650,8 @@ std::vector<range> find_table_splits(table_view const& input,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
 {
-#ifdef LOCAL_TEST
-  printf("find table split, seg length = %d, limit = %d \n", segment_length, (int)size_limit);
-#endif
-
-  // If segment_length is zero: we don't have any limit on granularity.
-  // As such, set segment length equal to the number of rows.
-  if (segment_length == 0) { segment_length = input.num_rows(); }
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
+  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length");
 
   // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
   segment_length = std::min(segment_length, input.num_rows());
@@ -776,20 +680,6 @@ std::vector<range> find_table_splits(table_view const& input,
                              static_cast<std::size_t>(size)};
     });
 
-#ifdef LOCAL_TEST
-  {
-    int count{0};
-    // TODO: remove:
-    segmented_sizes.device_to_host_sync(stream);
-    printf("total row sizes by segment = %d:\n", (int)segment_length);
-    for (auto& size : segmented_sizes) {
-      printf("size: %ld, %zu\n", size.count, size.size_bytes / CHAR_BIT);
-      if (count > 5) break;
-      ++count;
-    }
-  }
-#endif
-
   // TODO: exec_policy_nosync
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          segmented_sizes.d_begin(),
@@ -808,6 +698,8 @@ void reader::impl::decompress_and_decode()
 {
   if (_file_itm_data.has_no_data()) { return; }
 
+  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
+
   auto const stripe_range =
     _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
   auto const stripe_start = stripe_range.begin;
@@ -815,16 +707,10 @@ void reader::impl::decompress_and_decode()
   auto const stripe_count = stripe_range.end - stripe_range.begin;
 
   // The start index of loaded stripes. They are different from decoding stripes.
-  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
   auto const load_stripe_range =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
   auto const load_stripe_start = load_stripe_range.begin;
 
-#ifdef LOCAL_TEST
-  printf("\ndecoding data from stripe %d -> %d\n", (int)stripe_start, (int)stripe_end);
-  printf("\n loaded stripe start %d \n", (int)load_stripe_start);
-#endif
-
   auto const rows_to_skip      = _file_itm_data.rows_to_skip;
   auto const& selected_stripes = _file_itm_data.selected_stripes;
 
@@ -834,13 +720,6 @@ void reader::impl::decompress_and_decode()
     auto const& stripe     = selected_stripes[stripe_idx];
     auto const stripe_rows = static_cast<int64_t>(stripe.stripe_info->numberOfRows);
     rows_to_decode += stripe_rows;
-
-    // The rows to skip should never be larger than number of rows in the first loaded stripes.
-    // Technically, overflow here should never happen since `select_stripes` already checked it.
-    // This is just to make sure there was not any bug there.
-    if (rows_to_skip > 0) {
-      CUDF_EXPECTS(rows_to_skip < stripe_rows, "Invalid rows_to_skip computation.");
-    }
   }
 
   CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation.");
@@ -851,18 +730,15 @@ void reader::impl::decompress_and_decode()
   _file_itm_data.rows_to_skip = 0;
   _file_itm_data.rows_to_read -= rows_to_decode;
 
-#ifdef LOCAL_TEST
-  printf("decode, skip = %ld, decode = %ld\n", rows_to_skip, rows_to_decode);
-#endif
-
   // Technically, overflow here should never happen because the `load_data()` step
   // already handled it by splitting the loaded stripe range into multiple decode ranges.
   CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",
                std::overflow_error);
 
+  // TODO: move this to global process
   // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+  auto const tz_table = [&, &writerTimezone = selected_stripes[0].stripe_footer->writerTimezone] {
     auto const has_timestamp_column = std::any_of(
       _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
         return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
@@ -870,9 +746,9 @@ void reader::impl::decompress_and_decode()
         });
       });
 
-    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
-                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
-                                : std::make_unique<cudf::table>();
+    return has_timestamp_column
+             ? cudf::detail::make_timezone_transition_table({}, writerTimezone, _stream)
+             : std::make_unique<cudf::table>();
   }();
   auto const tz_table_dptr = table_device_view::create(tz_table->view(), _stream);
 
@@ -888,17 +764,6 @@ void reader::impl::decompress_and_decode()
 
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-#ifdef LOCAL_TEST
-    printf("processing level = %d\n", (int)level);
-
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
-
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
 
@@ -915,15 +780,6 @@ void reader::impl::decompress_and_decode()
       cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_level_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
-
     const bool use_index =
       _config.use_index &&
       // Do stripes have row group index
@@ -937,10 +793,6 @@ void reader::impl::decompress_and_decode()
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
-#ifdef LOCAL_TEST
-    printf(" use_index: %d\n", (int)use_index);
-#endif
-
     null_count_prefix_sums[level].reserve(num_level_columns);
     std::generate_n(std::back_inserter(null_count_prefix_sums[level]), num_level_columns, [&]() {
       return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
@@ -954,16 +806,11 @@ void reader::impl::decompress_and_decode()
     std::size_t local_stream_order{0};
 
     for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
-#ifdef LOCAL_TEST
-      printf("processing stripe_idx = %d\n", (int)stripe_idx);
-#endif
-
       auto const& stripe       = selected_stripes[stripe_idx];
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
-      // Gather only for the decoding stripes, thus the first parameter (`global_stripe_order`)
-      // needs to be normalized to 0-based.
+      // The first parameter (`stripe_order`) must be normalized to 0-based.
       auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
                                                                       level,
                                                                       stripe_info,
@@ -978,27 +825,19 @@ void reader::impl::decompress_and_decode()
                                                                       &chunks);
 
       auto const is_stripe_data_empty = total_data_size == 0;
-#ifdef LOCAL_TEST
-      printf("is_stripe_data_empty: %d\n", (int)is_stripe_data_empty);
-#endif
-
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
       auto const dst_base =
         static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
-      auto const num_rows_per_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+      auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
 
       uint32_t const rowgroup_id = num_rowgroups;
       uint32_t const stripe_num_rowgroups =
-        use_index ? (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+        use_index ? (num_rows_in_stripe + _metadata.get_row_index_stride() - 1) /
                       _metadata.get_row_index_stride()
                   : 0;
 
-#ifdef LOCAL_TEST
-      printf(" num_rows_per_stripe : %d\n", (int)num_rows_per_stripe);
-#endif
-
       // Update chunks to reference streams pointers.
       for (std::size_t col_idx = 0; col_idx < num_level_columns; col_idx++) {
         auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
@@ -1010,7 +849,7 @@ void reader::impl::decompress_and_decode()
             : col_meta.child_start_row[(stripe_idx - stripe_start) * num_level_columns + col_idx];
         chunk.num_rows =
           (level == 0)
-            ? num_rows_per_stripe
+            ? num_rows_in_stripe
             : col_meta.num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_level_columns +
                                                  col_idx];
         chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
@@ -1052,7 +891,7 @@ void reader::impl::decompress_and_decode()
         }
       }
 
-      stripe_start_row += num_rows_per_stripe;
+      stripe_start_row += num_rows_in_stripe;
       num_rowgroups += stripe_num_rowgroups;
     }
 
@@ -1080,15 +919,6 @@ void reader::impl::decompress_and_decode()
 
     // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-#ifdef LOCAL_TEST
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-#endif
-
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,
@@ -1108,15 +938,6 @@ void reader::impl::decompress_and_decode()
         stripe_data[i + stripe_start - load_stripe_start] = {};
       }
 
-#ifdef LOCAL_TEST
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-#endif
-
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1133,33 +954,12 @@ void reader::impl::decompress_and_decode()
       }
     }
 
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
-
-    _out_buffers[level].clear();
-
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
+    _out_buffers[level].resize(0);
 
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < stripe_count; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-#ifdef LOCAL_TEST
-          printf("   is nullable\n");
-#endif
           is_nullable = true;
           break;
         }
@@ -1168,39 +968,11 @@ void reader::impl::decompress_and_decode()
       auto const is_list_type = (column_types[i].id() == type_id::LIST);
       auto const n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
 
-#ifdef LOCAL_TEST
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-#endif
-
       // For list column, offset column will be always size + 1.
       _out_buffers[level].emplace_back(
         column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr);
-
-#ifdef LOCAL_TEST
-      {
-        _stream.synchronize();
-        auto peak_mem = mem_stats_logger.peak_memory_usage();
-        std::cout << __LINE__ << ", buffer size: " << n_rows
-                  << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                  << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-      }
-#endif
     }
 
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
-
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
@@ -1212,20 +984,7 @@ void reader::impl::decompress_and_decode()
                        _stream,
                        _mr);
 
-#ifdef LOCAL_TEST
-    {
-      _stream.synchronize();
-      auto peak_mem = mem_stats_logger.peak_memory_usage();
-      std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-                << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-    }
-#endif
-
     if (nested_cols.size()) {
-#ifdef LOCAL_TEST
-      printf("have nested col\n");
-#endif
-
       // Extract information to process nested child columns.
       scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
 
@@ -1247,15 +1006,6 @@ void reader::impl::decompress_and_decode()
     }
   }  // end loop level
 
-#ifdef LOCAL_TEST
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-#endif
-
   // Now generate a table from the decoded result.
   std::vector<std::unique_ptr<column>> out_columns;
   _out_metadata = get_meta_with_user_data();
@@ -1271,9 +1021,9 @@ void reader::impl::decompress_and_decode()
     });
   _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
 
-  // Free up memory.
+  // Free up temp memory used for decoding.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    _out_buffers[level].clear();
+    _out_buffers[level].resize(0);
 
     auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
@@ -1285,16 +1035,11 @@ void reader::impl::decompress_and_decode()
     }
   }
 
-#ifdef LOCAL_TEST
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << __LINE__ << ", decomp and decode, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-#endif
-
+  // Output table range is reset to start from the first position.
   _chunk_read_data.curr_output_table_range = 0;
+
+  // Split the decoded table into ranges that be output into chunks having size within the given
+  // output size limit.
   _chunk_read_data.output_table_ranges =
     _chunk_read_data.output_size_limit == 0
       ? std::vector<range>{range{
@@ -1303,23 +1048,6 @@ void reader::impl::decompress_and_decode()
                           _chunk_read_data.output_row_granularity,
                           _chunk_read_data.output_size_limit,
                           _stream);
-
-#ifdef LOCAL_TEST
-  auto& splits = _chunk_read_data.output_table_ranges;
-  printf("------------\nSplits decoded table (/total num rows = %d): \n",
-         (int)_chunk_read_data.decoded_table->num_rows());
-  for (size_t idx = 0; idx < splits.size(); idx++) {
-    printf("{%ld, %ld}\n", splits[idx].begin, splits[idx].end);
-  }
-  fflush(stdout);
-
-  {
-    _stream.synchronize();
-    auto peak_mem = mem_stats_logger.peak_memory_usage();
-    std::cout << "decomp and decode, peak_memory_usage: " << peak_mem << "("
-              << (peak_mem * 1.0) / (1024.0 * 1024.0) << " MB)" << std::endl;
-  }
-#endif
 }
 
 }  // namespace cudf::io::orc::detail

From f19794695aa2dbbcf4e571adf2b9ee9bd8f290d8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 13:24:37 -0700
Subject: [PATCH 250/321] Cleanup `reader_impl_chunking.hpp`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index f77ca173d35..15cacea2e1d 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -220,16 +220,9 @@ struct chunk_read_data {
     return curr_output_table_range < output_table_ranges.size();
   }
 
-  // Only has more chunk to output if:
   bool has_next() const
   {
-#ifdef LOCAL_TEST
-    printf("compute has_next: %d, %d, %d\n",
-           (int)more_stripe_to_load(),
-           (int)more_stripe_to_decode(),
-           (int)more_table_chunk_to_output());
-#endif
-
+    // Only has more chunk to output if:
     return more_stripe_to_load() || more_stripe_to_decode() || more_table_chunk_to_output();
   }
 };

From 74d806bf8dfc6d541bfc26ece00716bff7a5db3c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 13:28:25 -0700
Subject: [PATCH 251/321] Change row selection test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 78f0894134c..2be1513e5f2 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1277,8 +1277,12 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
 
 TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
 {
-  int64_t constexpr num_rows    = 100'000'000l;
+  // `num_rows` should not be divisible by `stripe_size_rows`, to test the correctness of row
+  // selections.
+  int64_t constexpr num_rows    = 100'517'687l;
   int constexpr rows_per_stripe = 100'000;
+  static_assert(num_rows % rows_per_stripe != 0,
+                "`num_rows` should not be divisible by `stripe_size_rows`.");
 
   auto const it    = thrust::make_counting_iterator(0);
   auto const col   = int32s_col(it, it + num_rows);
@@ -1294,7 +1298,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
   // Verify metadata.
   auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath});
   EXPECT_EQ(metadata.num_rows(), num_rows);
-  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe);
+  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe + 1);
 
   int constexpr random_val = 123456;
 

From 2a67770bf6828e55e2c38b8fa1e2a44f87ea70d7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 13:31:19 -0700
Subject: [PATCH 252/321] Cleanup test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 2be1513e5f2..173e9ce00d4 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1470,7 +1470,5 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
   }
 
-  printf("done local test\n");
-  fflush(stdout);
 #endif  // LOCAL_TEST
 }

From f76f61e21280c4251c30b6f8c9fe159a145f0f9e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 15:06:33 -0700
Subject: [PATCH 253/321] Construct timezone table in global step

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 16 ++++++++++++++++
 cpp/src/io/orc/reader_impl_chunking.hpp |  3 +++
 cpp/src/io/orc/reader_impl_decode.cu    | 20 ++------------------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index e5d17204fb9..06800224865 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -19,6 +19,7 @@
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
 
+#include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -29,6 +30,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
+#include <algorithm>
 #include <tuple>
 
 namespace cudf::io::orc::detail {
@@ -251,6 +253,20 @@ void reader::impl::global_preprocess(read_mode mode)
   auto const num_total_stripes = selected_stripes.size();
   auto const num_levels        = _selected_columns.num_levels();
 
+  // Set up table for converting timestamp columns from local to UTC time
+  _file_itm_data.tz_table = [&] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
   //
   // Pre allocate necessary memory for data processed in the other reading steps:
   //
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 15cacea2e1d..5f958d6d73f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -168,6 +168,9 @@ struct file_intermediate_data {
   // List of nested type columns at each nested level.
   std::vector<std::vector<orc_column_meta>> lvl_nested_cols;
 
+  // Table for converting timestamp columns from local to UTC time.
+  std::unique_ptr<cudf::table> tz_table;
+
   bool global_preprocessed{false};
 };
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 2aab392cd6b..d3fc94ed760 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -22,7 +22,6 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/timezone.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -736,23 +735,8 @@ void reader::impl::decompress_and_decode()
                "Number or rows to decode exceeds the column size limit.",
                std::overflow_error);
 
-  // TODO: move this to global process
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &writerTimezone = selected_stripes[0].stripe_footer->writerTimezone] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
-
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table({}, writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
-  }();
-  auto const tz_table_dptr = table_device_view::create(tz_table->view(), _stream);
-
-  auto const num_levels = _selected_columns.num_levels();
+  auto const tz_table_dptr = table_device_view::create(_file_itm_data.tz_table->view(), _stream);
+  auto const num_levels    = _selected_columns.num_levels();
   _out_buffers.resize(num_levels);
 
   // Column descriptors ('chunks').

From de72389cd222dcf7b1821630720f9dc8345d6f02 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Mar 2024 15:21:05 -0700
Subject: [PATCH 254/321] Use `rmm::exec_policy_nosync`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu |  6 ++----
 cpp/src/io/orc/reader_impl_decode.cu   | 17 +++++++----------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 06800224865..7b89d63ec11 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -427,10 +427,9 @@ void reader::impl::global_preprocess(read_mode mode)
     return;
   }
 
-  // TODO: exec_policy_nosync
   // Compute the prefix sum of stripes' data sizes.
   total_stripe_sizes.host_to_device_async(_stream);
-  thrust::inclusive_scan(rmm::exec_policy(_stream),  // todo no sync
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
                          total_stripe_sizes.d_begin(),
                          total_stripe_sizes.d_end(),
                          total_stripe_sizes.d_begin(),
@@ -622,10 +621,9 @@ void reader::impl::load_data()
     return;
   }
 
-  // TODO: exec_policy_nosync
   // Compute the prefix sum of stripe data sizes and rows.
   stripe_decomp_sizes.host_to_device_async(_stream);
-  thrust::inclusive_scan(rmm::exec_policy(_stream),
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
                          stripe_decomp_sizes.d_begin(),
                          stripe_decomp_sizes.d_end(),
                          stripe_decomp_sizes.d_begin(),
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index d3fc94ed760..ff4528aea5f 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -129,7 +129,7 @@ rmm::device_buffer decompress_stripe_data(
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  thrust::fill(rmm::exec_policy_nosync(stream),
                inflate_res.begin(),
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
@@ -233,7 +233,7 @@ rmm::device_buffer decompress_stripe_data(
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(std::size_t{0}),
       thrust::make_counting_iterator(inflate_res.size()),
       [results           = inflate_res.begin(),
@@ -332,7 +332,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
       if (child_valid_map_base != nullptr) {
         rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
         // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
+        thrust::copy_if(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(0) + parent_mask_len,
                         dst_idx.begin(),
@@ -346,7 +346,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
         uint32_t* dst_idx_ptr = dst_idx.data();
         // Copy child valid bits from child column to valid indexes, this will merge both child
         // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
+        thrust::for_each(rmm::exec_policy_nosync(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
                          [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
@@ -481,8 +481,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
-  // TODO: exec_policy_nosync
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    d_prefix_sums_to_update.begin(),
                    d_prefix_sums_to_update.end(),
                    [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
@@ -661,9 +660,8 @@ std::vector<range> find_table_splits(table_view const& input,
   auto segmented_sizes =
     cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
 
-  // TODO: exec_policy_nosync
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(d_segmented_sizes->size()),
     segmented_sizes.d_begin(),
@@ -679,8 +677,7 @@ std::vector<range> find_table_splits(table_view const& input,
                              static_cast<std::size_t>(size)};
     });
 
-  // TODO: exec_policy_nosync
-  thrust::inclusive_scan(rmm::exec_policy(stream),
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                          segmented_sizes.d_begin(),
                          segmented_sizes.d_end(),
                          segmented_sizes.d_begin(),

From 28f7cfc56a8068385259ad1084adf0a32f96f000 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Mar 2024 11:14:19 -0700
Subject: [PATCH 255/321] Optimize benchmark code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 78 +++++++++-------------
 1 file changed, 31 insertions(+), 47 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 8514af28c63..e710219852e 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -31,39 +31,6 @@ namespace {
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
-template <typename Timer>
-void read_once(cudf::io::orc_reader_options const& options,
-               cudf::size_type num_rows_to_read,
-               Timer& timer)
-{
-  timer.start();
-  auto const result = cudf::io::read_orc(options);
-  timer.stop();
-
-  CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
-  CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-}
-
-template <typename Timer>
-void chunked_read(cudf::io::orc_reader_options const& options,
-                  cudf::size_type num_rows_to_read,
-                  std::size_t output_limit,
-                  std::size_t read_limit,
-                  Timer& timer)
-{
-  auto reader = cudf::io::chunked_orc_reader(output_limit, read_limit, options);
-  cudf::size_type num_rows{0};
-
-  timer.start();
-  do {
-    auto chunk = reader.read_chunk();
-    num_rows += chunk.tbl->num_rows();
-  } while (reader.has_next());
-  timer.stop();
-
-  CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
-}
-
 template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
                      cuio_source_sink_pair& source_sink,
@@ -74,18 +41,39 @@ void orc_read_common(cudf::size_type num_rows_to_read,
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
-      try_drop_l3_cache();
 
-      if constexpr (!is_chunked_read) {
-        read_once(read_opts, num_rows_to_read, timer);
-      } else {
+  if constexpr (is_chunked_read) {
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
         auto const output_limit = static_cast<std::size_t>(state.get_int64("output_limit"));
         auto const read_limit   = static_cast<std::size_t>(state.get_int64("read_limit"));
-        chunked_read(read_opts, num_rows_to_read, output_limit, read_limit, timer);
-      }
-    });
+
+        auto reader = cudf::io::chunked_orc_reader(output_limit, read_limit, read_opts);
+        cudf::size_type num_rows{0};
+
+        timer.start();
+        do {
+          auto chunk = reader.read_chunk();
+          num_rows += chunk.tbl->num_rows();
+        } while (reader.has_next());
+        timer.stop();
+
+        CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
+      });
+  } else {  // not is_chunked_read
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+
+        timer.start();
+        auto const result = cudf::io::read_orc(read_opts);
+        timer.stop();
+
+        CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
+        CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+      });
+  }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -150,11 +138,7 @@ void orc_read_io_compression(nvbench::state& state)
     return view.num_rows();
   }();
 
-  if constexpr (chunked_read) {
-    orc_read_common<true>(num_rows_written, source_sink, state);
-  } else {
-    orc_read_common<false>(num_rows_written, source_sink, state);
-  }
+  orc_read_common<chunked_read>(num_rows_written, source_sink, state);
 }
 
 template <cudf::io::io_type IOType, cudf::io::compression_type Compression>

From f527c994eec04fdc537a5d115e70966ea8e68a5e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Mar 2024 15:34:36 -0700
Subject: [PATCH 256/321] Do not sync

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ff4528aea5f..4b9eecd884e 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -421,7 +421,7 @@ void decode_stream_data(int64_t num_dicts,
   // Allocate global dictionary for deserializing
   rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
-  chunks.host_to_device_sync(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 

From 96f89a1589c6444e3f7dd8a2082f2240fa7e0a26 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Mar 2024 16:33:35 -0700
Subject: [PATCH 257/321] Simplify code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 4b9eecd884e..2a171a27852 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -791,8 +791,11 @@ void reader::impl::decompress_and_decode()
       auto const stripe_info   = stripe.stripe_info;
       auto const stripe_footer = stripe.stripe_footer;
 
+      // Normalize stripe_idx to 0-based.
+      auto const stripe_local_idx = stripe_idx - stripe_start;
+
       // The first parameter (`stripe_order`) must be normalized to 0-based.
-      auto const total_data_size = gather_stream_info_and_column_desc(stripe_idx - stripe_start,
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_local_idx,
                                                                       level,
                                                                       stripe_info,
                                                                       stripe_footer,
@@ -821,18 +824,16 @@ void reader::impl::decompress_and_decode()
 
       // Update chunks to reference streams pointers.
       for (std::size_t col_idx = 0; col_idx < num_level_columns; col_idx++) {
-        auto& chunk = chunks[stripe_idx - stripe_start][col_idx];
+        auto& chunk = chunks[stripe_local_idx][col_idx];
         // start row, number of rows in a each stripe and total number of rows
         // may change in lower levels of nesting
         chunk.start_row =
-          (level == 0)
-            ? stripe_start_row
-            : col_meta.child_start_row[(stripe_idx - stripe_start) * num_level_columns + col_idx];
+          (level == 0) ? stripe_start_row
+                       : col_meta.child_start_row[stripe_local_idx * num_level_columns + col_idx];
         chunk.num_rows =
           (level == 0)
             ? num_rows_in_stripe
-            : col_meta.num_child_rows_per_stripe[(stripe_idx - stripe_start) * num_level_columns +
-                                                 col_idx];
+            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_level_columns + col_idx];
         chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
         chunk.parent_validity_info =
           (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];

From 20d8e81eb2c2a0b3a5c4d7bad745ed92a86d1797 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 10:49:54 -0700
Subject: [PATCH 258/321] Add assertion to `num_rows` in parquet reader

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/parquet/reader_impl_helpers.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 7b890111dab..604b9b77a8a 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -631,6 +631,8 @@ aggregate_reader_metadata::select_row_groups(
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
       skip_rows_opt, num_rows_opt, get_num_rows());
+    CUDF_EXPECTS(from_opts.second <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "Number of reading rows exceeds cudf's column size limit.");
     return std::pair{static_cast<int64_t>(from_opts.first),
                      static_cast<size_type>(from_opts.second)};
   }();

From 99afb2edc59bd5cf99536ca3d7a3af658c4437df Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 10:52:01 -0700
Subject: [PATCH 259/321] Fix comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 173e9ce00d4..ba57145465e 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1431,7 +1431,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
                              .build();
     auto reader = cudf::io::chunked_orc_reader(
       static_cast<std::size_t>(rows_per_stripe * 5.7) *
-        sizeof(data_type) /* output limit, equal to 5.2M rows */,
+        sizeof(data_type) /* output limit, equal to 5.7M rows */,
       0UL /* no input limit */,
       rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
       read_opts);

From 38d87487bc44a646a42abd63bbc23102b92c61b7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 10:52:09 -0700
Subject: [PATCH 260/321] Add assertion to `skip_rows`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 259c5c1016a..a28011feb8f 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -199,6 +199,7 @@ class orc_reader_options {
    */
   void set_skip_rows(int64_t rows)
   {
+    CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
     CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
     _skip_rows = rows;
   }

From 6e658dc0b37b8b38fd8520a1acd79da126310325 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 11:02:05 -0700
Subject: [PATCH 261/321] Update docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 65d1f0a7ad4..94f681fff0c 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -112,6 +112,16 @@ class aggregate_orc_metadata {
    * @brief Selects the stripes to read, based on the row/stripe selection parameters.
    *
    * Stripes are potentially selected from multiple files.
+   *
+   * Upon parsing stripes' information, the number of skip rows and reading rows are also updated
+   * to be matched with the actual numbers for reading stripes from data sources.
+   *
+   * @param user_specified_stripes The specified stripe indices to read
+   * @param skip_rows Number of rows to skip from reading
+   * @param num_rows Number of rows to read
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return A tuple of the corrected skip_rows and num_rows values along with a vector of
+   *         stripes' metadata such as footer, data information, and source index
    */
   [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,

From 734dcf363ee2c7a5fcf91e311913f6b44933440d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 13:34:54 -0700
Subject: [PATCH 262/321] Separate `impl` class from `reader` resulting into
 `reader_impl`, and also make `chunked_reader` independent from `reader`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp     | 18 +++-----
 cpp/src/io/orc/reader.cu               | 28 ++++++------
 cpp/src/io/orc/reader_impl.cu          | 62 +++++++++++++-------------
 cpp/src/io/orc/reader_impl.hpp         | 36 +++++++--------
 cpp/src/io/orc/reader_impl_chunking.cu |  4 +-
 cpp/src/io/orc/reader_impl_decode.cu   |  2 +-
 6 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index c07dbef11d7..32b28692140 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -37,18 +37,15 @@ class chunked_orc_writer_options;
 
 namespace orc::detail {
 
+// Forward declaration of the internal reader class
+class reader_impl;
+
 /**
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
  protected:
-  class impl;
-  std::unique_ptr<impl> _impl;
-
-  /**
-   * @brief Default constructor, needed for subclassing.
-   */
-  reader();
+  std::unique_ptr<reader_impl> _impl;
 
  public:
   /**
@@ -79,11 +76,10 @@ class reader {
 
 /**
  * @brief The reader class that supports iterative reading from an array of data sources.
- *
- * This class intentionally subclasses the `reader` class with private inheritance to hide the
- * base class `reader::read()` API. As such, only chunked reading APIs are supported through it.
  */
-class chunked_reader : private reader {
+class chunked_reader {
+  std::unique_ptr<reader_impl> _impl;
+
  public:
   /**
    * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index ea0b43c0f93..bcc1c5b8649 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -19,15 +19,14 @@
 
 namespace cudf::io::orc::detail {
 
-// Constructor and destructor are defined within this translation unit.
-reader::reader()  = default;
+// Destructor are defined within this translation unit.
 reader::~reader() = default;
 
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
-  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
 {
 }
 
@@ -39,10 +38,9 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
-  : reader()
+  : _impl{std::make_unique<reader_impl>(
+      output_size_limit, data_read_limit, std::move(sources), options, stream, mr)}
 {
-  _impl = std::make_unique<impl>(
-    output_size_limit, data_read_limit, std::move(sources), options, stream, mr);
 }
 
 chunked_reader::chunked_reader(std::size_t output_size_limit,
@@ -52,17 +50,19 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
-  : reader()
+  : _impl{std::make_unique<reader_impl>(output_size_limit,
+                                        data_read_limit,
+                                        output_row_granularity,
+                                        std::move(sources),
+                                        options,
+                                        stream,
+                                        mr)}
 {
+  // Although we internally accept non-positve value for `output_row_granularity` because we
+  // implicitly change such value into `DEFAULT_OUTPUT_ROW_GRANULARITY`.
+  // The user are not allowed to do so but instead required to specify an explicit positive number.
   CUDF_EXPECTS(output_row_granularity > 0,
                "The value of `output_row_granularity` must be positive.");
-  _impl = std::make_unique<impl>(output_size_limit,
-                                 data_read_limit,
-                                 output_row_granularity,
-                                 std::move(sources),
-                                 options,
-                                 stream,
-                                 mr);
 }
 
 chunked_reader::~chunked_reader() = default;
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 140e4517862..d2c881218cc 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -24,7 +24,7 @@
 
 namespace cudf::io::orc::detail {
 
-void reader::impl::prepare_data(read_mode mode)
+void reader_impl::prepare_data(read_mode mode)
 {
   // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return; }
@@ -49,7 +49,7 @@ void reader::impl::prepare_data(read_mode mode)
   }
 }
 
-table_with_metadata reader::impl::make_output_chunk()
+table_with_metadata reader_impl::make_output_chunk()
 {
   // There is no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
@@ -104,7 +104,7 @@ table_with_metadata reader::impl::make_output_chunk()
   return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/};
 }
 
-table_metadata reader::impl::get_meta_with_user_data()
+table_metadata reader_impl::get_meta_with_user_data()
 {
   if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; }
 
@@ -133,37 +133,37 @@ table_metadata reader::impl::get_meta_with_user_data()
   return out_metadata;
 }
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : reader::impl::impl(0UL, 0UL, std::move(sources), options, stream, mr)
+reader_impl::reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+  : reader_impl::reader_impl(0UL, 0UL, std::move(sources), options, stream, mr)
 {
 }
 
-reader::impl::impl(std::size_t output_size_limit,
-                   std::size_t data_read_limit,
-                   std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : reader::impl::impl(output_size_limit,
-                       data_read_limit,
-                       DEFAULT_OUTPUT_ROW_GRANULARITY,
-                       std::move(sources),
-                       options,
-                       stream,
-                       mr)
+reader_impl::reader_impl(std::size_t output_size_limit,
+                         std::size_t data_read_limit,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+  : reader_impl::reader_impl(output_size_limit,
+                             data_read_limit,
+                             DEFAULT_OUTPUT_ROW_GRANULARITY,
+                             std::move(sources),
+                             options,
+                             stream,
+                             mr)
 {
 }
 
-reader::impl::impl(std::size_t output_size_limit,
-                   std::size_t data_read_limit,
-                   size_type output_row_granularity,
-                   std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+reader_impl::reader_impl(std::size_t output_size_limit,
+                         std::size_t data_read_limit,
+                         size_type output_row_granularity,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
     _config{options.get_timestamp_type(),
@@ -188,19 +188,19 @@ reader::impl::impl(std::size_t output_size_limit,
                "skip_rows is not supported by nested column");
 }
 
-table_with_metadata reader::impl::read()
+table_with_metadata reader_impl::read()
 {
   prepare_data(read_mode::READ_ALL);
   return make_output_chunk();
 }
 
-bool reader::impl::has_next()
+bool reader_impl::has_next()
 {
   prepare_data(read_mode::CHUNKED_READ);
   return _chunk_read_data.has_next();
 }
 
-table_with_metadata reader::impl::read_chunk()
+table_with_metadata reader_impl::read_chunk()
 {
   prepare_data(read_mode::CHUNKED_READ);
   return make_output_chunk();
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 45d60acb3db..07beecb70d0 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -38,7 +38,7 @@ struct reader_column_meta;
 /**
  * @brief Implementation for ORC reader.
  */
-class reader::impl {
+class reader_impl {
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
@@ -51,33 +51,33 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr);
 
   /**
    * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
    * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
    */
-  explicit impl(std::size_t output_size_limit,
-                std::size_t data_read_limit,
-                std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(std::size_t output_size_limit,
+                       std::size_t data_read_limit,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr);
 
   /**
    * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
    * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
    */
-  explicit impl(std::size_t output_size_limit,
-                std::size_t data_read_limit,
-                size_type output_row_granularity,
-                std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(std::size_t output_size_limit,
+                       std::size_t data_read_limit,
+                       size_type output_row_granularity,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr);
 
   /**
    * @copydoc cudf::io::orc::detail::reader::read
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 7b89d63ec11..777a5a6f79f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -229,7 +229,7 @@ range get_range(std::vector<range> const& input_ranges, range const& selected_ra
   return {first_range.begin, last_range.end};
 }
 
-void reader::impl::global_preprocess(read_mode mode)
+void reader_impl::global_preprocess(read_mode mode)
 {
   if (_file_itm_data.global_preprocessed) { return; }
   _file_itm_data.global_preprocessed = true;
@@ -447,7 +447,7 @@ void reader::impl::global_preprocess(read_mode mode)
     find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
 }
 
-void reader::impl::load_data()
+void reader_impl::load_data()
 {
   if (_file_itm_data.has_no_data()) { return; }
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 2a171a27852..04808d6f0b0 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -690,7 +690,7 @@ std::vector<range> find_table_splits(table_view const& input,
 
 }  // namespace
 
-void reader::impl::decompress_and_decode()
+void reader_impl::decompress_and_decode()
 {
   if (_file_itm_data.has_no_data()) { return; }
 

From 75c5b7c9fe0bc50d88099625e489c651b606417c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 14:00:31 -0700
Subject: [PATCH 263/321] Fix spell

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index bcc1c5b8649..37eb7ab0fd7 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -58,7 +58,7 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                         stream,
                                         mr)}
 {
-  // Although we internally accept non-positve value for `output_row_granularity` because we
+  // Although we internally accept non-positive value for `output_row_granularity` because we
   // implicitly change such value into `DEFAULT_OUTPUT_ROW_GRANULARITY`.
   // The user are not allowed to do so but instead required to specify an explicit positive number.
   CUDF_EXPECTS(output_row_granularity > 0,

From 861cdccd7fe5ba7b96884c7f095f1c1ead90025b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 15:22:32 -0700
Subject: [PATCH 264/321] Only update `total_stripe_sizes` if in chunked read
 mode

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 777a5a6f79f..ec766465817 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -351,7 +351,8 @@ void reader_impl::global_preprocess(read_mode mode)
   //
 
   // Accumulate data size for data streams in each stripe.
-  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(num_total_stripes, _stream);
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
+    mode == read_mode::CHUNKED_READ ? num_total_stripes : std::size_t{0}, _stream);
 
   for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
        ++stripe_global_idx) {
@@ -407,7 +408,9 @@ void reader_impl::global_preprocess(read_mode mode)
       }
     }  // end loop level
 
-    total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size};
+    if (mode == read_mode::CHUNKED_READ) {
+      total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size};
+    }
 
     // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
     stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
@@ -422,7 +425,8 @@ void reader_impl::global_preprocess(read_mode mode)
   _chunk_read_data.curr_load_stripe_range = 0;
 
   // Load all stripes if there is no read limit.
-  if (_chunk_read_data.data_read_limit == 0) {
+  // In addition, if we are not in chunked read mode, we also load all stripe.
+  if (mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
     return;
   }

From ee5070147538f6a01dee3c3383831e5c818acbc9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 28 Mar 2024 16:08:35 -0700
Subject: [PATCH 265/321] Implement optimized code path splitting stripe range
 in special cases

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          |   2 +-
 cpp/src/io/orc/reader_impl.hpp         |   4 +-
 cpp/src/io/orc/reader_impl_chunking.cu | 125 ++++++++++++++++---------
 3 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d2c881218cc..f2c8b53fac3 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -38,7 +38,7 @@ void reader_impl::prepare_data(read_mode mode)
       //  - There is more stripe to load, and
       //  - All loaded stripes were decoded, and
       //  - All the decoded results were output.
-      load_data();
+      load_data(mode);
     }
     if (_chunk_read_data.more_stripe_to_decode()) {
       // Only decompress/decode the loaded stripes if:
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 07beecb70d0..17ce7c23a58 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -133,8 +133,10 @@ class reader_impl {
    * smaller subsets, each of which to be decompressed and decoded in the next step
    * `decompress_and_decode()`. This is to ensure that loading data from data sources together with
    * decompression and decoding will be capped around the given data read limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void load_data();
+  void load_data(read_mode mode);
 
   /**
    * @brief Decompress and decode stripe data in the internal buffers, and store the result into
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ec766465817..576f25fac6e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -425,7 +425,7 @@ void reader_impl::global_preprocess(read_mode mode)
   _chunk_read_data.curr_load_stripe_range = 0;
 
   // Load all stripes if there is no read limit.
-  // In addition, if we are not in chunked read mode, we also load all stripe.
+  // In addition, if we are not in CHUNKED_READ mode, we also load all stripes.
   if (mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
     return;
@@ -451,7 +451,7 @@ void reader_impl::global_preprocess(read_mode mode)
     find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
 }
 
-void reader_impl::load_data()
+void reader_impl::load_data(read_mode mode)
 {
   if (_file_itm_data.has_no_data()) { return; }
 
@@ -524,6 +524,82 @@ void reader_impl::load_data()
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
+  // Compute number of rows in the loading stripes.
+  auto const num_loading_rows = [&] {
+    std::size_t count{0};
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      count += _file_itm_data.selected_stripes[idx + stripe_start].stripe_info->numberOfRows;
+    }
+    return count;
+  }();
+
+  // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
+  _chunk_read_data.curr_decode_stripe_range = 0;
+
+  // Decode all loaded stripes if there is no read limit, or if we are not in chunked_read  mode.
+  // In theory, we should just decode enough stripes for output one table chunk, instead of
+  // decoding all stripes like this, for better load-balancing and reduce memory usage.
+  // However, we do not know how many stripes are 'enough' because there is not any simple and
+  // cheap way to compute the exact decoded sizes of stripes without actually decoding them.
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
+      // In addition to read limit, we also need to check if the the total number of
+      // rows in the loaded stripes exceeds column size limit.
+      // If that is the case, we cannot read all stripes at once.
+      num_loading_rows < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
+    return;
+  }
+
+  // For estimating the decompressed sizes of the loaded stripes.
+  // Only valid in CHUNKED_READ mode.
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(
+    mode == read_mode::CHUNKED_READ ? stripe_count : std::size_t{0}, _stream);
+
+  // For mapping stripe to the number of rows in it.
+  // Only valid in READ_ALL mode.
+  // This is similar to store exactly the same data as for `stripe_decomp_size` but
+  // does not allocate device memory.
+  std::vector<cumulative_size_and_row> stripe_rows(mode == read_mode::READ_ALL ? stripe_count
+                                                                               : std::size_t{0});
+
+  // Fill up the `cumulative_size_and_row` array.
+  auto const stripe_sizes_rows_ptr =
+    mode == read_mode::CHUNKED_READ ? stripe_decomp_sizes.begin() : stripe_rows.data();
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info = stripe.stripe_info;
+    stripe_sizes_rows_ptr[idx] =
+      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
+  }
+
+  // This is the post-processing step after we've done with splitting `load_stripe_range` into
+  // `decode_stripe_ranges`.
+  auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
+    // The split ranges always start from zero.
+    // We need to change these ranges to start from `stripe_start` which are the correct subranges
+    // of the current loaded stripe range.
+    for (auto& range : new_ranges) {
+      range.begin += stripe_start;
+      range.end += stripe_start;
+    }
+  };
+
+  //
+  // Optimized code path when we do not have any read limit but the number of rows in the
+  // loaded stripes exceeds column size limit.
+  //
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
+      num_loading_rows >= static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+    // Here we will split based on number of rows, not data size.
+    // Thus, we use a maximum possible value for size_limit.
+    _chunk_read_data.decode_stripe_ranges = find_splits<cumulative_size_and_row>(
+      cudf::host_span<cumulative_size_and_row>(stripe_sizes_rows_ptr, stripe_count),
+      stripe_count,
+      std::numeric_limits<std::size_t>::max());
+    add_range_offset(_chunk_read_data.decode_stripe_ranges);
+    return;
+  }
+
   //
   // Split range of loaded stripes into subranges that can be decoded separately without blowing up
   // memory:
@@ -534,20 +610,6 @@ void reader_impl::load_data()
   // decompression and decoding.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
-  // For estimating the decompressed sizes of the loaded stripes.
-  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
-                                                                               _stream);
-
-  // Number of rows in the loading stripes.
-  std::size_t num_loading_rows{0};
-
-  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
-    auto const& stripe       = _file_itm_data.selected_stripes[idx + stripe_start];
-    auto const stripe_info   = stripe.stripe_info;
-    stripe_decomp_sizes[idx] = cumulative_size_and_row{1, 0, stripe_info->numberOfRows};
-    num_loading_rows += stripe_info->numberOfRows;
-  }
-
   auto& compinfo_map = _file_itm_data.compinfo_map;
 
   for (std::size_t level = 0; level < num_levels; ++level) {
@@ -608,23 +670,6 @@ void reader_impl::load_data()
     }
   }  // end loop level
 
-  // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
-  _chunk_read_data.curr_decode_stripe_range = 0;
-
-  // Decode all loaded stripes if there is no read limit.
-  // In theory, we should just decode enough stripes for output one table chunk, instead of
-  // decoding all stripes like this, for better load-balancing and reduce memory usage.
-  // However, we do not know how many stripes are 'enough' because there is not any simple and
-  // cheap way to compute the exact decoded sizes of stripes without actually decoding them.
-  if (_chunk_read_data.data_read_limit == 0 &&
-      // In addition to read limit, we also need to check if the the total number of
-      // rows in the loaded stripes exceeds column size limit.
-      // If that is the case, we cannot read all stripes at once.
-      num_loading_rows < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
-    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
-    return;
-  }
-
   // Compute the prefix sum of stripe data sizes and rows.
   stripe_decomp_sizes.host_to_device_async(_stream);
   thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
@@ -635,26 +680,16 @@ void reader_impl::load_data()
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
   auto const decode_limit = [&] {
-    // In this case, we have no read limit but have to split due to having number of rows in loaded
-    // stripes exceeds column size limit. So we will split based on row number, not data size.
-    if (_chunk_read_data.data_read_limit == 0) { return std::numeric_limits<std::size_t>::max(); }
-
-    // If `data_read_limit` is too small, make sure not to pass 0 byte limit to `find_splits`.
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
                                               chunk_read_data::decode_limit_ratio);
+    // Make sure not to pass 0 byte limit to `find_splits`.
     return tmp > 0UL ? tmp : 1UL;
   }();
 
   _chunk_read_data.decode_stripe_ranges =
     find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
 
-  // The split ranges always start from zero.
-  // We need to change these ranges to start from `stripe_start` which are the correct subranges of
-  // the current loaded stripe range.
-  for (auto& range : _chunk_read_data.decode_stripe_ranges) {
-    range.begin += stripe_start;
-    range.end += stripe_start;
-  }
+  add_range_offset(_chunk_read_data.decode_stripe_ranges);
 }
 
 }  // namespace cudf::io::orc::detail

From b976d99d1d3e1b9806414e5ea1977ef0768b4d5e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 09:58:02 -0700
Subject: [PATCH 266/321] Remove test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index ba57145465e..39450bb2a9f 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1266,13 +1266,6 @@ TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
                           input_limit{128 * 1024 * 1024UL},
                           expected);
   }
-
-  // TODO: remove
-  {
-    int constexpr expected[] = {1, 1, 1};
-    input_limit_test_read(
-      __LINE__, test_files, input, output_limit{0UL}, input_limit{0UL}, expected);
-  }
 }
 
 TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)

From ec7303fbf1c6a2f833097e8047a4d6a285ae3736 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 10:23:20 -0700
Subject: [PATCH 267/321] Add `read_mode` param to `decompress_and_decode`, and
 change comments

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          |  2 +-
 cpp/src/io/orc/reader_impl.hpp         |  4 +++-
 cpp/src/io/orc/reader_impl_chunking.cu | 33 ++++++++++++++------------
 cpp/src/io/orc/reader_impl_decode.cu   |  2 +-
 4 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f2c8b53fac3..566c8a059d8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -44,7 +44,7 @@ void reader_impl::prepare_data(read_mode mode)
       // Only decompress/decode the loaded stripes if:
       //  - There are loaded stripes that were not decoded yet, and
       //  - All the decoded results were output.
-      decompress_and_decode();
+      decompress_and_decode(mode);
     }
   }
 }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 17ce7c23a58..e8c3adfe1f9 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -144,8 +144,10 @@ class reader_impl {
    *
    * This function expects that the other preprocessing steps (`global preprocess()` and
    * `load_data()`) have already been done.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void decompress_and_decode();
+  void decompress_and_decode(read_mode mode);
 
   /**
    * @brief Create the output table from the intermediate table and return it along with metadata.
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 576f25fac6e..15ba652c4a4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -351,6 +351,8 @@ void reader_impl::global_preprocess(read_mode mode)
   //
 
   // Accumulate data size for data streams in each stripe.
+  // This will be used for CHUNKED_READ mode only.
+  // If we are in READ_ALL mode, we do not need this since we just load all stripes.
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
     mode == read_mode::CHUNKED_READ ? num_total_stripes : std::size_t{0}, _stream);
 
@@ -424,8 +426,7 @@ void reader_impl::global_preprocess(read_mode mode)
   // Load range is reset to start from the first position in `load_stripe_ranges`.
   _chunk_read_data.curr_load_stripe_range = 0;
 
-  // Load all stripes if there is no read limit.
-  // In addition, if we are not in CHUNKED_READ mode, we also load all stripes.
+  // Load all stripes if there is no read limit or if we are in READ_ALL mode.
   if (mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) {
     _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
     return;
@@ -536,33 +537,33 @@ void reader_impl::load_data(read_mode mode)
   // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
   _chunk_read_data.curr_decode_stripe_range = 0;
 
-  // Decode all loaded stripes if there is no read limit, or if we are not in chunked_read  mode.
-  // In theory, we should just decode enough stripes for output one table chunk, instead of
+  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode.
+  // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
   // decoding all stripes like this, for better load-balancing and reduce memory usage.
-  // However, we do not know how many stripes are 'enough' because there is not any simple and
-  // cheap way to compute the exact decoded sizes of stripes without actually decoding them.
+  // However, we do not have any good way to know how many stripes are 'enough'.
   if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
       // In addition to read limit, we also need to check if the the total number of
       // rows in the loaded stripes exceeds column size limit.
-      // If that is the case, we cannot read all stripes at once.
+      // If that is the case, we cannot decode all stripes at once.
       num_loading_rows < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
     _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
     return;
   }
 
   // For estimating the decompressed sizes of the loaded stripes.
-  // Only valid in CHUNKED_READ mode.
+  // Only used in CHUNKED_READ mode.
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(
     mode == read_mode::CHUNKED_READ ? stripe_count : std::size_t{0}, _stream);
 
   // For mapping stripe to the number of rows in it.
-  // Only valid in READ_ALL mode.
-  // This is similar to store exactly the same data as for `stripe_decomp_size` but
-  // does not allocate device memory.
+  // Only used in READ_ALL mode.
+  // This is to store exactly the same data as for `stripe_decomp_size` above but here we do not
+  // need to allocate device memory.
   std::vector<cumulative_size_and_row> stripe_rows(mode == read_mode::READ_ALL ? stripe_count
                                                                                : std::size_t{0});
 
   // Fill up the `cumulative_size_and_row` array.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect name.
   auto const stripe_sizes_rows_ptr =
     mode == read_mode::CHUNKED_READ ? stripe_decomp_sizes.begin() : stripe_rows.data();
   for (std::size_t idx = 0; idx < stripe_count; ++idx) {
@@ -584,14 +585,16 @@ void reader_impl::load_data(read_mode mode)
     }
   };
 
-  //
   // Optimized code path when we do not have any read limit but the number of rows in the
   // loaded stripes exceeds column size limit.
-  //
+  // Note that the values `max_uncompressed_size` for each stripe are not computed here.
+  // Instead, they will be computed on the fly during decoding to avoid the overhead of
+  // storing and retrieving from memory.
   if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
       num_loading_rows >= static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
-    // Here we will split based on number of rows, not data size.
-    // Thus, we use a maximum possible value for size_limit.
+    // Here we will split stripe ranges based on stripes' number of rows, not their data size.
+    // Thus, we use a maximum possible value for data size limit.
+    // The function `find_splits` will automatically handle row count limit.
     _chunk_read_data.decode_stripe_ranges = find_splits<cumulative_size_and_row>(
       cudf::host_span<cumulative_size_and_row>(stripe_sizes_rows_ptr, stripe_count),
       stripe_count,
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 04808d6f0b0..90faedd0063 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -690,7 +690,7 @@ std::vector<range> find_table_splits(table_view const& input,
 
 }  // namespace
 
-void reader_impl::decompress_and_decode()
+void reader_impl::decompress_and_decode(read_mode mode)
 {
   if (_file_itm_data.has_no_data()) { return; }
 

From acd9689616df414dda50e5e6fea9708447e179ab Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 10:48:18 -0700
Subject: [PATCH 268/321] Compute compinfo on the fly

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 39 +++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 90faedd0063..a33c31168a8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -58,7 +58,7 @@ namespace {
  *
  * @param loaded_stripe_range Range of stripes that are already loaded in memory
  * @param stream_range Range of streams to be decoded
- * @param num_decoded_stripes Number of stripes that the decoding streams belong to
+ * @param num_decode_stripes Number of stripes that the decoding streams belong to
  * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
  * @param stripe_data List of source stripe column data
@@ -84,6 +84,9 @@ rmm::device_buffer decompress_stripe_data(
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
+  // Whether we have the comppression info precomputed.
+  auto const compinfo_ready = compinfo_map.size() > 0;
+
   // Count the exact number of compressed blocks
   std::size_t num_compressed_blocks   = 0;
   std::size_t num_uncompressed_blocks = 0;
@@ -101,15 +104,33 @@ rmm::device_buffer decompress_stripe_data(
         info.dst_pos,
       info.length));
 
-    auto const& cached_comp_info             = compinfo_map.at(info.source);
-    auto& stream_comp_info                   = compinfo.back();
-    stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
-    stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
-    stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+    if (compinfo_ready) {
+      auto const& cached_comp_info             = compinfo_map.at(info.source);
+      auto& stream_comp_info                   = compinfo.back();
+      stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+      stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+      stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+      num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+      num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+      total_decomp_size += cached_comp_info.total_decomp_size;
+    }
+  }
 
-    num_compressed_blocks += cached_comp_info.num_compressed_blocks;
-    num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
-    total_decomp_size += cached_comp_info.total_decomp_size;
+  if (!compinfo_ready) {
+    compinfo.host_to_device_async(stream);
+    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                   compinfo.size(),
+                                   decompressor.GetBlockSize(),
+                                   decompressor.GetLog2MaxCompressionRatio(),
+                                   stream);
+    compinfo.device_to_host_sync(stream);
+
+    for (std::size_t i = 0; i < compinfo.size(); ++i) {
+      num_compressed_blocks += compinfo[i].num_compressed_blocks;
+      num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+      total_decomp_size += compinfo[i].max_uncompressed_size;
+    }
   }
 
   CUDF_EXPECTS(

From b9f07a253d3f6381a0585702773902019f396e66 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 12:50:14 -0700
Subject: [PATCH 269/321] Separate `compinfo_map` into levels

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 18 +++++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp | 10 ++++------
 cpp/src/io/orc/reader_impl_decode.cu    |  2 +-
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 15ba652c4a4..6e92aa0511a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -121,11 +121,10 @@ std::size_t gather_stream_info_and_column_desc(
 
         (*local_stream_order)++;
       } else {  // not chunks.has_value()
-        stream_info.value()->emplace_back(
-          stripeinfo->offset + src_offset,
-          dst_offset,
-          stream.length,
-          stream_source_info{stripe_order, level, column_id, stream.kind});
+        stream_info.value()->emplace_back(stripeinfo->offset + src_offset,
+                                          dst_offset,
+                                          stream.length,
+                                          stream_source_info{stripe_order, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
@@ -275,6 +274,7 @@ void reader_impl::global_preprocess(read_mode mode)
 
   auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
+  auto& lvl_compinfo_map         = _file_itm_data.lvl_compinfo_map;
   auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
   auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
   auto& lvl_column_types         = _file_itm_data.lvl_column_types;
@@ -282,6 +282,7 @@ void reader_impl::global_preprocess(read_mode mode)
 
   lvl_stripe_data.resize(num_levels);
   lvl_stripe_sizes.resize(num_levels);
+  lvl_compinfo_map.resize(num_levels);
   lvl_stream_info.resize(num_levels);
   lvl_stripe_stream_ranges.resize(num_levels);
   lvl_column_types.resize(num_levels);
@@ -613,8 +614,6 @@ void reader_impl::load_data(read_mode mode)
   // decompression and decoding.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
-  auto& compinfo_map = _file_itm_data.compinfo_map;
-
   for (std::size_t level = 0; level < num_levels; ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
@@ -641,8 +640,7 @@ void reader_impl::load_data(read_mode mode)
 
         compinfo.push_back(gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length));
         stream_compinfo_map[stream_source_info{
-          info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind}] =
-          &compinfo.back();
+          info.source.stripe_idx, info.source.orc_col_idx, info.source.kind}] = &compinfo.back();
       }
 
       compinfo.host_to_device_async(_stream);
@@ -653,6 +651,8 @@ void reader_impl::load_data(read_mode mode)
                                      _stream);
       compinfo.device_to_host_sync(_stream);
 
+      auto& compinfo_map = _file_itm_data.lvl_compinfo_map[level];
+      compinfo_map.clear();  // clear cache of the last load
       for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
         // Cache these parsed numbers so they can be reused in the decompression/decoding step.
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 5f958d6d73f..3b193f13441 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -33,7 +33,6 @@ namespace cudf::io::orc::detail {
  */
 struct stream_source_info {
   std::size_t stripe_idx;  // global stripe id throughout all data sources
-  std::size_t level;       // level of the nested column
   uint32_t orc_col_idx;    // orc column id
   StreamKind kind;         // stream kind
 
@@ -41,16 +40,15 @@ struct stream_source_info {
     std::size_t operator()(stream_source_info const& id) const
     {
       auto const hasher = std::hash<size_t>{};
-      return hasher(id.stripe_idx) ^ hasher(id.level) ^
-             hasher(static_cast<std::size_t>(id.orc_col_idx)) ^
+      return hasher(id.stripe_idx) ^ hasher(static_cast<std::size_t>(id.orc_col_idx)) ^
              hasher(static_cast<std::size_t>(id.kind));
     }
   };
   struct equal_to {
     bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
     {
-      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
-             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.orc_col_idx == rhs.orc_col_idx &&
+             lhs.kind == rhs.kind;
     }
   };
 };
@@ -144,7 +142,7 @@ struct file_intermediate_data {
   std::vector<range> stripe_data_read_ranges;
 
   // Store the compression information for each data stream.
-  stream_source_map<stripe_level_comp_info> compinfo_map;
+  std::vector<stream_source_map<stripe_level_comp_info>> lvl_compinfo_map;
 
   // Store info for each ORC stream at each nested level.
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index a33c31168a8..bc25f716640 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -925,7 +925,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,
-                                                _file_itm_data.compinfo_map,
+                                                _file_itm_data.lvl_compinfo_map[level],
                                                 *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,

From ab1afdcc4541d020d9b7d3fada64216223a49d1a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 13:18:57 -0700
Subject: [PATCH 270/321] Revert "Separate `compinfo_map` into levels"

This reverts commit b9f07a253d3f6381a0585702773902019f396e66.
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 18 +++++++++---------
 cpp/src/io/orc/reader_impl_chunking.hpp | 10 ++++++----
 cpp/src/io/orc/reader_impl_decode.cu    |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 6e92aa0511a..15ba652c4a4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -121,10 +121,11 @@ std::size_t gather_stream_info_and_column_desc(
 
         (*local_stream_order)++;
       } else {  // not chunks.has_value()
-        stream_info.value()->emplace_back(stripeinfo->offset + src_offset,
-                                          dst_offset,
-                                          stream.length,
-                                          stream_source_info{stripe_order, column_id, stream.kind});
+        stream_info.value()->emplace_back(
+          stripeinfo->offset + src_offset,
+          dst_offset,
+          stream.length,
+          stream_source_info{stripe_order, level, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
@@ -274,7 +275,6 @@ void reader_impl::global_preprocess(read_mode mode)
 
   auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
-  auto& lvl_compinfo_map         = _file_itm_data.lvl_compinfo_map;
   auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
   auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
   auto& lvl_column_types         = _file_itm_data.lvl_column_types;
@@ -282,7 +282,6 @@ void reader_impl::global_preprocess(read_mode mode)
 
   lvl_stripe_data.resize(num_levels);
   lvl_stripe_sizes.resize(num_levels);
-  lvl_compinfo_map.resize(num_levels);
   lvl_stream_info.resize(num_levels);
   lvl_stripe_stream_ranges.resize(num_levels);
   lvl_column_types.resize(num_levels);
@@ -614,6 +613,8 @@ void reader_impl::load_data(read_mode mode)
   // decompression and decoding.
   stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
 
+  auto& compinfo_map = _file_itm_data.compinfo_map;
+
   for (std::size_t level = 0; level < num_levels; ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
@@ -640,7 +641,8 @@ void reader_impl::load_data(read_mode mode)
 
         compinfo.push_back(gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length));
         stream_compinfo_map[stream_source_info{
-          info.source.stripe_idx, info.source.orc_col_idx, info.source.kind}] = &compinfo.back();
+          info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind}] =
+          &compinfo.back();
       }
 
       compinfo.host_to_device_async(_stream);
@@ -651,8 +653,6 @@ void reader_impl::load_data(read_mode mode)
                                      _stream);
       compinfo.device_to_host_sync(_stream);
 
-      auto& compinfo_map = _file_itm_data.lvl_compinfo_map[level];
-      compinfo_map.clear();  // clear cache of the last load
       for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
         // Cache these parsed numbers so they can be reused in the decompression/decoding step.
         compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 3b193f13441..5f958d6d73f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -33,6 +33,7 @@ namespace cudf::io::orc::detail {
  */
 struct stream_source_info {
   std::size_t stripe_idx;  // global stripe id throughout all data sources
+  std::size_t level;       // level of the nested column
   uint32_t orc_col_idx;    // orc column id
   StreamKind kind;         // stream kind
 
@@ -40,15 +41,16 @@ struct stream_source_info {
     std::size_t operator()(stream_source_info const& id) const
     {
       auto const hasher = std::hash<size_t>{};
-      return hasher(id.stripe_idx) ^ hasher(static_cast<std::size_t>(id.orc_col_idx)) ^
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^
+             hasher(static_cast<std::size_t>(id.orc_col_idx)) ^
              hasher(static_cast<std::size_t>(id.kind));
     }
   };
   struct equal_to {
     bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
     {
-      return lhs.stripe_idx == rhs.stripe_idx && lhs.orc_col_idx == rhs.orc_col_idx &&
-             lhs.kind == rhs.kind;
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
     }
   };
 };
@@ -142,7 +144,7 @@ struct file_intermediate_data {
   std::vector<range> stripe_data_read_ranges;
 
   // Store the compression information for each data stream.
-  std::vector<stream_source_map<stripe_level_comp_info>> lvl_compinfo_map;
+  stream_source_map<stripe_level_comp_info> compinfo_map;
 
   // Store info for each ORC stream at each nested level.
   std::vector<std::vector<orc_stream_info>> lvl_stream_info;
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index bc25f716640..a33c31168a8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -925,7 +925,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,
-                                                _file_itm_data.lvl_compinfo_map[level],
+                                                _file_itm_data.compinfo_map,
                                                 *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,

From bf5b11133ebd9a5cac709396940f91e7c24dceab Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 13:21:19 -0700
Subject: [PATCH 271/321] Simplify code

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 15ba652c4a4..a592af3d88d 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -640,9 +640,7 @@ void reader_impl::load_data(read_mode mode)
           static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
 
         compinfo.push_back(gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length));
-        stream_compinfo_map[stream_source_info{
-          info.source.stripe_idx, info.source.level, info.source.orc_col_idx, info.source.kind}] =
-          &compinfo.back();
+        stream_compinfo_map[info.source] = &compinfo.back();
       }
 
       compinfo.host_to_device_async(_stream);

From 4d01ad754224fcfe673f4da35b46cd2f03e23cd5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 13:47:48 -0700
Subject: [PATCH 272/321] Remove local `stream_compinfo_map`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 29 ++++++++++----------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index a592af3d88d..1af0ad478ed 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -608,12 +608,8 @@ void reader_impl::load_data(read_mode mode)
   // memory:
   //
 
-  // A map from a stripe sources into `CompressedStreamInfo*` pointers.
-  // These pointers are then used to retrieve stripe/level decompressed sizes for later
-  // decompression and decoding.
-  stream_source_map<gpu::CompressedStreamInfo*> stream_compinfo_map;
-
   auto& compinfo_map = _file_itm_data.compinfo_map;
+  compinfo_map.clear();  // clear cache of the last load
 
   for (std::size_t level = 0; level < num_levels; ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
@@ -630,19 +626,15 @@ void reader_impl::load_data(read_mode mode)
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      // Cannot be cached as-is, since this is for streams in the current loaded stripe range,
-      // while the decompression/decoding step would probably use just a subrange of it.
       cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
-
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
           static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
-
         compinfo.push_back(gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length));
-        stream_compinfo_map[info.source] = &compinfo.back();
       }
 
+      // Estimate the uncompressed data.
       compinfo.host_to_device_async(_stream);
       gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                      compinfo.size(),
@@ -651,17 +643,18 @@ void reader_impl::load_data(read_mode mode)
                                      _stream);
       compinfo.device_to_host_sync(_stream);
 
-      for (auto& [stream_id, stream_compinfo] : stream_compinfo_map) {
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info           = stream_info[stream_idx];
+        auto const stream_compinfo = compinfo[stream_idx - stream_range.begin];
+
         // Cache these parsed numbers so they can be reused in the decompression/decoding step.
-        compinfo_map[stream_id] = {stream_compinfo->num_compressed_blocks,
-                                   stream_compinfo->num_uncompressed_blocks,
-                                   stream_compinfo->max_uncompressed_size};
-        stripe_decomp_sizes[stream_id.stripe_idx - stripe_start].size_bytes +=
-          stream_compinfo->max_uncompressed_size;
+        compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks,
+                                     stream_compinfo.num_uncompressed_blocks,
+                                     stream_compinfo.max_uncompressed_size};
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes +=
+          stream_compinfo.max_uncompressed_size;
       }
 
-      // Important: must clear this map to reuse the (empty) map for processing the next level.
-      stream_compinfo_map.clear();
     } else {  // no decompression
       // Set decompression sizes equal to the input sizes.
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {

From a06cf49caf8412258edb2a52de039a0f65953764 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 14:00:51 -0700
Subject: [PATCH 273/321] Optimize hashing by combining `orc_col_idx` and
 `kind`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 5f958d6d73f..1f368e1211b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -40,10 +40,10 @@ struct stream_source_info {
   struct hash {
     std::size_t operator()(stream_source_info const& id) const
     {
+      auto const col_kind =
+        static_cast<std::size_t>(id.orc_col_idx) | (static_cast<std::size_t>(id.kind) << 32);
       auto const hasher = std::hash<size_t>{};
-      return hasher(id.stripe_idx) ^ hasher(id.level) ^
-             hasher(static_cast<std::size_t>(id.orc_col_idx)) ^
-             hasher(static_cast<std::size_t>(id.kind));
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^ hasher(col_kind);
     }
   };
   struct equal_to {

From 54ed4fdcd60b85a8085cb255256061ed77869ff8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 14:21:01 -0700
Subject: [PATCH 274/321] Optimize by using one array of compinfo for all
 levels

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 1af0ad478ed..57e4b4c82a4 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -18,6 +18,7 @@
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -611,6 +612,19 @@ void reader_impl::load_data(read_mode mode)
   auto& compinfo_map = _file_itm_data.compinfo_map;
   compinfo_map.clear();  // clear cache of the last load
 
+  // Find the maximum number of streams in all levels of the loaded stripes.
+  auto const max_num_streams = [&] {
+    std::size_t max_count{0};
+    for (std::size_t level = 0; level < num_levels; ++level) {
+      auto const stream_range =
+        get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+      auto const num_streams = stream_range.end - stream_range.begin;
+      max_count              = std::max(max_count, num_streams);
+    }
+    return max_count;
+  }();
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> hd_compinfo(max_num_streams, _stream);
+
   for (std::size_t level = 0; level < num_levels; ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
@@ -626,12 +640,14 @@ void reader_impl::load_data(read_mode mode)
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, _stream);
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
           static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
-        compinfo.push_back(gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length));
+        compinfo[stream_idx - stream_range.begin] =
+          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
       }
 
       // Estimate the uncompressed data.

From 0447271eebf931e71f0f63a97df0bade939198fc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 14:42:38 -0700
Subject: [PATCH 275/321] Use only one array of compinfo for all levels

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 24 ++++++++++--------
 cpp/src/io/orc/reader_impl_decode.cu   | 34 +++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 57e4b4c82a4..406e9558dae 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -612,18 +612,22 @@ void reader_impl::load_data(read_mode mode)
   auto& compinfo_map = _file_itm_data.compinfo_map;
   compinfo_map.clear();  // clear cache of the last load
 
-  // Find the maximum number of streams in all levels of the loaded stripes.
-  auto const max_num_streams = [&] {
-    std::size_t max_count{0};
-    for (std::size_t level = 0; level < num_levels; ++level) {
-      auto const stream_range =
-        get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
-      auto const num_streams = stream_range.end - stream_range.begin;
-      max_count              = std::max(max_count, num_streams);
+  // For parsing decompression data.
+  // We create an array that is large enough to use for all levels, thus only need to allocate
+  // memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the loaded stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+        auto const num_streams = stream_range.end - stream_range.begin;
+        max_num_streams        = std::max(max_num_streams, num_streams);
+      }
     }
-    return max_count;
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
   }();
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> hd_compinfo(max_num_streams, _stream);
 
   for (std::size_t level = 0; level < num_levels; ++level) {
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index a33c31168a8..2221ef66fa1 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -20,6 +20,7 @@
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
+#include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/transform.hpp>
@@ -74,6 +75,7 @@ rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
   range const& stream_range,
   std::size_t num_decode_stripes,
+  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
@@ -92,21 +94,18 @@ rmm::device_buffer decompress_stripe_data(
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
 
-  auto const num_streams = stream_range.end - stream_range.begin;
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, num_streams, stream);
-
   for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
     auto const& info = stream_info[stream_idx];
 
-    compinfo.push_back(gpu::CompressedStreamInfo(
+    auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
+    stream_comp_info       = gpu::CompressedStreamInfo(
       static_cast<uint8_t const*>(
         stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
         info.dst_pos,
-      info.length));
+      info.length);
 
     if (compinfo_ready) {
       auto const& cached_comp_info             = compinfo_map.at(info.source);
-      auto& stream_comp_info                   = compinfo.back();
       stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
       stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
       stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
@@ -759,15 +758,35 @@ void reader_impl::decompress_and_decode(read_mode mode)
 
   // Column descriptors ('chunks').
   // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  // Unfortunately we cannot create one hostdevice_vector to use for all levels because
+  // currently we do not have hostdevice_2dspan exists.
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
 
   // For computing null count.
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums(num_levels);
 
+  // For parsing decompression data.
+  // We create one hostdevice_vector that is large enough to use for all levels,
+  // thus only need to allocate memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the decoding stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          get_range(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
+        auto const num_streams = stream_range.end - stream_range.begin;
+        max_num_streams        = std::max(max_num_streams, num_streams);
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
+  }();
+
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
+    auto const num_streams           = stream_range.end - stream_range.begin;
 
     auto const& columns_level = _selected_columns.levels[level];
     auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
@@ -922,9 +941,12 @@ void reader_impl::decompress_and_decode(read_mode mode)
 
     // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,
+                                                compinfo,
                                                 _file_itm_data.compinfo_map,
                                                 *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,

From 33c92a97edca3472700e0f20a0be5a7c73d41251 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 29 Mar 2024 16:23:02 -0700
Subject: [PATCH 276/321] Use only one `device_buffer` for storing all stripe
 data

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 45 ++++++++-----
 cpp/src/io/orc/reader_impl_chunking.hpp | 12 ++--
 cpp/src/io/orc/reader_impl_decode.cu    | 85 +++++++++++++------------
 3 files changed, 80 insertions(+), 62 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 406e9558dae..acd62b874af 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -274,14 +274,12 @@ void reader_impl::global_preprocess(read_mode mode)
   auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
   stripe_data_read_ranges.resize(num_total_stripes);
 
-  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
   auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
   auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
   auto& lvl_column_types         = _file_itm_data.lvl_column_types;
   auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
 
-  lvl_stripe_data.resize(num_levels);
   lvl_stripe_sizes.resize(num_levels);
   lvl_stream_info.resize(num_levels);
   lvl_stripe_stream_ranges.resize(num_levels);
@@ -462,22 +460,28 @@ void reader_impl::load_data(read_mode mode)
   auto const stripe_start = load_stripe_range.begin;
   auto const stripe_end   = load_stripe_range.end;
   auto const stripe_count = stripe_end - stripe_start;
+  auto const num_levels   = _selected_columns.num_levels();
 
-  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
-  auto const num_levels = _selected_columns.num_levels();
+  auto& stripe_data_offsets = _file_itm_data.stripe_data_offsets;
+  stripe_data_offsets.resize(0);
+  stripe_data_offsets.reserve(num_levels * stripe_count);
+  stripe_data_offsets.push_back(0);
+  std::size_t offset{0};
 
-  // Prepare the buffer to read raw data onto.
+  // Compute the offsets for the memory segments storing data of each stripe.
   for (std::size_t level = 0; level < num_levels; ++level) {
-    auto& stripe_data = lvl_stripe_data[level];
-    stripe_data.resize(stripe_count);
-
     for (std::size_t idx = 0; idx < stripe_count; ++idx) {
-      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
-      stripe_data[idx]       = rmm::device_buffer(
-        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
+      auto const stripe_size      = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      auto const stripe_data_size = cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE);
+      offset += stripe_data_size;
+      stripe_data_offsets.push_back(offset);
     }
   }
 
+  // Now we have the total data size of all stripes. Just create one buffer to load all data into.
+  auto& stripe_data = _file_itm_data.stripe_data;
+  stripe_data       = rmm::device_buffer(stripe_data_offsets.back(), _stream);
+
   //
   // Load stripe data into memory:
   //
@@ -497,8 +501,11 @@ void reader_impl::load_data(read_mode mode)
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
     auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
-    auto const dst_base   = static_cast<uint8_t*>(
-      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
+
+    // `offset_idx` is the flattened index of the stripe offset in `stripe_data_offsets`.
+    auto const offset_idx    = read_info.level * stripe_count + read_info.stripe_idx - stripe_start;
+    auto const stripe_offset = stripe_data_offsets[offset_idx];
+    auto const dst_base      = static_cast<uint8_t*>(stripe_data.data()) + stripe_offset;
 
     if (source_ptr->is_device_read_preferred(read_info.length)) {
       read_tasks.push_back(
@@ -633,8 +640,9 @@ void reader_impl::load_data(read_mode mode)
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
 
-    auto& stripe_data = lvl_stripe_data[level];
-    if (stripe_data.empty()) { continue; }
+    auto const level_data_size =
+      stripe_data_offsets[(level + 1) * stripe_count] - stripe_data_offsets[level * stripe_count];
+    if (level_data_size == 0) { continue; }
 
     // Range of all streams in the loaded stripes.
     auto const stream_range =
@@ -648,8 +656,11 @@ void reader_impl::load_data(read_mode mode)
         hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-        auto const dst_base =
-          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
+
+        // `offset_idx` is the flattened index of the stripe offset in `stripe_data_offsets`.
+        auto const offset_idx    = level * stripe_count + info.source.stripe_idx - stripe_start;
+        auto const stripe_offset = stripe_data_offsets[offset_idx];
+        auto const dst_base      = static_cast<uint8_t*>(stripe_data.data()) + stripe_offset;
         compinfo[stream_idx - stream_range.begin] =
           gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
       }
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 1f368e1211b..cdc6faaf172 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -143,6 +143,13 @@ struct file_intermediate_data {
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
   std::vector<range> stripe_data_read_ranges;
 
+  // The buffers to store raw data read from disk, initialized each time calling to `load_data()`.
+  // After decoding, such buffers can be released.
+  rmm::device_buffer stripe_data;
+
+  // Offsets into the buffer `stripe_data` for each loaded stripe.
+  std::vector<std::size_t> stripe_data_offsets;
+
   // Store the compression information for each data stream.
   stream_source_map<stripe_level_comp_info> compinfo_map;
 
@@ -153,11 +160,6 @@ struct file_intermediate_data {
   // This is used to identify the range of streams for each stripe from that vector.
   std::vector<std::vector<range>> lvl_stripe_stream_ranges;
 
-  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
-  // After decoding, such buffers can be released.
-  // This can only be implemented after chunked output is ready.
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
-
   // Store the size of each stripe at each nested level.
   // This is used to initialize the stripe_data buffers.
   std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 2221ef66fa1..948925b50c9 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -62,7 +62,7 @@ namespace {
  * @param num_decode_stripes Number of stripes that the decoding streams belong to
  * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
- * @param stripe_data List of source stripe column data
+ * @param stripe_data Stripe column data
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
@@ -70,6 +70,8 @@ namespace {
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return Device buffer to decompressed data
+ *
+ * //  TODO: add missing params
  */
 rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
@@ -78,7 +80,9 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
-  host_span<rmm::device_buffer const> stripe_data,
+  device_span<uint8_t const> stripe_data,
+  host_span<std::size_t const> stripe_data_offsets,
+  std::size_t offset_idx_start,
   host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
@@ -98,11 +102,11 @@ rmm::device_buffer decompress_stripe_data(
     auto const& info = stream_info[stream_idx];
 
     auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
-    stream_comp_info       = gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(
-        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
-        info.dst_pos,
-      info.length);
+
+    auto const offset_idx = offset_idx_start + info.source.stripe_idx - loaded_stripe_range.begin;
+    auto const stripe_offset = stripe_data_offsets[offset_idx];
+    auto const dst_base      = &stripe_data.data()[stripe_offset];
+    stream_comp_info         = gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
 
     if (compinfo_ready) {
       auto const& cached_comp_info             = compinfo_map.at(info.source);
@@ -725,6 +729,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
   // The start index of loaded stripes. They are different from decoding stripes.
   auto const load_stripe_range =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
+  auto const load_stripe_count = load_stripe_range.end - load_stripe_range.begin;
   auto const load_stripe_start = load_stripe_range.begin;
 
   auto const rows_to_skip      = _file_itm_data.rows_to_skip;
@@ -782,7 +787,13 @@ void reader_impl::decompress_and_decode(read_mode mode)
     return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
-  auto& col_meta = *_col_meta;
+  auto const& stripe_data_offsets = _file_itm_data.stripe_data_offsets;
+  auto const& stripe_data         = _file_itm_data.stripe_data;
+  auto& col_meta                  = *_col_meta;
+
+  // To store the output decompressed buffers, which need to be kept alive until we decode them.
+  std::vector<rmm::device_buffer> decompressed_buffers;
+
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
@@ -793,8 +804,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
     auto const& column_types  = _file_itm_data.lvl_column_types[level];
     auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
-    auto& chunks      = lvl_chunks[level];
+    auto& chunks = lvl_chunks[level];
 
     auto const num_level_columns = columns_level.size();
     chunks =
@@ -852,8 +862,10 @@ void reader_impl::decompress_and_decode(read_mode mode)
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      auto const dst_base =
-        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
+      // `offset_idx` is the flattened index of the stripe offset in `stripe_data_offsets`.
+      auto const offset_idx    = level * load_stripe_count + stripe_idx - load_stripe_start;
+      auto const stripe_offset = stripe_data_offsets[offset_idx];
+      auto const dst_base      = static_cast<uint8_t const*>(stripe_data.data()) + stripe_offset;
       auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
 
       uint32_t const rowgroup_id = num_rowgroups;
@@ -917,7 +929,9 @@ void reader_impl::decompress_and_decode(read_mode mode)
       num_rowgroups += stripe_num_rowgroups;
     }
 
-    if (stripe_data.empty()) { continue; }
+    auto const level_data_size =
+      stripe_data_offsets[(level + 1) * stripe_count] - stripe_data_offsets[level * stripe_count];
+    if (level_data_size == 0) { continue; }
 
     // Process dataset chunks into output columns.
     auto row_groups =
@@ -943,26 +957,26 @@ void reader_impl::decompress_and_decode(read_mode mode)
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
         hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
-      auto decomp_data = decompress_stripe_data(load_stripe_range,
-                                                stream_range,
-                                                stripe_count,
-                                                compinfo,
-                                                _file_itm_data.compinfo_map,
-                                                *_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
+      auto decomp_data = decompress_stripe_data(
+        load_stripe_range,
+        stream_range,
+        stripe_count,
+        compinfo,
+        _file_itm_data.compinfo_map,
+        *_metadata.per_file_metadata[0].decompressor,
+        device_span<uint8_t const>{static_cast<uint8_t const*>(stripe_data.data()),
+                                   stripe_data.size()},
+        stripe_data_offsets,
+        level * load_stripe_count,
+        stream_info,
+        chunks,
+        row_groups,
+        _metadata.get_row_index_stride(),
+        level == 0,
+        _stream);
 
       // Just save the decompressed data and clear out the raw data to free up memory.
-      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
-      for (std::size_t i = 1; i < stripe_count; ++i) {
-        stripe_data[i + stripe_start - load_stripe_start] = {};
-      }
-
+      decompressed_buffers.emplace_back(std::move(decomp_data));
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1049,15 +1063,6 @@ void reader_impl::decompress_and_decode(read_mode mode)
   // Free up temp memory used for decoding.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     _out_buffers[level].resize(0);
-
-    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      stripe_data[stripe_start - load_stripe_start] = {};
-    } else {
-      for (std::size_t i = 0; i < stripe_count; ++i) {
-        stripe_data[i + stripe_start - load_stripe_start] = {};
-      }
-    }
   }
 
   // Output table range is reset to start from the first position.

From 0a16bb433fa8b40e736dfa8709e20c9517913d0a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Sat, 30 Mar 2024 23:21:51 -0700
Subject: [PATCH 277/321] Revert "Use only one `device_buffer` for storing all
 stripe data"

This reverts commit 33c92a97edca3472700e0f20a0be5a7c73d41251.
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 45 +++++--------
 cpp/src/io/orc/reader_impl_chunking.hpp | 12 ++--
 cpp/src/io/orc/reader_impl_decode.cu    | 85 ++++++++++++-------------
 3 files changed, 62 insertions(+), 80 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index acd62b874af..406e9558dae 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -274,12 +274,14 @@ void reader_impl::global_preprocess(read_mode mode)
   auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
   stripe_data_read_ranges.resize(num_total_stripes);
 
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
   auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
   auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
   auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
   auto& lvl_column_types         = _file_itm_data.lvl_column_types;
   auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
 
+  lvl_stripe_data.resize(num_levels);
   lvl_stripe_sizes.resize(num_levels);
   lvl_stream_info.resize(num_levels);
   lvl_stripe_stream_ranges.resize(num_levels);
@@ -460,28 +462,22 @@ void reader_impl::load_data(read_mode mode)
   auto const stripe_start = load_stripe_range.begin;
   auto const stripe_end   = load_stripe_range.end;
   auto const stripe_count = stripe_end - stripe_start;
-  auto const num_levels   = _selected_columns.num_levels();
 
-  auto& stripe_data_offsets = _file_itm_data.stripe_data_offsets;
-  stripe_data_offsets.resize(0);
-  stripe_data_offsets.reserve(num_levels * stripe_count);
-  stripe_data_offsets.push_back(0);
-  std::size_t offset{0};
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto const num_levels = _selected_columns.num_levels();
 
-  // Compute the offsets for the memory segments storing data of each stripe.
+  // Prepare the buffer to read raw data onto.
   for (std::size_t level = 0; level < num_levels; ++level) {
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(stripe_count);
+
     for (std::size_t idx = 0; idx < stripe_count; ++idx) {
-      auto const stripe_size      = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
-      auto const stripe_data_size = cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE);
-      offset += stripe_data_size;
-      stripe_data_offsets.push_back(offset);
+      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      stripe_data[idx]       = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
     }
   }
 
-  // Now we have the total data size of all stripes. Just create one buffer to load all data into.
-  auto& stripe_data = _file_itm_data.stripe_data;
-  stripe_data       = rmm::device_buffer(stripe_data_offsets.back(), _stream);
-
   //
   // Load stripe data into memory:
   //
@@ -501,11 +497,8 @@ void reader_impl::load_data(read_mode mode)
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
     auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
-
-    // `offset_idx` is the flattened index of the stripe offset in `stripe_data_offsets`.
-    auto const offset_idx    = read_info.level * stripe_count + read_info.stripe_idx - stripe_start;
-    auto const stripe_offset = stripe_data_offsets[offset_idx];
-    auto const dst_base      = static_cast<uint8_t*>(stripe_data.data()) + stripe_offset;
+    auto const dst_base   = static_cast<uint8_t*>(
+      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
     if (source_ptr->is_device_read_preferred(read_info.length)) {
       read_tasks.push_back(
@@ -640,9 +633,8 @@ void reader_impl::load_data(read_mode mode)
     auto const& stream_info = _file_itm_data.lvl_stream_info[level];
     auto const num_columns  = _selected_columns.levels[level].size();
 
-    auto const level_data_size =
-      stripe_data_offsets[(level + 1) * stripe_count] - stripe_data_offsets[level * stripe_count];
-    if (level_data_size == 0) { continue; }
+    auto& stripe_data = lvl_stripe_data[level];
+    if (stripe_data.empty()) { continue; }
 
     // Range of all streams in the loaded stripes.
     auto const stream_range =
@@ -656,11 +648,8 @@ void reader_impl::load_data(read_mode mode)
         hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
-
-        // `offset_idx` is the flattened index of the stripe offset in `stripe_data_offsets`.
-        auto const offset_idx    = level * stripe_count + info.source.stripe_idx - stripe_start;
-        auto const stripe_offset = stripe_data_offsets[offset_idx];
-        auto const dst_base      = static_cast<uint8_t*>(stripe_data.data()) + stripe_offset;
+        auto const dst_base =
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
         compinfo[stream_idx - stream_range.begin] =
           gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
       }
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index cdc6faaf172..1f368e1211b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -143,13 +143,6 @@ struct file_intermediate_data {
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
   std::vector<range> stripe_data_read_ranges;
 
-  // The buffers to store raw data read from disk, initialized each time calling to `load_data()`.
-  // After decoding, such buffers can be released.
-  rmm::device_buffer stripe_data;
-
-  // Offsets into the buffer `stripe_data` for each loaded stripe.
-  std::vector<std::size_t> stripe_data_offsets;
-
   // Store the compression information for each data stream.
   stream_source_map<stripe_level_comp_info> compinfo_map;
 
@@ -160,6 +153,11 @@ struct file_intermediate_data {
   // This is used to identify the range of streams for each stripe from that vector.
   std::vector<std::vector<range>> lvl_stripe_stream_ranges;
 
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
+  // This can only be implemented after chunked output is ready.
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
+
   // Store the size of each stripe at each nested level.
   // This is used to initialize the stripe_data buffers.
   std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 948925b50c9..2221ef66fa1 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -62,7 +62,7 @@ namespace {
  * @param num_decode_stripes Number of stripes that the decoding streams belong to
  * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
- * @param stripe_data Stripe column data
+ * @param stripe_data List of source stripe column data
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
@@ -70,8 +70,6 @@ namespace {
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return Device buffer to decompressed data
- *
- * //  TODO: add missing params
  */
 rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
@@ -80,9 +78,7 @@ rmm::device_buffer decompress_stripe_data(
   cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
-  device_span<uint8_t const> stripe_data,
-  host_span<std::size_t const> stripe_data_offsets,
-  std::size_t offset_idx_start,
+  host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
@@ -102,11 +98,11 @@ rmm::device_buffer decompress_stripe_data(
     auto const& info = stream_info[stream_idx];
 
     auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
-
-    auto const offset_idx = offset_idx_start + info.source.stripe_idx - loaded_stripe_range.begin;
-    auto const stripe_offset = stripe_data_offsets[offset_idx];
-    auto const dst_base      = &stripe_data.data()[stripe_offset];
-    stream_comp_info         = gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+    stream_comp_info       = gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
+        info.dst_pos,
+      info.length);
 
     if (compinfo_ready) {
       auto const& cached_comp_info             = compinfo_map.at(info.source);
@@ -729,7 +725,6 @@ void reader_impl::decompress_and_decode(read_mode mode)
   // The start index of loaded stripes. They are different from decoding stripes.
   auto const load_stripe_range =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
-  auto const load_stripe_count = load_stripe_range.end - load_stripe_range.begin;
   auto const load_stripe_start = load_stripe_range.begin;
 
   auto const rows_to_skip      = _file_itm_data.rows_to_skip;
@@ -787,13 +782,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
     return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
-  auto const& stripe_data_offsets = _file_itm_data.stripe_data_offsets;
-  auto const& stripe_data         = _file_itm_data.stripe_data;
-  auto& col_meta                  = *_col_meta;
-
-  // To store the output decompressed buffers, which need to be kept alive until we decode them.
-  std::vector<rmm::device_buffer> decompressed_buffers;
-
+  auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
@@ -804,7 +793,8 @@ void reader_impl::decompress_and_decode(read_mode mode)
     auto const& column_types  = _file_itm_data.lvl_column_types[level];
     auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    auto& chunks = lvl_chunks[level];
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    auto& chunks      = lvl_chunks[level];
 
     auto const num_level_columns = columns_level.size();
     chunks =
@@ -862,10 +852,8 @@ void reader_impl::decompress_and_decode(read_mode mode)
       CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                    "Invalid index rowgroup stream data");
 
-      // `offset_idx` is the flattened index of the stripe offset in `stripe_data_offsets`.
-      auto const offset_idx    = level * load_stripe_count + stripe_idx - load_stripe_start;
-      auto const stripe_offset = stripe_data_offsets[offset_idx];
-      auto const dst_base      = static_cast<uint8_t const*>(stripe_data.data()) + stripe_offset;
+      auto const dst_base =
+        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
       auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
 
       uint32_t const rowgroup_id = num_rowgroups;
@@ -929,9 +917,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
       num_rowgroups += stripe_num_rowgroups;
     }
 
-    auto const level_data_size =
-      stripe_data_offsets[(level + 1) * stripe_count] - stripe_data_offsets[level * stripe_count];
-    if (level_data_size == 0) { continue; }
+    if (stripe_data.empty()) { continue; }
 
     // Process dataset chunks into output columns.
     auto row_groups =
@@ -957,26 +943,26 @@ void reader_impl::decompress_and_decode(read_mode mode)
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
         hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
-      auto decomp_data = decompress_stripe_data(
-        load_stripe_range,
-        stream_range,
-        stripe_count,
-        compinfo,
-        _file_itm_data.compinfo_map,
-        *_metadata.per_file_metadata[0].decompressor,
-        device_span<uint8_t const>{static_cast<uint8_t const*>(stripe_data.data()),
-                                   stripe_data.size()},
-        stripe_data_offsets,
-        level * load_stripe_count,
-        stream_info,
-        chunks,
-        row_groups,
-        _metadata.get_row_index_stride(),
-        level == 0,
-        _stream);
+      auto decomp_data = decompress_stripe_data(load_stripe_range,
+                                                stream_range,
+                                                stripe_count,
+                                                compinfo,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
 
       // Just save the decompressed data and clear out the raw data to free up memory.
-      decompressed_buffers.emplace_back(std::move(decomp_data));
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (std::size_t i = 1; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1063,6 +1049,15 @@ void reader_impl::decompress_and_decode(read_mode mode)
   // Free up temp memory used for decoding.
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     _out_buffers[level].resize(0);
+
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (std::size_t i = 0; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
   }
 
   // Output table range is reset to start from the first position.

From 3f8a2202c047434ef6be42f32a5a4ded92c31fa6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 1 Apr 2024 21:29:33 -0700
Subject: [PATCH 278/321] Revert changes to `reader` class

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 32b28692140..bf042b35fe0 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -44,7 +44,7 @@ class reader_impl;
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
- protected:
+ private:
   std::unique_ptr<reader_impl> _impl;
 
  public:
@@ -64,7 +64,7 @@ class reader {
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
    */
-  virtual ~reader();
+  ~reader();
 
   /**
    * @brief Reads the entire dataset.
@@ -78,6 +78,7 @@ class reader {
  * @brief The reader class that supports iterative reading from an array of data sources.
  */
 class chunked_reader {
+ private:
   std::unique_ptr<reader_impl> _impl;
 
  public:

From cbb3858b85bc8aa4eba17a868170680611c6c8ca Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 1 Apr 2024 22:01:42 -0700
Subject: [PATCH 279/321] Use byte count instead of bit count

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 2221ef66fa1..cd22a9e3703 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,7 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
       // the last segment may be shorter than the others.
       auto const current_length =
         cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
-      auto const size = d_sizes[segment_idx];
+      auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};
     });
@@ -704,8 +704,7 @@ std::vector<range> find_table_splits(table_view const& input,
                          cumulative_size_sum{});
   segmented_sizes.device_to_host_sync(stream);
 
-  // Since the segment sizes are in bits, we need to multiply CHAR_BIT with the output limit.
-  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit * CHAR_BIT);
+  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit);
 }
 
 }  // namespace

From 1c62ba790fa003717879031b423b44de046947cd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 1 Apr 2024 22:01:54 -0700
Subject: [PATCH 280/321] Change bench limits

---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index e710219852e..9ddcaeb36e4 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -188,6 +188,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
 
+std::size_t constexpr MB_bytes{1024 * 1024};
+
 // Should have the same parameters as `BM_orc_read_io_compression` for comparison.
 NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
                     NVBENCH_TYPE_AXES(io_list, compression_list))
@@ -196,5 +198,6 @@ NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
-  .add_int64_axis("output_limit", {0, 500'000})
-  .add_int64_axis("read_limit", {0, 500'000});
+  // The input has approximately 520MB and 127K rows.
+  .add_int64_axis("output_limit", {100 * MB_bytes, 500 * MB_bytes})
+  .add_int64_axis("read_limit", {100 * MB_bytes, 500 * MB_bytes});

From 1c2bdc18b9a7f86e9c7477028eee94ffe8d1ca70 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 2 Apr 2024 13:34:09 -0600
Subject: [PATCH 281/321] Update cpp/benchmarks/io/orc/orc_reader_input.cpp

---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 9ddcaeb36e4..9daf87b9cd3 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -188,7 +188,7 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
 
-std::size_t constexpr MB_bytes{1024 * 1024};
+std::size_t constexpr Mbytes{1024 * 1024};
 
 // Should have the same parameters as `BM_orc_read_io_compression` for comparison.
 NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,

From 7d12de242a99cfb754214df7c90695a71fa99820 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 2 Apr 2024 13:34:17 -0600
Subject: [PATCH 282/321] Update cpp/benchmarks/io/orc/orc_reader_input.cpp

---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 9daf87b9cd3..58b43367382 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -199,5 +199,5 @@ NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   // The input has approximately 520MB and 127K rows.
-  .add_int64_axis("output_limit", {100 * MB_bytes, 500 * MB_bytes})
-  .add_int64_axis("read_limit", {100 * MB_bytes, 500 * MB_bytes});
+  .add_int64_axis("output_limit", {100 * Mbytes, 500 * Mbytes})
+  .add_int64_axis("read_limit", {100 * Mbytes, 500 * Mbytes});

From c426e4c1a98529383be9ee812fca46104d44f4b3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 3 Apr 2024 10:16:49 -0700
Subject: [PATCH 283/321] Fix/add comment and cleanup

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp             | 2 +-
 cpp/tests/io/orc_chunked_reader_test.cu | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index a28011feb8f..f1d20cc2094 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -477,7 +477,7 @@ class chunked_orc_reader {
    * @brief Construct the reader from input/output size limits along with other ORC reader options.
    *
    * This constructor implicitly call the other constructor with `output_row_granularity` set to
-   * 10'000 rows.
+   * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
    *
    * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 39450bb2a9f..59727c6d5fc 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -44,9 +44,6 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <fstream>
-#include <type_traits>
-
 namespace {
 enum class output_limit : std::size_t {};
 enum class input_limit : std::size_t {};
@@ -1413,6 +1410,7 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
   }
 
+  // The test below requires a huge amount of memory, thus it is disabled by default.
 #ifdef LOCAL_TEST
   // Read with only output limit -- there is no limit on the memory usage.
   // However, the reader should be able to detect and load only enough stripes each time

From faea7bc6922393f24b4bc67ed0e62a06529f7557 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 3 Apr 2024 10:21:08 -0700
Subject: [PATCH 284/321] Use pointers instead of optionals

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 27 ++++++++++++-------------
 cpp/src/io/orc/reader_impl_chunking.hpp |  4 ++--
 cpp/src/io/orc/reader_impl_decode.cu    |  2 +-
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 406e9558dae..c1209569285 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -47,10 +47,10 @@ std::size_t gather_stream_info_and_column_desc(
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
   std::size_t* local_stream_order,
-  std::optional<std::vector<orc_stream_info>*> const& stream_info,
-  std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks)
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
 {
-  CUDF_EXPECTS(stream_info.has_value() ^ chunks.has_value(),
+  CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
                "Either stream_info or chunks must be provided, but not both.");
 
   std::size_t src_offset = 0;
@@ -92,8 +92,8 @@ std::size_t gather_stream_info_and_column_desc(
           auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
           if (child_idx >= 0) {
             col = child_idx;
-            if (chunks.has_value()) {
-              auto& chunk                     = (*chunks.value())[stripe_order][col];
+            if (chunks) {
+              auto& chunk                     = (*chunks)[stripe_order][col];
               chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
@@ -101,11 +101,11 @@ std::size_t gather_stream_info_and_column_desc(
         }
       }
     } else if (col != -1) {
-      if (chunks.has_value()) {
+      if (chunks) {
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk                = (*chunks.value())[stripe_order][col];
+            auto& chunk                = (*chunks)[stripe_order][col];
             chunk.strm_id[index_type]  = *local_stream_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
@@ -121,12 +121,11 @@ std::size_t gather_stream_info_and_column_desc(
         }
 
         (*local_stream_order)++;
-      } else {  // not chunks.has_value()
-        stream_info.value()->emplace_back(
-          stripeinfo->offset + src_offset,
-          dst_offset,
-          stream.length,
-          stream_source_info{stripe_order, level, column_id, stream.kind});
+      } else {  // chunks == nullptr
+        stream_info->emplace_back(stripeinfo->offset + src_offset,
+                                  dst_offset,
+                                  stream.length,
+                                  stream_source_info{stripe_order, level, column_id, stream.kind});
       }
 
       dst_offset += stream.length;
@@ -381,7 +380,7 @@ void reader_impl::global_preprocess(read_mode mode)
                                            nullptr,  // num_dictionary_entries
                                            nullptr,  // local_stream_order
                                            &stream_info,
-                                           std::nullopt  // chunks
+                                           nullptr  // chunks
         );
 
       auto const is_stripe_data_empty = stripe_level_size == 0;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 1f368e1211b..4fcb75b89b6 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -325,7 +325,7 @@ std::size_t gather_stream_info_and_column_desc(
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
   std::size_t* local_stream_order,
-  std::optional<std::vector<orc_stream_info>*> const& stream_info,
-  std::optional<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>*> const& chunks);
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index cd22a9e3703..602a73ff78d 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -844,7 +844,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
                                                                       level == 0,
                                                                       &num_dict_entries,
                                                                       &local_stream_order,
-                                                                      std::nullopt,  // stream_info
+                                                                      nullptr,  // stream_info
                                                                       &chunks);
 
       auto const is_stripe_data_empty = total_data_size == 0;

From 7bfcdf536a7dddc6b54632de82e8d9f0ff473045 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 3 Apr 2024 14:01:34 -0700
Subject: [PATCH 285/321] Require `output_row_granularity` to be positive all
 the time

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/orc.hpp             | 4 +++-
 cpp/src/io/orc/reader.cu                | 5 -----
 cpp/src/io/orc/reader_impl.cu           | 5 +----
 cpp/src/io/orc/reader_impl_chunking.hpp | 2 ++
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index f1d20cc2094..ac7d086c950 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -464,6 +464,8 @@ class chunked_orc_reader {
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
+   *
+   * @throw cudf::logic_error if `output_row_granularity` is non-positive
    */
   explicit chunked_orc_reader(
     std::size_t output_size_limit,
@@ -498,7 +500,7 @@ class chunked_orc_reader {
    * @brief Construct the reader from output size limits along with other ORC reader options.
    *
    * This constructor implicitly call the other constructor with `data_read_limit` set to `0` and
-   * `output_row_granularity` set to 10'000 rows.
+   * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
    *
    * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 37eb7ab0fd7..006e70467b5 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -58,11 +58,6 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                         stream,
                                         mr)}
 {
-  // Although we internally accept non-positive value for `output_row_granularity` because we
-  // implicitly change such value into `DEFAULT_OUTPUT_ROW_GRANULARITY`.
-  // The user are not allowed to do so but instead required to specify an explicit positive number.
-  CUDF_EXPECTS(output_row_granularity > 0,
-               "The value of `output_row_granularity` must be positive.");
 }
 
 chunked_reader::~chunked_reader() = default;
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 566c8a059d8..f63fd1fbeef 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -177,10 +177,7 @@ reader_impl::reader_impl(std::size_t output_size_limit,
     _sources(std::move(sources)),
     _metadata{_sources, stream},
     _selected_columns{_metadata.select_columns(options.get_columns())},
-    _chunk_read_data{
-      output_size_limit,
-      data_read_limit,
-      output_row_granularity > 0 ? output_row_granularity : DEFAULT_OUTPUT_ROW_GRANULARITY}
+    _chunk_read_data{output_size_limit, data_read_limit, output_row_granularity}
 {
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns.
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 4fcb75b89b6..7faedbcd399 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -185,6 +185,8 @@ struct chunk_read_data {
       data_read_limit{data_read_limit_},
       output_row_granularity{output_row_granularity_}
   {
+    CUDF_EXPECTS(output_row_granularity > 0,
+                 "The value of `output_row_granularity` must be positive.");
   }
 
   std::size_t const

From a915f33a06e8e6ba146e71f2c82242f570addd9f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 3 Apr 2024 15:52:24 -0700
Subject: [PATCH 286/321] Reorganize code, removing constructors

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 16 +++--
 cpp/src/io/orc/reader_impl_chunking.hpp | 84 +++++++++----------------
 2 files changed, 42 insertions(+), 58 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index c1209569285..7a1fb3dfb48 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -122,10 +122,11 @@ std::size_t gather_stream_info_and_column_desc(
 
         (*local_stream_order)++;
       } else {  // chunks == nullptr
-        stream_info->emplace_back(stripeinfo->offset + src_offset,
-                                  dst_offset,
-                                  stream.length,
-                                  stream_source_info{stripe_order, level, column_id, stream.kind});
+        stream_info->emplace_back(
+          orc_stream_info{stripeinfo->offset + src_offset,
+                          dst_offset,
+                          stream.length,
+                          stream_source_info{stripe_order, level, column_id, stream.kind}});
       }
 
       dst_offset += stream.length;
@@ -406,7 +407,12 @@ void reader_impl::global_preprocess(read_mode mode)
           len += stream_info[stream_level_count].length;
           stream_level_count++;
         }
-        read_info.emplace_back(offset, d_dst, len, stripe.source_idx, stripe_global_idx, level);
+        read_info.emplace_back(stream_data_read_info{offset,
+                                                     d_dst,
+                                                     len,
+                                                     static_cast<std::size_t>(stripe.source_idx),
+                                                     stripe_global_idx,
+                                                     level});
       }
     }  // end loop level
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 7faedbcd399..8e78514c72d 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -28,6 +28,34 @@
 
 namespace cudf::io::orc::detail {
 
+/**
+ * @brief Struct representing a range of data.
+ */
+struct range {
+  std::size_t begin{0};
+  std::size_t end{0};
+};
+
+// Store information to identify where to read a chunk of data from source.
+// Each read corresponds to one or more consecutive streams combined.
+struct stream_data_read_info {
+  uint64_t offset;         // offset in data source
+  std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;      // data length to read
+  std::size_t source_idx;  // the data source id
+  std::size_t stripe_idx;  // global stripe index
+  std::size_t level;       // nested level
+};
+
+/**
+ * @brief Compression information for a stripe at a specific nested level.
+ */
+struct stripe_level_comp_info {
+  std::size_t num_compressed_blocks{0};
+  std::size_t num_uncompressed_blocks{0};
+  std::size_t total_decomp_size{0};
+};
+
 /**
  * @brief Struct that store source information of an ORC streams.
  */
@@ -66,13 +94,6 @@ using stream_source_map =
  * @brief Struct that store information of an ORC stream.
  */
 struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           stream_source_info const& source_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), source(source_)
-  {
-  }
   // Data info:
   uint64_t offset;      // offset in data source
   std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
@@ -82,23 +103,6 @@ struct orc_stream_info {
   stream_source_info source;
 };
 
-/**
- * @brief Compression information for a stripe at a specific nested level.
- */
-struct stripe_level_comp_info {
-  std::size_t num_compressed_blocks{0};
-  std::size_t num_uncompressed_blocks{0};
-  std::size_t total_decomp_size{0};
-};
-
-/**
- * @brief Struct representing a range of data.
- */
-struct range {
-  std::size_t begin;
-  std::size_t end;
-};
-
 /**
  * @brief Struct storing intermediate processing data loaded from data sources.
  */
@@ -110,39 +114,13 @@ struct file_intermediate_data {
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
-  // Store information to identify where to read a chunk of data from source.
-  // Each read corresponds to one or more consecutive streams combined.
-  struct stream_data_read_info {
-    stream_data_read_info(uint64_t offset_,
-                          std::size_t dst_pos_,
-                          std::size_t length_,
-                          std::size_t source_idx_,
-                          std::size_t stripe_idx_,
-                          std::size_t level_)
-      : offset(offset_),
-        dst_pos(dst_pos_),
-        length(length_),
-        source_idx(source_idx_),
-        stripe_idx(stripe_idx_),
-        level(level_)
-    {
-    }
-
-    uint64_t offset;         // offset in data source
-    std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
-    std::size_t length;      // data length to read
-    std::size_t source_idx;  // the data source id
-    std::size_t stripe_idx;  // global stripe index
-    std::size_t level;       // nested level
-  };
-
-  // Identify what data to read from source.
-  std::vector<stream_data_read_info> data_read_info;
-
   // For each stripe, we perform a number of read for its streams.
   // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
   std::vector<range> stripe_data_read_ranges;
 
+  // Identify what data to read from source.
+  std::vector<stream_data_read_info> data_read_info;
+
   // Store the compression information for each data stream.
   stream_source_map<stripe_level_comp_info> compinfo_map;
 

From 69d70c587e4b6b904b619ddeb33b548addeebfd3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 3 Apr 2024 15:53:59 -0700
Subject: [PATCH 287/321] Rename functor

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 4 ++--
 cpp/src/io/orc/reader_impl_chunking.hpp | 2 +-
 cpp/src/io/orc/reader_impl_decode.cu    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 7a1fb3dfb48..c5257f3dc13 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -444,7 +444,7 @@ void reader_impl::global_preprocess(read_mode mode)
                          total_stripe_sizes.d_begin(),
                          total_stripe_sizes.d_end(),
                          total_stripe_sizes.d_begin(),
-                         cumulative_size_sum{});
+                         cumulative_size_plus{});
   total_stripe_sizes.device_to_host_sync(_stream);
 
   auto const load_limit = [&] {
@@ -695,7 +695,7 @@ void reader_impl::load_data(read_mode mode)
                          stripe_decomp_sizes.d_begin(),
                          stripe_decomp_sizes.d_end(),
                          stripe_decomp_sizes.d_begin(),
-                         cumulative_size_sum{});
+                         cumulative_size_plus{});
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
   auto const decode_limit = [&] {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 8e78514c72d..3b61bc067a9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -231,7 +231,7 @@ struct cumulative_size_and_row {
 /**
  * @brief Functor to sum up cumulative data.
  */
-struct cumulative_size_sum {
+struct cumulative_size_plus {
   __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
   {
     return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 602a73ff78d..455d96691e8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -701,7 +701,7 @@ std::vector<range> find_table_splits(table_view const& input,
                          segmented_sizes.d_begin(),
                          segmented_sizes.d_end(),
                          segmented_sizes.d_begin(),
-                         cumulative_size_sum{});
+                         cumulative_size_plus{});
   segmented_sizes.device_to_host_sync(stream);
 
   return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit);

From 4e94d531f0f52bf44e62c7f2ce849fe06eeb6cc7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 3 Apr 2024 15:57:16 -0700
Subject: [PATCH 288/321] Using `host_span` instead of `const&`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 2 +-
 cpp/src/io/orc/reader_impl_chunking.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index c5257f3dc13..6c8f2c14adf 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -220,7 +220,7 @@ template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_si
 template std::vector<range> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
 
-range get_range(std::vector<range> const& input_ranges, range const& selected_ranges)
+range get_range(host_span<range const> input_ranges, range const& selected_ranges)
 {
   // The first and last range.
   auto const& first_range = input_ranges[selected_ranges.begin];
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 3b61bc067a9..58b67760b66 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -268,7 +268,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
  * @param selected_ranges A range of ranges from `input_ranges`
  * @return The range of data span by the selected range of ranges
  */
-range get_range(std::vector<range> const& input_ranges, range const& selected_ranges);
+range get_range(host_span<range const> input_ranges, range const& selected_ranges);
 
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column

From 8c056541ae5e0df53234e132d2a5f396278a1a0e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 09:20:09 -0700
Subject: [PATCH 289/321] Use `device_async_resource_ref`

---
 cpp/include/cudf/io/detail/orc.hpp | 8 ++++----
 cpp/include/cudf/io/orc.hpp        | 8 ++++----
 cpp/src/io/functions.cpp           | 6 +++---
 cpp/src/io/orc/reader.cu           | 6 +++---
 cpp/src/io/orc/reader_impl.cu      | 4 ++--
 cpp/src/io/orc/reader_impl.hpp     | 8 ++++----
 6 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 0000fc8a9c4..4d610891858 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -85,7 +85,7 @@ class chunked_reader {
  public:
   /**
    * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
-   * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    *
    * @param sources Input `datasource` objects to read the dataset from
    */
@@ -95,10 +95,10 @@ class chunked_reader {
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           orc_reader_options const& options,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+                          rmm::device_async_resource_ref mr);
   /**
    * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
-   * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    *
    * @param sources Input `datasource` objects to read the dataset from
    */
@@ -107,7 +107,7 @@ class chunked_reader {
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           orc_reader_options const& options,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+                          rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 33f430a9b1c..f4e63a1d84e 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -473,8 +473,8 @@ class chunked_orc_reader {
     std::size_t data_read_limit,
     size_type output_row_granularity,
     orc_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct the reader from input/output size limits along with other ORC reader options.
@@ -495,7 +495,7 @@ class chunked_orc_reader {
     std::size_t data_read_limit,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct the reader from output size limits along with other ORC reader options.
@@ -513,7 +513,7 @@ class chunked_orc_reader {
     std::size_t output_size_limit,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index d23caaec45d..4819e5e7b78 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -445,7 +445,7 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        size_type output_row_granularity,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
   : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
                                                          data_read_limit,
                                                          output_row_granularity,
@@ -460,7 +460,7 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        std::size_t data_read_limit,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
   : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
                                                          data_read_limit,
                                                          make_datasources(options.get_source()),
@@ -473,7 +473,7 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
   : chunked_orc_reader(output_size_limit, 0UL, options, stream, mr)
 {
 }
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
index 006e70467b5..d4b0c3383af 100644
--- a/cpp/src/io/orc/reader.cu
+++ b/cpp/src/io/orc/reader.cu
@@ -25,7 +25,7 @@ reader::~reader() = default;
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
 {
 }
@@ -37,7 +37,7 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
   : _impl{std::make_unique<reader_impl>(
       output_size_limit, data_read_limit, std::move(sources), options, stream, mr)}
 {
@@ -49,7 +49,7 @@ chunked_reader::chunked_reader(std::size_t output_size_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
   : _impl{std::make_unique<reader_impl>(output_size_limit,
                                         data_read_limit,
                                         output_row_granularity,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4a3ac4833f6..4d874cfc1b9 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -144,7 +144,7 @@ reader_impl::reader_impl(std::size_t output_size_limit,
                          std::vector<std::unique_ptr<datasource>>&& sources,
                          orc_reader_options const& options,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : reader_impl::reader_impl(output_size_limit,
                              data_read_limit,
                              DEFAULT_OUTPUT_ROW_GRANULARITY,
@@ -161,7 +161,7 @@ reader_impl::reader_impl(std::size_t output_size_limit,
                          std::vector<std::unique_ptr<datasource>>&& sources,
                          orc_reader_options const& options,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : _stream(stream),
     _mr(mr),
     _config{options.get_timestamp_type(),
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index a872e5a091c..310102079fb 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -59,18 +59,18 @@ class reader_impl {
 
   /**
    * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
-   * orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
   explicit reader_impl(std::size_t output_size_limit,
                        std::size_t data_read_limit,
                        std::vector<std::unique_ptr<datasource>>&& sources,
                        orc_reader_options const& options,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr);
+                       rmm::device_async_resource_ref mr);
 
   /**
    * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
-   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
   explicit reader_impl(std::size_t output_size_limit,
                        std::size_t data_read_limit,
@@ -78,7 +78,7 @@ class reader_impl {
                        std::vector<std::unique_ptr<datasource>>&& sources,
                        orc_reader_options const& options,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr);
+                       rmm::device_async_resource_ref mr);
 
   /**
    * @copydoc cudf::io::orc::detail::reader::read

From 3b4d7f22903902bad7bbd43447d3e00b5c72be63 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 10:26:10 -0700
Subject: [PATCH 290/321] Rename `global_preprocess` into `preprocess_file`

---
 cpp/src/io/orc/reader_impl.cu          | 2 +-
 cpp/src/io/orc/reader_impl.hpp         | 6 +++---
 cpp/src/io/orc/reader_impl_chunking.cu | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4d874cfc1b9..289573b38ca 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -30,7 +30,7 @@ void reader_impl::prepare_data(read_mode mode)
   if (_selected_columns.num_levels() == 0) { return; }
 
   // This will be no-op if it was called before.
-  global_preprocess(mode);
+  preprocess_file(mode);
 
   if (!_chunk_read_data.more_table_chunk_to_output()) {
     if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 310102079fb..fd78a35792c 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -112,8 +112,8 @@ class reader_impl {
   void prepare_data(read_mode mode);
 
   /**
-   * @brief Perform a global preprocessing step that executes exactly once for the entire duration
-   * of the reader.
+   * @brief Perform a preprocessing step on the input data sources that executes exactly once
+   * for the entire duration of the reader.
    *
    * In this step, the metadata of all stripes in the data sources is parsed, and information about
    * data streams of the selected columns in all stripes are generated. If the reader has a data
@@ -123,7 +123,7 @@ class reader_impl {
    *
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void global_preprocess(read_mode mode);
+  void preprocess_file(read_mode mode);
 
   /**
    * @brief Load stripes from the input data sources into memory.
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 6c8f2c14adf..c05082a377a 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -230,7 +230,7 @@ range get_range(host_span<range const> input_ranges, range const& selected_range
   return {first_range.begin, last_range.end};
 }
 
-void reader_impl::global_preprocess(read_mode mode)
+void reader_impl::preprocess_file(read_mode mode)
 {
   if (_file_itm_data.global_preprocessed) { return; }
   _file_itm_data.global_preprocessed = true;

From 33f6d158fcb23ff26b13dd953f01691474ad14db Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 12:32:48 -0700
Subject: [PATCH 291/321] Optimize memory usage

---
 cpp/src/io/orc/reader_impl_chunking.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index c05082a377a..6ad67701f0b 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -351,11 +351,15 @@ void reader_impl::preprocess_file(read_mode mode)
   // Collect all data streams' information:
   //
 
+  // Load all stripes if we are in READ_ALL mode or there is no read limit.
+  auto const load_all_stripes =
+    mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0;
+
   // Accumulate data size for data streams in each stripe.
   // This will be used for CHUNKED_READ mode only.
   // If we are in READ_ALL mode, we do not need this since we just load all stripes.
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
-    mode == read_mode::CHUNKED_READ ? num_total_stripes : std::size_t{0}, _stream);
+    load_all_stripes ? std::size_t{0} : num_total_stripes, _stream);
 
   for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
        ++stripe_global_idx) {
@@ -416,9 +420,7 @@ void reader_impl::preprocess_file(read_mode mode)
       }
     }  // end loop level
 
-    if (mode == read_mode::CHUNKED_READ) {
-      total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size};
-    }
+    if (!load_all_stripes) { total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size}; }
 
     // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
     stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
@@ -432,8 +434,7 @@ void reader_impl::preprocess_file(read_mode mode)
   // Load range is reset to start from the first position in `load_stripe_ranges`.
   _chunk_read_data.curr_load_stripe_range = 0;
 
-  // Load all stripes if there is no read limit or if we are in READ_ALL mode.
-  if (mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) {
+  if (load_all_stripes) {
     _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
     return;
   }

From 5a253bd04bea1a1ea508c89acce50559521d3b23 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 13:29:05 -0700
Subject: [PATCH 292/321] Fix overflow handling

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 69 ++++++++++++++------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 6ad67701f0b..776e000f12f 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -544,6 +544,9 @@ void reader_impl::load_data(read_mode mode)
   // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
   _chunk_read_data.curr_decode_stripe_range = 0;
 
+  auto constexpr column_size_limit =
+    static_cast<std::size_t>(std::numeric_limits<size_type>::max());
+
   // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode.
   // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
   // decoding all stripes like this, for better load-balancing and reduce memory usage.
@@ -552,34 +555,11 @@ void reader_impl::load_data(read_mode mode)
       // In addition to read limit, we also need to check if the the total number of
       // rows in the loaded stripes exceeds column size limit.
       // If that is the case, we cannot decode all stripes at once.
-      num_loading_rows < static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+      num_loading_rows < column_size_limit) {
     _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
     return;
   }
 
-  // For estimating the decompressed sizes of the loaded stripes.
-  // Only used in CHUNKED_READ mode.
-  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(
-    mode == read_mode::CHUNKED_READ ? stripe_count : std::size_t{0}, _stream);
-
-  // For mapping stripe to the number of rows in it.
-  // Only used in READ_ALL mode.
-  // This is to store exactly the same data as for `stripe_decomp_size` above but here we do not
-  // need to allocate device memory.
-  std::vector<cumulative_size_and_row> stripe_rows(mode == read_mode::READ_ALL ? stripe_count
-                                                                               : std::size_t{0});
-
-  // Fill up the `cumulative_size_and_row` array.
-  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect name.
-  auto const stripe_sizes_rows_ptr =
-    mode == read_mode::CHUNKED_READ ? stripe_decomp_sizes.begin() : stripe_rows.data();
-  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
-    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
-    auto const stripe_info = stripe.stripe_info;
-    stripe_sizes_rows_ptr[idx] =
-      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
-  }
-
   // This is the post-processing step after we've done with splitting `load_stripe_range` into
   // `decode_stripe_ranges`.
   auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
@@ -593,19 +573,29 @@ void reader_impl::load_data(read_mode mode)
   };
 
   // Optimized code path when we do not have any read limit but the number of rows in the
-  // loaded stripes exceeds column size limit.
+  // loaded stripes exceeds cudf's column size limit.
   // Note that the values `max_uncompressed_size` for each stripe are not computed here.
   // Instead, they will be computed on the fly during decoding to avoid the overhead of
   // storing and retrieving from memory.
   if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
-      num_loading_rows >= static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
-    // Here we will split stripe ranges based on stripes' number of rows, not their data size.
-    // Thus, we use a maximum possible value for data size limit.
-    // The function `find_splits` will automatically handle row count limit.
-    _chunk_read_data.decode_stripe_ranges = find_splits<cumulative_size_and_row>(
-      cudf::host_span<cumulative_size_and_row>(stripe_sizes_rows_ptr, stripe_count),
-      stripe_count,
-      std::numeric_limits<std::size_t>::max());
+      num_loading_rows >= column_size_limit) {
+    std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
+    std::size_t rows{0};
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+      auto const stripe_info = stripe.stripe_info;
+      rows += stripe_info->numberOfRows;
+
+      // Here we will split stripe ranges based only on stripes' number of rows, not data size.
+      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripe and
+      // will use the column size limit (`std::numeric_limits<size_type>::max()`) as split limit.
+      cumulative_stripe_rows[idx] =
+        cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};
+    }
+
+    _chunk_read_data.decode_stripe_ranges =
+      find_splits<cumulative_size_and_row>(cumulative_stripe_rows, stripe_count, column_size_limit);
     add_range_offset(_chunk_read_data.decode_stripe_ranges);
     return;
   }
@@ -615,6 +605,19 @@ void reader_impl::load_data(read_mode mode)
   // memory:
   //
 
+  // For estimating the decompressed sizes of the loaded stripes.
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
+                                                                               _stream);
+
+  // Fill up the `cumulative_size_and_row` array with initial values.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect name.
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info = stripe.stripe_info;
+    stripe_decomp_sizes[idx] =
+      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
+  }
+
   auto& compinfo_map = _file_itm_data.compinfo_map;
   compinfo_map.clear();  // clear cache of the last load
 

From 7a9c43671c29d6683287c90028afff6af42a1cdd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 13:53:02 -0700
Subject: [PATCH 293/321] Remove `reader.cu`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/CMakeLists.txt            |  1 -
 cpp/src/io/orc/reader.cu      | 69 -----------------------------------
 cpp/src/io/orc/reader_impl.cu | 46 +++++++++++++++++++++++
 3 files changed, 46 insertions(+), 70 deletions(-)
 delete mode 100644 cpp/src/io/orc/reader.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1a5827ef144..648cbf0a428 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -393,7 +393,6 @@ add_library(
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
-  src/io/orc/reader.cu
   src/io/orc/reader_impl.cu
   src/io/orc/reader_impl_chunking.cu
   src/io/orc/reader_impl_decode.cu
diff --git a/cpp/src/io/orc/reader.cu b/cpp/src/io/orc/reader.cu
deleted file mode 100644
index d4b0c3383af..00000000000
--- a/cpp/src/io/orc/reader.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "io/orc/reader_impl.hpp"
-#include "io/orc/reader_impl_helpers.hpp"
-
-namespace cudf::io::orc::detail {
-
-// Destructor are defined within this translation unit.
-reader::~reader() = default;
-
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               orc_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
-{
-}
-
-table_with_metadata reader::read() { return _impl->read(); }
-
-chunked_reader::chunked_reader(std::size_t output_size_limit,
-                               std::size_t data_read_limit,
-                               std::vector<std::unique_ptr<datasource>>&& sources,
-                               orc_reader_options const& options,
-                               rmm::cuda_stream_view stream,
-                               rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<reader_impl>(
-      output_size_limit, data_read_limit, std::move(sources), options, stream, mr)}
-{
-}
-
-chunked_reader::chunked_reader(std::size_t output_size_limit,
-                               std::size_t data_read_limit,
-                               size_type output_row_granularity,
-                               std::vector<std::unique_ptr<datasource>>&& sources,
-                               orc_reader_options const& options,
-                               rmm::cuda_stream_view stream,
-                               rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<reader_impl>(output_size_limit,
-                                        data_read_limit,
-                                        output_row_granularity,
-                                        std::move(sources),
-                                        options,
-                                        stream,
-                                        mr)}
-{
-}
-
-chunked_reader::~chunked_reader() = default;
-
-bool chunked_reader::has_next() const { return _impl->has_next(); }
-
-table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
-
-}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 289573b38ca..e9c34896425 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -201,4 +201,50 @@ table_with_metadata reader_impl::read_chunk()
   return make_output_chunk();
 }
 
+chunked_reader::chunked_reader(std::size_t output_size_limit,
+                               std::size_t data_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(
+      output_size_limit, data_read_limit, std::move(sources), options, stream, mr)}
+{
+}
+
+chunked_reader::chunked_reader(std::size_t output_size_limit,
+                               std::size_t data_read_limit,
+                               size_type output_row_granularity,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(output_size_limit,
+                                        data_read_limit,
+                                        output_row_granularity,
+                                        std::move(sources),
+                                        options,
+                                        stream,
+                                        mr)}
+{
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               orc_reader_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
+{
+}
+
+reader::~reader() = default;
+
+table_with_metadata reader::read() { return _impl->read(); }
+
 }  // namespace cudf::io::orc::detail

From 219372213a9f7d96cfc8094bebf240aa4e6edbe0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 14:06:17 -0700
Subject: [PATCH 294/321] Add a test

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/tests/io/orc_chunked_reader_test.cu | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 59727c6d5fc..1c1b53ea17f 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -174,6 +174,18 @@ TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
 }
 
+TEST_F(OrcChunkedReaderTest, TestChunkedReadInvalidParameter)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_invalid");
+  EXPECT_THROW(
+    chunked_read(filepath, output_limit{1'000}, output_row_granularity{-1} /*invalid value*/),
+    cudf::logic_error);
+}
+
 TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
 {
   auto constexpr num_rows = 40'000;

From 4d3ddd18c4582a921ca632abdc5b4b81809eb160 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 14:40:56 -0700
Subject: [PATCH 295/321] Rewrite benchmark

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 45 ++++++++++++----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 58b43367382..1f73374ac20 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -30,6 +30,7 @@ namespace {
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
+constexpr std::size_t Mbytes{1024 * 1024};
 
 template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
@@ -46,10 +47,12 @@ void orc_read_common(cudf::size_type num_rows_to_read,
     state.exec(
       nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
         try_drop_l3_cache();
-        auto const output_limit = static_cast<std::size_t>(state.get_int64("output_limit"));
-        auto const read_limit   = static_cast<std::size_t>(state.get_int64("read_limit"));
+        auto const output_limit_MB =
+          static_cast<std::size_t>(state.get_int64("chunk_read_limit_MB"));
+        auto const read_limit_MB = static_cast<std::size_t>(state.get_int64("pass_read_limit_MB"));
 
-        auto reader = cudf::io::chunked_orc_reader(output_limit, read_limit, read_opts);
+        auto reader =
+          cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts);
         cudf::size_type num_rows{0};
 
         timer.start();
@@ -120,15 +123,21 @@ void orc_read_io_compression(nvbench::state& state)
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
 
-  cudf::size_type const cardinality = state.get_int64("cardinality");
-  cudf::size_type const run_length  = state.get_int64("run_length");
+  auto const [cardinality, run_length] = [&]() -> std::pair<cudf::size_type, cudf::size_type> {
+    if constexpr (chunked_read) {
+      return {0, 4};
+    } else {
+      return {static_cast<cudf::size_type>(state.get_int64("cardinality")),
+              static_cast<cudf::size_type>(state.get_int64("run_length"))};
+    }
+  }();
   cuio_source_sink_pair source_sink(IOType);
 
   auto const num_rows_written = [&]() {
     auto const tbl = create_random_table(
       cycle_dtypes(d_type, num_cols),
       table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+      data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length));
     auto const view = tbl->view();
 
     cudf::io::orc_writer_options opts =
@@ -149,12 +158,12 @@ void BM_orc_read_io_compression(
   return orc_read_io_compression<IOType, Compression, false>(state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
-void BM_orc_chunked_read_io_compression(
-  nvbench::state& state,
-  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+template <cudf::io::compression_type Compression>
+void BM_orc_chunked_read_io_compression(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<Compression>>)
 {
-  return orc_read_io_compression<IOType, Compression, true>(state);
+  // Only run benchmark using HOST_BUFFER IO.
+  return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -188,16 +197,12 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
 
-std::size_t constexpr Mbytes{1024 * 1024};
-
 // Should have the same parameters as `BM_orc_read_io_compression` for comparison.
-NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression,
-                    NVBENCH_TYPE_AXES(io_list, compression_list))
+NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list))
   .set_name("orc_chunked_read_io_compression")
-  .set_type_axes_names({"io", "compression"})
+  .set_type_axes_names({"compression"})
   .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32})
   // The input has approximately 520MB and 127K rows.
-  .add_int64_axis("output_limit", {100 * Mbytes, 500 * Mbytes})
-  .add_int64_axis("read_limit", {100 * Mbytes, 500 * Mbytes});
+  // The limits below are given in MBs.
+  .add_int64_axis("chunk_read_limit_MB", {50, 250, 700})
+  .add_int64_axis("pass_read_limit_MB", {50, 250, 700});

From 80488f905fdfbbce865275e3cbcc08cbc8f3bcab Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 14:52:37 -0700
Subject: [PATCH 296/321] Rename parameters

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/io/detail/orc.hpp      |  8 +++---
 cpp/include/cudf/io/orc.hpp             | 36 ++++++++++++-------------
 cpp/src/io/functions.cpp                | 20 +++++++-------
 cpp/src/io/orc/reader_impl.cu           | 28 +++++++++----------
 cpp/src/io/orc/reader_impl.hpp          | 10 +++----
 cpp/src/io/orc/reader_impl_chunking.cu  | 10 +++----
 cpp/src/io/orc/reader_impl_chunking.hpp | 12 ++++-----
 cpp/src/io/orc/reader_impl_decode.cu    |  4 +--
 8 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 4d610891858..597ddd9cf0a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -89,8 +89,8 @@ class chunked_reader {
    *
    * @param sources Input `datasource` objects to read the dataset from
    */
-  explicit chunked_reader(std::size_t output_size_limit,
-                          std::size_t data_read_limit,
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
                           size_type output_row_granularity,
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           orc_reader_options const& options,
@@ -102,8 +102,8 @@ class chunked_reader {
    *
    * @param sources Input `datasource` objects to read the dataset from
    */
-  explicit chunked_reader(std::size_t output_size_limit,
-                          std::size_t data_read_limit,
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           orc_reader_options const& options,
                           rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index f4e63a1d84e..8140f8897b7 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -407,7 +407,7 @@ table_with_metadata read_orc(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief The chunked orc reader class to read ORC file iteratively into a series of
+ * @brief The chunked orc reader class to read an ORC file iteratively into a series of
  * tables, chunk by chunk.
  *
  * This class is designed to address the reading issue when reading very large ORC files such
@@ -437,28 +437,28 @@ class chunked_orc_reader {
    *
    * ```
    *
-   * If `output_size_limit == 0` (i.e., no output limit) and `data_read_limit == 0` (no temporary
+   * If `chunk_read_limit == 0` (i.e., no output limit) and `pass_read_limit == 0` (no temporary
    * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
    * containing all rows.
    *
-   * The `output_size_limit` parameter controls the size of the output table to be returned per
+   * The `chunk_read_limit` parameter controls the size of the output table to be returned per
    * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
    * tables that have a total bytes size (over all columns) of 100 MB or less.
    * This is a soft limit and the code will not fail if it cannot satisfy the limit.
    *
-   * The `data_read_limit` parameter controls how much temporary memory is used in the entire
+   * The `pass_read_limit` parameter controls how much temporary memory is used in the entire
    * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
    * the reader will try to make the best effort.
    *
    * Finally, the parameter `output_row_granularity` controls the changes in row number of the
-   * output chunk. For each call to `read_chunk()`, with respect to the given `data_read_limit`, a
+   * output chunk. For each call to `read_chunk()`, with respect to the given `pass_read_limit`, a
    * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
    * reader will then subdivide that table into smaller tables for final output using
    * `output_row_granularity` as the subdivision step.
    *
-   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
-   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
    * @param output_row_granularity The granularity parameter used for subdividing the decoded
    *        table for final output
@@ -469,8 +469,8 @@ class chunked_orc_reader {
    * @throw cudf::logic_error if `output_row_granularity` is non-positive
    */
   explicit chunked_orc_reader(
-    std::size_t output_size_limit,
-    std::size_t data_read_limit,
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
     size_type output_row_granularity,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -482,37 +482,37 @@ class chunked_orc_reader {
    * This constructor implicitly call the other constructor with `output_row_granularity` set to
    * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
    *
-   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
-   * @param data_read_limit Limit on temporary memory usage for reading the data sources,
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
    *        or `0` if there is no limit
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit chunked_orc_reader(
-    std::size_t output_size_limit,
-    std::size_t data_read_limit,
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
     orc_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct the reader from output size limits along with other ORC reader options.
    *
-   * This constructor implicitly call the other constructor with `data_read_limit` set to `0` and
+   * This constructor implicitly call the other constructor with `pass_read_limit` set to `0` and
    * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
    *
-   * @param output_size_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
    *        or `0` if there is no limit
    * @param options Settings for controlling reading behaviors
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit chunked_orc_reader(
-    std::size_t output_size_limit,
+    std::size_t chunk_read_limit,
     orc_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 4819e5e7b78..74b5a654382 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -440,14 +440,14 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   writer->write(options.get_table());
 }
 
-chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
-                                       std::size_t data_read_limit,
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
                                        size_type output_row_granularity,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
-  : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
-                                                         data_read_limit,
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
                                                          output_row_granularity,
                                                          make_datasources(options.get_source()),
                                                          options,
@@ -456,13 +456,13 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
-chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
-                                       std::size_t data_read_limit,
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
-  : reader{std::make_unique<orc::detail::chunked_reader>(output_size_limit,
-                                                         data_read_limit,
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
                                                          make_datasources(options.get_source()),
                                                          options,
                                                          stream,
@@ -470,11 +470,11 @@ chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
 {
 }
 
-chunked_orc_reader::chunked_orc_reader(std::size_t output_size_limit,
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
                                        orc_reader_options const& options,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
-  : chunked_orc_reader(output_size_limit, 0UL, options, stream, mr)
+  : chunked_orc_reader(chunk_read_limit, 0UL, options, stream, mr)
 {
 }
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index e9c34896425..63cc0226ea3 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -139,14 +139,14 @@ reader_impl::reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
 {
 }
 
-reader_impl::reader_impl(std::size_t output_size_limit,
-                         std::size_t data_read_limit,
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
                          std::vector<std::unique_ptr<datasource>>&& sources,
                          orc_reader_options const& options,
                          rmm::cuda_stream_view stream,
                          rmm::device_async_resource_ref mr)
-  : reader_impl::reader_impl(output_size_limit,
-                             data_read_limit,
+  : reader_impl::reader_impl(chunk_read_limit,
+                             pass_read_limit,
                              DEFAULT_OUTPUT_ROW_GRANULARITY,
                              std::move(sources),
                              options,
@@ -155,8 +155,8 @@ reader_impl::reader_impl(std::size_t output_size_limit,
 {
 }
 
-reader_impl::reader_impl(std::size_t output_size_limit,
-                         std::size_t data_read_limit,
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
                          size_type output_row_granularity,
                          std::vector<std::unique_ptr<datasource>>&& sources,
                          orc_reader_options const& options,
@@ -175,7 +175,7 @@ reader_impl::reader_impl(std::size_t output_size_limit,
     _sources(std::move(sources)),
     _metadata{_sources, stream},
     _selected_columns{_metadata.select_columns(options.get_columns())},
-    _chunk_read_data{output_size_limit, data_read_limit, output_row_granularity}
+    _chunk_read_data{chunk_read_limit, pass_read_limit, output_row_granularity}
 {
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns.
@@ -201,26 +201,26 @@ table_with_metadata reader_impl::read_chunk()
   return make_output_chunk();
 }
 
-chunked_reader::chunked_reader(std::size_t output_size_limit,
-                               std::size_t data_read_limit,
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
   : _impl{std::make_unique<reader_impl>(
-      output_size_limit, data_read_limit, std::move(sources), options, stream, mr)}
+      chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr)}
 {
 }
 
-chunked_reader::chunked_reader(std::size_t output_size_limit,
-                               std::size_t data_read_limit,
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
                                size_type output_row_granularity,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                orc_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<reader_impl>(output_size_limit,
-                                        data_read_limit,
+  : _impl{std::make_unique<reader_impl>(chunk_read_limit,
+                                        pass_read_limit,
                                         output_row_granularity,
                                         std::move(sources),
                                         options,
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index fd78a35792c..b9ec4a74a31 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -44,7 +44,7 @@ class reader_impl {
   /**
    * @brief Constructor from a dataset source with reader options.
    *
-   * This constructor will call the other constructor with `output_size_limit` and `data_read_limit`
+   * This constructor will call the other constructor with `chunk_read_limit` and `pass_read_limit`
    * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`.
    *
    * @param sources Dataset sources
@@ -61,8 +61,8 @@ class reader_impl {
    * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
    * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
-  explicit reader_impl(std::size_t output_size_limit,
-                       std::size_t data_read_limit,
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
                        std::vector<std::unique_ptr<datasource>>&& sources,
                        orc_reader_options const& options,
                        rmm::cuda_stream_view stream,
@@ -72,8 +72,8 @@ class reader_impl {
    * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
    * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
-  explicit reader_impl(std::size_t output_size_limit,
-                       std::size_t data_read_limit,
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
                        size_type output_row_granularity,
                        std::vector<std::unique_ptr<datasource>>&& sources,
                        orc_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 776e000f12f..7e96b251868 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -353,7 +353,7 @@ void reader_impl::preprocess_file(read_mode mode)
 
   // Load all stripes if we are in READ_ALL mode or there is no read limit.
   auto const load_all_stripes =
-    mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0;
+    mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0;
 
   // Accumulate data size for data streams in each stripe.
   // This will be used for CHUNKED_READ mode only.
@@ -449,7 +449,7 @@ void reader_impl::preprocess_file(read_mode mode)
   total_stripe_sizes.device_to_host_sync(_stream);
 
   auto const load_limit = [&] {
-    auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
                                               chunk_read_data::load_limit_ratio);
     // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
     return tmp > 0UL ? tmp : 1UL;
@@ -551,7 +551,7 @@ void reader_impl::load_data(read_mode mode)
   // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
   // decoding all stripes like this, for better load-balancing and reduce memory usage.
   // However, we do not have any good way to know how many stripes are 'enough'.
-  if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
       // In addition to read limit, we also need to check if the the total number of
       // rows in the loaded stripes exceeds column size limit.
       // If that is the case, we cannot decode all stripes at once.
@@ -577,7 +577,7 @@ void reader_impl::load_data(read_mode mode)
   // Note that the values `max_uncompressed_size` for each stripe are not computed here.
   // Instead, they will be computed on the fly during decoding to avoid the overhead of
   // storing and retrieving from memory.
-  if ((mode == read_mode::READ_ALL || _chunk_read_data.data_read_limit == 0) &&
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
       num_loading_rows >= column_size_limit) {
     std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
     std::size_t rows{0};
@@ -703,7 +703,7 @@ void reader_impl::load_data(read_mode mode)
   stripe_decomp_sizes.device_to_host_sync(_stream);
 
   auto const decode_limit = [&] {
-    auto const tmp = static_cast<std::size_t>(_chunk_read_data.data_read_limit *
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
                                               chunk_read_data::decode_limit_ratio);
     // Make sure not to pass 0 byte limit to `find_splits`.
     return tmp > 0UL ? tmp : 1UL;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 58b67760b66..7e58188a0a7 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -159,8 +159,8 @@ struct chunk_read_data {
   explicit chunk_read_data(std::size_t output_size_limit_,
                            std::size_t data_read_limit_,
                            size_type output_row_granularity_)
-    : output_size_limit{output_size_limit_},
-      data_read_limit{data_read_limit_},
+    : chunk_read_limit{output_size_limit_},
+      pass_read_limit{data_read_limit_},
       output_row_granularity{output_row_granularity_}
   {
     CUDF_EXPECTS(output_row_granularity > 0,
@@ -168,14 +168,14 @@ struct chunk_read_data {
   }
 
   std::size_t const
-    output_size_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
-  std::size_t const data_read_limit;  // approximate maximum size (in bytes) used for store
+    chunk_read_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t const pass_read_limit;  // approximate maximum size (in bytes) used for store
                                       // intermediate data, or 0 for no limit
   size_type const output_row_granularity;
 
   // Memory limits for loading data and decoding are computed as
-  // `load/decode_limit_ratio * data_read_limit`.
-  // This is to maintain the total memory usage to be **around** the given `data_read_limit`.
+  // `load/decode_limit_ratio * pass_read_limit`.
+  // This is to maintain the total memory usage to be **around** the given `pass_read_limit`.
   // Note that sum of these limits may not be `1.0`, and their values are set empirically.
   static double constexpr load_limit_ratio{0.25};
   static double constexpr decode_limit_ratio{0.6};
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 91dbff9689e..2ac3f0dfb3b 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -1066,12 +1066,12 @@ void reader_impl::decompress_and_decode(read_mode mode)
   // Split the decoded table into ranges that be output into chunks having size within the given
   // output size limit.
   _chunk_read_data.output_table_ranges =
-    _chunk_read_data.output_size_limit == 0
+    _chunk_read_data.chunk_read_limit == 0
       ? std::vector<range>{range{
           0, static_cast<std::size_t>(_chunk_read_data.decoded_table->num_rows())}}
       : find_table_splits(_chunk_read_data.decoded_table->view(),
                           _chunk_read_data.output_row_granularity,
-                          _chunk_read_data.output_size_limit,
+                          _chunk_read_data.chunk_read_limit,
                           _stream);
 }
 

From b5343dc1a82abfe587e2d1ed6f67d0ee8dd07f9f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 14:55:35 -0700
Subject: [PATCH 297/321] Rename parameter

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 7 ++++---
 cpp/src/io/orc/aggregate_orc_metadata.hpp | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index da49fc84d06..ac0dd10856c 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -156,15 +156,16 @@ std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
+  std::optional<size_type> const& num_read_rows,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(),
+  CUDF_EXPECTS((skip_rows == 0 and not num_read_rows.has_value()) or user_specified_stripes.empty(),
                "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not user_specified_stripes.empty()) { return std::pair<int64_t, int64_t>{0, 0}; }
-    return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
+    return cudf::io::detail::skip_rows_num_rows_from_options(
+      skip_rows, num_read_rows, get_num_rows());
   }();
 
   struct stripe_source_mapping {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 94f681fff0c..5da5af58b9b 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -118,7 +118,7 @@ class aggregate_orc_metadata {
    *
    * @param user_specified_stripes The specified stripe indices to read
    * @param skip_rows Number of rows to skip from reading
-   * @param num_rows Number of rows to read
+   * @param num_read_rows Number of rows to read
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of the corrected skip_rows and num_rows values along with a vector of
    *         stripes' metadata such as footer, data information, and source index
@@ -126,7 +126,7 @@ class aggregate_orc_metadata {
   [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
     int64_t skip_rows,
-    std::optional<size_type> const& num_rows,
+    std::optional<size_type> const& num_read_rows,
     rmm::cuda_stream_view stream);
 
   /**

From bdc92a0f5bed1e1683e5e4dfea5b22a9ea832747 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 15:47:06 -0700
Subject: [PATCH 298/321] Rename functions

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu           | 14 +++++++-------
 cpp/src/io/orc/reader_impl.hpp          | 14 +++++++-------
 cpp/src/io/orc/reader_impl_chunking.cu  |  2 +-
 cpp/src/io/orc/reader_impl_chunking.hpp |  8 ++++----
 cpp/src/io/orc/reader_impl_decode.cu    |  4 ++--
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 63cc0226ea3..b877caff09a 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -32,19 +32,19 @@ void reader_impl::prepare_data(read_mode mode)
   // This will be no-op if it was called before.
   preprocess_file(mode);
 
-  if (!_chunk_read_data.more_table_chunk_to_output()) {
-    if (!_chunk_read_data.more_stripe_to_decode() && _chunk_read_data.more_stripe_to_load()) {
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    if (!_chunk_read_data.more_stripes_to_decode() && _chunk_read_data.more_stripes_to_load()) {
       // Only load stripe data if:
       //  - There is more stripe to load, and
       //  - All loaded stripes were decoded, and
       //  - All the decoded results were output.
-      load_data(mode);
+      load_next_stripe_data(mode);
     }
-    if (_chunk_read_data.more_stripe_to_decode()) {
+    if (_chunk_read_data.more_stripes_to_decode()) {
       // Only decompress/decode the loaded stripes if:
       //  - There are loaded stripes that were not decoded yet, and
       //  - All the decoded results were output.
-      decompress_and_decode(mode);
+      decompress_and_decode_stripes(mode);
     }
   }
 }
@@ -55,7 +55,7 @@ table_with_metadata reader_impl::make_output_chunk()
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
   // If no rows or stripes to read, return empty columns.
-  if (!_chunk_read_data.more_table_chunk_to_output()) {
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
     std::vector<std::unique_ptr<column>> out_columns;
     auto out_metadata = get_meta_with_user_data();
     std::transform(_selected_columns.levels[0].begin(),
@@ -94,7 +94,7 @@ table_with_metadata reader_impl::make_output_chunk()
     auto output = std::make_unique<table>(out_tview, _stream, _mr);
 
     // If this is the last slice, we also delete the decoded table to free up memory.
-    if (!_chunk_read_data.more_table_chunk_to_output()) {
+    if (!_chunk_read_data.more_table_chunks_to_output()) {
       _chunk_read_data.decoded_table.reset(nullptr);
     }
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index b9ec4a74a31..a07ef5d917d 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -118,8 +118,8 @@ class reader_impl {
    * In this step, the metadata of all stripes in the data sources is parsed, and information about
    * data streams of the selected columns in all stripes are generated. If the reader has a data
    * read limit, sizes of these streams are used to split the list of all stripes into multiple
-   * subsets, each of which will be read into memory in the `load_data()` step. These subsets are
-   * computed such that memory usage will be kept to be around a fixed size limit.
+   * subsets, each of which will be read into memory in the `load_next_stripe_data()` step. These
+   * subsets are computed such that memory usage will be kept to be around a fixed size limit.
    *
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
@@ -132,23 +132,23 @@ class reader_impl {
    * their total data size does not exceed a fixed size limit. Then, the data is probed to
    * estimate its uncompressed sizes, which are in turn used to split that stripe subset into
    * smaller subsets, each of which to be decompressed and decoded in the next step
-   * `decompress_and_decode()`. This is to ensure that loading data from data sources together with
-   * decompression and decoding will be capped around the given data read limit.
+   * `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+   * together with decompression and decoding will be capped around the given data read limit.
    *
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void load_data(read_mode mode);
+  void load_next_stripe_data(read_mode mode);
 
   /**
    * @brief Decompress and decode stripe data in the internal buffers, and store the result into
    * an intermediate table.
    *
    * This function expects that the other preprocessing steps (`global preprocess()` and
-   * `load_data()`) have already been done.
+   * `load_next_stripe_data()`) have already been done.
    *
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void decompress_and_decode(read_mode mode);
+  void decompress_and_decode_stripes(read_mode mode);
 
   /**
    * @brief Create the output table from the intermediate table and return it along with metadata.
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 7e96b251868..3afb626de72 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -459,7 +459,7 @@ void reader_impl::preprocess_file(read_mode mode)
     find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
 }
 
-void reader_impl::load_data(read_mode mode)
+void reader_impl::load_next_stripe_data(read_mode mode)
 {
   if (_file_itm_data.has_no_data()) { return; }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 7e58188a0a7..19db24b70f5 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -184,12 +184,12 @@ struct chunk_read_data {
   // limit.
   std::vector<range> load_stripe_ranges;
   std::size_t curr_load_stripe_range{0};
-  bool more_stripe_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
+  bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
 
   // Chunks of stripes such that their decompression size is within a size limit.
   std::vector<range> decode_stripe_ranges;
   std::size_t curr_decode_stripe_range{0};
-  bool more_stripe_to_decode() const
+  bool more_stripes_to_decode() const
   {
     return curr_decode_stripe_range < decode_stripe_ranges.size();
   }
@@ -198,7 +198,7 @@ struct chunk_read_data {
   std::vector<range> output_table_ranges;
   std::size_t curr_output_table_range{0};
   std::unique_ptr<cudf::table> decoded_table;
-  bool more_table_chunk_to_output() const
+  bool more_table_chunks_to_output() const
   {
     return curr_output_table_range < output_table_ranges.size();
   }
@@ -206,7 +206,7 @@ struct chunk_read_data {
   bool has_next() const
   {
     // Only has more chunk to output if:
-    return more_stripe_to_load() || more_stripe_to_decode() || more_table_chunk_to_output();
+    return more_stripes_to_load() || more_stripes_to_decode() || more_table_chunks_to_output();
   }
 };
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 2ac3f0dfb3b..5ab8516276d 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -710,7 +710,7 @@ std::vector<range> find_table_splits(table_view const& input,
 
 }  // namespace
 
-void reader_impl::decompress_and_decode(read_mode mode)
+void reader_impl::decompress_and_decode_stripes(read_mode mode)
 {
   if (_file_itm_data.has_no_data()) { return; }
 
@@ -746,7 +746,7 @@ void reader_impl::decompress_and_decode(read_mode mode)
   _file_itm_data.rows_to_skip = 0;
   _file_itm_data.rows_to_read -= rows_to_decode;
 
-  // Technically, overflow here should never happen because the `load_data()` step
+  // Technically, overflow here should never happen because the `load_next_stripe_data()` step
   // already handled it by splitting the loaded stripe range into multiple decode ranges.
   CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
                "Number or rows to decode exceeds the column size limit.",

From 252d546a9058c3af8072cc360687a77e5d479344 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 16:08:08 -0700
Subject: [PATCH 299/321] Fix format

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index b877caff09a..048794c3c05 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -51,7 +51,7 @@ void reader_impl::prepare_data(read_mode mode)
 
 table_with_metadata reader_impl::make_output_chunk()
 {
-  // There is no columns in the table.
+  // There are no columns in the table.
   if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
 
   // If no rows or stripes to read, return empty columns.
@@ -119,7 +119,9 @@ table_metadata reader_impl::get_meta_with_user_data()
                    std::transform(meta.ff.metadata.cbegin(),
                                   meta.ff.metadata.cend(),
                                   std::inserter(kv_map, kv_map.end()),
-                                  [](auto const& kv) { return std::pair{kv.name, kv.value}; });
+                                  [](auto const& kv) {
+                                    return std::pair{kv.name, kv.value};
+                                  });
                    return kv_map;
                  });
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),

From 8ebbb2cb2791a4cc3177d12915b6e18850c9ca11 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 19:59:44 -0700
Subject: [PATCH 300/321] Change comments

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 3afb626de72..0cb73a5f329 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -198,7 +198,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
 
   // If the last range has size smaller than `merge_threshold` the size of the second last one,
   // merge it with the second last one.
-  // This is to prevent having too small trailing range.
+  // This is to prevent having the last range too small.
   if (splits.size() > 1) {
     double constexpr merge_threshold = 0.15;
     if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
@@ -427,8 +427,8 @@ void reader_impl::preprocess_file(read_mode mode)
   }
 
   //
-  // Split range of all stripes into subranges that can be loaded separately without blowing up
-  // memory:
+  // Split range of all stripes into subranges that can be loaded separately while maintaining
+  // the memory usage under the given pass limit:
   //
 
   // Load range is reset to start from the first position in `load_stripe_ranges`.
@@ -588,7 +588,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       rows += stripe_info->numberOfRows;
 
       // Here we will split stripe ranges based only on stripes' number of rows, not data size.
-      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripe and
+      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and
       // will use the column size limit (`std::numeric_limits<size_type>::max()`) as split limit.
       cumulative_stripe_rows[idx] =
         cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};

From a793eb73134f375a4178d8585913a8cc499e2462 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 23 Apr 2024 20:11:14 -0700
Subject: [PATCH 301/321] Change comments and rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  |  2 +-
 cpp/src/io/orc/reader_impl_chunking.hpp | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 0cb73a5f329..39026783135 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -704,7 +704,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
 
   auto const decode_limit = [&] {
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
-                                              chunk_read_data::decode_limit_ratio);
+                                              chunk_read_data::decompress_and_decode_limit_ratio);
     // Make sure not to pass 0 byte limit to `find_splits`.
     return tmp > 0UL ? tmp : 1UL;
   }();
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 19db24b70f5..d7ddf9d50f9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -57,7 +57,7 @@ struct stripe_level_comp_info {
 };
 
 /**
- * @brief Struct that store source information of an ORC streams.
+ * @brief Struct that stores source information of an ORC streams.
  */
 struct stream_source_info {
   std::size_t stripe_idx;  // global stripe id throughout all data sources
@@ -91,7 +91,7 @@ using stream_source_map =
   std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
 
 /**
- * @brief Struct that store information of an ORC stream.
+ * @brief Struct that stores information of an ORC stream.
  */
 struct orc_stream_info {
   // Data info:
@@ -114,8 +114,8 @@ struct file_intermediate_data {
   // Return true if no rows or stripes to read.
   bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
 
-  // For each stripe, we perform a number of read for its streams.
-  // Those reads are identified by a chunk of consecutive read info, stored in data_read_info.
+  // For each stripe, we perform a number of reads for its streams.
+  // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`.
   std::vector<range> stripe_data_read_ranges;
 
   // Identify what data to read from source.
@@ -174,19 +174,19 @@ struct chunk_read_data {
   size_type const output_row_granularity;
 
   // Memory limits for loading data and decoding are computed as
-  // `load/decode_limit_ratio * pass_read_limit`.
+  // `*_limit_ratio * pass_read_limit`.
   // This is to maintain the total memory usage to be **around** the given `pass_read_limit`.
   // Note that sum of these limits may not be `1.0`, and their values are set empirically.
   static double constexpr load_limit_ratio{0.25};
-  static double constexpr decode_limit_ratio{0.6};
+  static double constexpr decompress_and_decode_limit_ratio{0.6};
 
-  // Chunks of stripes that can be load into memory such that their data size is within a size
-  // limit.
+  // Chunks of stripes that can be loaded into memory such that their data size is within the user
+  // specified limit.
   std::vector<range> load_stripe_ranges;
   std::size_t curr_load_stripe_range{0};
   bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
 
-  // Chunks of stripes such that their decompression size is within a size limit.
+  // Chunks of stripes such that their decompression size is within the user specified size limit.
   std::vector<range> decode_stripe_ranges;
   std::size_t curr_decode_stripe_range{0};
   bool more_stripes_to_decode() const

From 1a7c3a959a5e3f44fa71132cde011231d91cfc21 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 24 Apr 2024 09:49:40 -0700
Subject: [PATCH 302/321] Change comments

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          |  5 +-
 cpp/src/io/orc/reader_impl.hpp         |  2 +-
 cpp/src/io/orc/reader_impl_chunking.cu | 63 ++++++++++++++++----------
 3 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 048794c3c05..ae077886015 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -24,6 +24,7 @@
 
 namespace cudf::io::orc::detail {
 
+// This is just the proxy to call all other data preprocessing functions.
 void reader_impl::prepare_data(read_mode mode)
 {
   // There are no columns in the table.
@@ -119,9 +120,7 @@ table_metadata reader_impl::get_meta_with_user_data()
                    std::transform(meta.ff.metadata.cbegin(),
                                   meta.ff.metadata.cend(),
                                   std::inserter(kv_map, kv_map.end()),
-                                  [](auto const& kv) {
-                                    return std::pair{kv.name, kv.value};
-                                  });
+                                  [](auto const& kv) { return std::pair{kv.name, kv.value}; });
                    return kv_map;
                  });
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index a07ef5d917d..cd30e0e2b91 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -118,7 +118,7 @@ class reader_impl {
    * In this step, the metadata of all stripes in the data sources is parsed, and information about
    * data streams of the selected columns in all stripes are generated. If the reader has a data
    * read limit, sizes of these streams are used to split the list of all stripes into multiple
-   * subsets, each of which will be read into memory in the `load_next_stripe_data()` step. These
+   * subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
    * subsets are computed such that memory usage will be kept to be around a fixed size limit.
    *
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 39026783135..4d4e45718d0 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -220,7 +220,7 @@ template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_si
 template std::vector<range> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
 
-range get_range(host_span<range const> input_ranges, range const& selected_ranges)
+inline range get_range(host_span<range const> input_ranges, range const& selected_ranges)
 {
   // The first and last range.
   auto const& first_range = input_ranges[selected_ranges.begin];
@@ -230,6 +230,11 @@ range get_range(host_span<range const> input_ranges, range const& selected_range
   return {first_range.begin, last_range.end};
 }
 
+// In this step, the metadata of all stripes in the data sources is parsed, and information about
+// data streams of the selected columns in all stripes are generated. If the reader has a data
+// read limit, sizes of these streams are used to split the list of all stripes into multiple
+// subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+// subsets are computed such that memory usage will be kept to be around a fixed size limit.
 void reader_impl::preprocess_file(read_mode mode)
 {
   if (_file_itm_data.global_preprocessed) { return; }
@@ -293,9 +298,8 @@ void reader_impl::preprocess_file(read_mode mode)
   auto& col_meta  = *_col_meta;
 
   //
-  // Collect columns' types.
+  // Collect columns' types:
   //
-
   for (std::size_t level = 0; level < num_levels; ++level) {
     lvl_stripe_sizes[level].resize(num_total_stripes);
     lvl_stripe_stream_ranges[level].resize(num_total_stripes);
@@ -355,9 +359,9 @@ void reader_impl::preprocess_file(read_mode mode)
   auto const load_all_stripes =
     mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0;
 
-  // Accumulate data size for data streams in each stripe.
-  // This will be used for CHUNKED_READ mode only.
-  // If we are in READ_ALL mode, we do not need this since we just load all stripes.
+  // Accumulate data size for data streams in each stripe, used for chunking.
+  // This will be used only for CHUNKED_READ mode when there is a read limit.
+  // Otherwise, we do not need this since we just load all stripes.
   cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
     load_all_stripes ? std::size_t{0} : num_total_stripes, _stream);
 
@@ -459,6 +463,12 @@ void reader_impl::preprocess_file(read_mode mode)
     find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
 }
 
+// If there is a data read limit, only a subset of stripes are read at a time such that
+// their total data size does not exceed a fixed size limit. Then, the data is probed to
+// estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+// smaller subsets, each of which to be decompressed and decoded in the next step
+// `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+// together with decompression and decoding will be capped around the given data read limit.
 void reader_impl::load_next_stripe_data(read_mode mode)
 {
   if (_file_itm_data.has_no_data()) { return; }
@@ -494,7 +504,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
 
   // If we load data directly from sources into device memory, the loads are also async.
   // Thus, we need to make sure to sync all them at the end.
-  std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+  std::vector<std::pair<std::future<std::size_t>, std::size_t>> device_read_tasks;
 
   // Range of the read info (offset, length) to read for the current being loaded stripes.
   auto const [read_begin, read_end] =
@@ -507,7 +517,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
     if (source_ptr->is_device_read_preferred(read_info.length)) {
-      read_tasks.push_back(
+      device_read_tasks.push_back(
         std::pair(source_ptr->device_read_async(
                     read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
                   read_info.length));
@@ -524,11 +534,11 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     }
   }
 
-  if (host_read_buffers.size() > 0) {
+  if (host_read_buffers.size() > 0) {  // if there was host read
     _stream.synchronize();
-    host_read_buffers.clear();
+    host_read_buffers.clear();  // its data was copied to device memory after stream sync
   }
-  for (auto& task : read_tasks) {
+  for (auto& task : device_read_tasks) {  // if there was device read
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
@@ -541,25 +551,32 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     return count;
   }();
 
-  // Decoding range is reset to start from the first position in `decode_stripe_ranges`.
+  // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
   _chunk_read_data.curr_decode_stripe_range = 0;
 
+  // The cudf's column size limit.
   auto constexpr column_size_limit =
     static_cast<std::size_t>(std::numeric_limits<size_type>::max());
 
-  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode.
+  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode,
+  // and the number of loading rows is less than the column size limit.
   // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
   // decoding all stripes like this, for better load-balancing and reduce memory usage.
   // However, we do not have any good way to know how many stripes are 'enough'.
   if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
-      // In addition to read limit, we also need to check if the the total number of
-      // rows in the loaded stripes exceeds column size limit.
-      // If that is the case, we cannot decode all stripes at once.
+      // In addition to read limit, we also need to check if the total number of
+      // rows in the loaded stripes exceeds the column size limit.
+      // If that is the case, we cannot decode all stripes at once into a cudf table.
       num_loading_rows < column_size_limit) {
     _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
     return;
   }
 
+  // From here, we have reading mode that is either:
+  // - READ_ALL but the number of reading rows exceeds column size limit, or
+  // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or
+  // - CHUNKED_READ with a pass read limit.
+
   // This is the post-processing step after we've done with splitting `load_stripe_range` into
   // `decode_stripe_ranges`.
   auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
@@ -573,7 +590,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   };
 
   // Optimized code path when we do not have any read limit but the number of rows in the
-  // loaded stripes exceeds cudf's column size limit.
+  // loaded stripes exceeds column size limit.
   // Note that the values `max_uncompressed_size` for each stripe are not computed here.
   // Instead, they will be computed on the fly during decoding to avoid the overhead of
   // storing and retrieving from memory.
@@ -587,9 +604,9 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       auto const stripe_info = stripe.stripe_info;
       rows += stripe_info->numberOfRows;
 
-      // Here we will split stripe ranges based only on stripes' number of rows, not data size.
+      // We will split stripe ranges based only on stripes' number of rows, not data size.
       // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and
-      // will use the column size limit (`std::numeric_limits<size_type>::max()`) as split limit.
+      // will use the column size limit as the split size limit.
       cumulative_stripe_rows[idx] =
         cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};
     }
@@ -601,16 +618,16 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   }
 
   //
-  // Split range of loaded stripes into subranges that can be decoded separately without blowing up
-  // memory:
+  // Split range of loaded stripes into subranges that can be decoded separately such that the
+  // memory usage is maintained around the given limit:
   //
 
-  // For estimating the decompressed sizes of the loaded stripes.
+  // This is for estimating the decompressed sizes of the loaded stripes.
   cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
                                                                                _stream);
 
   // Fill up the `cumulative_size_and_row` array with initial values.
-  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect name.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect API name.
   for (std::size_t idx = 0; idx < stripe_count; ++idx) {
     auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
     auto const stripe_info = stripe.stripe_info;

From 6c3bb4ff0ae15e66d61c4f098fe596ff271a5144 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 24 Apr 2024 09:54:02 -0700
Subject: [PATCH 303/321] Inline a small function

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 10 ----------
 cpp/src/io/orc/reader_impl_chunking.hpp | 26 ++++++++++++++++---------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 4d4e45718d0..9f1b6b137cb 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -220,16 +220,6 @@ template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_si
 template std::vector<range> find_splits<cumulative_size_and_row>(
   host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
 
-inline range get_range(host_span<range const> input_ranges, range const& selected_ranges)
-{
-  // The first and last range.
-  auto const& first_range = input_ranges[selected_ranges.begin];
-  auto const& last_range  = input_ranges[selected_ranges.end - 1];
-
-  // The range of data covered from the first to the last range.
-  return {first_range.begin, last_range.end};
-}
-
 // In this step, the metadata of all stripes in the data sources is parsed, and information about
 // data streams of the selected columns in all stripes are generated. If the reader has a data
 // read limit, sizes of these streams are used to split the list of all stripes into multiple
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index d7ddf9d50f9..6bc36002391 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -36,6 +36,23 @@ struct range {
   std::size_t end{0};
 };
 
+/**
+ * @brief Expand a range of ranges into a simple range of data.
+ *
+ * @param input_ranges The list of all data ranges
+ * @param selected_ranges A range of ranges from `input_ranges`
+ * @return The range of data span by the selected range of ranges
+ */
+inline range get_range(host_span<range const> input_ranges, range const& selected_ranges)
+{
+  // The first and last range.
+  auto const& first_range = input_ranges[selected_ranges.begin];
+  auto const& last_range  = input_ranges[selected_ranges.end - 1];
+
+  // The range of data covered from the first to the last range.
+  return {first_range.begin, last_range.end};
+}
+
 // Store information to identify where to read a chunk of data from source.
 // Each read corresponds to one or more consecutive streams combined.
 struct stream_data_read_info {
@@ -261,15 +278,6 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
                                std::size_t total_count,
                                std::size_t size_limit);
 
-/**
- * @brief Expand a range of ranges into a simple range of data.
- *
- * @param input_ranges The list of all data ranges
- * @param selected_ranges A range of ranges from `input_ranges`
- * @return The range of data span by the selected range of ranges
- */
-range get_range(host_span<range const> input_ranges, range const& selected_ranges);
-
 /**
  * @brief Function that populates descriptors for either individual streams or chunks of column
  * data, but not both.

From 673b03426670f92179152bc126278c8580a620aa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 24 Apr 2024 16:03:10 -0700
Subject: [PATCH 304/321] Fix format

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index ae077886015..cf7b6fecbc6 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -120,7 +120,9 @@ table_metadata reader_impl::get_meta_with_user_data()
                    std::transform(meta.ff.metadata.cbegin(),
                                   meta.ff.metadata.cend(),
                                   std::inserter(kv_map, kv_map.end()),
-                                  [](auto const& kv) { return std::pair{kv.name, kv.value}; });
+                                  [](auto const& kv) {
+                                    return std::pair{kv.name, kv.value};
+                                  });
                    return kv_map;
                  });
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),

From 767e35fa5832b25b68c1dfbf5dbce530c7ccf489 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 25 Apr 2024 20:28:44 -0700
Subject: [PATCH 305/321] Allocate `null_count_prefix_sums` as just one buffer

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 87 +++++++++++++++-------------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 5ab8516276d..e93ed357919 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -485,40 +485,38 @@ void decode_stream_data(int64_t num_dicts,
  * layer.
  */
 void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
   auto const num_stripes = chunks.size().first;
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
-  thrust::for_each(rmm::exec_policy_nosync(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    d_prefix_sums_to_update.begin(),
+    d_prefix_sums_to_update.end(),
+    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+      auto const& idx_psums) {
+      auto const col_idx = idx_psums.first;
+      auto const psums   = idx_psums.second;
+      thrust::transform(thrust::seq,
+                        thrust::make_counting_iterator<std::size_t>(0ul),
+                        thrust::make_counting_iterator<std::size_t>(num_stripes),
+                        psums,
+                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+    });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -763,7 +761,18 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
 
   // For computing null count.
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums(num_levels);
+  auto null_count_prefix_sums = [&] {
+    auto const num_total_cols = std::accumulate(
+      _selected_columns.levels.begin(),
+      _selected_columns.levels.end(),
+      std::size_t{0},
+      [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
+
+    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+  }();
+  std::size_t num_processed_lvl_columns      = 0;
+  std::size_t num_processed_prev_lvl_columns = 0;
 
   // For parsing decompression data.
   // We create one hostdevice_vector that is large enough to use for all levels,
@@ -796,9 +805,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
     auto& chunks      = lvl_chunks[level];
 
-    auto const num_level_columns = columns_level.size();
+    auto const num_lvl_columns = columns_level.size();
     chunks =
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_level_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
@@ -809,17 +818,11 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
       // TODO: Consider nrows, gpu, and tune the threshold
       (rows_to_decode > _metadata.get_row_index_stride() &&
        !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
-       num_level_columns * stripe_count < 8 * 128) &&
+       num_lvl_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
-    null_count_prefix_sums[level].reserve(num_level_columns);
-    std::generate_n(std::back_inserter(null_count_prefix_sums[level]), num_level_columns, [&]() {
-      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-        stripe_count, _stream, rmm::mr::get_current_device_resource());
-    });
-
     // 0-based counters, used across all decoding stripes in this step.
     int64_t stripe_start_row{0};
     int64_t num_dict_entries{0};
@@ -863,24 +866,25 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
                   : 0;
 
       // Update chunks to reference streams pointers.
-      for (std::size_t col_idx = 0; col_idx < num_level_columns; col_idx++) {
+      for (std::size_t col_idx = 0; col_idx < num_lvl_columns; col_idx++) {
         auto& chunk = chunks[stripe_local_idx][col_idx];
         // start row, number of rows in a each stripe and total number of rows
         // may change in lower levels of nesting
         chunk.start_row =
           (level == 0) ? stripe_start_row
-                       : col_meta.child_start_row[stripe_local_idx * num_level_columns + col_idx];
+                       : col_meta.child_start_row[stripe_local_idx * num_lvl_columns + col_idx];
         chunk.num_rows =
           (level == 0)
             ? num_rows_in_stripe
-            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_level_columns + col_idx];
+            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_lvl_columns + col_idx];
         chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
         chunk.parent_validity_info =
           (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
         chunk.parent_null_count_prefix_sums =
-          (level == 0)
-            ? nullptr
-            : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+          (level == 0) ? nullptr
+                       : null_count_prefix_sums.data() + (num_processed_prev_lvl_columns +
+                                                          col_meta.parent_column_index[col_idx]) *
+                                                           stripe_count;
         chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
         chunk.type_kind =
           _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
@@ -921,10 +925,10 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
     // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_level_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
       cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_level_columns);
+                                                     num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -971,7 +975,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
                                 chunks.base_device_ptr(),
-                                num_level_columns,
+                                num_lvl_columns,
                                 stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
@@ -1011,7 +1015,8 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
     if (nested_cols.size()) {
       // Extract information to process nested child columns.
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+      scan_null_counts(
+        chunks, null_count_prefix_sums.data() + num_processed_lvl_columns * stripe_count, _stream);
 
       row_groups.device_to_host_sync(_stream);
       aggregate_child_meta(
@@ -1029,6 +1034,8 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
+    num_processed_prev_lvl_columns = num_processed_lvl_columns;
+    num_processed_lvl_columns += num_lvl_columns;
   }  // end loop level
 
   // Now generate a table from the decoded result.

From ca15afce96eb91ef92db1ca4459dacfea5cffad6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 25 Apr 2024 20:35:56 -0700
Subject: [PATCH 306/321] Change initialization style

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 1f73374ac20..19810a7d5f1 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -30,7 +30,7 @@ namespace {
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
-constexpr std::size_t Mbytes{1024 * 1024};
+constexpr std::size_t Mbytes       = 1024 * 1024;
 
 template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,

From f34b7b62fb4d7c284085dabef502835866e851cd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 25 Apr 2024 20:36:02 -0700
Subject: [PATCH 307/321] Change comment

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_decode.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index e93ed357919..3a07007de96 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -757,7 +757,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   // Column descriptors ('chunks').
   // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
   // Unfortunately we cannot create one hostdevice_vector to use for all levels because
-  // currently we do not have hostdevice_2dspan exists.
+  // currently we do not have a hostdevice_2dspan class.
   std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
 
   // For computing null count.

From 1d19ede1c80ea83c3ecea36a24cf2d68f2f90353 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 25 Apr 2024 20:36:13 -0700
Subject: [PATCH 308/321] Reserve vector

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index ac0dd10856c..0bc1c2209d4 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -183,6 +183,7 @@ aggregate_orc_metadata::select_stripes(
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
       std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(user_specified_stripes[src_file_idx].size());
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -213,6 +214,7 @@ aggregate_orc_metadata::select_stripes(
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
       std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(per_file_metadata[src_file_idx].ff.stripes.size());
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;

From 3ed1e2e4f501f2e8b178b9cd702728def0eb4bb9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 25 Apr 2024 20:54:29 -0700
Subject: [PATCH 309/321] Change variable order

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/benchmarks/io/orc/orc_reader_input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 19810a7d5f1..b7c214a8374 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -28,8 +28,8 @@ namespace {
 
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
+constexpr std::size_t data_size    = 512 << 20;
 constexpr std::size_t Mbytes       = 1024 * 1024;
 
 template <bool is_chunked_read>

From 633558665707efaa7f4fb702e036ba3aa84d773e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:05:51 -0700
Subject: [PATCH 310/321] Move data to output

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 0bc1c2209d4..94a4d146b35 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -267,7 +267,9 @@ aggregate_orc_metadata::select_stripes(
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
 
-    output.insert(output.end(), mapping.stripe_info.begin(), mapping.stripe_info.end());
+    output.insert(output.end(),
+                  std::make_move_iterator(mapping.stripe_info.begin()),
+                  std::make_move_iterator(mapping.stripe_info.end()));
   }
 
   return {rows_to_skip, rows_to_read, std::move(output)};

From 437c9c0c55c675d8de0dd3c6c22325efbf6281ef Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:06:00 -0700
Subject: [PATCH 311/321] Rename function

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 6 +++---
 cpp/src/io/orc/reader_impl_chunking.hpp | 7 ++++---
 cpp/src/io/orc/reader_impl_decode.cu    | 4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9f1b6b137cb..9925982f3e7 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -498,7 +498,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
 
   // Range of the read info (offset, length) to read for the current being loaded stripes.
   auto const [read_begin, read_end] =
-    get_range(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
+    merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
@@ -637,7 +637,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       // Find the maximum number of streams in all levels of the loaded stripes.
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
-          get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
         auto const num_streams = stream_range.end - stream_range.begin;
         max_num_streams        = std::max(max_num_streams, num_streams);
       }
@@ -654,7 +654,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
 
     // Range of all streams in the loaded stripes.
     auto const stream_range =
-      get_range(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+      merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
     auto const num_streams = stream_range.end - stream_range.begin;
 
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 6bc36002391..9bc1adcb294 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -43,7 +43,8 @@ struct range {
  * @param selected_ranges A range of ranges from `input_ranges`
  * @return The range of data span by the selected range of ranges
  */
-inline range get_range(host_span<range const> input_ranges, range const& selected_ranges)
+inline range merge_selected_ranges(host_span<range const> input_ranges,
+                                   range const& selected_ranges)
 {
   // The first and last range.
   auto const& first_range = input_ranges[selected_ranges.begin];
@@ -128,8 +129,8 @@ struct file_intermediate_data {
   int64_t rows_to_read;
   std::vector<metadata::orc_stripe_info> selected_stripes;
 
-  // Return true if no rows or stripes to read.
-  bool has_no_data() const { return rows_to_read == 0 || selected_stripes.empty(); }
+  // Check if there is data to read.
+  bool has_data() const { return rows_to_read > 0 && !selected_stripes.empty(); }
 
   // For each stripe, we perform a number of reads for its streams.
   // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`.
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 3a07007de96..7eb2240fade 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -783,7 +783,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
       // Find the maximum number of streams in all levels of the decoding stripes.
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
-          get_range(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
         auto const num_streams = stream_range.end - stream_range.begin;
         max_num_streams        = std::max(max_num_streams, num_streams);
       }
@@ -794,7 +794,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
-    auto const stream_range          = get_range(stripe_stream_ranges, stripe_range);
+    auto const stream_range          = merge_selected_ranges(stripe_stream_ranges, stripe_range);
     auto const num_streams           = stream_range.end - stream_range.begin;
 
     auto const& columns_level = _selected_columns.levels[level];

From cc174bbe629f271261cd602cea013bb7a5644577 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:06:46 -0700
Subject: [PATCH 312/321] Change `has_no_data` into `has_data`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 4 ++--
 cpp/src/io/orc/reader_impl_decode.cu   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9925982f3e7..7dffd334a68 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -237,7 +237,7 @@ void reader_impl::preprocess_file(read_mode mode)
     _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
     _metadata.select_stripes(
       _config.selected_stripes, _config.skip_rows, _config.num_read_rows, _stream);
-  if (_file_itm_data.has_no_data()) { return; }
+  if (!_file_itm_data.has_data()) { return; }
 
   CUDF_EXPECTS(
     mode == read_mode::CHUNKED_READ ||
@@ -461,7 +461,7 @@ void reader_impl::preprocess_file(read_mode mode)
 // together with decompression and decoding will be capped around the given data read limit.
 void reader_impl::load_next_stripe_data(read_mode mode)
 {
-  if (_file_itm_data.has_no_data()) { return; }
+  if (!_file_itm_data.has_data()) { return; }
 
   auto const load_stripe_range =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 7eb2240fade..5387aced269 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -710,7 +710,7 @@ std::vector<range> find_table_splits(table_view const& input,
 
 void reader_impl::decompress_and_decode_stripes(read_mode mode)
 {
-  if (_file_itm_data.has_no_data()) { return; }
+  if (!_file_itm_data.has_data()) { return; }
 
   CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
 

From 1e69335d7508fe943a1d2848c7c33178769a9feb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:11:47 -0700
Subject: [PATCH 313/321] Rename variable

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 4 ++--
 cpp/src/io/orc/reader_impl_chunking.hpp | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 7dffd334a68..fbb98548068 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -170,7 +170,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
     if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
       // Similarly, while the returned range has total number of rows exceeds column size limit,
       // move back one position.
-      while (split_pos > 0 && cumulative_sizes[split_pos].rows >
+      while (split_pos > 0 && cumulative_sizes[split_pos].num_rows >
                                 cur_cumulative_rows +
                                   static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
         split_pos--;
@@ -192,7 +192,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
     cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
 
     if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
-      cur_cumulative_rows = cumulative_sizes[split_pos].rows;
+      cur_cumulative_rows = cumulative_sizes[split_pos].num_rows;
     }
   }
 
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 9bc1adcb294..e20200cfeea 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -34,6 +34,8 @@ namespace cudf::io::orc::detail {
 struct range {
   std::size_t begin{0};
   std::size_t end{0};
+
+  [[nodiscard]] auto size() const { return end - begin; }
 };
 
 /**
@@ -243,7 +245,7 @@ struct cumulative_size {
 struct cumulative_size_and_row {
   std::size_t count{0};
   std::size_t size_bytes{0};
-  std::size_t rows{0};
+  std::size_t num_rows{0};
 };
 
 /**
@@ -258,7 +260,8 @@ struct cumulative_size_plus {
   __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a,
                                                 cumulative_size_and_row const& b) const
   {
-    return cumulative_size_and_row{a.count + b.count, a.size_bytes + b.size_bytes, a.rows + b.rows};
+    return cumulative_size_and_row{
+      a.count + b.count, a.size_bytes + b.size_bytes, a.num_rows + b.num_rows};
   }
 };
 

From ad6923665b92f30f7ae5aefb826dc1e02f29234b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:25:43 -0700
Subject: [PATCH 314/321] Implement `size` for range

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 12 ++++--------
 cpp/src/io/orc/reader_impl_decode.cu   |  8 +++-----
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index fbb98548068..1e57139d0fb 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -202,8 +202,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
   if (splits.size() > 1) {
     double constexpr merge_threshold = 0.15;
     if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
-        (last.end - last.begin) <=
-        static_cast<std::size_t>(merge_threshold * (second_last.end - second_last.begin))) {
+        last.size() <= static_cast<std::size_t>(merge_threshold * second_last.size())) {
       splits.pop_back();
       splits.back().end = last.end;
     }
@@ -466,8 +465,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   auto const load_stripe_range =
     _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
   auto const stripe_start = load_stripe_range.begin;
-  auto const stripe_end   = load_stripe_range.end;
-  auto const stripe_count = stripe_end - stripe_start;
+  auto const stripe_count = load_stripe_range.size();
 
   auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
   auto const num_levels = _selected_columns.num_levels();
@@ -638,8 +636,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
           merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
-        auto const num_streams = stream_range.end - stream_range.begin;
-        max_num_streams        = std::max(max_num_streams, num_streams);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
       }
     }
     return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
@@ -655,13 +652,12 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     // Range of all streams in the loaded stripes.
     auto const stream_range =
       merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
-    auto const num_streams = stream_range.end - stream_range.begin;
 
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
       auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
-        hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 5387aced269..e1fd61cd582 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -718,7 +718,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
   auto const stripe_start = stripe_range.begin;
   auto const stripe_end   = stripe_range.end;
-  auto const stripe_count = stripe_range.end - stripe_range.begin;
+  auto const stripe_count = stripe_range.size();
 
   // The start index of loaded stripes. They are different from decoding stripes.
   auto const load_stripe_range =
@@ -784,8 +784,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
           merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
-        auto const num_streams = stream_range.end - stream_range.begin;
-        max_num_streams        = std::max(max_num_streams, num_streams);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
       }
     }
     return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
@@ -795,7 +794,6 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
     auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
     auto const stream_range          = merge_selected_ranges(stripe_stream_ranges, stripe_range);
-    auto const num_streams           = stream_range.end - stream_range.begin;
 
     auto const& columns_level = _selected_columns.levels[level];
     auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
@@ -946,7 +944,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
-        hd_compinfo.begin(), hd_compinfo.d_begin(), num_streams);
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,

From 890abb4e970eab7dc7ec059292e7089db1eccd51 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:27:19 -0700
Subject: [PATCH 315/321] Change docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index e20200cfeea..0b38a11e2f5 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -29,7 +29,7 @@
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct representing a range of data.
+ * @brief Struct representing a range of of data offsets.
  */
 struct range {
   std::size_t begin{0};

From cb21a6df264291900aa50464837844ee7ed3cef7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:28:40 -0700
Subject: [PATCH 316/321] Rename `_config` into `_options`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.cu          | 22 +++++++++++-----------
 cpp/src/io/orc/reader_impl.hpp         |  2 +-
 cpp/src/io/orc/reader_impl_chunking.cu |  8 ++++----
 cpp/src/io/orc/reader_impl_decode.cu   |  4 ++--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index cf7b6fecbc6..621d4c67691 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -66,9 +66,9 @@ table_with_metadata reader_impl::make_output_chunk()
                      out_metadata.schema_info.emplace_back("");
                      return create_empty_column(col_meta.id,
                                                 _metadata,
-                                                _config.decimal128_columns,
-                                                _config.use_np_dtypes,
-                                                _config.timestamp_type,
+                                                _options.decimal128_columns,
+                                                _options.use_np_dtypes,
+                                                _options.timestamp_type,
                                                 out_metadata.schema_info.back(),
                                                 _stream);
                    });
@@ -167,13 +167,13 @@ reader_impl::reader_impl(std::size_t chunk_read_limit,
                          rmm::device_async_resource_ref mr)
   : _stream(stream),
     _mr(mr),
-    _config{options.get_timestamp_type(),
-            options.is_enabled_use_index(),
-            options.is_enabled_use_np_dtypes(),
-            options.get_decimal128_columns(),
-            options.get_skip_rows(),
-            options.get_num_rows(),
-            options.get_stripes()},
+    _options{options.get_timestamp_type(),
+             options.is_enabled_use_index(),
+             options.is_enabled_use_np_dtypes(),
+             options.get_decimal128_columns(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_stripes()},
     _col_meta{std::make_unique<reader_column_meta>()},
     _sources(std::move(sources)),
     _metadata{_sources, stream},
@@ -182,7 +182,7 @@ reader_impl::reader_impl(std::size_t chunk_read_limit,
 {
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns.
-  CUDF_EXPECTS(_config.skip_rows == 0 or _selected_columns.num_levels() == 1,
+  CUDF_EXPECTS(_options.skip_rows == 0 or _selected_columns.num_levels() == 1,
                "skip_rows is not supported by nested column");
 }
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index cd30e0e2b91..4f433653e1b 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -178,7 +178,7 @@ class reader_impl {
     int64_t const skip_rows;
     std::optional<int64_t> num_read_rows;
     std::vector<std::vector<size_type>> const selected_stripes;
-  } const _config;
+  } const _options;
 
   // Intermediate data for reading.
   std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 1e57139d0fb..98caa677aee 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -235,7 +235,7 @@ void reader_impl::preprocess_file(read_mode mode)
   std::tie(
     _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
     _metadata.select_stripes(
-      _config.selected_stripes, _config.skip_rows, _config.num_read_rows, _stream);
+      _options.selected_stripes, _options.skip_rows, _options.num_read_rows, _stream);
   if (!_file_itm_data.has_data()) { return; }
 
   CUDF_EXPECTS(
@@ -305,9 +305,9 @@ void reader_impl::preprocess_file(read_mode mode)
 
       auto const col_type =
         to_cudf_type(_metadata.get_col_type(col.id).kind,
-                     _config.use_np_dtypes,
-                     _config.timestamp_type.id(),
-                     to_cudf_decimal_type(_config.decimal128_columns, _metadata, col.id));
+                     _options.use_np_dtypes,
+                     _options.timestamp_type.id(),
+                     to_cudf_decimal_type(_options.decimal128_columns, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
 
       auto& column_types = lvl_column_types[level];
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index e1fd61cd582..1e00779506c 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -809,7 +809,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
-      _config.use_index &&
+      _options.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
@@ -905,7 +905,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
         chunk.num_rowgroups = stripe_num_rowgroups;
 
         if (chunk.type_kind == orc::TIMESTAMP) {
-          chunk.timestamp_type_id = _config.timestamp_type.id();
+          chunk.timestamp_type_id = _options.timestamp_type.id();
         }
         if (not is_stripe_data_empty) {
           for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {

From 4e64eb708e94efe156572f336ff8330de5c5b7a6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:33:23 -0700
Subject: [PATCH 317/321] Change comments

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4f433653e1b..94b294087b8 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -194,6 +194,9 @@ class reader_impl {
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 
   // The default value used for subdividing the decoded table for final output.
+  // Larger values will reduce the computation time but will make the output table less granular.
+  // Smaller values (minimum is `1`) will increase the computation time but the output table will
+  // have size closer to the given `chunk_read_limit`.
   static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 

From d42ed14487442eadfb787a896def8b509ebd9af8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 29 Apr 2024 13:34:41 -0700
Subject: [PATCH 318/321] Change `cumulative_size_and_row` to subclass
 `cumulative_size`

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0b38a11e2f5..96751876751 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -242,9 +242,7 @@ struct cumulative_size {
  * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or
  * rows in tables.
  */
-struct cumulative_size_and_row {
-  std::size_t count{0};
-  std::size_t size_bytes{0};
+struct cumulative_size_and_row : public cumulative_size {
   std::size_t num_rows{0};
 };
 

From a0ca33378ec39d90325154d21f5b359782abcc80 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 1 May 2024 21:34:14 -0700
Subject: [PATCH 319/321] Address some review comments

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 18 ++++++++----------
 cpp/src/io/orc/reader_impl_chunking.hpp |  2 +-
 cpp/src/io/orc/reader_impl_decode.cu    | 21 ++++++++++-----------
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 98caa677aee..eecef1b0334 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -142,7 +142,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
                                std::size_t total_count,
                                std::size_t size_limit)
 {
-  CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit", std::invalid_argument);
 
   std::vector<range> splits;
   std::size_t cur_count{0};
@@ -445,7 +445,7 @@ void reader_impl::preprocess_file(read_mode mode)
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
                                               chunk_read_data::load_limit_ratio);
     // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
-    return tmp > 0UL ? tmp : 1UL;
+    return std::max(tmp, 1UL);
   }();
 
   _chunk_read_data.load_stripe_ranges =
@@ -531,13 +531,11 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   }
 
   // Compute number of rows in the loading stripes.
-  auto const num_loading_rows = [&] {
-    std::size_t count{0};
-    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
-      count += _file_itm_data.selected_stripes[idx + stripe_start].stripe_info->numberOfRows;
-    }
-    return count;
-  }();
+  auto const num_loading_rows = std::accumulate(
+    _file_itm_data.selected_stripes.begin() + stripe_start,
+    _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count,
+    std::size_t{0},
+    [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; });
 
   // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
   _chunk_read_data.curr_decode_stripe_range = 0;
@@ -709,7 +707,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
                                               chunk_read_data::decompress_and_decode_limit_ratio);
     // Make sure not to pass 0 byte limit to `find_splits`.
-    return tmp > 0UL ? tmp : 1UL;
+    return std::max(tmp, 1UL);
   }();
 
   _chunk_read_data.decode_stripe_ranges =
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 96751876751..0ba61004e10 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -272,7 +272,7 @@ struct cumulative_size_plus {
  *
  * @param cumulative_sizes The input cumulative sizes to compute split ranges
  * @param total_count The total count in the entire input
- * @param size_limit The given soft limit to compute splits
+ * @param size_limit The given soft limit to compute splits; must be positive
  * @return A vector of ranges as splits of the input
  */
 template <typename T>
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 1e00779506c..ec936b85761 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -88,7 +88,7 @@ rmm::device_buffer decompress_stripe_data(
   rmm::cuda_stream_view stream)
 {
   // Whether we have the comppression info precomputed.
-  auto const compinfo_ready = compinfo_map.size() > 0;
+  auto const compinfo_ready = not compinfo_map.empty();
 
   // Count the exact number of compressed blocks
   std::size_t num_compressed_blocks   = 0;
@@ -667,8 +667,11 @@ std::vector<range> find_table_splits(table_view const& input,
                                      std::size_t size_limit,
                                      rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(size_limit > 0, "Invalid size limit");
-  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length");
+  if (size_limit == 0) {
+    return std::vector<range>{range{0, static_cast<std::size_t>(input.num_rows())}};
+  }
+
+  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length", std::invalid_argument);
 
   // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
   segment_length = std::min(segment_length, input.num_rows());
@@ -1070,14 +1073,10 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
   // Split the decoded table into ranges that be output into chunks having size within the given
   // output size limit.
-  _chunk_read_data.output_table_ranges =
-    _chunk_read_data.chunk_read_limit == 0
-      ? std::vector<range>{range{
-          0, static_cast<std::size_t>(_chunk_read_data.decoded_table->num_rows())}}
-      : find_table_splits(_chunk_read_data.decoded_table->view(),
-                          _chunk_read_data.output_row_granularity,
-                          _chunk_read_data.chunk_read_limit,
-                          _stream);
+  _chunk_read_data.output_table_ranges = find_table_splits(_chunk_read_data.decoded_table->view(),
+                                                           _chunk_read_data.output_row_granularity,
+                                                           _chunk_read_data.chunk_read_limit,
+                                                           _stream);
 }
 
 }  // namespace cudf::io::orc::detail

From cf18e4c0481e9831fe31c5649dd5753c72fc42a1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 2 May 2024 13:04:32 -0700
Subject: [PATCH 320/321] Remove handling for `READ_ALL` when number of rows
 exceed 2B rows

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index eecef1b0334..1a682ccbeef 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -241,7 +241,8 @@ void reader_impl::preprocess_file(read_mode mode)
   CUDF_EXPECTS(
     mode == read_mode::CHUNKED_READ ||
       _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.",
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit. "
+    "For reading large number of rows, please use chunked_reader.",
     std::overflow_error);
 
   auto const& selected_stripes = _file_itm_data.selected_stripes;
@@ -553,15 +554,19 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       // In addition to read limit, we also need to check if the total number of
       // rows in the loaded stripes exceeds the column size limit.
       // If that is the case, we cannot decode all stripes at once into a cudf table.
-      num_loading_rows < column_size_limit) {
+      num_loading_rows <= column_size_limit) {
     _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
     return;
   }
 
   // From here, we have reading mode that is either:
-  // - READ_ALL but the number of reading rows exceeds column size limit, or
   // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or
   // - CHUNKED_READ with a pass read limit.
+  // READ_ALL mode with number of rows more than cudf's column size limit should be handled early in
+  // `preprocess_file`. We just check again to make sure such situations never happen here.
+  CUDF_EXPECTS(
+    mode != read_mode::READ_ALL,
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.");
 
   // This is the post-processing step after we've done with splitting `load_stripe_range` into
   // `decode_stripe_ranges`.
@@ -580,8 +585,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   // Note that the values `max_uncompressed_size` for each stripe are not computed here.
   // Instead, they will be computed on the fly during decoding to avoid the overhead of
   // storing and retrieving from memory.
-  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
-      num_loading_rows >= column_size_limit) {
+  if (_chunk_read_data.pass_read_limit == 0 && num_loading_rows > column_size_limit) {
     std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
     std::size_t rows{0};
 

From 42601b2bcb281bbe7ed6f7d81fbc9628604ec9c9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 2 May 2024 14:50:33 -0700
Subject: [PATCH 321/321] Rename parameter

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/src/io/orc/reader_impl_chunking.cu  | 8 ++++----
 cpp/src/io/orc/reader_impl_chunking.hpp | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 1a682ccbeef..5034aa14a95 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -37,7 +37,7 @@
 namespace cudf::io::orc::detail {
 
 std::size_t gather_stream_info_and_column_desc(
-  std::size_t stripe_order,
+  std::size_t stripe_id,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,
@@ -93,7 +93,7 @@ std::size_t gather_stream_info_and_column_desc(
           if (child_idx >= 0) {
             col = child_idx;
             if (chunks) {
-              auto& chunk                     = (*chunks)[stripe_order][col];
+              auto& chunk                     = (*chunks)[stripe_id][col];
               chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
@@ -105,7 +105,7 @@ std::size_t gather_stream_info_and_column_desc(
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
           if (index_type < gpu::CI_NUM_STREAMS) {
-            auto& chunk                = (*chunks)[stripe_order][col];
+            auto& chunk                = (*chunks)[stripe_id][col];
             chunk.strm_id[index_type]  = *local_stream_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
@@ -126,7 +126,7 @@ std::size_t gather_stream_info_and_column_desc(
           orc_stream_info{stripeinfo->offset + src_offset,
                           dst_offset,
                           stream.length,
-                          stream_source_info{stripe_order, level, column_id, stream.kind}});
+                          stream_source_info{stripe_id, level, column_id, stream.kind}});
       }
 
       dst_offset += stream.length;
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0ba61004e10..4ef68ee8d86 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -290,7 +290,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
  * steps share most of the execution path thus this function takes mutually exclusive parameters
  * `stream_info` or `chunks` depending on each use case.
  *
- * @param stripe_order The index of the current stripe, can be global index or local decoding index
+ * @param stripe_id The index of the current stripe, can be global index or local decoding index
  * @param level The current processing nested level
  * @param stripeinfo The pointer to current stripe's information
  * @param stripefooter The pointer to current stripe's footer
@@ -305,7 +305,7 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
  * @return The number of bytes in the gathered streams
  */
 std::size_t gather_stream_info_and_column_desc(
-  std::size_t stripe_order,
+  std::size_t stripe_id,
   std::size_t level,
   orc::StripeInformation const* stripeinfo,
   orc::StripeFooter const* stripefooter,