From 63a2d880cb60fa0400304e64704b3c456298beaf Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Sun, 23 Apr 2023 17:49:33 -0500
Subject: [PATCH 001/114] Rework of level decoding to be considerably more
 parallel.  Previously, it was only 1 warp wide.  Now it is block-wide. Only
 integrated into the gpuComputePageSizes() kernel.  gpuDecodePages() will be a
 followup PR.

---
 .../cudf/detail/utilities/integer_utils.hpp   |   4 +-
 cpp/src/io/parquet/page_data.cu               | 413 +++++++++++-------
 cpp/src/io/parquet/page_hdr.cu                |   8 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |   8 +
 cpp/src/io/parquet/reader_impl.hpp            |   8 +
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  50 ++-
 cpp/src/io/parquet/rle_stream.cuh             | 352 +++++++++++++++
 cpp/src/io/utilities/block_utils.cuh          |   7 +
 8 files changed, 665 insertions(+), 185 deletions(-)
 create mode 100644 cpp/src/io/parquet/rle_stream.cuh
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 40faae7e9f4..3e4979c0c38 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -44,7 +44,7 @@ namespace util {
  * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-S round_up_safe(S number_to_round, S modulus)
+constexpr S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -67,7 +67,7 @@ S round_up_safe(S number_to_round, S modulus)
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-S round_down_safe(S number_to_round, S modulus) noexcept
+constexpr S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 8cb01d5a34b..c8995ec2625 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -15,6 +15,7 @@
  */
 
 #include "parquet_gpu.hpp"
+#include "rle_stream.cuh"
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/column_buffer.hpp>
 
@@ -46,9 +47,10 @@ namespace gpu {
 
 namespace {
 
-constexpr int block_size           = 128;
-constexpr int non_zero_buffer_size = block_size * 2;
-
+constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
+constexpr int decode_block_size     = 128;
+constexpr int non_zero_buffer_size  = decode_block_size * 2;
+constexpr int rolling_lvl_index(int index, int size) { return index % size; }
 constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
 
 struct page_state_s {
@@ -82,11 +84,14 @@ struct page_state_s {
   int32_t input_value_count;                  // how many values of the input we've processed
   int32_t input_row_count;                    // how many rows of the input we've processed
   int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint32_t rep[non_zero_buffer_size];         // circular buffer of repetition level values
-  uint32_t def[non_zero_buffer_size];         // circular buffer of definition level values
+  uint32_t* rep;                              // circular buffer of repetition level values
+  uint32_t* def;                              // circular buffer of definition level values
+  int level_decode_buf_size;                  // size of rep/ref
   const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];         // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;              // lower bound of row indices we should process
+  const uint8_t* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  const uint8_t* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
 
   // a shared-memory cache of frequently used data when decoding. The source of this data is
   // normally stored in global memory which can yield poor performance. So, when possible
@@ -144,32 +149,6 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
   return page_begin >= begin && page_end <= end;
 }
 
-/**
- * @brief Read a 32-bit varint integer
- *
- * @param[in,out] cur The current data position, updated after the read
- * @param[in] end The end data position
- *
- * @return The 32-bit value read
- */
-inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
-{
-  uint32_t v = *cur++;
-  if (v >= 0x80 && cur < end) {
-    v = (v & 0x7f) | ((*cur++) << 7);
-    if (v >= (0x80 << 7) && cur < end) {
-      v = (v & ((0x7f << 7) | 0x7f)) | ((*cur++) << 14);
-      if (v >= (0x80 << 14) && cur < end) {
-        v = (v & ((0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 21);
-        if (v >= (0x80 << 21) && cur < end) {
-          v = (v & ((0x7f << 21) | (0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 28);
-        }
-      }
-    }
-  }
-  return v;
-}
-
 /**
  * @brief Parse the beginning of the level section (definition or repetition),
  * initializes the initial RLE run & value, and returns the section length
@@ -184,18 +163,22 @@ inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
 __device__ uint32_t InitLevelSection(page_state_s* s,
                                      const uint8_t* cur,
                                      const uint8_t* end,
-                                     level_type lvl)
+                                     level_type lvl,
+                                     bool is_decode_step,
+                                     rle_stream* decoders)
 {
   int32_t len;
   int level_bits    = s->col.level_bits[lvl];
   Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
                                                     : s->page.repetition_level_encoding;
 
+  auto start = cur;
   if (level_bits == 0) {
     len                       = 0;
     s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
     s->initial_rle_value[lvl] = 0;
     s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
   } else if (encoding == Encoding::RLE) {
     // V2 only uses RLE encoding, so only perform check here
     if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
@@ -207,6 +190,7 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
       len      = 0;
       s->error = 2;
     }
+    s->abs_lvl_start[lvl] = cur;
     if (!s->error) {
       uint32_t run            = get_vlq32(cur, end);
       s->initial_rle_run[lvl] = run;
@@ -220,17 +204,22 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
         s->initial_rle_value[lvl] = v;
       }
       s->lvl_start[lvl] = cur;
-      if (cur > end) { s->error = 2; }
     }
+
+    if (cur > end) { s->error = 2; }
   } else if (encoding == Encoding::BIT_PACKED) {
     len                       = (s->page.num_input_values * level_bits + 7) >> 3;
     s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
     s->initial_rle_value[lvl] = 0;
     s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
   } else {
     s->error = 3;
     len      = 0;
   }
+
+  s->abs_lvl_end[lvl] = start + len;
+
   return static_cast<uint32_t>(len);
 }
 
@@ -980,15 +969,21 @@ static __device__ void gpuOutputGeneric(
  * @param[in] chunks The global list of chunks
  * @param[in] min_row Crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
- * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess
- * step)
+ * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
+ * @param[in] level_decode_buf Buffer space to use for repetition and definition levels
+ * @param[in] level_decode_buf_size Size of the level decode buffers
+ * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
+ * Currently only used by gpuComputePageSizes step)
  */
 static __device__ bool setupLocalPageInfo(page_state_s* const s,
                                           PageInfo const* p,
                                           device_span<ColumnChunkDesc const> chunks,
                                           size_t min_row,
                                           size_t num_rows,
-                                          bool is_decode_step)
+                                          bool is_decode_step,
+                                          uint32_t* level_decode_buf[level_type::NUM_LEVEL_TYPES],
+                                          int level_decode_buf_size,
+                                          rle_stream* decoders = nullptr)
 {
   int t = threadIdx.x;
   int chunk_idx;
@@ -1005,7 +1000,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
   chunk_idx = s->page.chunk_idx;
   if (!t) { s->col = chunks[chunk_idx]; }
 
-  // if we can use the decode cache, set it up now
+  // if we can use the nesting decode cache, set it up now
   auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
   if (can_use_decode_cache) {
     int depth = 0;
@@ -1028,6 +1023,13 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
   if (!t) {
     s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
   }
+
+  if (!t) {
+    s->rep                   = level_decode_buf[level_type::REPETITION];
+    s->def                   = level_decode_buf[level_type::DEFINITION];
+    s->level_decode_buf_size = level_decode_buf_size;
+  }
+
   __syncthreads();
 
   // zero counts
@@ -1202,9 +1204,9 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->first_output_value = 0;
 
       // Find the compressed size of repetition levels
-      cur += InitLevelSection(s, cur, end, level_type::REPETITION);
+      cur += InitLevelSection(s, cur, end, level_type::REPETITION, is_decode_step, decoders);
       // Find the compressed size of definition levels
-      cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
+      cur += InitLevelSection(s, cur, end, level_type::DEFINITION, is_decode_step, decoders);
 
       s->dict_bits = 0;
       s->dict_base = nullptr;
@@ -1382,11 +1384,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
                                           int32_t target_input_value_count,
                                           int t)
 {
-  start_depth = -1;
-  end_depth   = -1;
-  d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int index = rolling_index(input_value_count + t);
+    int index = rolling_lvl_index(input_value_count + t, s->level_decode_buf_size);
     d         = s->def[index];
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
@@ -1401,6 +1400,10 @@ inline __device__ void get_nesting_bounds(int& start_depth,
       start_depth = 0;
       end_depth   = s->col.max_nesting_depth - 1;
     }
+  } else {
+    start_depth = -1;
+    end_depth   = -1;
+    d           = -1;
   }
 }
 
@@ -1617,116 +1620,149 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 }
 
 /**
- * @brief Process a batch of incoming repetition/definition level values to generate
- *        per-nesting level output column size for this page.
+ * @brief Returns the total size in bytes of string char data in the page.
+ *
+ * This function expects the dictionary position to be at 0 and will traverse
+ * the entire thing.
  *
- * Each page represents one piece of the overall output column. The total output (cudf)
- * column sizes are the sum of the values in each individual page.
+ * Operates on a single warp only. Expects t < 32
  *
- * @param[in] s The local page info
- * @param[in] target_input_value_count The # of repetition/definition levels to process up to
- * @param[in] t Thread index
- * @param[in] bounds_set Whether or not s->row_index_lower_bound, s->first_row and s->num_rows
- * have been computed for this page (they will only be set in the second/trim pass).
+ * @param s The local page info
+ * @param t Thread index
+ */
+__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
+{
+  size_type target_pos = s->num_input_values;
+  size_type str_len    = 0;
+  if (s->dict_base) {
+    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, nullptr, target_pos, t);
+    target_pos                       = new_target_pos;
+    str_len                          = len;
+  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+    str_len = gpuInitStringDescriptors<true>(s, nullptr, target_pos, t);
+  }
+  if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; }
+  return str_len;
+}
+
+/**
+ * @brief Update output column sizes for every nesting level based on a batch
+ * of incoming decoded definition and repetition level values.
+ *
+ * If bounds_set is true, computes skipped_values and skipped_leaf_values for the
+ * page to indicate where we need to skip to based on min/max row.
+ *
+ * Operates at the block level.
+ *
+ * @param s The local page info
+ * @param target_value_count The target value count to process up to
+ * @param t Thread index
+ * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
  */
 static __device__ void gpuUpdatePageSizes(page_state_s* s,
-                                          int32_t target_input_value_count,
+                                          int target_value_count,
                                           int t,
                                           bool bounds_set)
 {
   // max nesting depth of the column
   int const max_depth = s->col.max_nesting_depth;
+
+  constexpr int num_warps      = preprocess_block_size / 32;
+  constexpr int max_batch_size = num_warps * 32;
+
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
   // how many input level values we've processed in the page so far
-  int input_value_count = s->input_value_count;
-  // how many leaf values we've processed in the page so far
-  int input_leaf_count = s->input_leaf_count;
+  int value_count = s->input_value_count;
   // how many rows we've processed in the page so far
-  int input_row_count = s->input_row_count;
+  int row_count = s->input_row_count;
+  // how many leaf values we've processed in the page so far
+  int leaf_count = s->input_leaf_count;
+  // whether or not we need to continue checking for the first row
+  bool skipped_values_set = s->page.skipped_values >= 0;
 
-  while (input_value_count < target_input_value_count) {
-    int start_depth, end_depth, d;
-    get_nesting_bounds(
-      start_depth, end_depth, d, s, input_value_count, target_input_value_count, t);
+  while (value_count < target_value_count) {
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
 
-    // count rows and leaf values
-    int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row);
-    int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level) ? 1 : 0;
-    uint32_t const warp_leaf_count_mask = ballot(is_new_leaf);
+    // start/end depth
+    int start_depth, end_depth, d;
+    get_nesting_bounds(start_depth, end_depth, d, s, value_count, value_count + batch_size, t);
 
-    // is this thread within row bounds? on the first pass we don't know the bounds, so we will be
-    // computing the full size of the column.  on the second pass, we will know our actual row
-    // bounds, so the computation will cap sizes properly.
+    // is this thread within row bounds? in the non skip_rows/num_rows case this will always
+    // be true.
     int in_row_bounds = 1;
+
+    // if we are in the skip_rows/num_rows case, we need to check against these limits
     if (bounds_set) {
-      // absolute row index
-      int32_t thread_row_index =
-        input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
-      in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
-                          thread_row_index < (s->first_row + s->num_rows)
-                        ? 1
-                        : 0;
-
-      uint32_t const row_bounds_mask  = ballot(in_row_bounds);
-      int const first_thread_in_range = __ffs(row_bounds_mask) - 1;
-
-      // if we've found the beginning of the first row, mark down the position
-      // in the def/repetition buffer (skipped_values) and the data buffer (skipped_leaf_values)
-      if (!t && first_thread_in_range >= 0 && s->page.skipped_values < 0) {
-        // how many values we've skipped in the rep/def levels
-        s->page.skipped_values = input_value_count + first_thread_in_range;
-        // how many values we've skipped in the actual data stream
-        s->page.skipped_leaf_values =
-          input_leaf_count + __popc(warp_leaf_count_mask & ((1 << first_thread_in_range) - 1));
+      // get absolute thread row index
+      int const is_new_row = start_depth == 0;
+      int thread_row_count, block_row_count;
+      block_scan(temp_storage.scan_storage)
+        .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+      __syncthreads();
+
+      // get absolute thread leaf index
+      int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level);
+      int thread_leaf_count, block_leaf_count;
+      block_scan(temp_storage.scan_storage)
+        .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+      __syncthreads();
+
+      // if this thread is in row bounds
+      int const row_index = (thread_row_count + row_count) - 1;
+      in_row_bounds =
+        (row_index >= s->row_index_lower_bound) && (row_index < (s->first_row + s->num_rows));
+
+      // if we have not set skipped values yet, see if we found the first in-bounds row
+      if (!skipped_values_set) {
+        int local_count, global_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(in_row_bounds, local_count, global_count);
+        __syncthreads();
+
+        // we found it
+        if (global_count > 0) {
+          // this is the thread that represents the first row.
+          if (local_count == 1) {
+            s->page.skipped_values = value_count + t;
+            s->page.skipped_leaf_values =
+              leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+          }
+          skipped_values_set = true;
+        }
       }
+
+      row_count += block_row_count;
+      leaf_count += block_leaf_count;
     }
 
     // increment value counts across all nesting depths
     for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      PageNestingInfo* pni = &s->page.nesting[s_idx];
-
-      // if we are within the range of nesting levels we should be adding value indices for
-      int const in_nesting_bounds =
-        (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds) ? 1 : 0;
-      uint32_t const count_mask = ballot(in_nesting_bounds);
-      if (!t) { pni->batch_size += __popc(count_mask); }
+      int const in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds);
+      int const count = block_reduce(temp_storage.reduce_storage).Sum(in_nesting_bounds);
+      __syncthreads();
+      if (!t) {
+        PageNestingInfo* pni = &s->page.nesting[s_idx];
+        pni->batch_size += count;
+      }
     }
 
-    input_value_count += min(32, (target_input_value_count - input_value_count));
-    input_row_count += __popc(warp_row_count_mask);
-    input_leaf_count += __popc(warp_leaf_count_mask);
+    value_count += batch_size;
   }
 
-  // update final page value count
+  // update final outputs
   if (!t) {
-    s->input_value_count = target_input_value_count;
-    s->input_leaf_count  = input_leaf_count;
-    s->input_row_count   = input_row_count;
-  }
-}
+    s->input_value_count = value_count;
 
-/**
- * @brief Returns the total size in bytes of string char data in the page.
- *
- * This function expects the dictionary position to be at 0 and will traverse
- * the entire thing.
- *
- * @param s The local page info
- * @param t Thread index
- */
-__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
-{
-  size_type target_pos = s->num_input_values;
-  size_type str_len    = 0;
-  if (s->dict_base) {
-    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, nullptr, target_pos, t);
-    target_pos                       = new_target_pos;
-    str_len                          = len;
-  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
-    str_len = gpuInitStringDescriptors<true>(s, nullptr, target_pos, t);
+    // only used in the skip_rows/num_rows case
+    s->input_leaf_count = leaf_count;
+    s->input_row_count  = row_count;
   }
-  if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; }
-  return str_len;
 }
 
 /**
@@ -1744,7 +1780,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
  * @param compute_string_sizes Whether or not we should be computing string sizes
  * (PageInfo::str_bytes) as part of the pass
  */
-__global__ void __launch_bounds__(block_size)
+__global__ void __launch_bounds__(preprocess_block_size)
   gpuComputePageSizes(PageInfo* pages,
                       device_span<ColumnChunkDesc const> chunks,
                       size_t min_row,
@@ -1759,7 +1795,46 @@ __global__ void __launch_bounds__(block_size)
   int t                 = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
+  // whether or not we have repetition levels (lists)
+  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the level stream decoders
+  __shared__ rle_run def_runs[run_buffer_size];
+  __shared__ rle_run rep_runs[run_buffer_size];
+  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(s,
+                          pp,
+                          chunks,
+                          min_row,
+                          num_rows,
+                          false,
+                          pp->lvl_decode_buf,
+                          LEVEL_DECODE_BUF_SIZE,
+                          decoders)) {
+    return;
+  }
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  int const max_batch_size = s->level_decode_buf_size;
+  uint32_t* def_decode     = s->def;
+  uint32_t* rep_decode     = s->rep;
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        max_batch_size,
+                                        def_decode,
+                                        s->page.num_input_values);
+  if (has_repetition) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          max_batch_size,
+                                          rep_decode,
+                                          s->page.num_input_values);
+  }
+  __syncthreads();
 
   if (!t) {
     s->page.skipped_values      = -1;
@@ -1779,7 +1854,6 @@ __global__ void __launch_bounds__(block_size)
 
   // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
   // containing lists anywhere within).
-  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
   compute_string_sizes =
     compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
 
@@ -1829,40 +1903,32 @@ __global__ void __launch_bounds__(block_size)
     }
     depth += blockDim.x;
   }
-
   __syncthreads();
 
-  // optimization : it might be useful to have a version of gpuDecodeStream that could go wider than
-  // 1 warp.  Currently it only uses 1 warp so that it can overlap work with the value decoding step
-  // when in the actual value decoding kernel. However, during this preprocess step we have no such
-  // limits -  we could go as wide as block_size
-  if (t < 32) {
-    constexpr int batch_size = 32;
-    int target_input_count   = batch_size;
-    while (!s->error && s->input_value_count < s->num_input_values) {
-      // decode repetition and definition levels. these will attempt to decode at
-      // least up to the target, but may decode a few more.
-      if (has_repetition) {
-        gpuDecodeStream(s->rep, s, target_input_count, t, level_type::REPETITION);
-      }
-      gpuDecodeStream(s->def, s, target_input_count, t, level_type::DEFINITION);
-      __syncwarp();
-
-      // we may have decoded different amounts from each stream, so only process what we've been
-      int actual_input_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
-                                                    s->lvl_count[level_type::DEFINITION])
-                                              : s->lvl_count[level_type::DEFINITION];
-
-      // process what we got back
-      gpuUpdatePageSizes(s, actual_input_count, t, !is_base_pass);
-      target_input_count = actual_input_count + batch_size;
-      __syncwarp();
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuUpdatePageSizes
+  int processed = 0;
+  while (processed < s->page.num_input_values) {
+    // TODO:  it would not take much more work to make it so that we could run both of these
+    // decodes concurrently. there are a couple of shared variables internally that would have to
+    // get dealt with but that's about it.
+    if (has_repetition) {
+      decoders[level_type::REPETITION].decode_next(t);
+      __syncthreads();
     }
+    // the # of rep/def levels will always be the same size
+    processed += decoders[level_type::DEFINITION].decode_next(t);
+    __syncthreads();
 
-    // retrieve total string size.
-    // TODO: investigate if it is possible to do this with a separate warp at the same time levels
-    // are being decoded above.
-    if (compute_string_sizes) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
+    // update page sizes
+    gpuUpdatePageSizes(s, processed, t, !is_base_pass);
+    __syncthreads();
+  }
+
+  // retrieve total string size.
+  // TODO: make this block-based instead of just 1 warp
+  if (compute_string_sizes) {
+    if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
   }
 
   // update output results:
@@ -1925,7 +1991,7 @@ struct null_count_back_copier {
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(block_size) gpuDecodePageData(
+__global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -1938,7 +2004,20 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  __shared__ uint32_t def_buf[non_zero_buffer_size];
+  __shared__ uint32_t rep_buf[non_zero_buffer_size];
+  uint32_t* level_decode_buf[level_type::NUM_LEVEL_TYPES] = {def_buf, rep_buf};
+
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          true,
+                          level_decode_buf,
+                          non_zero_buffer_size)) {
+    return;
+  }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -1973,10 +2052,10 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
     int src_pos = s->src_pos;
 
     if (t < out_thread0) {
-      target_pos =
-        min(src_pos + 2 * (block_size - out_thread0), s->nz_count + (block_size - out_thread0));
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
     } else {
-      target_pos = min(s->nz_count, src_pos + block_size - out_thread0);
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
     __syncthreads();
@@ -2104,7 +2183,7 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
                       bool compute_string_sizes,
                       rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(block_size, 1);
+  dim3 dim_block(preprocess_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   // computes:
@@ -2127,7 +2206,7 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
-  dim3 dim_block(block_size, 1);
+  dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   gpuDecodePageData<<<dim_grid, dim_block, 0, stream.value()>>>(
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index ffb4cb60a20..c7d27914375 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -365,9 +365,11 @@ __global__ void __launch_bounds__(128)
       // this computation is only valid for flat schemas. for nested schemas,
       // they will be recomputed in the preprocess step by examining repetition and
       // definition levels
-      bs->page.chunk_row = 0;
-      bs->page.num_rows  = 0;
-      bs->page.str_bytes = 0;
+      bs->page.chunk_row           = 0;
+      bs->page.num_rows            = 0;
+      bs->page.skipped_values      = -1;
+      bs->page.skipped_leaf_values = 0;
+      bs->page.str_bytes           = 0;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 4b577929e82..54119cc7e00 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -45,6 +45,10 @@ constexpr int MAX_DICT_BITS = 24;
 // Total number of unsigned 24 bit values
 constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 
+// level decode buffer size.
+// at size 4096, each page requires 32kb of memory
+constexpr int LEVEL_DECODE_BUF_SIZE = 4096;
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -193,6 +197,9 @@ struct PageInfo {
   int32_t nesting_info_size;
   PageNestingInfo* nesting;
   PageNestingDecodeInfo* nesting_decode;
+
+  // level decode buffers
+  uint32_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
 };
 
 /**
@@ -284,6 +291,7 @@ struct file_intermediate_data {
   hostdevice_vector<gpu::PageInfo> pages_info{};
   hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
   hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+  rmm::device_buffer level_decode_data;
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9b40610b141..7e22179a421 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -181,6 +181,14 @@ class reader::impl {
    */
   void allocate_nesting_info();
 
+  /**
+   * @brief Allocate space for use when decoding definition/repetition levels/
+   *
+   * One large contiguous buffer of data allocated and
+   * distributed among the PageInfo structs.
+   */
+  void allocate_level_decode_space();
+
   /**
    * @brief Read a chunk of data and return an output table.
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index bcb02466a68..be3e0d3ce33 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -565,9 +565,6 @@ void reader::impl::allocate_nesting_info()
   page_nesting_decode_info =
     hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
-  // retrieve from the gpu so we can update
-  pages.device_to_host(_stream, true);
-
   // update pointers in the PageInfos
   int target_page_index = 0;
   int src_info_index    = 0;
@@ -593,9 +590,6 @@ void reader::impl::allocate_nesting_info()
     target_page_index += chunks[idx].num_data_pages;
   }
 
-  // copy back to the gpu
-  pages.host_to_device(_stream);
-
   // fill in
   int nesting_info_index = 0;
   std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -673,6 +667,28 @@ void reader::impl::allocate_nesting_info()
   page_nesting_decode_info.host_to_device(_stream);
 }
 
+void reader::impl::allocate_level_decode_space()
+{
+  auto& pages = _file_itm_data.pages_info;
+
+  // TODO: this could be made smaller if we ignored dictionary pages and pages with no
+  // repetition data.
+  size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * sizeof(uint32_t);
+  auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
+  _file_itm_data.level_decode_data      = rmm::device_buffer(decode_buf_size, _stream, _mr);
+
+  // distribute the buffers
+  uint32_t* buf = static_cast<uint32_t*>(_file_itm_data.level_decode_data.data());
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    auto& p = pages[idx];
+
+    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    buf += LEVEL_DECODE_BUF_SIZE;
+    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    buf += LEVEL_DECODE_BUF_SIZE;
+  }
+}
+
 std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_column_chunks(
   cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows)
 {
@@ -776,7 +792,7 @@ void reader::impl::load_and_decompress_data(
   auto& raw_page_data    = _file_itm_data.raw_page_data;
   auto& decomp_page_data = _file_itm_data.decomp_page_data;
   auto& chunks           = _file_itm_data.chunks;
-  auto& pages_info       = _file_itm_data.pages_info;
+  auto& pages            = _file_itm_data.pages_info;
 
   auto const [has_compressed_data, read_rowgroup_tasks] =
     create_and_read_column_chunks(row_groups_info, num_rows);
@@ -787,13 +803,13 @@ void reader::impl::load_and_decompress_data(
 
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
-  pages_info             = hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages                  = hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
 
   if (total_pages > 0) {
     // decoding of column/page information
-    decode_page_headers(chunks, pages_info, _stream);
+    decode_page_headers(chunks, pages, _stream);
     if (has_compressed_data) {
-      decomp_page_data = decompress_page_data(chunks, pages_info, _stream);
+      decomp_page_data = decompress_page_data(chunks, pages, _stream);
       // Free compressed data
       for (size_t c = 0; c < chunks.size(); c++) {
         if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
@@ -815,9 +831,17 @@ void reader::impl::load_and_decompress_data(
     // create it ourselves.
     // std::vector<output_column_info> output_info = build_output_column_info();
 
-    // nesting information (sizes, etc) stored -per page-
-    // note : even for flat schemas, we allocate 1 level of "nesting" info
-    allocate_nesting_info();
+    // the following two allocate functions modify the page data
+    pages.device_to_host(_stream, true);
+    {
+      // nesting information (sizes, etc) stored -per page-
+      // note : even for flat schemas, we allocate 1 level of "nesting" info
+      allocate_nesting_info();
+
+      // level decode space
+      allocate_level_decode_space();
+    }
+    pages.host_to_device(_stream);
   }
 }
 
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
new file mode 100644
index 00000000000..00e0c47e67d
--- /dev/null
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parquet_gpu.hpp"
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+namespace cudf {namespace io {
+namespace parquet {
+namespace gpu {
+
+// TODO: consider if these should be template parameters to rle_stream
+constexpr int num_rle_stream_decode_threads = 512;
+constexpr int num_rle_stream_decode_warps = (num_rle_stream_decode_threads / 32) - 1;
+constexpr int run_buffer_size = (num_rle_stream_decode_warps * 2) + 2;
+constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
+
+/**
+ * @brief Read a 32-bit varint integer
+ *
+ * @param[in,out] cur The current data position, updated after the read
+ * @param[in] end The end data position
+ *
+ * @return The 32-bit value read
+ */
+inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
+{
+  uint32_t v = *cur++;
+  if (v >= 0x80 && cur < end) {
+    v = (v & 0x7f) | ((*cur++) << 7);
+    if (v >= (0x80 << 7) && cur < end) {
+      v = (v & ((0x7f << 7) | 0x7f)) | ((*cur++) << 14);
+      if (v >= (0x80 << 14) && cur < end) {
+        v = (v & ((0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 21);
+        if (v >= (0x80 << 21) && cur < end) {
+          v = (v & ((0x7f << 21) | (0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 28);
+        }
+      }
+    }
+  }
+  return v;
+}
+
+// an individual batch. processed by a warp.
+// batches should be in shared memory.
+struct rle_batch {
+  uint8_t const* run_start;   // start of the run we are part of
+  int run_offset;             // value offset of this batch from the start of the run
+  uint32_t* output;
+  int level_run;
+  int size;
+
+  __device__ inline void decode(uint8_t const*const end, int level_bits, int lane, int warp_id)
+  {
+    int output_pos = 0;
+    int remain = size;
+
+    // for bitpacked/literal runs, total size is always a multiple of 8. so we need to take care if
+    // we are not starting/ending exactly on a run boundary
+    uint8_t const* cur;
+    if(level_run & 1){
+      int const effective_offset = cudf::util::round_down_safe(run_offset, 8);
+      int const lead_values = (run_offset - effective_offset);
+      output_pos -= lead_values;
+      remain += lead_values;
+      cur = run_start + ((effective_offset >> 3) * level_bits);
+    }
+
+    // if this is a repeated run, compute the repeated value
+    int _level_val;
+    if(!(level_run & 1)){
+      _level_val = run_start[0];
+      if(level_bits > 8){ 
+        _level_val |= run_start[0] << 8;
+      }
+    }
+
+    // process
+    while(remain > 0){
+      int batch_len = min(32, remain);
+
+      // if this is a literal run. each thread computes it's own level_val
+      if (level_run & 1) {
+        int const batch_len8 = (batch_len + 7) >> 3;
+        if (lane < batch_len) {
+          int bitpos         = lane * level_bits;
+          uint8_t const* cur_thread = cur + (bitpos >> 3);
+          bitpos &= 7;
+          _level_val = 0;
+          if (cur_thread < end){
+            _level_val = cur_thread[0];
+          }
+          cur_thread++;
+          if (level_bits > 8 - bitpos && cur_thread < end) {
+            _level_val |= cur_thread[0] << 8;
+            cur_thread++;
+            if (level_bits > 16 - bitpos && cur_thread < end){
+              _level_val |= cur_thread[0] << 16;
+            }
+          }
+          _level_val = (_level_val >> bitpos) & ((1 << level_bits) - 1);
+        }
+
+        cur += batch_len8 * level_bits;
+      }
+      
+      // store level_val
+      if (lane < batch_len && (lane + output_pos) >= 0) {
+        output[lane + output_pos] = _level_val;
+      }
+      remain -= batch_len;
+      output_pos += batch_len;
+    }
+  }
+};
+
+// a single rle run. may be broken up into multiple rle_batches
+struct rle_run {  
+  int size;               // total size of the run
+  int output_pos;
+  uint8_t const* start;
+  int level_run;          // level_run header value
+  int remaining;
+
+  __device__ __inline__ rle_batch next_batch(uint32_t *const output, int max_size)
+  {
+    int batch_len = min(max_size, remaining);
+    int const run_offset = size - remaining;
+    remaining -= batch_len;
+    return rle_batch{start, run_offset, output, level_run, batch_len};
+  }
+};
+
+// a stream of rle_runs
+struct rle_stream {
+  int level_bits;
+  uint8_t const* start;
+  uint8_t const* cur;
+  uint8_t const* end; 
+
+  int max_output_values;
+  int total_values;
+  int cur_values;
+
+  uint32_t *output;
+
+  rle_run *runs;
+  int run_index;
+  int run_count;
+  int output_pos;
+  bool spill;
+
+  int next_batch_run_start;
+  int next_batch_run_count;
+
+  __device__ rle_stream(rle_run* _runs) : runs(_runs){}
+
+  __device__ void init(int _level_bits, uint8_t const* _start, uint8_t const* _end,int _max_output_values, uint32_t* _output, int _total_values)
+  {
+    level_bits = _level_bits;
+    start = _start;
+    cur = _start;
+    end = _end;
+
+    max_output_values = _max_output_values;
+    output = _output;
+    
+    run_index = 0;
+    run_count = 0;
+    output_pos = 0;
+    spill = false;
+    next_batch_run_start = 0;
+    next_batch_run_count = 0;
+
+    total_values = _total_values;
+    cur_values = 0;
+  }
+
+  __device__ inline thrust::pair<int, int> get_run_batch()
+  {
+    return {next_batch_run_start, next_batch_run_count};
+  }
+
+  // fill in up to num_rle_stream_decode_warps runs or until we reach the max_count limit.
+  // this function is the critical hotspot.  please be very careful altering it.
+  __device__ inline void fill_run_batch(int max_count)
+  {
+    // if we spilled over, we've already got a run at the beginning
+    next_batch_run_start = spill ? run_index - 1 : run_index;
+    spill = false;
+
+    // generate runs until we either run out of warps to decode them with, or 
+    // we cross the output limit.
+    while(run_count < num_rle_stream_decode_warps && output_pos < max_count && cur < end){
+      auto& run = runs[rolling_run_index(run_index)];
+      
+      // Encoding::RLE
+      
+      // bytes for the varint header
+      uint8_t const *_cur = cur;
+      int const level_run = get_vlq32(_cur, end);
+      int run_bytes = _cur - cur;
+
+      // literal run
+      if(level_run & 1){
+        int const run_size = (level_run >> 1) * 8;
+        run.size = run_size;
+        int const run_size8 = (run_size + 7) >> 3;
+        run_bytes += run_size8 * level_bits;
+      }
+      // repeated value run
+      else {
+        run.size = (level_run >> 1);
+        run_bytes++;
+        // can this ever be > 16?  it effectively encodes nesting depth so that would require
+        // a nesting depth > 64k.
+        if(level_bits > 8){
+          run_bytes++;
+        }
+      }      
+      run.output_pos = output_pos;
+      run.start = _cur;
+      run.level_run = level_run;
+      run.remaining = run.size;
+      cur += run_bytes;
+
+      output_pos += run.size;
+      run_count++;
+      run_index++;
+    }
+
+    next_batch_run_count = run_count;
+
+    // if we've reached the output limit on the last run
+    if(output_pos >= max_count){
+      // first, see if we've spilled over
+      auto& src = runs[rolling_run_index(run_index - 1)];
+      int spill_count = output_pos - max_count;
+
+      // a spill has occurred in the current run. spill the extra values over into the beginning of the next run.
+      if(spill_count > 0){
+        auto &spill_run = runs[rolling_run_index(run_index)];
+        spill_run = src;
+        spill_run.output_pos = 0;
+        spill_run.remaining = spill_count;
+                
+        run_count = 1;
+        run_index++;
+        output_pos = spill_run.remaining;
+        spill = true;
+      } 
+      // no actual spill needed. just reset the output pos
+      else {
+        output_pos = 0;
+        run_count = 0;
+      }
+    }
+    // didn't cross the limit, so reset the run count
+    else {
+      run_count = 0;
+    }
+  }
+
+  __device__ __inline__ int decode_next(int t)
+  {
+    int const output_count = min(max_output_values, (total_values - cur_values));
+
+    // special case. if level_bits == 0, just return all zeros. this should tremendously speed up
+    // a very common case: columns with no nulls, especially if they are non-nested
+    if(level_bits == 0){
+      int written = 0;
+      while(written < output_count){
+        int const batch_size = min(num_rle_stream_decode_threads, output_count - written);
+        if(t < batch_size){
+          output[written + t] = 0;
+        }
+        written += batch_size;
+      }    
+      cur_values += output_count;
+      return output_count;
+    }
+    
+    // otherwise, full decode.
+    int const warp_id = t / 32;
+    int const warp_decode_id = warp_id - 1;
+    int const warp_lane = t % 32;
+    
+    __shared__ int run_start;
+    __shared__ int num_runs;
+    __shared__ int values_processed;
+    if(!t){
+      // carryover from the last call.
+      thrust::tie(run_start, num_runs) = get_run_batch();
+      values_processed = 0;
+    }
+    __syncthreads();
+
+    do {
+      // warp 0 reads ahead and generates batches of runs to be decoded by remaining warps.
+      if(!warp_id){
+        // fill the next set of runs. fill_runs will generally be the bottleneck for any
+        // kernel that uses an rle_stream.
+        if(warp_lane == 0){
+          fill_run_batch(output_count);
+        }
+      }
+      // remaining warps decode the runs
+      else if(warp_decode_id < num_runs){
+        // each warp handles 1 run, regardless of size.
+        // TODO: having each warp handle exactly 32 values would be ideal. as an example, the repetition levels
+        // for one of the list benchmarks decodes in ~3ms total, while the definition levels take ~11ms - the
+        // difference is entirely due to long runs in the definition levels.
+        auto& run = runs[rolling_run_index(run_start + warp_decode_id)];
+        auto batch = run.next_batch(output + run.output_pos, min(run.remaining, (output_count - run.output_pos)));
+        batch.decode(end, level_bits, warp_lane, warp_decode_id);
+        // last warp updates total values processed
+        if(warp_lane == 0 && warp_decode_id == num_runs - 1){
+          values_processed = run.output_pos + batch.size;
+        }
+      }
+      __syncthreads();
+      
+      // if we haven't run out of space, retrieve the next batch. otherwise leave it for the next call.
+      if(!t && values_processed < output_count){
+        thrust::tie(run_start, num_runs) = get_run_batch();
+      }
+      __syncthreads();
+    } while(num_runs > 0 && values_processed < output_count);
+    
+    cur_values += values_processed;
+
+    // valid for every thread
+    return values_processed;
+  }
+};
+
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index d73f0ebc9b7..724d91a1510 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -26,6 +26,13 @@ inline __device__ T shuffle(T var, int lane = 0)
   return __shfl_sync(~0, var, lane);
 }
 
+template <typename T>
+inline __device__ T shuffle_ptr(T var, int lane = 0)
+{
+  uintptr_t ptr_val = reinterpret_cast<uintptr_t>(var);
+  return reinterpret_cast<T>(shuffle(ptr_val));
+}
+
 template <typename T>
 inline __device__ T shuffle_xor(T var, uint32_t delta)
 {

From 9211bccfbd83e692f0fcccff84ac84d1b6ee7570 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 24 Apr 2023 14:50:10 -0500
Subject: [PATCH 002/114] Style formatting.

---
 .../cudf/detail/utilities/integer_utils.hpp   |   2 +-
 cpp/src/io/parquet/page_hdr.cu                |   2 +-
 cpp/src/io/parquet/rle_stream.cuh             | 198 +++++++++---------
 cpp/src/io/utilities/block_utils.cuh          |   2 +-
 4 files changed, 100 insertions(+), 104 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 3e4979c0c38..ccc89b2dce3 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index c7d27914375..76af22e068c 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 00e0c47e67d..c32233b26c3 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -17,14 +17,15 @@
 #include "parquet_gpu.hpp"
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf {namespace io {
+namespace cudf {
+namespace io {
 namespace parquet {
 namespace gpu {
 
 // TODO: consider if these should be template parameters to rle_stream
 constexpr int num_rle_stream_decode_threads = 512;
-constexpr int num_rle_stream_decode_warps = (num_rle_stream_decode_threads / 32) - 1;
-constexpr int run_buffer_size = (num_rle_stream_decode_warps * 2) + 2;
+constexpr int num_rle_stream_decode_warps   = (num_rle_stream_decode_threads / 32) - 1;
+constexpr int run_buffer_size               = (num_rle_stream_decode_warps * 2) + 2;
 constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
 
 /**
@@ -56,23 +57,23 @@ inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
 // an individual batch. processed by a warp.
 // batches should be in shared memory.
 struct rle_batch {
-  uint8_t const* run_start;   // start of the run we are part of
-  int run_offset;             // value offset of this batch from the start of the run
+  uint8_t const* run_start;  // start of the run we are part of
+  int run_offset;            // value offset of this batch from the start of the run
   uint32_t* output;
   int level_run;
   int size;
 
-  __device__ inline void decode(uint8_t const*const end, int level_bits, int lane, int warp_id)
+  __device__ inline void decode(uint8_t const* const end, int level_bits, int lane, int warp_id)
   {
     int output_pos = 0;
-    int remain = size;
+    int remain     = size;
 
     // for bitpacked/literal runs, total size is always a multiple of 8. so we need to take care if
     // we are not starting/ending exactly on a run boundary
     uint8_t const* cur;
-    if(level_run & 1){
+    if (level_run & 1) {
       int const effective_offset = cudf::util::round_down_safe(run_offset, 8);
-      int const lead_values = (run_offset - effective_offset);
+      int const lead_values      = (run_offset - effective_offset);
       output_pos -= lead_values;
       remain += lead_values;
       cur = run_start + ((effective_offset >> 3) * level_bits);
@@ -80,46 +81,38 @@ struct rle_batch {
 
     // if this is a repeated run, compute the repeated value
     int _level_val;
-    if(!(level_run & 1)){
+    if (!(level_run & 1)) {
       _level_val = run_start[0];
-      if(level_bits > 8){ 
-        _level_val |= run_start[0] << 8;
-      }
+      if (level_bits > 8) { _level_val |= run_start[0] << 8; }
     }
 
     // process
-    while(remain > 0){
+    while (remain > 0) {
       int batch_len = min(32, remain);
 
       // if this is a literal run. each thread computes it's own level_val
       if (level_run & 1) {
         int const batch_len8 = (batch_len + 7) >> 3;
         if (lane < batch_len) {
-          int bitpos         = lane * level_bits;
+          int bitpos                = lane * level_bits;
           uint8_t const* cur_thread = cur + (bitpos >> 3);
           bitpos &= 7;
           _level_val = 0;
-          if (cur_thread < end){
-            _level_val = cur_thread[0];
-          }
+          if (cur_thread < end) { _level_val = cur_thread[0]; }
           cur_thread++;
           if (level_bits > 8 - bitpos && cur_thread < end) {
             _level_val |= cur_thread[0] << 8;
             cur_thread++;
-            if (level_bits > 16 - bitpos && cur_thread < end){
-              _level_val |= cur_thread[0] << 16;
-            }
+            if (level_bits > 16 - bitpos && cur_thread < end) { _level_val |= cur_thread[0] << 16; }
           }
           _level_val = (_level_val >> bitpos) & ((1 << level_bits) - 1);
         }
 
         cur += batch_len8 * level_bits;
       }
-      
+
       // store level_val
-      if (lane < batch_len && (lane + output_pos) >= 0) {
-        output[lane + output_pos] = _level_val;
-      }
+      if (lane < batch_len && (lane + output_pos) >= 0) { output[lane + output_pos] = _level_val; }
       remain -= batch_len;
       output_pos += batch_len;
     }
@@ -127,16 +120,16 @@ struct rle_batch {
 };
 
 // a single rle run. may be broken up into multiple rle_batches
-struct rle_run {  
-  int size;               // total size of the run
+struct rle_run {
+  int size;  // total size of the run
   int output_pos;
   uint8_t const* start;
-  int level_run;          // level_run header value
+  int level_run;  // level_run header value
   int remaining;
 
-  __device__ __inline__ rle_batch next_batch(uint32_t *const output, int max_size)
+  __device__ __inline__ rle_batch next_batch(uint32_t* const output, int max_size)
   {
-    int batch_len = min(max_size, remaining);
+    int batch_len        = min(max_size, remaining);
     int const run_offset = size - remaining;
     remaining -= batch_len;
     return rle_batch{start, run_offset, output, level_run, batch_len};
@@ -148,15 +141,15 @@ struct rle_stream {
   int level_bits;
   uint8_t const* start;
   uint8_t const* cur;
-  uint8_t const* end; 
+  uint8_t const* end;
 
   int max_output_values;
   int total_values;
   int cur_values;
 
-  uint32_t *output;
+  uint32_t* output;
 
-  rle_run *runs;
+  rle_run* runs;
   int run_index;
   int run_count;
   int output_pos;
@@ -165,27 +158,32 @@ struct rle_stream {
   int next_batch_run_start;
   int next_batch_run_count;
 
-  __device__ rle_stream(rle_run* _runs) : runs(_runs){}
+  __device__ rle_stream(rle_run* _runs) : runs(_runs) {}
 
-  __device__ void init(int _level_bits, uint8_t const* _start, uint8_t const* _end,int _max_output_values, uint32_t* _output, int _total_values)
+  __device__ void init(int _level_bits,
+                       uint8_t const* _start,
+                       uint8_t const* _end,
+                       int _max_output_values,
+                       uint32_t* _output,
+                       int _total_values)
   {
     level_bits = _level_bits;
-    start = _start;
-    cur = _start;
-    end = _end;
+    start      = _start;
+    cur        = _start;
+    end        = _end;
 
     max_output_values = _max_output_values;
-    output = _output;
-    
-    run_index = 0;
-    run_count = 0;
-    output_pos = 0;
-    spill = false;
+    output            = _output;
+
+    run_index            = 0;
+    run_count            = 0;
+    output_pos           = 0;
+    spill                = false;
     next_batch_run_start = 0;
     next_batch_run_count = 0;
 
     total_values = _total_values;
-    cur_values = 0;
+    cur_values   = 0;
   }
 
   __device__ inline thrust::pair<int, int> get_run_batch()
@@ -199,24 +197,24 @@ struct rle_stream {
   {
     // if we spilled over, we've already got a run at the beginning
     next_batch_run_start = spill ? run_index - 1 : run_index;
-    spill = false;
+    spill                = false;
 
-    // generate runs until we either run out of warps to decode them with, or 
+    // generate runs until we either run out of warps to decode them with, or
     // we cross the output limit.
-    while(run_count < num_rle_stream_decode_warps && output_pos < max_count && cur < end){
+    while (run_count < num_rle_stream_decode_warps && output_pos < max_count && cur < end) {
       auto& run = runs[rolling_run_index(run_index)];
-      
+
       // Encoding::RLE
-      
+
       // bytes for the varint header
-      uint8_t const *_cur = cur;
+      uint8_t const* _cur = cur;
       int const level_run = get_vlq32(_cur, end);
-      int run_bytes = _cur - cur;
+      int run_bytes       = _cur - cur;
 
       // literal run
-      if(level_run & 1){
-        int const run_size = (level_run >> 1) * 8;
-        run.size = run_size;
+      if (level_run & 1) {
+        int const run_size  = (level_run >> 1) * 8;
+        run.size            = run_size;
         int const run_size8 = (run_size + 7) >> 3;
         run_bytes += run_size8 * level_bits;
       }
@@ -226,14 +224,12 @@ struct rle_stream {
         run_bytes++;
         // can this ever be > 16?  it effectively encodes nesting depth so that would require
         // a nesting depth > 64k.
-        if(level_bits > 8){
-          run_bytes++;
-        }
-      }      
+        if (level_bits > 8) { run_bytes++; }
+      }
       run.output_pos = output_pos;
-      run.start = _cur;
-      run.level_run = level_run;
-      run.remaining = run.size;
+      run.start      = _cur;
+      run.level_run  = level_run;
+      run.remaining  = run.size;
       cur += run_bytes;
 
       output_pos += run.size;
@@ -244,27 +240,28 @@ struct rle_stream {
     next_batch_run_count = run_count;
 
     // if we've reached the output limit on the last run
-    if(output_pos >= max_count){
+    if (output_pos >= max_count) {
       // first, see if we've spilled over
-      auto& src = runs[rolling_run_index(run_index - 1)];
+      auto& src       = runs[rolling_run_index(run_index - 1)];
       int spill_count = output_pos - max_count;
 
-      // a spill has occurred in the current run. spill the extra values over into the beginning of the next run.
-      if(spill_count > 0){
-        auto &spill_run = runs[rolling_run_index(run_index)];
-        spill_run = src;
+      // a spill has occurred in the current run. spill the extra values over into the beginning of
+      // the next run.
+      if (spill_count > 0) {
+        auto& spill_run      = runs[rolling_run_index(run_index)];
+        spill_run            = src;
         spill_run.output_pos = 0;
-        spill_run.remaining = spill_count;
-                
+        spill_run.remaining  = spill_count;
+
         run_count = 1;
         run_index++;
         output_pos = spill_run.remaining;
-        spill = true;
-      } 
+        spill      = true;
+      }
       // no actual spill needed. just reset the output pos
       else {
         output_pos = 0;
-        run_count = 0;
+        run_count  = 0;
       }
     }
     // didn't cross the limit, so reset the run count
@@ -279,66 +276,65 @@ struct rle_stream {
 
     // special case. if level_bits == 0, just return all zeros. this should tremendously speed up
     // a very common case: columns with no nulls, especially if they are non-nested
-    if(level_bits == 0){
+    if (level_bits == 0) {
       int written = 0;
-      while(written < output_count){
+      while (written < output_count) {
         int const batch_size = min(num_rle_stream_decode_threads, output_count - written);
-        if(t < batch_size){
-          output[written + t] = 0;
-        }
+        if (t < batch_size) { output[written + t] = 0; }
         written += batch_size;
-      }    
+      }
       cur_values += output_count;
       return output_count;
     }
-    
+
     // otherwise, full decode.
-    int const warp_id = t / 32;
+    int const warp_id        = t / 32;
     int const warp_decode_id = warp_id - 1;
-    int const warp_lane = t % 32;
-    
+    int const warp_lane      = t % 32;
+
     __shared__ int run_start;
     __shared__ int num_runs;
     __shared__ int values_processed;
-    if(!t){
+    if (!t) {
       // carryover from the last call.
       thrust::tie(run_start, num_runs) = get_run_batch();
-      values_processed = 0;
+      values_processed                 = 0;
     }
     __syncthreads();
 
     do {
       // warp 0 reads ahead and generates batches of runs to be decoded by remaining warps.
-      if(!warp_id){
+      if (!warp_id) {
         // fill the next set of runs. fill_runs will generally be the bottleneck for any
         // kernel that uses an rle_stream.
-        if(warp_lane == 0){
-          fill_run_batch(output_count);
-        }
+        if (warp_lane == 0) { fill_run_batch(output_count); }
       }
       // remaining warps decode the runs
-      else if(warp_decode_id < num_runs){
+      else if (warp_decode_id < num_runs) {
         // each warp handles 1 run, regardless of size.
-        // TODO: having each warp handle exactly 32 values would be ideal. as an example, the repetition levels
-        // for one of the list benchmarks decodes in ~3ms total, while the definition levels take ~11ms - the
-        // difference is entirely due to long runs in the definition levels.
-        auto& run = runs[rolling_run_index(run_start + warp_decode_id)];
-        auto batch = run.next_batch(output + run.output_pos, min(run.remaining, (output_count - run.output_pos)));
+        // TODO: having each warp handle exactly 32 values would be ideal. as an example, the
+        // repetition levels for one of the list benchmarks decodes in ~3ms total, while the
+        // definition levels take ~11ms - the difference is entirely due to long runs in the
+        // definition levels.
+        auto& run  = runs[rolling_run_index(run_start + warp_decode_id)];
+        auto batch = run.next_batch(output + run.output_pos,
+                                    min(run.remaining, (output_count - run.output_pos)));
         batch.decode(end, level_bits, warp_lane, warp_decode_id);
         // last warp updates total values processed
-        if(warp_lane == 0 && warp_decode_id == num_runs - 1){
+        if (warp_lane == 0 && warp_decode_id == num_runs - 1) {
           values_processed = run.output_pos + batch.size;
         }
       }
       __syncthreads();
-      
-      // if we haven't run out of space, retrieve the next batch. otherwise leave it for the next call.
-      if(!t && values_processed < output_count){
+
+      // if we haven't run out of space, retrieve the next batch. otherwise leave it for the next
+      // call.
+      if (!t && values_processed < output_count) {
         thrust::tie(run_start, num_runs) = get_run_batch();
       }
       __syncthreads();
-    } while(num_runs > 0 && values_processed < output_count);
-    
+    } while (num_runs > 0 && values_processed < output_count);
+
     cur_values += values_processed;
 
     // valid for every thread
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 724d91a1510..830523a288e 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 2a2f6b2831eb6cf4ccafca968b9758819d92141d Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 09:42:16 -0700
Subject: [PATCH 003/114] checkpoint

---
 cpp/src/io/parquet/page_data.cu    | 322 ++++++++++++++++++++++++++++-
 cpp/src/io/parquet/parquet_gpu.hpp |  28 ++-
 cpp/src/io/parquet/reader_impl.cpp |   2 +
 3 files changed, 331 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index c8995ec2625..96de9ee37d3 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1765,6 +1765,296 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
   }
 }
 
+__device__ std::pair<int, int> page_bounds(page_state_s* const s,
+                                           size_t min_row,
+                                           size_t num_rows,
+                                           bool is_bounds_pg,
+                                           bool has_repetition,
+                                           int t)
+{
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  // the level stream decoders
+  __shared__ rle_run def_runs[run_buffer_size];
+  __shared__ rle_run rep_runs[run_buffer_size];
+  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+
+  // decode batches of level stream data using rle_stream objects and use the results to
+  // calculate start and end value positions in the encoded string data.
+  int const max_depth = s->col.max_nesting_depth;
+  int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
+
+  // can skip all this if we know there are no nulls
+  if (max_def == 0) { return {0, s->page.num_input_values}; }
+
+  int start_value = 0;
+  int end_value   = s->page.num_input_values;
+  auto const pp   = &s->page;
+  auto const col  = &s->col;
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  int const max_batch_size = s->level_decode_buf_size;
+  uint32_t* def_decode     = s->def;
+  uint32_t* rep_decode     = s->rep;
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        max_batch_size,
+                                        def_decode,
+                                        s->page.num_input_values);
+  // only need repetition if this is a bounds page. otherwise all we need is def level info
+  // to count the nulls.
+  if (has_repetition && is_bounds_pg) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          max_batch_size,
+                                          rep_decode,
+                                          s->page.num_input_values);
+  }
+
+  int processed = 0;
+
+  // if this is a bounds page, we need to do extra work to find the start and/or end value index
+  if (is_bounds_pg) {
+    __shared__ int skipped_leaf_values;
+    __shared__ int end_val_idx;
+
+    // need these for skip_rows case
+    auto const page_start_row = col->start_row + pp->chunk_row;
+    auto const max_row        = min_row + num_rows;
+    auto const begin_row      = page_start_row >= min_row ? 0 : min_row - page_start_row;
+    auto const max_page_rows  = pp->num_rows - begin_row;
+    auto const page_rows      = page_start_row + begin_row + max_page_rows <= max_row
+                                  ? max_page_rows
+                                  : max_row - (page_start_row + begin_row);
+    auto const end_row        = begin_row + page_rows;
+
+    int row_count           = 0;
+    int leaf_count          = 0;
+    bool skipped_values_set = false;
+    bool end_value_set      = false;
+
+    while (processed < s->page.num_input_values) {
+      int start_val = processed;
+
+      if (has_repetition) {
+        decoders[level_type::REPETITION].decode_next(t);
+        __syncthreads();
+      }
+
+      // the # of rep/def levels will always be the same size
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      // do something with the level data
+      while (start_val < processed) {
+        int idx_t = start_val + t;
+        int idx   = rolling_lvl_index(idx_t, s->level_decode_buf_size);
+
+        // get absolute thread row index
+        int is_new_row = idx_t < processed && (!has_repetition || s->rep[idx] == 0);
+        int thread_row_count, block_row_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+        __syncthreads();
+
+        // get absolute thread leaf index
+        int const is_new_leaf = idx_t < processed && (s->def[idx] >= max_def);
+        int thread_leaf_count, block_leaf_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+        __syncthreads();
+
+        // if we have not set skipped values yet, see if we found the first in-bounds row
+        if (!skipped_values_set && row_count + block_row_count > begin_row) {
+          // if this thread is in row bounds
+          int const row_index = (thread_row_count + row_count) - 1;
+          int in_row_bounds =
+            idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(in_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the first row.
+            if (local_count == 1) {
+              skipped_leaf_values =
+                leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            skipped_values_set = true;
+          }
+        }
+
+        // test if row_count will exceed end_row in this batch
+        if (!end_value_set && row_count + block_row_count > end_row) {
+          // if this thread exceeds row bounds
+          int const row_index    = (thread_row_count + row_count) - 1;
+          int exceeds_row_bounds = row_index >= end_row;
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(exceeds_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the end row.
+            if (local_count == 1) {
+              end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            end_value_set = true;
+          }
+        }
+
+        row_count += block_row_count;
+        leaf_count += block_leaf_count;
+
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+
+      if (skipped_values_set) { start_value = skipped_leaf_values; }
+      if (end_value_set) { end_value = end_val_idx; }
+    }
+  }
+  // already filtered out unwanted pages, so need to count all non-null values in this page
+  else {
+    int num_nulls = 0;
+    while (processed < s->page.num_input_values) {
+      int start_val = processed;
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      while (start_val < processed) {
+        int idx_t = start_val + t;
+        if (idx_t < processed) {
+          int idx = rolling_lvl_index(idx_t, s->level_decode_buf_size);
+          if (s->def[idx] < max_def) { num_nulls++; }
+        }
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+    }
+
+    int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls);
+
+    if (t == 0) { pp->num_nulls = null_count; }
+    __syncthreads();
+
+    end_value -= pp->num_nulls;
+  }
+
+  return {start_value, end_value};
+}
+
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  // only count if it's a string column
+  auto const col         = &chunks[pp->chunk_idx];
+  uint32_t dtype         = col->data_type & 7;
+  uint32_t dtype_len_out = col->data_type >> 3;
+  if (dtype != BYTE_ARRAY || dtype_len_out == 4) { return; }
+
+  // whether or not we have repetition levels (lists)
+  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the level stream decoders
+  __shared__ rle_run def_runs[run_buffer_size];
+  __shared__ rle_run rep_runs[run_buffer_size];
+  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(s,
+                          pp,
+                          chunks,
+                          min_row,
+                          num_rows,
+                          false,
+                          pp->lvl_decode_buf,
+                          LEVEL_DECODE_BUF_SIZE,
+                          decoders)) {
+    return;
+  }
+
+  if (!t) {
+    s->page.num_nulls = 0;
+    s->page.str_bytes = 0;
+  }
+  __syncthreads();
+
+  bool is_bounds_pg = is_bounds_page(s, min_row, num_rows);
+
+  // if we're skipping this page anyway, no need to count it
+  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
+
+  // find start/end value indices
+  auto const [start_value, end_value] =
+    page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, t);
+  if (t == 0) printf("%05d: start_val %d end_val %d\n", blockIdx.x, start_value, end_value);
+
+  // now process string info in the range [start_value, end_value)
+  // set up for decoding strings...can be either plain or dictionary
+  uint8_t const* data      = s->data_start;
+  uint8_t const* const end = s->data_end;
+  uint8_t const* dict_base = nullptr;
+  int dict_bits            = 0;
+  int dict_size            = 0;
+  size_t str_bytes         = 0;
+
+  switch (pp->encoding) {
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE_DICTIONARY:
+      // RLE-packed dictionary indices, first byte indicates index length in bits
+      if (col->str_dict_index) {
+        // String dictionary: use index
+        dict_base = reinterpret_cast<const uint8_t*>(col->str_dict_index);
+        dict_size = col->page_info[0].num_input_values * sizeof(string_index_pair);
+      } else {
+        dict_base = col->page_info[0].page_data;  // dictionary is always stored in the first page
+        dict_size = col->page_info[0].uncompressed_page_size;
+      }
+
+      dict_bits = (data < end) ? *data++ : 0;
+
+      // FIXME: how to throw?  set error and return?
+      if (dict_bits > 32 || !dict_base) { printf("error\n"); }
+
+      str_bytes = countDictEntries(
+        data, dict_base, dict_bits, dict_size, (end - data), start_value, end_value, t);
+      break;
+    case Encoding::PLAIN:
+      dict_size = static_cast<int32_t>(end - data);
+      str_bytes = countPlainEntries(data, dict_size, start_value, end_value, t);
+      break;
+  }
+
+  if (t == 0) { pp->str_bytes = str_bytes; }
+}
+
 /**
  * @brief Kernel for computing per-page column size information for all nesting levels.
  *
@@ -1859,9 +2149,9 @@ __global__ void __launch_bounds__(preprocess_block_size)
 
   // early out optimizations:
 
-  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
-  // to do the expensive work of traversing the level data to determine sizes.  we can just compute
-  // it directly.
+  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't
+  // need to do the expensive work of traversing the level data to determine sizes.  we can just
+  // compute it directly.
   if (!has_repetition && !compute_string_sizes) {
     int depth = 0;
     while (depth < s->page.num_output_nesting_levels) {
@@ -1875,8 +2165,8 @@ __global__ void __launch_bounds__(preprocess_block_size)
     return;
   }
 
-  // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
-  // at the beginning or the end of the row bounds)
+  // in the trim pass, for anything with lists, we only need to fully process bounding pages
+  // (those at the beginning or the end of the row bounds)
   if (!is_base_pass && !is_bounds_page(s, min_row, num_rows)) {
     int depth = 0;
     while (depth < s->page.num_output_nesting_levels) {
@@ -2099,13 +2389,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       //
       if (!has_repetition) { dst_pos -= s->first_row; }
 
-      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
-      // before first_row) in the flat hierarchy case.
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
+      // (values before first_row) in the flat hierarchy case.
       if (src_pos < target_pos && dst_pos >= 0) {
         // src_pos represents the logical row position we want to read from. But in the case of
-        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read position
-        // has to take into account the # of values we have to skip in the page to get to the
-        // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read
+        // position has to take into account the # of values we have to skip in the page to get to
+        // the desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
         uint32_t val_src_pos = src_pos + skipped_leaf_values;
 
         // nesting level that is storing actual leaf values
@@ -2172,6 +2462,18 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
 }  // anonymous namespace
 
+void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
+                            hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(preprocess_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  gpuComputePageStringSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
+    pages.device_ptr(), chunks, min_row, num_rows);
+}
+
 /**
  * @copydoc cudf::io::parquet::gpu::ComputePageSizes
  */
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 54119cc7e00..0fa4deb79e9 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -275,7 +275,7 @@ struct ColumnChunkDesc {
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
   int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;   // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index;   // my input column index
   int32_t src_col_schema;  // my schema index in the file
@@ -364,16 +364,16 @@ struct slot_type;
 struct EncColumnChunk {
   parquet_column_device_view const* col_desc;  //!< Column description
   size_type col_desc_id;
-  PageFragment* fragments;                     //!< First fragment in chunk
-  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
-  uint8_t* compressed_bfr;                     //!< Compressed page data
-  statistics_chunk const* stats;               //!< Fragment statistics
-  uint32_t bfr_size;                           //!< Uncompressed buffer size
-  uint32_t compressed_size;                    //!< Compressed buffer size
-  uint32_t max_page_data_size;  //!< Max data size (excluding header) of any page in this chunk
-  uint32_t page_headers_size;   //!< Sum of size of all page headers
-  size_type start_row;          //!< First row of chunk
-  uint32_t num_rows;            //!< Number of rows in chunk
+  PageFragment* fragments;        //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;      //!< Uncompressed page data
+  uint8_t* compressed_bfr;        //!< Compressed page data
+  statistics_chunk const* stats;  //!< Fragment statistics
+  uint32_t bfr_size;              //!< Uncompressed buffer size
+  uint32_t compressed_size;       //!< Compressed buffer size
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
+  uint32_t page_headers_size;     //!< Sum of size of all page headers
+  size_type start_row;            //!< First row of chunk
+  uint32_t num_rows;              //!< Number of rows in chunk
   size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage* pages;           //!< Ptr to pages that belong to this chunk
@@ -438,6 +438,12 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream);
 
+void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
+                            hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            rmm::cuda_stream_view stream);
+
 /**
  * @brief Compute page output size information.
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9f1644dfd45..80c8aaf2b97 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -37,6 +37,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
+  gpu::ComputePageStringSizes(pages, chunks, skip_rows, num_rows, _stream);
+
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`

From 2b1f7d5f1c9a986bdaf9812aa829d63e1f2bf40b Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 10:29:15 -0700
Subject: [PATCH 004/114] checkpoint

---
 cpp/src/io/parquet/page_data.cu | 148 ++++++++++++++++++++++++++++++--
 1 file changed, 140 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 96de9ee37d3..87c1315e08b 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1956,16 +1956,147 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   return {start_value, end_value};
 }
 
+__device__ size_t countDictEntries(uint8_t const* data,
+                                   uint8_t const* dict_base,
+                                   int dict_bits,
+                                   int dict_size,
+                                   int data_size,
+                                   int start_value,
+                                   int end_value,
+                                   int t)
+{
+  uint8_t const* ptr       = data;
+  uint8_t const* const end = data + data_size;
+  size_t str_len           = 0;  // total sum for runs
+  size_t l_str_len         = 0;  // partial sums across literal runs
+  int pos                  = 0;
+
+  int dict_run = 0;
+  int dict_val = 0;
+
+  while (pos < end_value && ptr <= end) {
+    if (dict_run <= 1) {
+      dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0;
+      if (!(dict_run & 1)) {
+        // Repeated value
+        int bytecnt = (dict_bits + 7) >> 3;
+        if (ptr + bytecnt <= end) {
+          int32_t run_val = ptr[0];
+          if (bytecnt > 1) {
+            run_val |= ptr[1] << 8;
+            if (bytecnt > 2) {
+              run_val |= ptr[2] << 16;
+              if (bytecnt > 3) { run_val |= ptr[3] << 24; }
+            }
+          }
+          dict_val = run_val & ((1 << dict_bits) - 1);
+        }
+        ptr += bytecnt;
+      }
+    }
+
+    int batch_len;
+    if (dict_run & 1) {
+      // Literal batch: must output a multiple of 8, except for the last batch
+      int batch_len_div8;
+      batch_len      = max(min(128, (int)(dict_run >> 1) * 8), 1);
+      batch_len_div8 = (batch_len + 7) >> 3;
+      dict_run -= batch_len_div8 * 2;
+      ptr += batch_len_div8 * dict_bits;
+    } else {
+      batch_len = dict_run >> 1;
+      dict_run  = 0;
+    }
+
+    int is_literal = dict_run & 1;
+
+    // compute dictionary index.
+    if (is_literal) {
+      int dict_idx = 0;
+      if (t < batch_len) {
+        dict_idx         = dict_val;
+        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
+        const uint8_t* p = ptr + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+
+        if (pos + t < end_value) {
+          uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+          if (pos + t >= start_value && dict_pos < (uint32_t)dict_size) {
+            const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+            l_str_len += src->second;
+          }
+        }
+      }
+    } else {
+      int start_off = (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
+      batch_len     = min(batch_len, end_value - pos);
+      if (t == 0) {
+        uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
+        if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+          str_len += (batch_len - start_off) * src->second;
+        }
+      }
+    }
+
+    pos += batch_len;
+    // if (t == 0) printf("pos %d str_len %ld\n", pos, str_len);
+  }
+
+  using block_reduce = cub::BlockReduce<size_t, preprocess_block_size>;
+  typename block_reduce::TempStorage reduce_storage;
+  str_len += block_reduce(reduce_storage).Sum(l_str_len);
+
+  return str_len;
+}
+
+__device__ size_t
+countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_value, int t)
+{
+  int pos          = 0;
+  size_t total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    const uint8_t* cur = data;
+    int k              = 0;
+
+    while (pos < end_value && k < data_size) {
+      int len;
+      if (k + 4 <= data_size) {
+        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+        k += 4;
+        if (k + len > data_size) { len = 0; }
+      } else {
+        len = 0;
+      }
+
+      k += len;
+      if (pos >= start_value) { total_len += len; }
+      pos++;
+    }
+  }
+
+  return total_len;
+}
+
 __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
-  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
-  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
-
   __shared__ __align__(16) page_state_s state_g;
 
   page_state_s* const s = &state_g;
@@ -2048,7 +2179,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
       break;
     case Encoding::PLAIN:
       dict_size = static_cast<int32_t>(end - data);
-      str_bytes = countPlainEntries(data, dict_size, start_value, end_value, t);
+      str_bytes = is_bounds_pg ? countPlainEntries(data, dict_size, start_value, end_value, t)
+                               : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
       break;
   }
 

From 65696849033cca04c4dd0884c987d3b7b450ff51 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 12:11:27 -0700
Subject: [PATCH 005/114] fix is_bounds_page()

---
 cpp/src/io/parquet/page_data.cu | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 87c1315e08b..e9d6b3d0b2c 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -126,7 +126,7 @@ inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, s
   size_t const begin      = start_row;
   size_t const end        = start_row + num_rows;
 
-  return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end));
+  return ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
 }
 
 /**
@@ -2145,14 +2145,25 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   // find start/end value indices
   auto const [start_value, end_value] =
     page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, t);
-  if (t == 0) printf("%05d: start_val %d end_val %d\n", blockIdx.x, start_value, end_value);
+#if 0
+  if (t == 0)
+    printf("%05d: start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
+           blockIdx.x,
+           start_value,
+           end_value,
+           is_bounds_pg,
+           is_page_contained(s, min_row, num_rows),
+           min_row,
+           min_row + num_rows,
+           col->start_row + pp->chunk_row,
+           col->start_row + pp->chunk_row + pp->num_rows);
+#endif
 
   // now process string info in the range [start_value, end_value)
   // set up for decoding strings...can be either plain or dictionary
   uint8_t const* data      = s->data_start;
   uint8_t const* const end = s->data_end;
   uint8_t const* dict_base = nullptr;
-  int dict_bits            = 0;
   int dict_size            = 0;
   size_t str_bytes         = 0;
 
@@ -2169,13 +2180,13 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
         dict_size = col->page_info[0].uncompressed_page_size;
       }
 
-      dict_bits = (data < end) ? *data++ : 0;
-
-      // FIXME: how to throw?  set error and return?
-      if (dict_bits > 32 || !dict_base) { printf("error\n"); }
+      if (s->dict_bits > 32 || !dict_base) {
+        printf("%03d: error %d %p\n", t, s->dict_bits, dict_base);
+        CUDF_UNREACHABLE("invalid dictionary bit size");
+      }
 
       str_bytes = countDictEntries(
-        data, dict_base, dict_bits, dict_size, (end - data), start_value, end_value, t);
+        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value, t);
       break;
     case Encoding::PLAIN:
       dict_size = static_cast<int32_t>(end - data);

From 2f8836b3391c0deaf8761b8c883ab321bb683815 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 12:23:55 -0700
Subject: [PATCH 006/114] pass decoders into page_bounds

---
 cpp/src/io/parquet/page_data.cu | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e9d6b3d0b2c..3c351e600ad 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1770,6 +1770,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
                                            size_t num_rows,
                                            bool is_bounds_pg,
                                            bool has_repetition,
+                                           rle_stream* decoders,
                                            int t)
 {
   using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
@@ -1779,11 +1780,6 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
     typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
-  // the level stream decoders
-  __shared__ rle_run def_runs[run_buffer_size];
-  __shared__ rle_run rep_runs[run_buffer_size];
-  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
-
   // decode batches of level stream data using rle_stream objects and use the results to
   // calculate start and end value positions in the encoded string data.
   int const max_depth = s->col.max_nesting_depth;
@@ -2144,7 +2140,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // find start/end value indices
   auto const [start_value, end_value] =
-    page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, t);
+    page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
 #if 0
   if (t == 0)
     printf("%05d: start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",

From db7e2a46b6392197e6a4ba3b7dd165df567d67ca Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 13:44:48 -0700
Subject: [PATCH 007/114] copy over changes from string_cols

---
 cpp/src/io/json/reader_impl.cu             |   2 +-
 cpp/src/io/parquet/parquet_gpu.hpp         |  10 +
 cpp/src/io/parquet/reader_impl.cpp         |  76 ++++--
 cpp/src/io/parquet/reader_impl.hpp         |   2 -
 cpp/src/io/parquet/reader_impl_helpers.hpp |   1 +
 cpp/src/io/utilities/column_buffer.cpp     | 275 ++++++++++++++++-----
 cpp/src/io/utilities/column_buffer.hpp     |  88 ++++++-
 7 files changed, 356 insertions(+), 98 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 7ae8deb8055..66c17fbe9ec 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -554,7 +554,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream);
+    auto out_column = io::detail::make_column(out_buffers[i], nullptr, std::nullopt, stream);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 0fa4deb79e9..e6f2abbb82c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -113,6 +113,7 @@ struct PageNestingDecodeInfo {
   int32_t valid_count;
   int32_t value_count;
   uint8_t* data_out;
+  uint8_t* string_out;
   bitmask_type* valid_map;
 };
 
@@ -188,6 +189,7 @@ struct PageInfo {
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
   int32_t str_bytes;
+  int64_t str_offset;  // offset into string data for this page
 
   // nesting information (input/output) for each page. this array contains
   // input column nesting information, output column nesting information and
@@ -242,6 +244,7 @@ struct ColumnChunkDesc {
       str_dict_index(nullptr),
       valid_map_base{nullptr},
       column_data_base{nullptr},
+      column_string_base{nullptr},
       codec(codec_),
       converted_type(converted_type_),
       logical_type(logical_type_),
@@ -271,6 +274,7 @@ struct ColumnChunkDesc {
   string_index_pair* str_dict_index;          // index for string dictionary
   bitmask_type** valid_map_base;              // base pointers of valid bit map for this column
   void** column_data_base;                    // base pointers of column data
+  void** column_string_base;                  // base pointers of column string data
   int8_t codec;                               // compressed codec enum
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
@@ -475,6 +479,12 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
                       bool compute_string_sizes,
                       rmm::cuda_stream_view stream);
 
+void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
+                            hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the column data stored in the pages
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 80c8aaf2b97..4286756fbec 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -37,14 +37,39 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
+  // Here's the plan. Compute string sizes in case it hasn't already been done.  can work out
+  // if this is redundant later.  Then allocate buffers for the string data, and offsets to the
+  // first string.  pass this to decode, where string data will be written to the buffer rather
+  // than to _strings.  also need to allocate a size_type buffer to hold strings offsets, which
+  // will be calculated as we're writing the data.  once done, we'll have for each string column
+  // a char array with the contiguous string data, and a size_type array of offsets.  use these
+  // as child columns and create string column.  no need to call create_strings_column now.
+
   gpu::ComputePageStringSizes(pages, chunks, skip_rows, num_rows, _stream);
 
+  // TODO do the following on device with thrust/kernel to avoid the pages round trip
+  pages.device_to_host(_stream, true);
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  for (auto& page : pages) {
+    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0) {
+      auto const& col        = chunks[page.chunk_idx];
+      uint32_t dtype         = col.data_type & 7;
+      uint32_t dtype_len_out = col.data_type >> 3;
+      if (dtype == BYTE_ARRAY && dtype_len_out != 4) {
+        size_t const offset          = col_sizes[col.src_col_index];
+        page.str_offset              = offset;
+        col_sizes[col.src_col_index] = offset + page.str_bytes;
+      }
+    }
+  }
+
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<bitmask_type*>(sum_max_depths, _stream);
-  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
-  auto chunk_offsets       = std::vector<size_t>();
+  auto chunk_nested_valids   = hostdevice_vector<bitmask_type*>(sum_max_depths, _stream);
+  auto chunk_nested_data     = hostdevice_vector<void*>(sum_max_depths, _stream);
+  auto chunk_nested_str_data = hostdevice_vector<void*>(sum_max_depths, _stream);
+  auto chunk_offsets         = std::vector<size_t>();
 
   // Update chunks with pointers to column data.
   for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
@@ -65,6 +90,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     auto data                  = chunk_nested_data.host_ptr(chunk_off);
     chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
+    auto str_data                = chunk_nested_str_data.host_ptr(chunk_off);
+    chunks[c].column_string_base = chunk_nested_str_data.device_ptr(chunk_off);
+
     chunk_off += max_depth;
 
     // fill in the arrays on the host.  there are some important considerations to
@@ -107,6 +135,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
+        // only do string buffer for leaf
+        if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
+          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
+        }
+        str_data[idx] = out_buf.string_data();
         out_buf.user_data |=
           static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
       } else {
@@ -119,9 +152,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     page_count += chunks[c].max_num_pages;
   }
 
+  pages.host_to_device(_stream);  // FIXME: get rid of this eventually
   chunks.host_to_device(_stream);
   chunk_nested_valids.host_to_device(_stream);
   chunk_nested_data.host_to_device(_stream);
+  chunk_nested_str_data.host_to_device(_stream);
 
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
 
@@ -145,21 +180,28 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       auto& out_buf = (*cols)[input_col.nesting[l_idx]];
       cols          = &out_buf.children;
 
-      if (out_buf.type.id() != type_id::LIST ||
-          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
-        continue;
+      if (out_buf.type.id() == type_id::LIST &&
+          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED) == 0) {
+        CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
+        auto& child = (*cols)[input_col.nesting[l_idx + 1]];
+
+        // the final offset for a list at level N is the size of it's child
+        int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
+        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
+                                      &offset,
+                                      sizeof(offset),
+                                      cudaMemcpyDefault,
+                                      _stream.value()));
+        out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
+      } else if (out_buf.type.id() == type_id::STRING) {
+        // need to cap off the string offsets column
+        size_type sz = col_sizes[idx];
+        cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + out_buf.size,
+                        &sz,
+                        sizeof(size_type),
+                        cudaMemcpyDefault,
+                        _stream.value());
       }
-      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
-
-      // the final offset for a list at level N is the size of it's child
-      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
-                                    &offset,
-                                    sizeof(offset),
-                                    cudaMemcpyDefault,
-                                    _stream.value()));
-      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 7e22179a421..9c9ad6f85f4 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -24,8 +24,6 @@
 #include "parquet_gpu.hpp"
 #include "reader_impl_helpers.hpp"
 
-#include <io/utilities/column_buffer.hpp>
-
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index cdc5896803c..dfd2f2b644c 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -29,6 +29,7 @@
 namespace cudf::io::detail::parquet {
 
 using namespace cudf::io::parquet;
+using column_buffer = cudf::io::detail::utilities::column_buffer<true>;
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index d0783fe8a01..f0b18d3fc54 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -29,24 +29,43 @@
 namespace cudf {
 namespace io {
 namespace detail {
+namespace utilities {
 
-void column_buffer::create(size_type _size,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+void column_buffer_with_pointers::create_strings(size_type size, rmm::cuda_stream_view stream)
+{
+  // The contents of _strings will never be directly returned to the user.
+  // Due to the fact that make_strings_column copies the input data to
+  // produce its outputs, _strings is actually a temporary. As a result, we
+  // do not pass the provided mr to the call to
+  // make_zeroed_device_uvector_async here and instead let it use the
+  // default rmm memory resource.
+  _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+      size, stream, rmm::mr::get_current_device_resource()));
+}
+
+void column_buffer_with_strings::create_strings(size_type num_bytes,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  _string_data = rmm::device_buffer(num_bytes, stream, mr);
+}
+
+template <bool contains_strings>
+void column_buffer<contains_strings>::create(size_type _size,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
 {
   size = _size;
 
   switch (type.id()) {
     case type_id::STRING:
-      // The contents of _strings will never be directly returned to the user.
-      // Due to the fact that make_strings_column copies the input data to
-      // produce its outputs, _strings is actually a temporary. As a result, we
-      // do not pass the provided mr to the call to
-      // make_zeroed_device_uvector_async here and instead let it use the
-      // default rmm memory resource.
-      _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-          size, stream, rmm::mr::get_current_device_resource()));
+      if constexpr (!contains_strings) { this->create_strings(size, stream); }
+
+      if constexpr (contains_strings) {
+        // size + 1 for final offset. _string_data will be initialized later.
+        _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
+      }
       break;
 
     // list columns store a buffer of int32's as offsets to represent
@@ -73,26 +92,36 @@ namespace {
  * @param buff The old output buffer
  * @param new_buff The new output buffer
  */
-void copy_buffer_data(column_buffer const& buff, column_buffer& new_buff)
+template <bool contains_strings>
+void copy_buffer_data(column_buffer<contains_strings> const& buff,
+                      column_buffer<contains_strings>& new_buff)
 {
   new_buff.name      = buff.name;
   new_buff.user_data = buff.user_data;
   for (auto const& child : buff.children) {
-    auto& new_child = new_buff.children.emplace_back(column_buffer(child.type, child.is_nullable));
+    auto& new_child = new_buff.children.emplace_back(
+      column_buffer<contains_strings>(child.type, child.is_nullable));
     copy_buffer_data(child, new_child);
   }
 }
 
 }  // namespace
 
-column_buffer column_buffer::empty_like(column_buffer const& input)
+template <bool contains_strings>
+column_buffer<contains_strings> column_buffer<contains_strings>::empty_like(
+  column_buffer<contains_strings> const& input)
 {
   auto new_buff = column_buffer(input.type, input.is_nullable);
   copy_buffer_data(input, new_buff);
   return new_buff;
 }
 
-std::unique_ptr<column> make_column(column_buffer& buffer,
+// force instantiation of both column_buffers
+template class column_buffer<true>;
+template class column_buffer<false>;
+
+template <bool contains_strings>
+std::unique_ptr<column> make_column(column_buffer<contains_strings>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
@@ -100,46 +129,6 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
   switch (buffer.type.id()) {
-    case type_id::STRING:
-      if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
-        if (schema_info != nullptr) {
-          schema_info->children.push_back(column_name_info{"offsets"});
-          schema_info->children.push_back(column_name_info{"chars"});
-        }
-
-        // make_strings_column allocates new memory, it does not simply move
-        // from the inputs, so we need to pass it the memory resource given to
-        // the buffer on construction so that the memory is allocated using the
-        // resource that the calling code expected.
-        return make_strings_column(*buffer._strings, stream, buffer.mr);
-      } else {
-        // convert to binary
-        auto const string_col = make_strings_column(*buffer._strings, stream, buffer.mr);
-        auto const num_rows   = string_col->size();
-        auto const null_count = string_col->null_count();
-        auto col_content      = string_col->release();
-
-        // convert to uint8 column, strings are currently stored as int8
-        auto contents =
-          col_content.children[strings_column_view::chars_column_index].release()->release();
-        auto data = contents.data.release();
-
-        auto uint8_col = std::make_unique<column>(
-          data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
-
-        if (schema_info != nullptr) {
-          schema_info->children.push_back(column_name_info{"offsets"});
-          schema_info->children.push_back(column_name_info{"binary"});
-        }
-
-        return make_lists_column(
-          num_rows,
-          std::move(col_content.children[strings_column_view::offsets_column_index]),
-          std::move(uint8_col),
-          null_count,
-          std::move(*col_content.null_mask));
-      }
-
     case type_id::LIST: {
       // make offsets column
       auto offsets =
@@ -160,7 +149,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column(buffer.children[0], child_info, child_schema, stream);
+      auto child = cudf::io::detail::make_column<contains_strings>(
+        buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -189,8 +179,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                     ? std::make_optional<reader_column_schema>(schema->child(i))
                                     : std::nullopt;
 
-        output_children.emplace_back(
-          make_column(buffer.children[i], child_info, child_schema, stream));
+        output_children.emplace_back(cudf::io::detail::make_column<contains_strings>(
+          buffer.children[i], child_info, child_schema, stream));
       }
 
       return make_structs_column(buffer.size,
@@ -211,10 +201,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
   }
 }
 
-/**
- * @copydoc cudf::io::detail::empty_like
- */
-std::unique_ptr<column> empty_like(column_buffer& buffer,
+template <bool contains_strings>
+std::unique_ptr<column> empty_like(column_buffer<contains_strings>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -235,7 +223,8 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = empty_like(buffer.children[0], child_info, stream, mr);
+      auto child =
+        cudf::io::detail::empty_like<contains_strings>(buffer.children[0], child_info, stream, mr);
 
       // make the final list column
       return make_lists_column(
@@ -248,13 +237,14 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
       std::transform(buffer.children.begin(),
                      buffer.children.end(),
                      std::back_inserter(output_children),
-                     [&](column_buffer& col) {
+                     [&](auto& col) {
                        column_name_info* child_info = nullptr;
                        if (schema_info != nullptr) {
                          schema_info->children.push_back(column_name_info{""});
                          child_info = &schema_info->children.back();
                        }
-                       return empty_like(col, child_info, stream, mr);
+                       return cudf::io::detail::empty_like<contains_strings>(
+                         col, child_info, stream, mr);
                      });
 
       return make_structs_column(
@@ -265,6 +255,159 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
   }
 }
 
+}  // namespace utilities
+
+template <>
+std::unique_ptr<column> make_column<false>(utilities::column_buffer<false>& buffer,
+                                           column_name_info* schema_info,
+                                           std::optional<reader_column_schema> const& schema,
+                                           rmm::cuda_stream_view stream)
+{
+  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+
+  if (buffer.type.id() == type_id::STRING) {
+    if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{"chars"});
+      }
+
+      // make_strings_column allocates new memory, it does not simply move
+      // from the inputs, so we need to pass it the memory resource given to
+      // the buffer on construction so that the memory is allocated using the
+      // resource that the calling code expected.
+      return make_strings_column(*buffer._strings, stream, buffer.mr);
+    } else {
+      // convert to binary
+      auto const string_col = make_strings_column(*buffer._strings, stream, buffer.mr);
+      auto const num_rows   = string_col->size();
+      auto const null_count = string_col->null_count();
+      auto col_content      = string_col->release();
+
+      // convert to uint8 column, strings are currently stored as int8
+      auto contents =
+        col_content.children[strings_column_view::chars_column_index].release()->release();
+      auto data = contents.data.release();
+
+      auto uint8_col = std::make_unique<column>(
+        data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
+
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{"binary"});
+      }
+
+      return make_lists_column(
+        num_rows,
+        std::move(col_content.children[strings_column_view::offsets_column_index]),
+        std::move(uint8_col),
+        null_count,
+        std::move(*col_content.null_mask));
+    }
+  }
+
+  // not a string
+  return utilities::make_column(buffer, schema_info, schema, stream);
+}
+
+template <>
+std::unique_ptr<column> make_column<true>(utilities::column_buffer<true>& buffer,
+                                          column_name_info* schema_info,
+                                          std::optional<reader_column_schema> const& schema,
+                                          rmm::cuda_stream_view stream)
+{
+  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+
+  if (buffer.type.id() == type_id::STRING) {
+    auto make_string_col = [stream](auto& buffer) {
+      // no need for copies, just transfer ownership of the data_buffers to the columns
+      auto mr    = buffer._string_data.memory_resource();
+      auto state = mask_state::UNALLOCATED;
+      auto str_col =
+        buffer._string_data.size() == 0
+          ? make_empty_column(data_type{type_id::INT8})
+          : std::make_unique<column>(data_type{type_id::INT8},
+                                     buffer.size,
+                                     std::move(buffer._string_data),
+                                     cudf::detail::create_null_mask(buffer.size, state, stream, mr),
+                                     state_null_count(state, buffer.size),
+                                     std::vector<std::unique_ptr<column>>{});
+      auto offsets_col =
+        std::make_unique<column>(data_type{type_to_id<size_type>()},
+                                 buffer.size + 1,
+                                 std::move(buffer._data),
+                                 cudf::detail::create_null_mask(buffer.size + 1, state, stream, mr),
+                                 state_null_count(state, buffer.size + 1),
+                                 std::vector<std::unique_ptr<column>>{});
+
+      return make_strings_column(buffer.size,
+                                 std::move(offsets_col),
+                                 std::move(str_col),
+                                 buffer.null_count(),
+                                 std::move(buffer._null_mask));
+    };
+
+    if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{"chars"});
+      }
+
+      return make_string_col(buffer);
+    } else {
+      // convert to binary
+      auto const string_col = make_string_col(buffer);
+      auto const num_rows   = string_col->size();
+      auto const null_count = string_col->null_count();
+      auto col_content      = string_col->release();
+
+      // convert to uint8 column, strings are currently stored as int8
+      auto contents =
+        col_content.children[strings_column_view::chars_column_index].release()->release();
+      auto data = contents.data.release();
+
+      auto uint8_col = std::make_unique<column>(
+        data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
+
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{"binary"});
+      }
+
+      return make_lists_column(
+        num_rows,
+        std::move(col_content.children[strings_column_view::offsets_column_index]),
+        std::move(uint8_col),
+        null_count,
+        std::move(*col_content.null_mask));
+    }
+  }
+
+  // not a string
+  return utilities::make_column(buffer, schema_info, schema, stream);
+}
+
+/**
+ * @copydoc cudf::io::detail::empty_like
+ */
+template <>
+std::unique_ptr<column> empty_like<true>(utilities::column_buffer<true>& buffer,
+                                         column_name_info* schema_info,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return utilities::empty_like(buffer, schema_info, stream, mr);
+}
+
+template <>
+std::unique_ptr<column> empty_like<false>(utilities::column_buffer<false>& buffer,
+                                          column_name_info* schema_info,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  return utilities::empty_like(buffer, schema_info, stream, mr);
+}
+
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 1535cc5c06a..a186fde3ac6 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -62,11 +62,42 @@ inline rmm::device_buffer create_data(data_type type,
 
 using string_index_pair = thrust::pair<const char*, size_type>;
 
+namespace utilities {
+
+struct column_buffer_with_pointers {
+  void create_strings(size_type _size, rmm::cuda_stream_view stream);
+
+  std::optional<void*> str_data()
+  {
+    return _strings ? std::optional(_strings->data()) : std::nullopt;
+  }
+  std::optional<size_t> str_data_size() const
+  {
+    return _strings ? std::optional(_strings->size()) : std::nullopt;
+  }
+
+  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
+};
+
+struct column_buffer_with_strings {
+  void create_strings(size_type _size,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr);
+
+  void* string_data() { return _string_data.data(); }
+  size_t string_size() const { return _string_data.size(); }
+
+  rmm::device_buffer _string_data{};
+};
+
 /**
  * @brief Class for holding device memory buffers to column data that eventually
  * will be used to create a column.
  */
-struct column_buffer {
+template <bool contains_strings>
+struct column_buffer : std::conditional<contains_strings,
+                                        column_buffer_with_strings,
+                                        column_buffer_with_pointers>::type {
   column_buffer() = default;
 
   // construct without a known size. call create() later to actually
@@ -85,19 +116,47 @@ struct column_buffer {
   }
 
   // move constructor
-  column_buffer(column_buffer&& col)            = default;
-  column_buffer& operator=(column_buffer&& col) = default;
+  column_buffer(column_buffer<contains_strings>&& col)                              = default;
+  column_buffer<contains_strings>& operator=(column_buffer<contains_strings>&& col) = default;
 
   // copy constructor
-  column_buffer(column_buffer const& col)            = delete;
-  column_buffer& operator=(column_buffer const& col) = delete;
+  column_buffer(column_buffer<contains_strings> const& col)                              = delete;
+  column_buffer<contains_strings>& operator=(column_buffer<contains_strings> const& col) = delete;
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
-  auto data() { return _strings ? _strings->data() : _data.data(); }
-  auto data_size() const { return _strings ? _strings->size() : _data.size(); }
+  template <bool T = contains_strings>
+  typename std::enable_if<T, void>::type create_string_data(size_t num_bytes,
+                                                            rmm::cuda_stream_view stream)
+  {
+    this->create_strings(num_bytes, stream, mr);
+  }
+
+  template <bool T = contains_strings>
+  typename std::enable_if<T, void*>::type data()
+  {
+    return _data.data();
+  }
+
+  template <bool T = contains_strings>
+  typename std::enable_if<!T, void*>::type data()
+  {
+    return this->str_data().value_or(_data.data());
+  }
+
+  template <bool T = contains_strings>
+  typename std::enable_if<T, size_t>::type data_size() const
+  {
+    return _data.size();
+  }
+
+  template <bool T = contains_strings>
+  typename std::enable_if<!T, size_t>::type data_size() const
+  {
+    return this->str_data_size().value_or(_data.size());
+  }
 
   template <typename T = uint32_t>
   auto null_mask()
@@ -110,9 +169,8 @@ struct column_buffer {
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
-  static column_buffer empty_like(column_buffer const& input);
+  static column_buffer<contains_strings> empty_like(column_buffer<contains_strings> const& input);
 
-  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
@@ -120,13 +178,17 @@ struct column_buffer {
   data_type type{type_id::EMPTY};
   bool is_nullable{false};
   size_type size{0};
-  std::vector<column_buffer> children;
+  std::vector<column_buffer<contains_strings>> children;
   uint32_t user_data{0};  // arbitrary user data
   std::string name;
 
   rmm::mr::device_memory_resource* mr;
 };
 
+}  // namespace utilities
+
+using column_buffer = utilities::column_buffer<false>;
+
 /**
  * @brief Creates a column from an existing set of device memory buffers.
  *
@@ -138,7 +200,8 @@ struct column_buffer {
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-std::unique_ptr<column> make_column(column_buffer& buffer,
+template <bool contains_strings>
+std::unique_ptr<column> make_column(utilities::column_buffer<contains_strings>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream);
@@ -158,7 +221,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-std::unique_ptr<column> empty_like(column_buffer& buffer,
+template <bool contains_strings>
+std::unique_ptr<column> empty_like(utilities::column_buffer<contains_strings>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);

From 90e214c613720f5ff0ceef9db2c636c7d1a0fcd9 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 21:29:30 -0700
Subject: [PATCH 008/114] works except skip_rows

---
 cpp/src/io/parquet/page_data.cu | 233 +++++++++++++++++++++++++++++---
 1 file changed, 212 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 3c351e600ad..5a8f9a20b10 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -21,8 +21,10 @@
 
 #include <cuda/std/tuple>
 #include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -109,6 +111,70 @@ struct page_state_buffers_s {
   uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
 };
 
+// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
+// copies from src to dst in 16B chunks per thread.
+__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  using cudf::strings::detail::load_uint4;
+
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
+
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
+  auto const in_start         = src;
+
+  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
+  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
+  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
+  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
+  // `load_uint4` reading beyond string boundaries.
+  // use signed int since out_end_aligned can be negative.
+  int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
+                                out_datatype_size * out_datatype_size -
+                              alignment_offset;
+  int64_t out_end_aligned =
+    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+    alignment_offset;
+
+  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
+       ichar += warp_size * out_datatype_size) {
+    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
+      load_uint4((const char*)in_start + ichar);
+  }
+
+  // Tail logic: copy characters of the current string outside
+  // `[out_start_aligned, out_end_aligned)`.
+  if (out_end_aligned <= out_start_aligned) {
+    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+    // entire string.
+    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
+      dst[ichar] = in_start[ichar];
+    }
+  } else {
+    // Copy characters in range `[0, out_start_aligned)`.
+    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
+    // Copy characters in range `[out_end_aligned, len)`.
+    int64_t ichar = out_end_aligned + lane_id;
+    if (ichar < len) { dst[ichar] = in_start[ichar]; }
+  }
+}
+
+// data parallel strcpy
+__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  if (len > 64) {
+    wideStrcpy(dst, src, len, lane_id);
+  } else {
+    for (int i = lane_id; i < len; i += warp_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
 /**
  * @brief Returns whether or not a page spans either the beginning or the end of the
  * specified row bounds
@@ -1186,18 +1252,29 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
             output_offset = nesting_info->page_start_value;
           }
 
-          nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+          if (s->col.column_data_base != nullptr) {
+            nesting_info->data_out   = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+            nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
 
-          if (nesting_info->data_out != nullptr) {
-            // anything below max depth with a valid data pointer must be a list, so the
-            // element size is the size of the offset type.
-            uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
-            nesting_info->data_out += (output_offset * len);
-          }
-          nesting_info->valid_map = s->col.valid_map_base[idx];
-          if (nesting_info->valid_map != nullptr) {
-            nesting_info->valid_map += output_offset >> 5;
-            nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+
+            if (nesting_info->data_out != nullptr) {
+              // anything below max depth with a valid data pointer must be a list, so the
+              // element size is the size of the offset type.
+              uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
+              // if this is a string column, then dtype_len is a lie. data will be offsets rather
+              // than (ptr,len) tuples.
+              if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); }
+              nesting_info->data_out += (output_offset * len);
+            }
+            if (nesting_info->string_out != nullptr) {
+              nesting_info->string_out += s->page.str_offset;
+            }
+            nesting_info->valid_map = s->col.valid_map_base[idx];
+            if (nesting_info->valid_map != nullptr) {
+              nesting_info->valid_map += output_offset >> 5;
+              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
+            }
           }
         }
       }
@@ -1786,7 +1863,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
 
   // can skip all this if we know there are no nulls
-  if (max_def == 0) { return {0, s->page.num_input_values}; }
+  if (max_def == 0 && !is_bounds_pg) { return {0, s->num_input_values}; }
 
   int start_value = 0;
   int end_value   = s->page.num_input_values;
@@ -1817,6 +1894,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   int processed = 0;
 
   // if this is a bounds page, we need to do extra work to find the start and/or end value index
+  // TODO calculate num_nulls
   if (is_bounds_pg) {
     __shared__ int skipped_leaf_values;
     __shared__ int end_val_idx;
@@ -1831,6 +1909,9 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
                                   : max_row - (page_start_row + begin_row);
     auto const end_row        = begin_row + page_rows;
 
+    // short circuit for no nulls
+    if (max_def == 0 && !has_repetition) { return {begin_row, page_rows}; }
+
     int row_count           = 0;
     int leaf_count          = 0;
     bool skipped_values_set = false;
@@ -1963,6 +2044,7 @@ __device__ size_t countDictEntries(uint8_t const* data,
 {
   uint8_t const* ptr       = data;
   uint8_t const* const end = data + data_size;
+  int const bytecnt        = (dict_bits + 7) >> 3;
   size_t str_len           = 0;  // total sum for runs
   size_t l_str_len         = 0;  // partial sums across literal runs
   int pos                  = 0;
@@ -1975,7 +2057,6 @@ __device__ size_t countDictEntries(uint8_t const* data,
       dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0;
       if (!(dict_run & 1)) {
         // Repeated value
-        int bytecnt = (dict_bits + 7) >> 3;
         if (ptr + bytecnt <= end) {
           int32_t run_val = ptr[0];
           if (bytecnt > 1) {
@@ -1995,7 +2076,7 @@ __device__ size_t countDictEntries(uint8_t const* data,
     if (dict_run & 1) {
       // Literal batch: must output a multiple of 8, except for the last batch
       int batch_len_div8;
-      batch_len      = max(min(128, (int)(dict_run >> 1) * 8), 1);
+      batch_len      = max(min(preprocess_block_size, (int)(dict_run >> 1) * 8), 1);
       batch_len_div8 = (batch_len + 7) >> 3;
       dict_run -= batch_len_div8 * 2;
       ptr += batch_len_div8 * dict_bits;
@@ -2050,14 +2131,14 @@ __device__ size_t countDictEntries(uint8_t const* data,
     }
 
     pos += batch_len;
-    // if (t == 0) printf("pos %d str_len %ld\n", pos, str_len);
   }
+  __syncthreads();
 
   using block_reduce = cub::BlockReduce<size_t, preprocess_block_size>;
-  typename block_reduce::TempStorage reduce_storage;
-  str_len += block_reduce(reduce_storage).Sum(l_str_len);
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+  size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len);
 
-  return str_len;
+  return str_len + sum_l;
 }
 
 __device__ size_t
@@ -2141,8 +2222,12 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   // find start/end value indices
   auto const [start_value, end_value] =
     page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
+  
+  // need to save num_nulls calculated in page_bounds in this page
+  // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
+  if (t==0) { pp->num_nulls = s->page.num_nulls; }
 #if 0
-  if (t == 0)
+  if (t == 0 && col->src_col_index == 0)
     printf("%05d: start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
            blockIdx.x,
            start_value,
@@ -2191,7 +2276,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
       break;
   }
 
-  if (t == 0) { pp->str_bytes = str_bytes; }
+  if (t == 0) {
+    // TODO check for overflow
+    pp->str_bytes = str_bytes;
+    // printf("%05d: string size %ld %d\n", blockIdx.x, str_bytes, col->src_col_index);
+  }
 }
 
 /**
@@ -2425,6 +2514,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 {
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(16) page_state_buffers_s state_buffers;
+  __shared__ __align__(4) size_type last_offset;
 
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
@@ -2450,6 +2540,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
+  // offsets is global...but the output is local, so account for that below
+  if (t == 0) { last_offset = s->page.str_offset; }
+
+  // choose a character parallel string copy when the average string is longer than a warp
+  auto const use_char_ll = (s->page.str_bytes / s->page.num_input_values) > cudf::detail::warp_size;
+
+  __syncthreads();
+
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
   // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
@@ -2528,6 +2626,69 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       //
       if (!has_repetition) { dst_pos -= s->first_row; }
 
+      // need to do this before we branch on src_pos/dst_pos so we don't deadlock
+      if (dtype == BYTE_ARRAY && s->dtype_len != 4) {
+        int leaf_level_index = s->col.max_nesting_depth - 1;
+        int me               = t - out_thread0;
+        if (me < 32) {
+          for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
+            auto [ptr, len] = src_pos + i < target_pos
+                                ? gpuGetStringData(s, sb, src_pos + i)
+                                : cuda::std::pair<char const*, size_t>{nullptr, 0};
+
+            __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
+            size_type offset;
+            cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
+            offset += last_offset;
+
+            dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
+            if (!has_repetition) { dst_pos -= s->first_row; }
+
+            if (use_char_ll) {
+              // TODO: might want separate kernel for string page decoding so we don't waste all
+              // this shared memory on non-string columns.
+              __shared__ __align__(8) uint8_t const* pointers[32];
+              __shared__ __align__(4) size_type offsets[32];
+              __shared__ __align__(4) int dsts[32];
+              __shared__ __align__(4) int lengths[32];
+
+              offsets[me]  = offset;
+              pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
+              dsts[me]     = dst_pos;
+              lengths[me]  = len;
+              __syncwarp();
+
+              for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
+                if (dsts[me] >= 0) {
+                  auto offptr =
+                    reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
+                    dsts[ss];
+                  *offptr      = offsets[ss];
+                  auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss] -
+                                 s->page.str_offset;
+                  ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
+                }
+              }
+
+            } else {
+              if (src_pos + i < target_pos && dst_pos >= 0) {
+                auto offptr =
+                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
+                  dst_pos;
+                *offptr = offset;
+                auto str_ptr =
+                  nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
+                memcpy(str_ptr, ptr, len);
+              }
+              __syncwarp();
+            }
+
+            if (me == 31) { last_offset = offset + len; }
+            __syncwarp();
+          }
+        }
+      }
+
       // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
       // (values before first_row) in the flat hierarchy case.
       if (src_pos < target_pos && dst_pos >= 0) {
@@ -2555,7 +2716,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
               gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst));
             }
           } else {
-            gpuOutputString(s, sb, val_src_pos, dst);
+            // test for string hashes
+            if (dtype_len == 4) { gpuOutputString(s, sb, val_src_pos, dst); }
           }
         } else if (dtype == BOOLEAN) {
           gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
@@ -2597,6 +2759,35 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
     }
     __syncthreads();
   }
+
+  // if there are nulls and this is a string column, clean up the offsets array.
+  // but if there's a list parent, then no need.
+  if (s->page.num_input_values != s->nz_count) {
+    int dtype = s->col.data_type & 7;
+    if (dtype == BYTE_ARRAY && s->dtype_len != 4) {
+      int leaf_level_index = s->col.max_nesting_depth - 1;
+      auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
+
+      if (nesting_info_base[leaf_level_index].null_count > 0) {
+        // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
+        if (s->nz_count == 0) {
+          for (int i = t; i < s->page.num_input_values; i += decode_block_size) {
+            offptr[i] = s->page.str_offset;
+          }
+        }
+        // just some nulls, do this serially for now
+        else if (t == 0) {
+          if (offptr[s->num_input_values - 1] == 0) {
+            offptr[s->num_input_values - 1] = s->page.str_offset + s->page.str_bytes;
+          }
+          for (int i = s->num_input_values - 2; i > 0; i--) {
+            if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
+          }
+          offptr[0] = s->page.str_offset;
+        }
+      }
+    }
+  }
 }
 
 }  // anonymous namespace

From 567a0ab0dba11f5fb061c471a92cf466b5d40089 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 22:06:28 -0700
Subject: [PATCH 009/114] fix bug with skip_rows

---
 cpp/src/io/parquet/page_data.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 5a8f9a20b10..96bfcd3e2aa 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2632,8 +2632,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
         int me               = t - out_thread0;
         if (me < 32) {
           for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
-            auto [ptr, len] = src_pos + i < target_pos
-                                ? gpuGetStringData(s, sb, src_pos + i)
+            auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
+                                ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
                                 : cuda::std::pair<char const*, size_t>{nullptr, 0};
 
             __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;

From fb45e8c7301be789da157263475a0ff431449f07 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 25 Apr 2023 22:07:42 -0700
Subject: [PATCH 010/114] debug prints

---
 cpp/src/io/parquet/reader_impl.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 4286756fbec..c6ad2996126 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -62,6 +62,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       }
     }
   }
+  // for (size_t i=0; i < col_sizes.size(); i++)
+  //  printf("col %ld size %ld\n", i, col_sizes[i]);
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
@@ -201,6 +203,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
                         sizeof(size_type),
                         cudaMemcpyDefault,
                         _stream.value());
+        // printf("col %ld sz %d colsize %d\n", idx, out_buf.size, sz);
       }
     }
   }

From 6d897523702bfc28da61edb7e04c3b64ce3cc6d5 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 26 Apr 2023 10:32:58 -0700
Subject: [PATCH 011/114] fix bug in page_bounds

---
 cpp/src/io/parquet/page_data.cu | 39 +++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 96bfcd3e2aa..1f7c4ce85b5 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1910,7 +1910,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
     auto const end_row        = begin_row + page_rows;
 
     // short circuit for no nulls
-    if (max_def == 0 && !has_repetition) { return {begin_row, page_rows}; }
+    if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; }
 
     int row_count           = 0;
     int leaf_count          = 0;
@@ -2222,22 +2222,24 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   // find start/end value indices
   auto const [start_value, end_value] =
     page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
-  
+
   // need to save num_nulls calculated in page_bounds in this page
   // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
-  if (t==0) { pp->num_nulls = s->page.num_nulls; }
+  if (t == 0) { pp->num_nulls = s->page.num_nulls; }
 #if 0
-  if (t == 0 && col->src_col_index == 0)
-    printf("%05d: start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
-           blockIdx.x,
-           start_value,
-           end_value,
-           is_bounds_pg,
-           is_page_contained(s, min_row, num_rows),
-           min_row,
-           min_row + num_rows,
-           col->start_row + pp->chunk_row,
-           col->start_row + pp->chunk_row + pp->num_rows);
+  if (t == 0)
+    printf(
+      "%05d: col %d start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
+      blockIdx.x,
+      col->src_col_index,
+      start_value,
+      end_value,
+      is_bounds_pg,
+      is_page_contained(s, min_row, num_rows),
+      min_row,
+      min_row + num_rows,
+      col->start_row + pp->chunk_row,
+      col->start_row + pp->chunk_row + pp->num_rows);
 #endif
 
   // now process string info in the range [start_value, end_value)
@@ -2679,6 +2681,15 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
                 auto str_ptr =
                   nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
                 memcpy(str_ptr, ptr, len);
+#if 0
+                printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
+                       blockIdx.x,
+                       t,
+                       src_pos + i,
+                       dst_pos,
+                       len,
+                       offset);
+#endif
               }
               __syncwarp();
             }

From 5035703a88cc4c2c88613520234b1eb185eb4e08 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 26 Apr 2023 11:53:42 -0700
Subject: [PATCH 012/114] optimization for countDictEntries

---
 cpp/src/io/parquet/page_data.cu | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 1f7c4ce85b5..609d2fa9807 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2086,13 +2086,16 @@ __device__ size_t countDictEntries(uint8_t const* data,
     }
 
     int is_literal = dict_run & 1;
+    //if (t == 0 && blockIdx.x == 1) printf("batch_len %d is_lit %d\n", batch_len, is_literal);
 
     // compute dictionary index.
     if (is_literal) {
       int dict_idx = 0;
-      if (t < batch_len) {
+      // reverse threads so thread 0 can process a repeat run while upper threads do literals
+      int tt = preprocess_block_size - 1 - t;
+      if (tt < batch_len) {
         dict_idx         = dict_val;
-        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
+        int32_t ofs      = (tt - ((batch_len + 7) & ~7)) * dict_bits;
         const uint8_t* p = ptr + (ofs >> 3);
         ofs &= 7;
         if (p < end) {
@@ -2110,9 +2113,9 @@ __device__ size_t countDictEntries(uint8_t const* data,
           dict_idx &= (1 << dict_bits) - 1;
         }
 
-        if (pos + t < end_value) {
+        if (pos + tt < end_value) {
           uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
-          if (pos + t >= start_value && dict_pos < (uint32_t)dict_size) {
+          if (pos + tt >= start_value && dict_pos < (uint32_t)dict_size) {
             const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
             l_str_len += src->second;
           }

From 37f7d463e03ade85633abdb360ce5d73d672d5bd Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 26 Apr 2023 15:38:21 -0700
Subject: [PATCH 013/114] fix another skip_rows bug, and round robin the
 countDictEntries calc

---
 cpp/src/io/parquet/page_data.cu | 39 +++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 609d2fa9807..71847f17005 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2045,9 +2045,9 @@ __device__ size_t countDictEntries(uint8_t const* data,
   uint8_t const* ptr       = data;
   uint8_t const* const end = data + data_size;
   int const bytecnt        = (dict_bits + 7) >> 3;
-  size_t str_len           = 0;  // total sum for runs
-  size_t l_str_len         = 0;  // partial sums across literal runs
-  int pos                  = 0;
+  size_t l_str_len         = 0;  // partial sums across threads
+  int pos                  = 0;  // current value index in the data stream
+  int t0                   = 0;  // thread 0 for this batch
 
   int dict_run = 0;
   int dict_val = 0;
@@ -2086,16 +2086,18 @@ __device__ size_t countDictEntries(uint8_t const* data,
     }
 
     int is_literal = dict_run & 1;
-    //if (t == 0 && blockIdx.x == 1) printf("batch_len %d is_lit %d\n", batch_len, is_literal);
+    // if (t == 0 && blockIdx.x == 1) printf("batch_len %d is_lit %d\n", batch_len, is_literal);
+
+    // calculate my thread id for this batch.  way to round-robin the work.
+    int mytid = t - t0;
+    if (mytid < 0) mytid += preprocess_block_size;
 
     // compute dictionary index.
     if (is_literal) {
       int dict_idx = 0;
-      // reverse threads so thread 0 can process a repeat run while upper threads do literals
-      int tt = preprocess_block_size - 1 - t;
-      if (tt < batch_len) {
+      if (mytid < batch_len) {
         dict_idx         = dict_val;
-        int32_t ofs      = (tt - ((batch_len + 7) & ~7)) * dict_bits;
+        int32_t ofs      = (mytid - ((batch_len + 7) & ~7)) * dict_bits;
         const uint8_t* p = ptr + (ofs >> 3);
         ofs &= 7;
         if (p < end) {
@@ -2113,26 +2115,31 @@ __device__ size_t countDictEntries(uint8_t const* data,
           dict_idx &= (1 << dict_bits) - 1;
         }
 
-        if (pos + tt < end_value) {
+        if (pos + mytid < end_value) {
           uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
-          if (pos + tt >= start_value && dict_pos < (uint32_t)dict_size) {
+          if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) {
             const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
             l_str_len += src->second;
           }
         }
       }
+
+      t0 += batch_len;
     } else {
       int start_off = (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
       batch_len     = min(batch_len, end_value - pos);
-      if (t == 0) {
+      if (mytid == 0) {
         uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
         if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
           const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
-          str_len += (batch_len - start_off) * src->second;
+          l_str_len += (batch_len - start_off) * src->second;
         }
       }
+
+      t0 += 1;
     }
 
+    t0 = t0 % preprocess_block_size;
     pos += batch_len;
   }
   __syncthreads();
@@ -2141,7 +2148,7 @@ __device__ size_t countDictEntries(uint8_t const* data,
   __shared__ typename block_reduce::TempStorage reduce_storage;
   size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len);
 
-  return str_len + sum_l;
+  return sum_l;
 }
 
 __device__ size_t
@@ -2637,6 +2644,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
         int me               = t - out_thread0;
         if (me < 32) {
           for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
+            dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
+            if (!has_repetition) { dst_pos -= s->first_row; }
+
             auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
                                 ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
                                 : cuda::std::pair<char const*, size_t>{nullptr, 0};
@@ -2646,9 +2656,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
             cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
             offset += last_offset;
 
-            dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
-            if (!has_repetition) { dst_pos -= s->first_row; }
-
             if (use_char_ll) {
               // TODO: might want separate kernel for string page decoding so we don't waste all
               // this shared memory on non-string columns.

From 19396bf1a6f44a8557c42facf4807438ba7cf722 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 26 Apr 2023 16:27:23 -0700
Subject: [PATCH 014/114] fix for chunked reads

---
 cpp/src/io/parquet/page_data.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 71847f17005..67c29e1733e 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2191,6 +2191,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   int t                 = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
+  // reset str_bytes to 0 in case it's already been calculated
+  if (t == 0) { pp->str_bytes = 0; }
+
   // only count if it's a string column
   auto const col         = &chunks[pp->chunk_idx];
   uint32_t dtype         = col->data_type & 7;

From 37804940bdcc4cbd91c05df02a75c52a65c065ba Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 26 Apr 2023 23:30:29 -0700
Subject: [PATCH 015/114] fix bug with setting the offsets for null
 values...chunked reader still not quite happy

---
 cpp/src/io/parquet/page_data.cu | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 67c29e1733e..f17ac4910c9 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1896,7 +1896,9 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   // if this is a bounds page, we need to do extra work to find the start and/or end value index
   // TODO calculate num_nulls
   if (is_bounds_pg) {
+    __shared__ int skipped_values;
     __shared__ int skipped_leaf_values;
+    __shared__ int last_input_value;
     __shared__ int end_val_idx;
 
     // need these for skip_rows case
@@ -1964,6 +1966,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
           if (global_count > 0) {
             // this is the thread that represents the first row.
             if (local_count == 1) {
+              skipped_values = idx_t;
               skipped_leaf_values =
                 leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
             }
@@ -1986,6 +1989,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
           if (global_count > 0) {
             // this is the thread that represents the end row.
             if (local_count == 1) {
+              last_input_value = idx_t;
               end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
             }
             end_value_set = true;
@@ -2002,6 +2006,16 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
       if (skipped_values_set) { start_value = skipped_leaf_values; }
       if (end_value_set) { end_value = end_val_idx; }
     }
+
+    if (t == 0) {
+      int const v0 = skipped_values_set ? skipped_values : 0;
+      int const vn = end_value_set ? last_input_value : s->num_input_values;
+      int const total_values = vn - v0;
+      int const total_leaf_values = end_value - start_value;
+      int const num_nulls = total_values - total_leaf_values;
+      pp->num_nulls = num_nulls;
+      // printf("%05d: input vals in page %d nz %d nc %d\n", blockIdx.x, total_values, total_leaf_values, num_nulls);
+    }
   }
   // already filtered out unwanted pages, so need to count all non-null values in this page
   else {
@@ -2786,25 +2800,27 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   // if there are nulls and this is a string column, clean up the offsets array.
   // but if there's a list parent, then no need.
-  if (s->page.num_input_values != s->nz_count) {
+  if (s->page.num_nulls != 0) {
     int dtype = s->col.data_type & 7;
     if (dtype == BYTE_ARRAY && s->dtype_len != 4) {
-      int leaf_level_index = s->col.max_nesting_depth - 1;
+      int const value_count = s->nz_count + s->page.num_nulls;
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+
       auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
 
       if (nesting_info_base[leaf_level_index].null_count > 0) {
         // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
         if (s->nz_count == 0) {
-          for (int i = t; i < s->page.num_input_values; i += decode_block_size) {
+          for (int i = t; i < value_count; i += decode_block_size) {
             offptr[i] = s->page.str_offset;
           }
         }
         // just some nulls, do this serially for now
         else if (t == 0) {
-          if (offptr[s->num_input_values - 1] == 0) {
-            offptr[s->num_input_values - 1] = s->page.str_offset + s->page.str_bytes;
+          if (offptr[value_count - 1] == 0) {
+            offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
           }
-          for (int i = s->num_input_values - 2; i > 0; i--) {
+          for (int i = value_count - 2; i > 0; i--) {
             if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
           }
           offptr[0] = s->page.str_offset;

From 4373b8fc63716aacff94663d912f35c2eef16a58 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 27 Apr 2023 14:05:03 -0700
Subject: [PATCH 016/114] fix edge case where skip_rows ends on a page boundary

---
 cpp/src/io/parquet/page_data.cu    | 100 +++++++++++++++++++++++------
 cpp/src/io/parquet/parquet_gpu.hpp |   6 +-
 2 files changed, 83 insertions(+), 23 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index f17ac4910c9..fd14022be38 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1975,7 +1975,7 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
         }
 
         // test if row_count will exceed end_row in this batch
-        if (!end_value_set && row_count + block_row_count > end_row) {
+        if (!end_value_set && row_count + block_row_count >= end_row) {
           // if this thread exceeds row bounds
           int const row_index    = (thread_row_count + row_count) - 1;
           int exceeds_row_bounds = row_index >= end_row;
@@ -2002,19 +2002,31 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
         start_val += preprocess_block_size;
       }
       __syncthreads();
-
-      if (skipped_values_set) { start_value = skipped_leaf_values; }
-      if (end_value_set) { end_value = end_val_idx; }
     }
 
+    start_value = skipped_values_set ? skipped_leaf_values : 0;
+    end_value   = end_value_set ? end_val_idx : leaf_count;
+
     if (t == 0) {
-      int const v0 = skipped_values_set ? skipped_values : 0;
-      int const vn = end_value_set ? last_input_value : s->num_input_values;
-      int const total_values = vn - v0;
+      int const v0                = skipped_values_set ? skipped_values : 0;
+      int const vn                = end_value_set ? last_input_value : s->num_input_values;
+      int const total_values      = vn - v0;
       int const total_leaf_values = end_value - start_value;
-      int const num_nulls = total_values - total_leaf_values;
-      pp->num_nulls = num_nulls;
-      // printf("%05d: input vals in page %d nz %d nc %d\n", blockIdx.x, total_values, total_leaf_values, num_nulls);
+      int const num_nulls         = total_values - total_leaf_values;
+      pp->num_nulls               = num_nulls;
+      pp->num_valids              = total_leaf_values;
+#if 0
+      printf("%05d: input vals in page %d,%d lc %d v0 %d vn %d %d nz %d nc %d\n",
+             blockIdx.x,
+             skipped_values_set,
+             end_value_set,
+             leaf_count,
+             v0,
+             vn,
+             total_values,
+             total_leaf_values,
+             num_nulls);
+#endif
     }
   }
   // already filtered out unwanted pages, so need to count all non-null values in this page
@@ -2038,7 +2050,10 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
 
     int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls);
 
-    if (t == 0) { pp->num_nulls = null_count; }
+    if (t == 0) {
+      pp->num_nulls  = null_count;
+      pp->num_valids = pp->num_input_values - null_count;
+    }
     __syncthreads();
 
     end_value -= pp->num_nulls;
@@ -2252,7 +2267,10 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // need to save num_nulls calculated in page_bounds in this page
   // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
-  if (t == 0) { pp->num_nulls = s->page.num_nulls; }
+  if (t == 0) {
+    pp->num_nulls  = s->page.num_nulls;
+    pp->num_valids = s->page.num_valids;
+  }
 #if 0
   if (t == 0)
     printf(
@@ -2688,7 +2706,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
               __syncwarp();
 
               for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
-                if (dsts[me] >= 0) {
+                if (dsts[ss] >= 0) {
                   auto offptr =
                     reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
                     dsts[ss];
@@ -2696,6 +2714,18 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
                   auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss] -
                                  s->page.str_offset;
                   ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
+#if 0
+                  if (is_bounds_page(s, min_row, num_rows)) {
+                    if (me == 0)
+                      printf("%05d,%03d: src %d dst %d len %d offset %d\n",
+                             blockIdx.x,
+                             me,
+                             src_pos + i + ss,
+                             dsts[ss],
+                             lengths[ss],
+                             offsets[ss]);
+                  }
+#endif
                 }
               }
 
@@ -2709,13 +2739,15 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
                   nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
                 memcpy(str_ptr, ptr, len);
 #if 0
-                printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
-                       blockIdx.x,
-                       t,
-                       src_pos + i,
-                       dst_pos,
-                       len,
-                       offset);
+                if (is_bounds_page(s, min_row, num_rows)) {
+                  printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
+                         blockIdx.x,
+                         t,
+                         src_pos + i,
+                         dst_pos,
+                         len,
+                         offset);
+                }
 #endif
               }
               __syncwarp();
@@ -2800,10 +2832,26 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   // if there are nulls and this is a string column, clean up the offsets array.
   // but if there's a list parent, then no need.
+#if 0
+  if ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4) {
+    int const leaf_level_index = s->col.max_nesting_depth - 1;
+    if (t == 0 && is_bounds_page(s, min_row, num_rows)) {
+      printf("%05d: nz %d nulls %d valids %d iv %d nival %d nivalid %d\n",
+             blockIdx.x,
+             s->nz_count,
+             s->page.num_nulls,
+             s->page.num_valids,
+             s->num_input_values,
+             nesting_info_base[leaf_level_index].value_count,
+             nesting_info_base[leaf_level_index].valid_count);
+    }
+  }
+#endif
+
   if (s->page.num_nulls != 0) {
     int dtype = s->col.data_type & 7;
     if (dtype == BYTE_ARRAY && s->dtype_len != 4) {
-      int const value_count = s->nz_count + s->page.num_nulls;
+      int const value_count      = s->page.num_valids + s->page.num_nulls;
       int const leaf_level_index = s->col.max_nesting_depth - 1;
 
       auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
@@ -2826,6 +2874,16 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
           offptr[0] = s->page.str_offset;
         }
       }
+      __syncthreads();
+#if 0
+      if (t == 0)
+        printf("%05d: offptr %p/%p %d %d\n",
+               blockIdx.x,
+               offptr,
+               offptr + value_count,
+               offptr[value_count - 2],
+               offptr[value_count - 1]);
+#endif
     }
   }
 }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index e6f2abbb82c..b6b63ce473f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -166,7 +166,9 @@ struct PageInfo {
   int32_t num_input_values;
   int32_t chunk_row;       // starting row of this page relative to the start of the chunk
   int32_t num_rows;        // number of rows in this page
-  int32_t num_nulls;       // number of null values (V2 header)
+  // the next two are calculated in gpuComputePageStringSizes
+  int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
+  int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -189,7 +191,7 @@ struct PageInfo {
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
   int32_t str_bytes;
-  int64_t str_offset;  // offset into string data for this page
+  int32_t str_offset;  // offset into string data for this page
 
   // nesting information (input/output) for each page. this array contains
   // input column nesting information, output column nesting information and

From 3a39970bce22fdb2da50af1ddfde09f883c1dab7 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 27 Apr 2023 17:03:54 -0700
Subject: [PATCH 017/114] move test for long strings

---
 cpp/src/io/parquet/reader_impl.cpp | 42 ++++++++++++++++++------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c6ad2996126..7739103b9ac 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -44,26 +44,32 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // will be calculated as we're writing the data.  once done, we'll have for each string column
   // a char array with the contiguous string data, and a size_type array of offsets.  use these
   // as child columns and create string column.  no need to call create_strings_column now.
+  auto const has_strings = std::any_of(pages.begin(), pages.end(), [&chunks](auto const& page) {
+    auto const& chunk = chunks[page.chunk_idx];
+    return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4;
+  });
 
-  gpu::ComputePageStringSizes(pages, chunks, skip_rows, num_rows, _stream);
-
-  // TODO do the following on device with thrust/kernel to avoid the pages round trip
-  pages.device_to_host(_stream, true);
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
-  for (auto& page : pages) {
-    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0) {
-      auto const& col        = chunks[page.chunk_idx];
-      uint32_t dtype         = col.data_type & 7;
-      uint32_t dtype_len_out = col.data_type >> 3;
-      if (dtype == BYTE_ARRAY && dtype_len_out != 4) {
-        size_t const offset          = col_sizes[col.src_col_index];
-        page.str_offset              = offset;
-        col_sizes[col.src_col_index] = offset + page.str_bytes;
+  if (has_strings) {
+    gpu::ComputePageStringSizes(pages, chunks, skip_rows, num_rows, _stream);
+
+    // TODO do the following on device with thrust/kernel to avoid the pages round trip
+    pages.device_to_host(_stream, true);
+    for (auto& page : pages) {
+      if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0) {
+        auto const& col        = chunks[page.chunk_idx];
+        uint32_t dtype         = col.data_type & 7;
+        uint32_t dtype_len_out = col.data_type >> 3;
+        if (dtype == BYTE_ARRAY && dtype_len_out != 4) {
+          size_t const offset          = col_sizes[col.src_col_index];
+          page.str_offset              = offset;
+          col_sizes[col.src_col_index] = offset + page.str_bytes;
+        }
       }
     }
+    // for (size_t i=0; i < col_sizes.size(); i++)
+    //  printf("col %ld size %ld\n", i, col_sizes[i]);
   }
-  // for (size_t i=0; i < col_sizes.size(); i++)
-  //  printf("col %ld size %ld\n", i, col_sizes[i]);
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
@@ -154,7 +160,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     page_count += chunks[c].max_num_pages;
   }
 
-  pages.host_to_device(_stream);  // FIXME: get rid of this eventually
+  if (has_strings) {
+    pages.host_to_device(_stream);  // FIXME: get rid of this eventually
+  }
   chunks.host_to_device(_stream);
   chunk_nested_valids.host_to_device(_stream);
   chunk_nested_data.host_to_device(_stream);
@@ -370,7 +378,7 @@ table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
     // Return user metadata
     out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
     out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
-                                       out_metadata.per_file_user_data[0].end()};
+                              out_metadata.per_file_user_data[0].end()};
 
     // Finally, save the output table metadata into `_output_metadata` for reuse next time.
     _output_metadata = std::make_unique<table_metadata>(out_metadata);

From 743b3f596420e2d99bc1f67bd3a5e825d74a1593 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 27 Apr 2023 17:54:05 -0700
Subject: [PATCH 018/114] more string tweaks

---
 cpp/src/io/parquet/page_data.cu | 145 +++++++++++++++++---------------
 1 file changed, 75 insertions(+), 70 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index fd14022be38..be34155fc73 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2587,13 +2587,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // offsets is global...but the output is local, so account for that below
-  if (t == 0) { last_offset = s->page.str_offset; }
-
-  // choose a character parallel string copy when the average string is longer than a warp
-  auto const use_char_ll = (s->page.str_bytes / s->page.num_input_values) > cudf::detail::warp_size;
+  int const dtype          = s->col.data_type & 7;
+  bool const is_string_col = dtype == BYTE_ARRAY && s->dtype_len != 4;
 
-  __syncthreads();
+  // offsets is global...but the output is local, so account for that below
+  if (is_string_col) {
+    if (t == 0) { last_offset = s->page.str_offset; }
+    __syncthreads();
+  }
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
@@ -2654,7 +2655,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
-      int dtype = s->col.data_type & 7;
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
@@ -2674,9 +2674,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       if (!has_repetition) { dst_pos -= s->first_row; }
 
       // need to do this before we branch on src_pos/dst_pos so we don't deadlock
-      if (dtype == BYTE_ARRAY && s->dtype_len != 4) {
-        int leaf_level_index = s->col.max_nesting_depth - 1;
-        int me               = t - out_thread0;
+      if (is_string_col) {
+        // choose a character parallel string copy when the average string is longer than a warp
+        auto const use_char_ll = s->page.num_valids > 0 &&
+                                 (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
+        int const leaf_level_index = s->col.max_nesting_depth - 1;
+        int const me               = t - out_thread0;
+
         if (me < 32) {
           for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
             dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
@@ -2757,71 +2761,72 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
             __syncwarp();
           }
         }
-      }
-
-      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
-      // (values before first_row) in the flat hierarchy case.
-      if (src_pos < target_pos && dst_pos >= 0) {
-        // src_pos represents the logical row position we want to read from. But in the case of
-        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read
-        // position has to take into account the # of values we have to skip in the page to get to
-        // the desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
-        uint32_t val_src_pos = src_pos + skipped_leaf_values;
-
-        // nesting level that is storing actual leaf values
-        int leaf_level_index = s->col.max_nesting_depth - 1;
-
-        uint32_t dtype_len = s->dtype_len;
-        void* dst =
-          nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
-        if (dtype == BYTE_ARRAY) {
-          if (s->col.converted_type == DECIMAL) {
-            auto const [ptr, len]        = gpuGetStringData(s, sb, val_src_pos);
-            auto const decimal_precision = s->col.decimal_precision;
-            if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
-              gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
-            } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
-              gpuOutputByteArrayAsInt(ptr, len, static_cast<int64_t*>(dst));
-            } else {
-              gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst));
-            }
-          } else {
-            // test for string hashes
-            if (dtype_len == 4) { gpuOutputString(s, sb, val_src_pos, dst); }
-          }
-        } else if (dtype == BOOLEAN) {
-          gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
-        } else if (s->col.converted_type == DECIMAL) {
-          switch (dtype) {
-            case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
-            case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
-            default:
-              if (s->dtype_len_in <= sizeof(int32_t)) {
-                gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int32_t*>(dst));
-              } else if (s->dtype_len_in <= sizeof(int64_t)) {
-                gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+      } else {
+        // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
+        // (values before first_row) in the flat hierarchy case.
+        if (src_pos < target_pos && dst_pos >= 0) {
+          // src_pos represents the logical row position we want to read from. But in the case of
+          // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read
+          // position has to take into account the # of values we have to skip in the page to get to
+          // the desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+          uint32_t val_src_pos = src_pos + skipped_leaf_values;
+
+          // nesting level that is storing actual leaf values
+          int leaf_level_index = s->col.max_nesting_depth - 1;
+
+          uint32_t dtype_len = s->dtype_len;
+          void* dst =
+            nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+          if (dtype == BYTE_ARRAY) {
+            if (s->col.converted_type == DECIMAL) {
+              auto const [ptr, len]        = gpuGetStringData(s, sb, val_src_pos);
+              auto const decimal_precision = s->col.decimal_precision;
+              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+                gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
+              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+                gpuOutputByteArrayAsInt(ptr, len, static_cast<int64_t*>(dst));
               } else {
-                gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<__int128_t*>(dst));
+                gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst));
               }
-              break;
-          }
-        } else if (dtype == INT96) {
-          gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
-        } else if (dtype_len == 8) {
-          if (s->dtype_len_in == 4) {
-            // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
-            // TIME_MILLIS is the only duration type stored as int32:
-            // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+            } else {
+              // test for string hashes
+              if (dtype_len == 4) { gpuOutputString(s, sb, val_src_pos, dst); }
+            }
+          } else if (dtype == BOOLEAN) {
+            gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
+          } else if (s->col.converted_type == DECIMAL) {
+            switch (dtype) {
+              case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
+              case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
+              default:
+                if (s->dtype_len_in <= sizeof(int32_t)) {
+                  gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int32_t*>(dst));
+                } else if (s->dtype_len_in <= sizeof(int64_t)) {
+                  gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+                } else {
+                  gpuOutputFixedLenByteArrayAsInt(
+                    s, sb, val_src_pos, static_cast<__int128_t*>(dst));
+                }
+                break;
+            }
+          } else if (dtype == INT96) {
+            gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+          } else if (dtype_len == 8) {
+            if (s->dtype_len_in == 4) {
+              // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+              // TIME_MILLIS is the only duration type stored as int32:
+              // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+              gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
+            } else if (s->ts_scale) {
+              gpuOutputInt64Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+            } else {
+              gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst));
+            }
+          } else if (dtype_len == 4) {
             gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
-          } else if (s->ts_scale) {
-            gpuOutputInt64Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
           } else {
-            gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst));
+            gpuOutputGeneric(s, sb, val_src_pos, static_cast<uint8_t*>(dst), dtype_len);
           }
-        } else if (dtype_len == 4) {
-          gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
-        } else {
-          gpuOutputGeneric(s, sb, val_src_pos, static_cast<uint8_t*>(dst), dtype_len);
         }
       }
 

From 08b68d7ac9b165b1996b6235f63c74abceb0163a Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 27 Apr 2023 18:02:12 -0700
Subject: [PATCH 019/114] change offsets to size_type

---
 cpp/src/io/parquet/reader_impl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 7739103b9ac..ae5764add1d 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -49,7 +49,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4;
   });
 
-  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  std::vector<size_type> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(pages, chunks, skip_rows, num_rows, _stream);
 
@@ -61,14 +61,14 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         uint32_t dtype         = col.data_type & 7;
         uint32_t dtype_len_out = col.data_type >> 3;
         if (dtype == BYTE_ARRAY && dtype_len_out != 4) {
-          size_t const offset          = col_sizes[col.src_col_index];
+          size_type const offset       = col_sizes[col.src_col_index];
           page.str_offset              = offset;
           col_sizes[col.src_col_index] = offset + page.str_bytes;
         }
       }
     }
     // for (size_t i=0; i < col_sizes.size(); i++)
-    //  printf("col %ld size %ld\n", i, col_sizes[i]);
+    //  printf("col %ld size %d\n", i, col_sizes[i]);
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -205,7 +205,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
-        size_type sz = col_sizes[idx];
+        size_type const sz = col_sizes[idx];
         cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + out_buf.size,
                         &sz,
                         sizeof(size_type),
@@ -378,7 +378,7 @@ table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
     // Return user metadata
     out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
     out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
-                              out_metadata.per_file_user_data[0].end()};
+                                       out_metadata.per_file_user_data[0].end()};
 
     // Finally, save the output table metadata into `_output_metadata` for reuse next time.
     _output_metadata = std::make_unique<table_metadata>(out_metadata);

From b79c9ec0e4160cca6b7b8de65b4909fbb69137bb Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 1 May 2023 13:38:07 -0500
Subject: [PATCH 020/114] Remove definition and repetition levels from
 page_data_s struct to deal with a performance issue introduced in
 gpuDecodePageData by previously changing them to be pointers instead of
 hardcoded arrays.

---
 cpp/src/io/parquet/page_data.cu | 117 +++++++++++++++-----------------
 1 file changed, 55 insertions(+), 62 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index c8995ec2625..ded49cd989c 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -50,8 +50,12 @@ namespace {
 constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
 constexpr int decode_block_size     = 128;
 constexpr int non_zero_buffer_size  = decode_block_size * 2;
-constexpr int rolling_lvl_index(int index, int size) { return index % size; }
 constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
+template <int lvl_buf_size>
+constexpr int rolling_lvl_index(int index)
+{
+  return index % lvl_buf_size;
+}
 
 struct page_state_s {
   const uint8_t* data_start;
@@ -84,9 +88,6 @@ struct page_state_s {
   int32_t input_value_count;                  // how many values of the input we've processed
   int32_t input_row_count;                    // how many rows of the input we've processed
   int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint32_t* rep;                              // circular buffer of repetition level values
-  uint32_t* def;                              // circular buffer of definition level values
-  int level_decode_buf_size;                  // size of rep/ref
   const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
   const uint8_t* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
   const uint8_t* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
@@ -970,8 +971,6 @@ static __device__ void gpuOutputGeneric(
  * @param[in] min_row Crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
- * @param[in] level_decode_buf Buffer space to use for repetition and definition levels
- * @param[in] level_decode_buf_size Size of the level decode buffers
  * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
  * Currently only used by gpuComputePageSizes step)
  */
@@ -981,8 +980,6 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
                                           size_t min_row,
                                           size_t num_rows,
                                           bool is_decode_step,
-                                          uint32_t* level_decode_buf[level_type::NUM_LEVEL_TYPES],
-                                          int level_decode_buf_size,
                                           rle_stream* decoders = nullptr)
 {
   int t = threadIdx.x;
@@ -1024,12 +1021,6 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
     s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
   }
 
-  if (!t) {
-    s->rep                   = level_decode_buf[level_type::REPETITION];
-    s->def                   = level_decode_buf[level_type::DEFINITION];
-    s->level_decode_buf_size = level_decode_buf_size;
-  }
-
   __syncthreads();
 
   // zero counts
@@ -1372,25 +1363,33 @@ static __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
  * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
  * d will be -1
  * @param[in] s Local page information
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
  * @param[in] input_value_count The current count of input level values we have processed
  * @param[in] target_input_value_count The desired # of input level values we want to process
  * @param[in] t Thread index
  */
+template <int lvl_buf_size>
 inline __device__ void get_nesting_bounds(int& start_depth,
                                           int& end_depth,
                                           int& d,
                                           page_state_s* s,
+                                          uint32_t const* const rep,
+                                          uint32_t const* const def,
                                           int input_value_count,
                                           int32_t target_input_value_count,
                                           int t)
 {
+  start_depth = -1;
+  end_depth   = -1;
+  d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int index = rolling_lvl_index(input_value_count + t, s->level_decode_buf_size);
-    d         = s->def[index];
+    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d         = def[index];
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {
-      int r       = s->rep[index];
+      int r       = rep[index];
       start_depth = s->nesting_info[r].start_depth;
       end_depth   = s->nesting_info[d].end_depth;
     }
@@ -1400,10 +1399,6 @@ inline __device__ void get_nesting_bounds(int& start_depth,
       start_depth = 0;
       end_depth   = s->col.max_nesting_depth - 1;
     }
-  } else {
-    start_depth = -1;
-    end_depth   = -1;
-    d           = -1;
   }
 }
 
@@ -1414,11 +1409,16 @@ inline __device__ void get_nesting_bounds(int& start_depth,
  * @param[in] target_input_value_count The # of repetition/definition levels to process up to
  * @param[in] s Local page information
  * @param[out] sb Page state buffer output
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
  * @param[in] t Thread index
  */
+template <int lvl_buf_size>
 static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
                                                              page_state_s* s,
                                                              page_state_buffers_s* sb,
+                                                             uint32_t const* const rep,
+                                                             uint32_t const* const def,
                                                              int t)
 {
   // max nesting depth of the column
@@ -1436,8 +1436,8 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
     // determine the nesting bounds for this thread (the range of nesting depths we
     // will generate new value indices and validity bits for)
     int start_depth, end_depth, d;
-    get_nesting_bounds(
-      start_depth, end_depth, d, s, input_value_count, target_input_value_count, t);
+    get_nesting_bounds<non_zero_buffer_size>(
+      start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
 
     // 4 interesting things to track:
     // thread_value_count : # of output values from the view of this thread
@@ -1588,11 +1588,16 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
  * @param[in] s The local page state
  * @param[out] sb Page state buffer output
  * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
  * @param[in] t Thread index
  */
+template <int lvl_buf_size>
 __device__ void gpuDecodeLevels(page_state_s* s,
                                 page_state_buffers_s* sb,
                                 int32_t target_leaf_count,
+                                uint32_t* const rep,
+                                uint32_t* const def,
                                 int t)
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
@@ -1601,8 +1606,8 @@ __device__ void gpuDecodeLevels(page_state_s* s,
   int cur_leaf_count       = target_leaf_count;
   while (!s->error && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
-    if (has_repetition) { gpuDecodeStream(s->rep, s, cur_leaf_count, t, level_type::REPETITION); }
-    gpuDecodeStream(s->def, s, cur_leaf_count, t, level_type::DEFINITION);
+    if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); }
+    gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION);
     __syncwarp();
 
     // because the rep and def streams are encoded separately, we cannot request an exact
@@ -1613,7 +1618,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
                                            : s->lvl_count[level_type::DEFINITION];
 
     // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices(actual_leaf_count, s, sb, t);
+    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size>(actual_leaf_count, s, sb, rep, def, t);
     cur_leaf_count = actual_leaf_count + batch_size;
     __syncwarp();
   }
@@ -1656,11 +1661,16 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
  *
  * @param s The local page info
  * @param target_value_count The target value count to process up to
+ * @param rep Repetition level buffer
+ * @param def Definition level buffer
  * @param t Thread index
  * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
  */
+template <int lvl_buf_size>
 static __device__ void gpuUpdatePageSizes(page_state_s* s,
                                           int target_value_count,
+                                          uint32_t const* const rep,
+                                          uint32_t const* const def,
                                           int t,
                                           bool bounds_set)
 {
@@ -1691,7 +1701,8 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
 
     // start/end depth
     int start_depth, end_depth, d;
-    get_nesting_bounds(start_depth, end_depth, d, s, value_count, value_count + batch_size, t);
+    get_nesting_bounds<lvl_buf_size>(
+      start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t);
 
     // is this thread within row bounds? in the non skip_rows/num_rows case this will always
     // be true.
@@ -1780,6 +1791,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
  * @param compute_string_sizes Whether or not we should be computing string sizes
  * (PageInfo::str_bytes) as part of the pass
  */
+template <int lvl_buf_size>
 __global__ void __launch_bounds__(preprocess_block_size)
   gpuComputePageSizes(PageInfo* pages,
                       device_span<ColumnChunkDesc const> chunks,
@@ -1804,34 +1816,24 @@ __global__ void __launch_bounds__(preprocess_block_size)
   rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          false,
-                          pp->lvl_decode_buf,
-                          LEVEL_DECODE_BUF_SIZE,
-                          decoders)) {
-    return;
-  }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  int const max_batch_size = s->level_decode_buf_size;
-  uint32_t* def_decode     = s->def;
-  uint32_t* rep_decode     = s->rep;
+  int const max_batch_size = lvl_buf_size;
+  uint32_t* rep            = pp->lvl_decode_buf[level_type::REPETITION];
+  uint32_t* def            = pp->lvl_decode_buf[level_type::DEFINITION];
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
                                         max_batch_size,
-                                        def_decode,
+                                        def,
                                         s->page.num_input_values);
   if (has_repetition) {
     decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
                                           s->abs_lvl_start[level_type::REPETITION],
                                           s->abs_lvl_end[level_type::REPETITION],
                                           max_batch_size,
-                                          rep_decode,
+                                          rep,
                                           s->page.num_input_values);
   }
   __syncthreads();
@@ -1921,7 +1923,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
     __syncthreads();
 
     // update page sizes
-    gpuUpdatePageSizes(s, processed, t, !is_base_pass);
+    gpuUpdatePageSizes<lvl_buf_size>(s, processed, rep, def, t, !is_base_pass);
     __syncthreads();
   }
 
@@ -1991,6 +1993,7 @@ struct null_count_back_copier {
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
  */
+template <int lvl_buf_size>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
@@ -2004,20 +2007,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  __shared__ uint32_t def_buf[non_zero_buffer_size];
-  __shared__ uint32_t rep_buf[non_zero_buffer_size];
-  uint32_t* level_decode_buf[level_type::NUM_LEVEL_TYPES] = {def_buf, rep_buf};
-
-  if (!setupLocalPageInfo(s,
-                          &pages[page_idx],
-                          chunks,
-                          min_row,
-                          num_rows,
-                          true,
-                          level_decode_buf,
-                          non_zero_buffer_size)) {
-    return;
-  }
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -2045,6 +2035,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
 
+  __shared__ uint32_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
+  __shared__ uint32_t def[non_zero_buffer_size];  // circular buffer of definition level values
+
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
   while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
@@ -2064,7 +2057,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       // - update validity vectors
       // - updates offsets (for nested columns)
       // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels(s, sb, target_pos, t);
+      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
     } else if (t < out_thread0) {
       // skipped_leaf_values will always be 0 for flat hierarchies.
       uint32_t src_target_pos = target_pos + skipped_leaf_values;
@@ -2191,7 +2184,7 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
   // This computes the size for the entire page, not taking row bounds into account.
   // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
   // the starting and ending read values to account for these bounds.
-  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
+  gpuComputePageSizes<LEVEL_DECODE_BUF_SIZE><<<dim_grid, dim_block, 0, stream.value()>>>(
     pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
 }
 
@@ -2209,8 +2202,8 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  gpuDecodePageData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks, min_row, num_rows);
+  gpuDecodePageData<non_zero_buffer_size>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
 }
 
 }  // namespace gpu

From 3320cde29bbf0269db09f129cf00fc0649fda327 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 1 May 2023 13:47:23 -0700
Subject: [PATCH 021/114] fixes after merging

---
 cpp/src/io/parquet/page_data.cu | 36 +++++++++++++--------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index f0ac5310ea3..f2441a225ab 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1853,6 +1853,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
   }
 }
 
+template <int lvl_buf_size>
 __device__ std::pair<int, int> page_bounds(page_state_s* const s,
                                            size_t min_row,
                                            size_t num_rows,
@@ -1882,9 +1883,9 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   auto const col  = &s->col;
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  int const max_batch_size = s->level_decode_buf_size;
-  uint32_t* def_decode     = s->def;
-  uint32_t* rep_decode     = s->rep;
+  int const max_batch_size = lvl_buf_size;
+  uint32_t* def_decode     = pp->lvl_decode_buf[level_type::DEFINITION];
+  uint32_t* rep_decode     = pp->lvl_decode_buf[level_type::REPETITION];
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
@@ -1945,17 +1946,17 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
       // do something with the level data
       while (start_val < processed) {
         int idx_t = start_val + t;
-        int idx   = rolling_lvl_index(idx_t, s->level_decode_buf_size);
+        int idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
 
         // get absolute thread row index
-        int is_new_row = idx_t < processed && (!has_repetition || s->rep[idx] == 0);
+        int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
         int thread_row_count, block_row_count;
         block_scan(temp_storage.scan_storage)
           .InclusiveSum(is_new_row, thread_row_count, block_row_count);
         __syncthreads();
 
         // get absolute thread leaf index
-        int const is_new_leaf = idx_t < processed && (s->def[idx] >= max_def);
+        int const is_new_leaf = idx_t < processed && (def_decode[idx] >= max_def);
         int thread_leaf_count, block_leaf_count;
         block_scan(temp_storage.scan_storage)
           .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
@@ -2051,8 +2052,8 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
       while (start_val < processed) {
         int idx_t = start_val + t;
         if (idx_t < processed) {
-          int idx = rolling_lvl_index(idx_t, s->level_decode_buf_size);
-          if (s->def[idx] < max_def) { num_nulls++; }
+          int idx = rolling_lvl_index<lvl_buf_size>(idx_t);
+          if (def_decode[idx] < max_def) { num_nulls++; }
         }
         start_val += preprocess_block_size;
       }
@@ -2221,6 +2222,7 @@ countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_v
   return total_len;
 }
 
+template <int lvl_buf_size>
 __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
@@ -2249,17 +2251,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          false,
-                          pp->lvl_decode_buf,
-                          LEVEL_DECODE_BUF_SIZE,
-                          decoders)) {
-    return;
-  }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
 
   if (!t) {
     s->page.num_nulls = 0;
@@ -2274,7 +2266,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // find start/end value indices
   auto const [start_value, end_value] =
-    page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
+    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
 
   // need to save num_nulls calculated in page_bounds in this page
   // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
@@ -2896,8 +2888,8 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
 {
   dim3 dim_block(preprocess_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-  gpuComputePageStringSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks, min_row, num_rows);
+  gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
 }
 
 /**

From 897db8c14fa8eeecec2ca8d7ef143a8d74b1ee4c Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 1 May 2023 15:07:31 -0700
Subject: [PATCH 022/114] split out separate decoder for string columns

---
 cpp/src/io/parquet/page_data.cu    | 421 ++++++++++++++++++-----------
 cpp/src/io/parquet/parquet_gpu.hpp |  10 +-
 cpp/src/io/parquet/reader_impl.cpp |   1 +
 3 files changed, 279 insertions(+), 153 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index f2441a225ab..83d5024c8cc 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2556,6 +2556,180 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 {
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(16) page_state_buffers_s state_buffers;
+
+  page_state_s* const s          = &state_g;
+  page_state_buffers_s* const sb = &state_buffers;
+  int page_idx                   = blockIdx.x;
+  int t                          = threadIdx.x;
+  int out_thread0;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  int const dtype           = s->col.data_type & 7;
+
+  // string cols handled elsewhere
+  if (dtype == BYTE_ARRAY && s->dtype_len != 4) { return; }
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
+                                               is_page_contained(s, min_row, num_rows)))) {
+    return;
+  }
+
+  if (s->dict_base) {
+    out_thread0 = (s->dict_bits > 0) ? 64 : 32;
+  } else {
+    out_thread0 =
+      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
+  }
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  __shared__ uint32_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
+  __shared__ uint32_t def[non_zero_buffer_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+      if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
+    }
+    __syncthreads();
+    if (t < 32) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
+    } else if (t < out_thread0) {
+      // skipped_leaf_values will always be 0 for flat hierarchies.
+      uint32_t src_target_pos = target_pos + skipped_leaf_values;
+
+      // WARP1: Decode dictionary indices, booleans or string positions
+      if (s->dict_base) {
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+      } else if ((s->col.data_type & 7) == BOOLEAN) {
+        src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
+      } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+      }
+      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+    } else {
+      // WARP1..WARP3: Decode values
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -1, -2, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
+      // (values before first_row) in the flat hierarchy case.
+      if (src_pos < target_pos && dst_pos >= 0) {
+        // src_pos represents the logical row position we want to read from. But in the case of
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read
+        // position has to take into account the # of values we have to skip in the page to get to
+        // the desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        uint32_t val_src_pos = src_pos + skipped_leaf_values;
+
+        // nesting level that is storing actual leaf values
+        int leaf_level_index = s->col.max_nesting_depth - 1;
+
+        uint32_t dtype_len = s->dtype_len;
+        void* dst =
+          nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        if (dtype == BYTE_ARRAY) {
+          if (s->col.converted_type == DECIMAL) {
+            auto const [ptr, len]        = gpuGetStringData(s, sb, val_src_pos);
+            auto const decimal_precision = s->col.decimal_precision;
+            if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+              gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
+            } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+              gpuOutputByteArrayAsInt(ptr, len, static_cast<int64_t*>(dst));
+            } else {
+              gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst));
+            }
+          } else {
+            // test for string hashes
+            if (dtype_len == 4) { gpuOutputString(s, sb, val_src_pos, dst); }
+          }
+        } else if (dtype == BOOLEAN) {
+          gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
+        } else if (s->col.converted_type == DECIMAL) {
+          switch (dtype) {
+            case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
+            case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
+            default:
+              if (s->dtype_len_in <= sizeof(int32_t)) {
+                gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int32_t*>(dst));
+              } else if (s->dtype_len_in <= sizeof(int64_t)) {
+                gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+              } else {
+                gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<__int128_t*>(dst));
+              }
+              break;
+          }
+        } else if (dtype == INT96) {
+          gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+        } else if (dtype_len == 8) {
+          if (s->dtype_len_in == 4) {
+            // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+            // TIME_MILLIS is the only duration type stored as int32:
+            // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+            gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
+          } else if (s->ts_scale) {
+            gpuOutputInt64Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
+          } else {
+            gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst));
+          }
+        } else if (dtype_len == 4) {
+          gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
+        } else {
+          gpuOutputGeneric(s, sb, val_src_pos, static_cast<uint8_t*>(dst), dtype_len);
+        }
+      }
+
+      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+}
+
+template <int lvl_buf_size>
+__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
   page_state_s* const s          = &state_g;
@@ -2572,11 +2746,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int const dtype          = s->col.data_type & 7;
   bool const is_string_col = dtype == BYTE_ARRAY && s->dtype_len != 4;
 
+  if (!is_string_col) { return; }
+
   // offsets is global...but the output is local, so account for that below
-  if (is_string_col) {
-    if (t == 0) { last_offset = s->page.str_offset; }
-    __syncthreads();
-  }
+  if (t == 0) { last_offset = s->page.str_offset; }
+  __syncthreads();
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
@@ -2659,50 +2833,49 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       if (!has_repetition) { dst_pos -= s->first_row; }
 
       // need to do this before we branch on src_pos/dst_pos so we don't deadlock
-      if (is_string_col) {
-        // choose a character parallel string copy when the average string is longer than a warp
-        auto const use_char_ll = s->page.num_valids > 0 &&
-                                 (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
-        int const leaf_level_index = s->col.max_nesting_depth - 1;
-        int const me               = t - out_thread0;
-
-        if (me < 32) {
-          for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
-            dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
-            if (!has_repetition) { dst_pos -= s->first_row; }
-
-            auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
-                                ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
-                                : cuda::std::pair<char const*, size_t>{nullptr, 0};
-
-            __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-            size_type offset;
-            cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
-            offset += last_offset;
-
-            if (use_char_ll) {
-              // TODO: might want separate kernel for string page decoding so we don't waste all
-              // this shared memory on non-string columns.
-              __shared__ __align__(8) uint8_t const* pointers[32];
-              __shared__ __align__(4) size_type offsets[32];
-              __shared__ __align__(4) int dsts[32];
-              __shared__ __align__(4) int lengths[32];
-
-              offsets[me]  = offset;
-              pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
-              dsts[me]     = dst_pos;
-              lengths[me]  = len;
-              __syncwarp();
-
-              for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
-                if (dsts[ss] >= 0) {
-                  auto offptr =
-                    reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
-                    dsts[ss];
-                  *offptr      = offsets[ss];
-                  auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss] -
-                                 s->page.str_offset;
-                  ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
+      // choose a character parallel string copy when the average string is longer than a warp
+      auto const use_char_ll = s->page.num_valids > 0 &&
+                               (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+      int const me               = t - out_thread0;
+
+      if (me < 32) {
+        for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
+          dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
+          if (!has_repetition) { dst_pos -= s->first_row; }
+
+          auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
+                              ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
+                              : cuda::std::pair<char const*, size_t>{nullptr, 0};
+
+          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
+          size_type offset;
+          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
+          offset += last_offset;
+
+          if (use_char_ll) {
+            // TODO: might want separate kernel for string page decoding so we don't waste all
+            // this shared memory on non-string columns.
+            __shared__ __align__(8) uint8_t const* pointers[32];
+            __shared__ __align__(4) size_type offsets[32];
+            __shared__ __align__(4) int dsts[32];
+            __shared__ __align__(4) int lengths[32];
+
+            offsets[me]  = offset;
+            pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
+            dsts[me]     = dst_pos;
+            lengths[me]  = len;
+            __syncwarp();
+
+            for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
+              if (dsts[ss] >= 0) {
+                auto offptr =
+                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
+                  dsts[ss];
+                *offptr = offsets[ss];
+                auto str_ptr =
+                  nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
+                ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
 #if 0
                   if (is_bounds_page(s, min_row, num_rows)) {
                     if (me == 0)
@@ -2715,18 +2888,17 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
                              offsets[ss]);
                   }
 #endif
-                }
               }
+            }
 
-            } else {
-              if (src_pos + i < target_pos && dst_pos >= 0) {
-                auto offptr =
-                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
-                  dst_pos;
-                *offptr = offset;
-                auto str_ptr =
-                  nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
-                memcpy(str_ptr, ptr, len);
+          } else {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr = offset;
+              auto str_ptr =
+                nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
+              memcpy(str_ptr, ptr, len);
 #if 0
                 if (is_bounds_page(s, min_row, num_rows)) {
                   printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
@@ -2738,80 +2910,12 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
                          offset);
                 }
 #endif
-              }
-              __syncwarp();
             }
-
-            if (me == 31) { last_offset = offset + len; }
             __syncwarp();
           }
-        }
-      } else {
-        // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
-        // (values before first_row) in the flat hierarchy case.
-        if (src_pos < target_pos && dst_pos >= 0) {
-          // src_pos represents the logical row position we want to read from. But in the case of
-          // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read
-          // position has to take into account the # of values we have to skip in the page to get to
-          // the desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
-          uint32_t val_src_pos = src_pos + skipped_leaf_values;
-
-          // nesting level that is storing actual leaf values
-          int leaf_level_index = s->col.max_nesting_depth - 1;
-
-          uint32_t dtype_len = s->dtype_len;
-          void* dst =
-            nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
-          if (dtype == BYTE_ARRAY) {
-            if (s->col.converted_type == DECIMAL) {
-              auto const [ptr, len]        = gpuGetStringData(s, sb, val_src_pos);
-              auto const decimal_precision = s->col.decimal_precision;
-              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
-                gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
-              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
-                gpuOutputByteArrayAsInt(ptr, len, static_cast<int64_t*>(dst));
-              } else {
-                gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst));
-              }
-            } else {
-              // test for string hashes
-              if (dtype_len == 4) { gpuOutputString(s, sb, val_src_pos, dst); }
-            }
-          } else if (dtype == BOOLEAN) {
-            gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
-          } else if (s->col.converted_type == DECIMAL) {
-            switch (dtype) {
-              case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
-              case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
-              default:
-                if (s->dtype_len_in <= sizeof(int32_t)) {
-                  gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int32_t*>(dst));
-                } else if (s->dtype_len_in <= sizeof(int64_t)) {
-                  gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<int64_t*>(dst));
-                } else {
-                  gpuOutputFixedLenByteArrayAsInt(
-                    s, sb, val_src_pos, static_cast<__int128_t*>(dst));
-                }
-                break;
-            }
-          } else if (dtype == INT96) {
-            gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
-          } else if (dtype_len == 8) {
-            if (s->dtype_len_in == 4) {
-              // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
-              // TIME_MILLIS is the only duration type stored as int32:
-              // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
-              gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
-            } else if (s->ts_scale) {
-              gpuOutputInt64Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
-            } else {
-              gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst));
-            }
-          } else if (dtype_len == 4) {
-            gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst));
-          } else {
-            gpuOutputGeneric(s, sb, val_src_pos, static_cast<uint8_t*>(dst), dtype_len);
-          }
+
+          if (me == 31) { last_offset = offset + len; }
+          __syncwarp();
         }
       }
 
@@ -2839,32 +2943,30 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 #endif
 
   if (s->page.num_nulls != 0) {
-    int dtype = s->col.data_type & 7;
-    if (dtype == BYTE_ARRAY && s->dtype_len != 4) {
-      int const value_count      = s->page.num_valids + s->page.num_nulls;
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
+    int const value_count      = s->page.num_valids + s->page.num_nulls;
+    int const leaf_level_index = s->col.max_nesting_depth - 1;
 
-      auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
+    auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
 
-      if (nesting_info_base[leaf_level_index].null_count > 0) {
-        // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
-        if (s->nz_count == 0) {
-          for (int i = t; i < value_count; i += decode_block_size) {
-            offptr[i] = s->page.str_offset;
-          }
+    if (nesting_info_base[leaf_level_index].null_count > 0) {
+      // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
+      if (s->nz_count == 0) {
+        for (int i = t; i < value_count; i += decode_block_size) {
+          offptr[i] = s->page.str_offset;
         }
-        // just some nulls, do this serially for now
-        else if (t == 0) {
-          if (offptr[value_count - 1] == 0) {
-            offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
-          }
-          for (int i = value_count - 2; i > 0; i--) {
-            if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
-          }
-          offptr[0] = s->page.str_offset;
+      }
+      // just some nulls, do this serially for now
+      else if (t == 0) {
+        if (offptr[value_count - 1] == 0) {
+          offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
+        }
+        for (int i = value_count - 2; i > 0; i--) {
+          if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
         }
+        offptr[0] = s->page.str_offset;
       }
-      __syncthreads();
+    }
+    __syncthreads();
 #if 0
       if (t == 0)
         printf("%05d: offptr %p/%p %d %d\n",
@@ -2874,7 +2976,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
                offptr[value_count - 2],
                offptr[value_count - 1]);
 #endif
-    }
   }
 }
 
@@ -2933,6 +3034,24 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
     <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
 }
 
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodePageData
+ */
+void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
+                                   hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  gpuDecodeStringPageData<non_zero_buffer_size>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+}
+
 }  // namespace gpu
 }  // namespace parquet
 }  // namespace io
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b6b63ce473f..0aed47fdc6d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -164,8 +164,8 @@ struct PageInfo {
   // - In the case of a nested schema, you have to decode the repetition and definition
   //   levels to extract actual column values
   int32_t num_input_values;
-  int32_t chunk_row;       // starting row of this page relative to the start of the chunk
-  int32_t num_rows;        // number of rows in this page
+  int32_t chunk_row;  // starting row of this page relative to the start of the chunk
+  int32_t num_rows;   // number of rows in this page
   // the next two are calculated in gpuComputePageStringSizes
   int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
   int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
@@ -505,6 +505,12 @@ void DecodePageData(hostdevice_vector<PageInfo>& pages,
                     size_t min_row,
                     rmm::cuda_stream_view stream);
 
+void DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
+                          hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          size_t num_rows,
+                          size_t min_row,
+                          rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ae5764add1d..b7603ca59ce 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -169,6 +169,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_str_data.host_to_device(_stream);
 
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
+  if (has_strings) { gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, _stream); }
 
   pages.device_to_host(_stream);
   page_nesting.device_to_host(_stream);

From 15f4e1261df5722065b67473fa50346ba0281519 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 1 May 2023 15:33:31 -0700
Subject: [PATCH 023/114] remove test for string hash

---
 cpp/src/io/parquet/page_data.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 83d5024c8cc..89f9dd3fe3c 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2679,8 +2679,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
               gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst));
             }
           } else {
-            // test for string hashes
-            if (dtype_len == 4) { gpuOutputString(s, sb, val_src_pos, dst); }
+            gpuOutputString(s, sb, val_src_pos, dst);
           }
         } else if (dtype == BOOLEAN) {
           gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));

From b9399c090f6fb74d956bf0008036c3d0dd652e53 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 1 May 2023 16:18:04 -0700
Subject: [PATCH 024/114] get rid of little used variables

---
 cpp/src/io/parquet/page_data.cu | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 89f9dd3fe3c..893813b87ba 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2567,10 +2567,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-  int const dtype           = s->col.data_type & 7;
 
   // string cols handled elsewhere
-  if (dtype == BYTE_ARRAY && s->dtype_len != 4) { return; }
+  if ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4) { return; }
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
@@ -2634,6 +2633,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
+      int const dtype = s->col.data_type & 7;
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
@@ -2742,10 +2742,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  int const dtype          = s->col.data_type & 7;
-  bool const is_string_col = dtype == BYTE_ARRAY && s->dtype_len != 4;
-
-  if (!is_string_col) { return; }
+  if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
 
   // offsets is global...but the output is local, so account for that below
   if (t == 0) { last_offset = s->page.str_offset; }

From 57d7aa8c9fd40ed0792962664d480f4dc110d355 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 2 May 2023 13:19:42 -0700
Subject: [PATCH 025/114] fix a few edge cases

---
 cpp/src/io/parquet/page_data.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 893813b87ba..cc06713088e 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1875,7 +1875,11 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
 
   // can skip all this if we know there are no nulls
-  if (max_def == 0 && !is_bounds_pg) { return {0, s->num_input_values}; }
+  if (max_def == 0 && !is_bounds_pg) {
+    s->page.num_valids = s->num_input_values;
+    s->page.num_nulls = 0;
+    return {0, s->num_input_values};
+  }
 
   int start_value = 0;
   int end_value   = s->page.num_input_values;
@@ -1976,8 +1980,9 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
 
           // we found it
           if (global_count > 0) {
-            // this is the thread that represents the first row.
-            if (local_count == 1) {
+            // this is the thread that represents the first row. need to test in_row_bounds for
+            // the case where we only want one row and local_count == 1 for many threads.
+            if (local_count == 1 && in_row_bounds) {
               skipped_values = idx_t;
               skipped_leaf_values =
                 leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);

From 72d301a888a4adb6e4710e8cea96a9b500013211 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 2 May 2023 16:41:33 -0700
Subject: [PATCH 026/114] use char parallel strcpy when avg string len is 32 or
 higher

---
 cpp/src/io/parquet/page_data.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index cc06713088e..f407d392f5b 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -2836,7 +2836,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       // need to do this before we branch on src_pos/dst_pos so we don't deadlock
       // choose a character parallel string copy when the average string is longer than a warp
       auto const use_char_ll = s->page.num_valids > 0 &&
-                               (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
+                               (s->page.str_bytes / s->page.num_valids) >= cudf::detail::warp_size;
       int const leaf_level_index = s->col.max_nesting_depth - 1;
       int const me               = t - out_thread0;
 

From 7768ae5c8736fcb9abb93cc59405b924fb7d9c63 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 2 May 2023 17:31:57 -0700
Subject: [PATCH 027/114] overlap decode kernels using stream pool

---
 cpp/src/io/parquet/reader_impl.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b7603ca59ce..e5de7e4098f 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -17,6 +17,7 @@
 #include "reader_impl.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
@@ -168,8 +169,17 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_data.host_to_device(_stream);
   chunk_nested_str_data.host_to_device(_stream);
 
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
-  if (has_strings) { gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, _stream); }
+  {
+    rmm::cuda_stream_pool pool(2);
+    auto s1 = pool.get_stream();
+    auto s2 = pool.get_stream();
+    if (has_strings) {
+      gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, s1);
+    }
+    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, s2);
+    s2.synchronize();
+    s1.synchronize();
+  }
 
   pages.device_to_host(_stream);
   page_nesting.device_to_host(_stream);

From 2d42bf324f7ddc39f7f61d033ada413c86abb0e6 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 3 May 2023 15:27:11 -0500
Subject: [PATCH 028/114] Squeeze level values into uint16_t instead of
 uint32_t, shrink deocde buffer size from 4096 to 2048. Global scratch memory
 cost per page now 8k instead of 32k.  This will likely need to be tuned
 further as this optimization gets appled to the decode kernel.

---
 cpp/src/io/parquet/page_data.cu              | 32 ++++++++++----------
 cpp/src/io/parquet/parquet_gpu.hpp           |  6 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu |  4 +--
 cpp/src/io/parquet/rle_stream.cuh            | 10 +++---
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index ded49cd989c..bcdaba5cd9e 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -233,7 +233,7 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
  */
 __device__ void gpuDecodeStream(
-  uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
+  level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
   const uint8_t* cur_def    = s->lvl_start[lvl];
   const uint8_t* end        = s->lvl_end;
@@ -1374,8 +1374,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
                                           int& end_depth,
                                           int& d,
                                           page_state_s* s,
-                                          uint32_t const* const rep,
-                                          uint32_t const* const def,
+                                          level_t const* const rep,
+                                          level_t const* const def,
                                           int input_value_count,
                                           int32_t target_input_value_count,
                                           int t)
@@ -1384,12 +1384,12 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d         = def[index];
+    level_t index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d             = static_cast<int>(def[index]);
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {
-      int r       = rep[index];
+      level_t r   = rep[index];
       start_depth = s->nesting_info[r].start_depth;
       end_depth   = s->nesting_info[d].end_depth;
     }
@@ -1417,8 +1417,8 @@ template <int lvl_buf_size>
 static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
                                                              page_state_s* s,
                                                              page_state_buffers_s* sb,
-                                                             uint32_t const* const rep,
-                                                             uint32_t const* const def,
+                                                             level_t const* const rep,
+                                                             level_t const* const def,
                                                              int t)
 {
   // max nesting depth of the column
@@ -1596,8 +1596,8 @@ template <int lvl_buf_size>
 __device__ void gpuDecodeLevels(page_state_s* s,
                                 page_state_buffers_s* sb,
                                 int32_t target_leaf_count,
-                                uint32_t* const rep,
-                                uint32_t* const def,
+                                level_t* const rep,
+                                level_t* const def,
                                 int t)
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
@@ -1669,8 +1669,8 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
 template <int lvl_buf_size>
 static __device__ void gpuUpdatePageSizes(page_state_s* s,
                                           int target_value_count,
-                                          uint32_t const* const rep,
-                                          uint32_t const* const def,
+                                          level_t const* const rep,
+                                          level_t const* const def,
                                           int t,
                                           bool bounds_set)
 {
@@ -1820,8 +1820,8 @@ __global__ void __launch_bounds__(preprocess_block_size)
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
-  uint32_t* rep            = pp->lvl_decode_buf[level_type::REPETITION];
-  uint32_t* def            = pp->lvl_decode_buf[level_type::DEFINITION];
+  level_t* rep             = pp->lvl_decode_buf[level_type::REPETITION];
+  level_t* def             = pp->lvl_decode_buf[level_type::DEFINITION];
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
@@ -2035,8 +2035,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
 
-  __shared__ uint32_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
-  __shared__ uint32_t def[non_zero_buffer_size];  // circular buffer of definition level values
+  __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
+  __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 54119cc7e00..8deb7133c65 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -46,8 +46,8 @@ constexpr int MAX_DICT_BITS = 24;
 constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 
 // level decode buffer size.
-// at size 4096, each page requires 32kb of memory
-constexpr int LEVEL_DECODE_BUF_SIZE = 4096;
+typedef uint16_t level_t;
+constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 
 /**
  * @brief Struct representing an input column in the file.
@@ -199,7 +199,7 @@ struct PageInfo {
   PageNestingDecodeInfo* nesting_decode;
 
   // level decode buffers
-  uint32_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
+  level_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index be3e0d3ce33..8cfbfdc01b5 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -673,12 +673,12 @@ void reader::impl::allocate_level_decode_space()
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
-  size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * sizeof(uint32_t);
+  size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * sizeof(level_t);
   auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
   _file_itm_data.level_decode_data      = rmm::device_buffer(decode_buf_size, _stream, _mr);
 
   // distribute the buffers
-  uint32_t* buf = static_cast<uint32_t*>(_file_itm_data.level_decode_data.data());
+  level_t* buf = static_cast<level_t*>(_file_itm_data.level_decode_data.data());
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index c32233b26c3..ecf2cc33e20 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "parquet_gpu.hpp"
 #include <cudf/detail/utilities/integer_utils.hpp>
 
@@ -59,7 +61,7 @@ inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
 struct rle_batch {
   uint8_t const* run_start;  // start of the run we are part of
   int run_offset;            // value offset of this batch from the start of the run
-  uint32_t* output;
+  level_t* output;
   int level_run;
   int size;
 
@@ -127,7 +129,7 @@ struct rle_run {
   int level_run;  // level_run header value
   int remaining;
 
-  __device__ __inline__ rle_batch next_batch(uint32_t* const output, int max_size)
+  __device__ __inline__ rle_batch next_batch(level_t* const output, int max_size)
   {
     int batch_len        = min(max_size, remaining);
     int const run_offset = size - remaining;
@@ -147,7 +149,7 @@ struct rle_stream {
   int total_values;
   int cur_values;
 
-  uint32_t* output;
+  level_t* output;
 
   rle_run* runs;
   int run_index;
@@ -164,7 +166,7 @@ struct rle_stream {
                        uint8_t const* _start,
                        uint8_t const* _end,
                        int _max_output_values,
-                       uint32_t* _output,
+                       level_t* _output,
                        int _total_values)
   {
     level_bits = _level_bits;

From 51624c8b155c9a5e713cce8500d148ed933514b8 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 3 May 2023 13:45:39 -0700
Subject: [PATCH 029/114] refactor to remove string decoding code from
 page_data.cu

---
 cpp/CMakeLists.txt                       |    1 +
 cpp/src/io/parquet/page_data.cu          | 2148 +---------------------
 cpp/src/io/parquet/page_decode.cuh       | 1197 ++++++++++++
 cpp/src/io/parquet/page_string_decode.cu |  863 +++++++++
 4 files changed, 2125 insertions(+), 2084 deletions(-)
 create mode 100644 cpp/src/io/parquet/page_decode.cuh
 create mode 100644 cpp/src/io/parquet/page_string_decode.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6d9986178d1..1c8426fc80a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -377,6 +377,7 @@ add_library(
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
+  src/io/parquet/page_string_decode.cu
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
   src/io/parquet/reader_impl_helpers.cpp
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index f407d392f5b..cffce2aef1a 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,33 +14,11 @@
  * limitations under the License.
  */
 
-#include "parquet_gpu.hpp"
-#include "rle_stream.cuh"
-#include <io/utilities/block_utils.cuh>
+#include "page_decode.cuh"
+
 #include <io/utilities/column_buffer.hpp>
 
-#include <cuda/std/tuple>
-#include <cudf/detail/utilities/assert.cuh>
-#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/strings/detail/gather.cuh>
-#include <cudf/strings/string_view.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
 
 namespace cudf {
 namespace io {
@@ -49,458 +27,6 @@ namespace gpu {
 
 namespace {
 
-constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
-constexpr int decode_block_size     = 128;
-constexpr int non_zero_buffer_size  = decode_block_size * 2;
-constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
-template <int lvl_buf_size>
-constexpr int rolling_lvl_index(int index)
-{
-  return index % lvl_buf_size;
-}
-
-struct page_state_s {
-  const uint8_t* data_start;
-  const uint8_t* data_end;
-  const uint8_t* lvl_end;
-  const uint8_t* dict_base;    // ptr to dictionary page data
-  int32_t dict_size;           // size of dictionary data
-  int32_t first_row;           // First row in page to output
-  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
-  int32_t first_output_value;  // First value in page to output
-  int32_t num_input_values;    // total # of input/level values in the page
-  int32_t dtype_len;           // Output data type length
-  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
-  int32_t dict_bits;           // # of bits to store dictionary indices
-  uint32_t dict_run;
-  int32_t dict_val;
-  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
-  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t error;
-  PageInfo page;
-  ColumnChunkDesc col;
-
-  // (leaf) value decoding
-  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
-  int32_t dict_pos;  // write position of dictionary indices
-  int32_t src_pos;   // input read position of final output value
-  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
-
-  // repetition/definition level decoding
-  int32_t input_value_count;                  // how many values of the input we've processed
-  int32_t input_row_count;                    // how many rows of the input we've processed
-  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  const uint8_t* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  const uint8_t* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
-
-  // a shared-memory cache of frequently used data when decoding. The source of this data is
-  // normally stored in global memory which can yield poor performance. So, when possible
-  // we copy that info here prior to decoding
-  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
-  // points to either nesting_decode_cache above when possible, or to the global source otherwise
-  PageNestingDecodeInfo* nesting_info;
-};
-
-// buffers only used in the decode kernel.  separated from page_state_s to keep
-// shared memory usage in other kernels (eg, gpuComputePageSizes) down.
-struct page_state_buffers_s {
-  uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
-  uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
-  uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
-};
-
-// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
-// copies from src to dst in 16B chunks per thread.
-__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
-{
-  using cudf::detail::warp_size;
-  using cudf::strings::detail::load_uint4;
-
-  constexpr size_t out_datatype_size = sizeof(uint4);
-  constexpr size_t in_datatype_size  = sizeof(uint);
-
-  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
-  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
-  auto const in_start         = src;
-
-  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
-  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
-  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
-  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
-  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
-  // `load_uint4` reading beyond string boundaries.
-  // use signed int since out_end_aligned can be negative.
-  int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
-                                out_datatype_size * out_datatype_size -
-                              alignment_offset;
-  int64_t out_end_aligned =
-    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
-    alignment_offset;
-
-  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
-       ichar += warp_size * out_datatype_size) {
-    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
-      load_uint4((const char*)in_start + ichar);
-  }
-
-  // Tail logic: copy characters of the current string outside
-  // `[out_start_aligned, out_end_aligned)`.
-  if (out_end_aligned <= out_start_aligned) {
-    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
-    // entire string.
-    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
-      dst[ichar] = in_start[ichar];
-    }
-  } else {
-    // Copy characters in range `[0, out_start_aligned)`.
-    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
-    // Copy characters in range `[out_end_aligned, len)`.
-    int64_t ichar = out_end_aligned + lane_id;
-    if (ichar < len) { dst[ichar] = in_start[ichar]; }
-  }
-}
-
-// data parallel strcpy
-__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
-{
-  using cudf::detail::warp_size;
-  if (len > 64) {
-    wideStrcpy(dst, src, len, lane_id);
-  } else {
-    for (int i = lane_id; i < len; i += warp_size) {
-      dst[i] = src[i];
-    }
-  }
-}
-
-/**
- * @brief Returns whether or not a page spans either the beginning or the end of the
- * specified row bounds
- *
- * @param s The page to be checked
- * @param start_row The starting row index
- * @param num_rows The number of rows
- *
- * @return True if the page spans the beginning or the end of the row bounds
- */
-inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
-{
-  size_t const page_begin = s->col.start_row + s->page.chunk_row;
-  size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = start_row;
-  size_t const end        = start_row + num_rows;
-
-  return ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
-}
-
-/**
- * @brief Returns whether or not a page is completely contained within the specified
- * row bounds
- *
- * @param s The page to be checked
- * @param start_row The starting row index
- * @param num_rows The number of rows
- *
- * @return True if the page is completely contained within the row bounds
- */
-inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
-{
-  size_t const page_begin = s->col.start_row + s->page.chunk_row;
-  size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = start_row;
-  size_t const end        = start_row + num_rows;
-
-  return page_begin >= begin && page_end <= end;
-}
-
-/**
- * @brief Parse the beginning of the level section (definition or repetition),
- * initializes the initial RLE run & value, and returns the section length
- *
- * @param[in,out] s The page state
- * @param[in] cur The current data position
- * @param[in] end The end of the data
- * @param[in] level_bits The bits required
- *
- * @return The length of the section
- */
-__device__ uint32_t InitLevelSection(page_state_s* s,
-                                     const uint8_t* cur,
-                                     const uint8_t* end,
-                                     level_type lvl,
-                                     bool is_decode_step,
-                                     rle_stream* decoders)
-{
-  int32_t len;
-  int level_bits    = s->col.level_bits[lvl];
-  Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
-                                                    : s->page.repetition_level_encoding;
-
-  auto start = cur;
-  if (level_bits == 0) {
-    len                       = 0;
-    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-    s->abs_lvl_start[lvl]     = cur;
-  } else if (encoding == Encoding::RLE) {
-    // V2 only uses RLE encoding, so only perform check here
-    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
-      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
-    } else if (cur + 4 < end) {
-      len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
-      cur += 4;
-    } else {
-      len      = 0;
-      s->error = 2;
-    }
-    s->abs_lvl_start[lvl] = cur;
-    if (!s->error) {
-      uint32_t run            = get_vlq32(cur, end);
-      s->initial_rle_run[lvl] = run;
-      if (!(run & 1)) {
-        int v = (cur < end) ? cur[0] : 0;
-        cur++;
-        if (level_bits > 8) {
-          v |= ((cur < end) ? cur[0] : 0) << 8;
-          cur++;
-        }
-        s->initial_rle_value[lvl] = v;
-      }
-      s->lvl_start[lvl] = cur;
-    }
-
-    if (cur > end) { s->error = 2; }
-  } else if (encoding == Encoding::BIT_PACKED) {
-    len                       = (s->page.num_input_values * level_bits + 7) >> 3;
-    s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-    s->abs_lvl_start[lvl]     = cur;
-  } else {
-    s->error = 3;
-    len      = 0;
-  }
-
-  s->abs_lvl_end[lvl] = start + len;
-
-  return static_cast<uint32_t>(len);
-}
-
-/**
- * @brief Decode values out of a definition or repetition stream
- *
- * @param[in,out] s Page state input/output
- * @param[in] t target_count Target count of stream values on output
- * @param[in] t Warp0 thread ID (0..31)
- * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
- */
-__device__ void gpuDecodeStream(
-  uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
-{
-  const uint8_t* cur_def    = s->lvl_start[lvl];
-  const uint8_t* end        = s->lvl_end;
-  uint32_t level_run        = s->initial_rle_run[lvl];
-  int32_t level_val         = s->initial_rle_value[lvl];
-  int level_bits            = s->col.level_bits[lvl];
-  int32_t num_input_values  = s->num_input_values;
-  int32_t value_count       = s->lvl_count[lvl];
-  int32_t batch_coded_count = 0;
-
-  while (value_count < target_count && value_count < num_input_values) {
-    int batch_len;
-    if (level_run <= 1) {
-      // Get a new run symbol from the byte stream
-      int sym_len = 0;
-      if (!t) {
-        const uint8_t* cur = cur_def;
-        if (cur < end) { level_run = get_vlq32(cur, end); }
-        if (!(level_run & 1)) {
-          if (cur < end) level_val = cur[0];
-          cur++;
-          if (level_bits > 8) {
-            if (cur < end) level_val |= cur[0] << 8;
-            cur++;
-          }
-        }
-        if (cur > end || level_run <= 1) { s->error = 0x10; }
-        sym_len = (int32_t)(cur - cur_def);
-        __threadfence_block();
-      }
-      sym_len   = shuffle(sym_len);
-      level_val = shuffle(level_val);
-      level_run = shuffle(level_run);
-      cur_def += sym_len;
-    }
-    if (s->error) { break; }
-
-    batch_len = min(num_input_values - value_count, 32);
-    if (level_run & 1) {
-      // Literal run
-      int batch_len8;
-      batch_len  = min(batch_len, (level_run >> 1) * 8);
-      batch_len8 = (batch_len + 7) >> 3;
-      if (t < batch_len) {
-        int bitpos         = t * level_bits;
-        const uint8_t* cur = cur_def + (bitpos >> 3);
-        bitpos &= 7;
-        if (cur < end) level_val = cur[0];
-        cur++;
-        if (level_bits > 8 - bitpos && cur < end) {
-          level_val |= cur[0] << 8;
-          cur++;
-          if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16;
-        }
-        level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
-      }
-      level_run -= batch_len8 * 2;
-      cur_def += batch_len8 * level_bits;
-    } else {
-      // Repeated value
-      batch_len = min(batch_len, level_run >> 1);
-      level_run -= batch_len * 2;
-    }
-    if (t < batch_len) {
-      int idx                    = value_count + t;
-      output[rolling_index(idx)] = level_val;
-    }
-    batch_coded_count += batch_len;
-    value_count += batch_len;
-  }
-
-  // update the stream info
-  if (!t) {
-    s->lvl_start[lvl]         = cur_def;
-    s->initial_rle_run[lvl]   = level_run;
-    s->initial_rle_value[lvl] = level_val;
-    s->lvl_count[lvl]         = value_count;
-  }
-}
-
-/**
- * @brief Performs RLE decoding of dictionary indexes
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
- * 31)
- * @param[in] t Warp1 thread ID (0..31)
- *
- * @return A pair containing the new output position, and the total length of strings decoded (this
- * will only be valid on thread 0 and if sizes_only is true). In the event that this function
- * decodes strings beyond target_pos, the total length of strings returned will include these
- * additional values.
- */
-template <bool sizes_only>
-__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  volatile page_state_s* s,
-  [[maybe_unused]] volatile page_state_buffers_s* sb,
-  int target_pos,
-  int t)
-{
-  const uint8_t* end = s->data_end;
-  int dict_bits      = s->dict_bits;
-  int pos            = s->dict_pos;
-  int str_len        = 0;
-
-  while (pos < target_pos) {
-    int is_literal, batch_len;
-    if (!t) {
-      uint32_t run       = s->dict_run;
-      const uint8_t* cur = s->data_start;
-      if (run <= 1) {
-        run = (cur < end) ? get_vlq32(cur, end) : 0;
-        if (!(run & 1)) {
-          // Repeated value
-          int bytecnt = (dict_bits + 7) >> 3;
-          if (cur + bytecnt <= end) {
-            int32_t run_val = cur[0];
-            if (bytecnt > 1) {
-              run_val |= cur[1] << 8;
-              if (bytecnt > 2) {
-                run_val |= cur[2] << 16;
-                if (bytecnt > 3) { run_val |= cur[3] << 24; }
-              }
-            }
-            s->dict_val = run_val & ((1 << dict_bits) - 1);
-          }
-          cur += bytecnt;
-        }
-      }
-      if (run & 1) {
-        // Literal batch: must output a multiple of 8, except for the last batch
-        int batch_len_div8;
-        batch_len      = max(min(32, (int)(run >> 1) * 8), 1);
-        batch_len_div8 = (batch_len + 7) >> 3;
-        run -= batch_len_div8 * 2;
-        cur += batch_len_div8 * dict_bits;
-      } else {
-        batch_len = max(min(32, (int)(run >> 1)), 1);
-        run -= batch_len * 2;
-      }
-      s->dict_run   = run;
-      s->data_start = cur;
-      is_literal    = run & 1;
-      __threadfence_block();
-    }
-    __syncwarp();
-    is_literal = shuffle(is_literal);
-    batch_len  = shuffle(batch_len);
-
-    // compute dictionary index.
-    int dict_idx = 0;
-    if (t < batch_len) {
-      dict_idx = s->dict_val;
-      if (is_literal) {
-        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
-        const uint8_t* p = s->data_start + (ofs >> 3);
-        ofs &= 7;
-        if (p < end) {
-          uint32_t c = 8 - ofs;
-          dict_idx   = (*p++) >> ofs;
-          if (c < dict_bits && p < end) {
-            dict_idx |= (*p++) << c;
-            c += 8;
-            if (c < dict_bits && p < end) {
-              dict_idx |= (*p++) << c;
-              c += 8;
-              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
-            }
-          }
-          dict_idx &= (1 << dict_bits) - 1;
-        }
-      }
-
-      // if we're not computing sizes, store off the dictionary index
-      if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; }
-    }
-
-    // if we're computing sizes, add the length(s)
-    if constexpr (sizes_only) {
-      int const len = [&]() {
-        if (t >= batch_len || (pos + t >= target_pos)) { return 0; }
-        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
-        if (dict_pos < (uint32_t)s->dict_size) {
-          const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
-          return src->second;
-        }
-        return 0;
-      }();
-
-      using WarpReduce = cub::WarpReduce<size_type>;
-      __shared__ typename WarpReduce::TempStorage temp_storage;
-      // note: str_len will only be valid on thread 0.
-      str_len += WarpReduce(temp_storage).Sum(len);
-    }
-
-    pos += batch_len;
-  }
-  return {pos, str_len};
-}
-
 /**
  * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
  *
@@ -568,92 +94,6 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
   return pos;
 }
 
-/**
- * @brief Parses the length and position of strings and returns total length of all strings
- * processed
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target output position
- * @param[in] t Thread ID
- *
- * @return Total length of strings processed
- */
-template <bool sizes_only>
-__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
-                                              [[maybe_unused]] volatile page_state_buffers_s* sb,
-                                              int target_pos,
-                                              int t)
-{
-  int pos       = s->dict_pos;
-  int total_len = 0;
-
-  // This step is purely serial
-  if (!t) {
-    const uint8_t* cur = s->data_start;
-    int dict_size      = s->dict_size;
-    int k              = s->dict_val;
-
-    while (pos < target_pos) {
-      int len;
-      if (k + 4 <= dict_size) {
-        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
-        k += 4;
-        if (k + len > dict_size) { len = 0; }
-      } else {
-        len = 0;
-      }
-      if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index(pos)] = k;
-        sb->str_len[rolling_index(pos)]  = len;
-      }
-      k += len;
-      total_len += len;
-      pos++;
-    }
-    s->dict_val = k;
-    __threadfence_block();
-  }
-
-  return total_len;
-}
-
-/**
- * @brief Retrieves string information for a string at the specified source position
- *
- * @param[in] s Page state input
- * @param[out] sb Page state buffer output
- * @param[in] src_pos Source position
- *
- * @return A pair containing a pointer to the string and its length
- */
-inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
-  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos)
-{
-  const char* ptr = nullptr;
-  size_t len      = 0;
-
-  if (s->dict_base) {
-    // String dictionary
-    uint32_t dict_pos =
-      (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
-    if (dict_pos < (uint32_t)s->dict_size) {
-      const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
-      ptr             = src->first;
-      len             = src->second;
-    }
-  } else {
-    // Plain encoding
-    uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
-    if (dict_pos <= (uint32_t)s->dict_size) {
-      ptr = reinterpret_cast<const char*>(s->data_start + dict_pos);
-      len = sb->str_len[rolling_index(src_pos)];
-    }
-  }
-
-  return {ptr, len};
-}
-
 /**
  * @brief Output a string descriptor
  *
@@ -1029,749 +469,76 @@ static __device__ void gpuOutputGeneric(
 }
 
 /**
- * @brief Sets up block-local page state information from the global pages.
+ * @brief Returns the total size in bytes of string char data in the page.
+ *
+ * This function expects the dictionary position to be at 0 and will traverse
+ * the entire thing.
+ *
+ * Operates on a single warp only. Expects t < 32
  *
- * @param[in, out] s The local page state to be filled in
- * @param[in] p The global page to be copied from
- * @param[in] chunks The global list of chunks
- * @param[in] min_row Crop all rows below min_row
- * @param[in] num_rows Maximum number of rows to read
- * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
- * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
- * Currently only used by gpuComputePageSizes step)
+ * @param s The local page info
+ * @param t Thread index
  */
-static __device__ bool setupLocalPageInfo(page_state_s* const s,
-                                          PageInfo const* p,
-                                          device_span<ColumnChunkDesc const> chunks,
-                                          size_t min_row,
-                                          size_t num_rows,
-                                          bool is_decode_step,
-                                          rle_stream* decoders = nullptr)
+__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
 {
-  int t = threadIdx.x;
-  int chunk_idx;
-
-  // Fetch page info
-  if (!t) {
-    s->page         = *p;
-    s->nesting_info = nullptr;
+  size_type target_pos = s->num_input_values;
+  size_type str_len    = 0;
+  if (s->dict_base) {
+    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, nullptr, target_pos, t);
+    target_pos                       = new_target_pos;
+    str_len                          = len;
+  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+    str_len = gpuInitStringDescriptors<true>(s, nullptr, target_pos, t);
   }
-  __syncthreads();
+  if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; }
+  return str_len;
+}
 
-  if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; }
-  // Fetch column chunk info
-  chunk_idx = s->page.chunk_idx;
-  if (!t) { s->col = chunks[chunk_idx]; }
+/**
+ * @brief Update output column sizes for every nesting level based on a batch
+ * of incoming decoded definition and repetition level values.
+ *
+ * If bounds_set is true, computes skipped_values and skipped_leaf_values for the
+ * page to indicate where we need to skip to based on min/max row.
+ *
+ * Operates at the block level.
+ *
+ * @param s The local page info
+ * @param target_value_count The target value count to process up to
+ * @param rep Repetition level buffer
+ * @param def Definition level buffer
+ * @param t Thread index
+ * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
+ */
+template <int lvl_buf_size>
+static __device__ void gpuUpdatePageSizes(page_state_s* s,
+                                          int target_value_count,
+                                          uint32_t const* const rep,
+                                          uint32_t const* const def,
+                                          int t,
+                                          bool bounds_set)
+{
+  // max nesting depth of the column
+  int const max_depth = s->col.max_nesting_depth;
 
-  // if we can use the nesting decode cache, set it up now
-  auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
-  if (can_use_decode_cache) {
-    int depth = 0;
-    while (depth < s->page.nesting_info_size) {
-      int const thread_depth = depth + t;
-      if (thread_depth < s->page.nesting_info_size) {
-        // these values need to be copied over from global
-        s->nesting_decode_cache[thread_depth].max_def_level =
-          s->page.nesting_decode[thread_depth].max_def_level;
-        s->nesting_decode_cache[thread_depth].page_start_value =
-          s->page.nesting_decode[thread_depth].page_start_value;
-        s->nesting_decode_cache[thread_depth].start_depth =
-          s->page.nesting_decode[thread_depth].start_depth;
-        s->nesting_decode_cache[thread_depth].end_depth =
-          s->page.nesting_decode[thread_depth].end_depth;
-      }
-      depth += blockDim.x;
-    }
-  }
-  if (!t) {
-    s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
-  }
+  constexpr int num_warps      = preprocess_block_size / 32;
+  constexpr int max_batch_size = num_warps * 32;
 
-  __syncthreads();
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
 
-  // zero counts
-  int depth = 0;
-  while (depth < s->page.num_output_nesting_levels) {
-    int const thread_depth = depth + t;
-    if (thread_depth < s->page.num_output_nesting_levels) {
-      s->nesting_info[thread_depth].valid_count = 0;
-      s->nesting_info[thread_depth].value_count = 0;
-      s->nesting_info[thread_depth].null_count  = 0;
-    }
-    depth += blockDim.x;
-  }
-  __syncthreads();
-
-  if (!t) {
-    s->error = 0;
-
-    // our starting row (absolute index) is
-    // col.start_row == absolute row index
-    // page.chunk-row == relative row index within the chunk
-    size_t page_start_row = s->col.start_row + s->page.chunk_row;
-
-    // IMPORTANT : nested schemas can have 0 rows in a page but still have
-    // values. The case is:
-    // - On page N-1, the last row starts, with 2/6 values encoded
-    // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
-    // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
-    if (s->page.num_input_values > 0) {
-      uint8_t* cur = s->page.page_data;
-      uint8_t* end = cur + s->page.uncompressed_page_size;
-
-      uint32_t dtype_len_out = s->col.data_type >> 3;
-      s->ts_scale            = 0;
-      // Validate data type
-      auto const data_type = s->col.data_type & 7;
-      switch (data_type) {
-        case BOOLEAN:
-          s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
-          break;
-        case INT32: [[fallthrough]];
-        case FLOAT: s->dtype_len = 4; break;
-        case INT64:
-          if (s->col.ts_clock_rate) {
-            int32_t units = 0;
-            // Duration types are not included because no scaling is done when reading
-            if (s->col.converted_type == TIMESTAMP_MILLIS) {
-              units = cudf::timestamp_ms::period::den;
-            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
-              units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
-              units = cudf::timestamp_ns::period::den;
-            }
-            if (units and units != s->col.ts_clock_rate) {
-              s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
-                                                           : (s->col.ts_clock_rate / units);
-            }
-          }
-          [[fallthrough]];
-        case DOUBLE: s->dtype_len = 8; break;
-        case INT96: s->dtype_len = 12; break;
-        case BYTE_ARRAY:
-          if (s->col.converted_type == DECIMAL) {
-            auto const decimal_precision = s->col.decimal_precision;
-            s->dtype_len                 = [decimal_precision]() {
-              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
-                return sizeof(int32_t);
-              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
-                return sizeof(int64_t);
-              } else {
-                return sizeof(__int128_t);
-              }
-            }();
-          } else {
-            s->dtype_len = sizeof(string_index_pair);
-          }
-          break;
-        default:  // FIXED_LEN_BYTE_ARRAY:
-          s->dtype_len = dtype_len_out;
-          s->error |= (s->dtype_len <= 0);
-          break;
-      }
-      // Special check for downconversions
-      s->dtype_len_in = s->dtype_len;
-      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
-        s->dtype_len = [dtype_len = s->dtype_len]() {
-          if (dtype_len <= sizeof(int32_t)) {
-            return sizeof(int32_t);
-          } else if (dtype_len <= sizeof(int64_t)) {
-            return sizeof(int64_t);
-          } else {
-            return sizeof(__int128_t);
-          }
-        }();
-      } else if (data_type == INT32) {
-        if (dtype_len_out == 1) {
-          // INT8 output
-          s->dtype_len = 1;
-        } else if (dtype_len_out == 2) {
-          // INT16 output
-          s->dtype_len = 2;
-        } else if (s->col.converted_type == TIME_MILLIS) {
-          // INT64 output
-          s->dtype_len = 8;
-        }
-      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
-        s->dtype_len = 4;  // HASH32 output
-      } else if (data_type == INT96) {
-        s->dtype_len = 8;  // Convert to 64-bit timestamp
-      }
-
-      // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
-      // invalid/bogus during first pass of the preprocess step for nested types. this is ok
-      // because we ignore these values in that stage.
-      {
-        auto const max_row = min_row + num_rows;
-
-        // if we are totally outside the range of the input, do nothing
-        if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
-          s->first_row = 0;
-          s->num_rows  = 0;
-        }
-        // otherwise
-        else {
-          s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
-          auto const max_page_rows = s->page.num_rows - s->first_row;
-          s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
-                                       ? max_page_rows
-                                       : max_row - (page_start_row + s->first_row);
-        }
-      }
-
-      // during the decoding step we need to offset the global output buffers
-      // for each level of nesting so that we write to the section this page
-      // is responsible for.
-      // - for flat schemas, we can do this directly by using row counts
-      // - for nested schemas, these offsets are computed during the preprocess step
-      //
-      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
-      // will be aliased to memory that has been freed when we get here in the non-decode step, so
-      // we cannot check against nullptr.  we'll just check a flag directly.
-      if (is_decode_step) {
-        int max_depth = s->col.max_nesting_depth;
-        for (int idx = 0; idx < max_depth; idx++) {
-          PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx];
-
-          size_t output_offset;
-          // schemas without lists
-          if (s->col.max_level[level_type::REPETITION] == 0) {
-            output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
-          }
-          // for schemas with lists, we've already got the exact value precomputed
-          else {
-            output_offset = nesting_info->page_start_value;
-          }
-
-          if (s->col.column_data_base != nullptr) {
-            nesting_info->data_out   = static_cast<uint8_t*>(s->col.column_data_base[idx]);
-            nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
-
-            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
-
-            if (nesting_info->data_out != nullptr) {
-              // anything below max depth with a valid data pointer must be a list, so the
-              // element size is the size of the offset type.
-              uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
-              // if this is a string column, then dtype_len is a lie. data will be offsets rather
-              // than (ptr,len) tuples.
-              if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); }
-              nesting_info->data_out += (output_offset * len);
-            }
-            if (nesting_info->string_out != nullptr) {
-              nesting_info->string_out += s->page.str_offset;
-            }
-            nesting_info->valid_map = s->col.valid_map_base[idx];
-            if (nesting_info->valid_map != nullptr) {
-              nesting_info->valid_map += output_offset >> 5;
-              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
-            }
-          }
-        }
-      }
-      s->first_output_value = 0;
-
-      // Find the compressed size of repetition levels
-      cur += InitLevelSection(s, cur, end, level_type::REPETITION, is_decode_step, decoders);
-      // Find the compressed size of definition levels
-      cur += InitLevelSection(s, cur, end, level_type::DEFINITION, is_decode_step, decoders);
-
-      s->dict_bits = 0;
-      s->dict_base = nullptr;
-      s->dict_size = 0;
-      // NOTE:  if additional encodings are supported in the future, modifications must
-      // be made to is_supported_encoding() in reader_impl_preprocess.cu
-      switch (s->page.encoding) {
-        case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
-          // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
-            // String dictionary: use index
-            s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
-          } else {
-            s->dict_base =
-              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
-            s->dict_size = s->col.page_info[0].uncompressed_page_size;
-          }
-          s->dict_run  = 0;
-          s->dict_val  = 0;
-          s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
-          break;
-        case Encoding::PLAIN:
-          s->dict_size = static_cast<int32_t>(end - cur);
-          s->dict_val  = 0;
-          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
-          break;
-        case Encoding::RLE: s->dict_run = 0; break;
-        default:
-          s->error = 1;  // Unsupported encoding
-          break;
-      }
-      if (cur > end) { s->error = 1; }
-      s->lvl_end    = cur;
-      s->data_start = cur;
-      s->data_end   = end;
-    } else {
-      s->error = 1;
-    }
-
-    s->lvl_count[level_type::REPETITION] = 0;
-    s->lvl_count[level_type::DEFINITION] = 0;
-    s->nz_count                          = 0;
-    s->num_input_values                  = s->page.num_input_values;
-    s->dict_pos                          = 0;
-    s->src_pos                           = 0;
-
-    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
-    // preprocess of the definition levels (since nulls will have no actual decodable value, there
-    // is no direct correlation between # of rows and # of decodable values).  so we will start
-    // processing at the beginning of the value stream and disregard any indices that start
-    // before the first row.
-    if (s->col.max_level[level_type::REPETITION] == 0) {
-      s->page.skipped_values      = 0;
-      s->page.skipped_leaf_values = 0;
-      s->input_value_count        = 0;
-      s->input_row_count          = 0;
-      s->input_leaf_count         = 0;
-
-      s->row_index_lower_bound = -1;
-    }
-    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
-    // we need to start decoding at
-    else {
-      // input_row_count translates to "how many rows we have processed so far", so since we are
-      // skipping directly to where we want to start decoding, set it to first_row
-      s->input_row_count = s->first_row;
-
-      // return the lower bound to compare (page-relative) thread row index against. Explanation:
-      // In the case of nested schemas, rows can span page boundaries.  That is to say,
-      // we can encounter the first value for row X on page M, but the last value for page M
-      // might not be the last value for row X. page M+1 (or further) may contain the last value.
-      //
-      // This means that the first values we encounter for a given page (M+1) may not belong to the
-      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
-      // previous row is within the overall row bounds, include the values by allowing relative row
-      // index -1
-      int const max_row = (min_row + num_rows) - 1;
-      if (min_row < page_start_row && max_row >= page_start_row - 1) {
-        s->row_index_lower_bound = -1;
-      } else {
-        s->row_index_lower_bound = s->first_row;
-      }
-
-      // if we're in the decoding step, jump directly to the first
-      // value we care about
-      if (is_decode_step) {
-        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
-      } else {
-        s->input_value_count = 0;
-        s->input_leaf_count  = 0;
-        s->page.skipped_values =
-          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
-        s->page.skipped_leaf_values = 0;
-      }
-    }
-
-    __threadfence_block();
-  }
-  __syncthreads();
-
-  return true;
-}
-
-/**
- * @brief Store a validity mask containing value_count bits into the output validity buffer of the
- * page.
- *
- * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
- * offset is also updated
- * @param[in] valid_mask The validity mask to be stored
- * @param[in] value_count # of bits in the validity mask
- */
-static __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
-                                      uint32_t valid_mask,
-                                      int32_t value_count)
-{
-  int word_offset = nesting_info->valid_map_offset / 32;
-  int bit_offset  = nesting_info->valid_map_offset % 32;
-  // if we fit entirely in the output word
-  if (bit_offset + value_count <= 32) {
-    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
-
-    if (relevant_mask == ~0) {
-      nesting_info->valid_map[word_offset] = valid_mask;
-    } else {
-      atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-      atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
-    }
-  }
-  // we're going to spill over into the next word.
-  // note : writing both values here is the lazy/slow way.  we could be writing just
-  // the first word and rolling the remaining bits over into the next call.
-  // however, some basic performance tests shows almost no difference between these two
-  // methods. More detailed performance testing might be worthwhile here.
-  else {
-    uint32_t bits_left = 32 - bit_offset;
-
-    // first word. strip bits_left bits off the beginning and store that
-    uint32_t relevant_mask = ((1 << bits_left) - 1);
-    uint32_t mask_word0    = valid_mask & relevant_mask;
-    atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-    atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset);
-
-    // second word. strip the remainder of the bits off the end and store that
-    relevant_mask       = ((1 << (value_count - bits_left)) - 1);
-    uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
-    atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask));
-    atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left);
-  }
-
-  nesting_info->valid_map_offset += value_count;
-}
-
-/**
- * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level
- * D to which we should considered them null or not.
- *
- * @param[out] start_depth The start nesting depth
- * @param[out] end_depth The end nesting depth (inclusive)
- * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
- * d will be -1
- * @param[in] s Local page information
- * @param[in] rep Repetition level buffer
- * @param[in] def Definition level buffer
- * @param[in] input_value_count The current count of input level values we have processed
- * @param[in] target_input_value_count The desired # of input level values we want to process
- * @param[in] t Thread index
- */
-template <int lvl_buf_size>
-inline __device__ void get_nesting_bounds(int& start_depth,
-                                          int& end_depth,
-                                          int& d,
-                                          page_state_s* s,
-                                          uint32_t const* const rep,
-                                          uint32_t const* const def,
-                                          int input_value_count,
-                                          int32_t target_input_value_count,
-                                          int t)
-{
-  start_depth = -1;
-  end_depth   = -1;
-  d           = -1;
-  if (input_value_count + t < target_input_value_count) {
-    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d         = def[index];
-    // if we have repetition (there are list columns involved) we have to
-    // bound what nesting levels we apply values to
-    if (s->col.max_level[level_type::REPETITION] > 0) {
-      int r       = rep[index];
-      start_depth = s->nesting_info[r].start_depth;
-      end_depth   = s->nesting_info[d].end_depth;
-    }
-    // for columns without repetition (even ones involving structs) we always
-    // traverse the entire hierarchy.
-    else {
-      start_depth = 0;
-      end_depth   = s->col.max_nesting_depth - 1;
-    }
-  }
-}
-
-/**
- * @brief Process a batch of incoming repetition/definition level values and generate
- *        validity, nested column offsets (where appropriate) and decoding indices.
- *
- * @param[in] target_input_value_count The # of repetition/definition levels to process up to
- * @param[in] s Local page information
- * @param[out] sb Page state buffer output
- * @param[in] rep Repetition level buffer
- * @param[in] def Definition level buffer
- * @param[in] t Thread index
- */
-template <int lvl_buf_size>
-static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
-                                                             page_state_s* s,
-                                                             page_state_buffers_s* sb,
-                                                             uint32_t const* const rep,
-                                                             uint32_t const* const def,
-                                                             int t)
-{
-  // max nesting depth of the column
-  int const max_depth       = s->col.max_nesting_depth;
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-  // how many (input) values we've processed in the page so far
-  int input_value_count = s->input_value_count;
-  // how many rows we've processed in the page so far
-  int input_row_count = s->input_row_count;
-
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-
-  // process until we've reached the target
-  while (input_value_count < target_input_value_count) {
-    // determine the nesting bounds for this thread (the range of nesting depths we
-    // will generate new value indices and validity bits for)
-    int start_depth, end_depth, d;
-    get_nesting_bounds<non_zero_buffer_size>(
-      start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
-
-    // 4 interesting things to track:
-    // thread_value_count : # of output values from the view of this thread
-    // warp_value_count   : # of output values for the whole warp
-    //
-    // thread_valid_count : # of valid values from the view of this thread
-    // warp_valid_count   : # of valid values for the whole warp
-    uint32_t thread_value_count, warp_value_count;
-    uint32_t thread_valid_count, warp_valid_count;
-
-    // track (page-relative) row index for the thread so we can compare against input bounds
-    // keep track of overall # of rows we've read.
-    int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row);
-    int32_t const thread_row_index =
-      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
-    input_row_count += __popc(warp_row_count_mask);
-    // is this thread within read row bounds?
-    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
-                                  thread_row_index < (s->first_row + s->num_rows)
-                                ? 1
-                                : 0;
-
-    // compute warp and thread value counts
-    uint32_t const warp_count_mask =
-      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
-
-    warp_value_count = __popc(warp_count_mask);
-    // Note : ((1 << t) - 1) implies "for all threads before me"
-    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
-
-    // walk from 0 to max_depth
-    uint32_t next_thread_value_count, next_warp_value_count;
-    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
-
-      // if we are within the range of nesting levels we should be adding value indices for
-      int const in_nesting_bounds =
-        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
-
-      // everything up to the max_def_level is a non-null value
-      uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
-
-      // compute warp and thread valid counts
-      uint32_t const warp_valid_mask =
-        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
-        // because every value in the input matches to a value in the output
-        !has_repetition
-          ? ballot(is_valid)
-          :
-          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
-          // however not all of them will necessarily represent a value at this nesting level. so
-          // the validity bit for thread t might actually represent output value t-6. the correct
-          // position for thread t's bit is cur_value_count. for cuda 11 we could use
-          // __reduce_or_sync(), but until then we have to do a warp reduce.
-          WarpReduceOr32(is_valid << thread_value_count);
-
-      thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
-      warp_valid_count   = __popc(warp_valid_mask);
-
-      // if this is the value column emit an index for value decoding
-      if (is_valid && s_idx == max_depth - 1) {
-        int const src_pos = nesting_info->valid_count + thread_valid_count;
-        int const dst_pos = nesting_info->value_count + thread_value_count;
-        // nz_idx is a mapping of src buffer indices to destination buffer indices
-        sb->nz_idx[rolling_index(src_pos)] = dst_pos;
-      }
-
-      // compute warp and thread value counts for the -next- nesting level. we need to
-      // do this for nested schemas so that we can emit an offset for the -current- nesting
-      // level. more concretely : the offset for the current nesting level == current length of the
-      // next nesting level
-      if (s_idx < max_depth - 1) {
-        uint32_t const next_warp_count_mask =
-          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
-        next_warp_value_count   = __popc(next_warp_count_mask);
-        next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
-
-        // if we're -not- at a leaf column and we're within nesting/row bounds
-        // and we have a valid data_out pointer, it implies this is a list column, so
-        // emit an offset.
-        if (in_nesting_bounds && nesting_info->data_out != nullptr) {
-          int const idx             = nesting_info->value_count + thread_value_count;
-          cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
-                                      next_thread_value_count +
-                                      nesting_info_base[s_idx + 1].page_start_value;
-          (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
-        }
-      }
-
-      // nested schemas always read and write to the same bounds (that is, read and write positions
-      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
-      // first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector here
-      // we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds =
-        !has_repetition
-          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
-          : in_row_bounds;
-      int const first_thread_in_write_range =
-        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
-
-      // # of bits to of the validity mask to write out
-      int const warp_valid_mask_bit_count =
-        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
-
-      // increment count of valid values, count of total values, and update validity mask
-      if (!t) {
-        if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
-          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
-          store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count);
-
-          nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
-        }
-        nesting_info->valid_count += warp_valid_count;
-        nesting_info->value_count += warp_value_count;
-      }
-
-      // propagate value counts for the next level
-      warp_value_count   = next_warp_value_count;
-      thread_value_count = next_thread_value_count;
-    }
-
-    input_value_count += min(32, (target_input_value_count - input_value_count));
-    __syncwarp();
-  }
-
-  // update
-  if (!t) {
-    // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = nesting_info_base[max_depth - 1].valid_count;
-    s->input_value_count = input_value_count;
-    s->input_row_count   = input_row_count;
-  }
-}
-
-/**
- * @brief Process repetition and definition levels up to the target count of leaf values.
- *
- * In order to decode actual leaf values from the input stream, we need to generate the
- * list of non-null value positions (page_state_s::nz_idx). We do this by processing
- * the repetition and definition level streams.  This process also generates validity information,
- * and offset column values in the case of nested schemas. Because of the way the streams
- * are encoded, this function may generate slightly more than target_leaf_count.
- *
- * Only runs on 1 warp.
- *
- * @param[in] s The local page state
- * @param[out] sb Page state buffer output
- * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
- * @param[in] rep Repetition level buffer
- * @param[in] def Definition level buffer
- * @param[in] t Thread index
- */
-template <int lvl_buf_size>
-__device__ void gpuDecodeLevels(page_state_s* s,
-                                page_state_buffers_s* sb,
-                                int32_t target_leaf_count,
-                                uint32_t* const rep,
-                                uint32_t* const def,
-                                int t)
-{
-  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  constexpr int batch_size = 32;
-  int cur_leaf_count       = target_leaf_count;
-  while (!s->error && s->nz_count < target_leaf_count &&
-         s->input_value_count < s->num_input_values) {
-    if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); }
-    gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION);
-    __syncwarp();
-
-    // because the rep and def streams are encoded separately, we cannot request an exact
-    // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
-    // levels we get.
-    int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
-                                                 s->lvl_count[level_type::DEFINITION])
-                                           : s->lvl_count[level_type::DEFINITION];
-
-    // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size>(actual_leaf_count, s, sb, rep, def, t);
-    cur_leaf_count = actual_leaf_count + batch_size;
-    __syncwarp();
-  }
-}
-
-/**
- * @brief Returns the total size in bytes of string char data in the page.
- *
- * This function expects the dictionary position to be at 0 and will traverse
- * the entire thing.
- *
- * Operates on a single warp only. Expects t < 32
- *
- * @param s The local page info
- * @param t Thread index
- */
-__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
-{
-  size_type target_pos = s->num_input_values;
-  size_type str_len    = 0;
-  if (s->dict_base) {
-    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, nullptr, target_pos, t);
-    target_pos                       = new_target_pos;
-    str_len                          = len;
-  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
-    str_len = gpuInitStringDescriptors<true>(s, nullptr, target_pos, t);
-  }
-  if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; }
-  return str_len;
-}
-
-/**
- * @brief Update output column sizes for every nesting level based on a batch
- * of incoming decoded definition and repetition level values.
- *
- * If bounds_set is true, computes skipped_values and skipped_leaf_values for the
- * page to indicate where we need to skip to based on min/max row.
- *
- * Operates at the block level.
- *
- * @param s The local page info
- * @param target_value_count The target value count to process up to
- * @param rep Repetition level buffer
- * @param def Definition level buffer
- * @param t Thread index
- * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
- */
-template <int lvl_buf_size>
-static __device__ void gpuUpdatePageSizes(page_state_s* s,
-                                          int target_value_count,
-                                          uint32_t const* const rep,
-                                          uint32_t const* const def,
-                                          int t,
-                                          bool bounds_set)
-{
-  // max nesting depth of the column
-  int const max_depth = s->col.max_nesting_depth;
-
-  constexpr int num_warps      = preprocess_block_size / 32;
-  constexpr int max_batch_size = num_warps * 32;
-
-  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
-  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
-
-  // how many input level values we've processed in the page so far
-  int value_count = s->input_value_count;
-  // how many rows we've processed in the page so far
-  int row_count = s->input_row_count;
-  // how many leaf values we've processed in the page so far
-  int leaf_count = s->input_leaf_count;
-  // whether or not we need to continue checking for the first row
-  bool skipped_values_set = s->page.skipped_values >= 0;
+  // how many input level values we've processed in the page so far
+  int value_count = s->input_value_count;
+  // how many rows we've processed in the page so far
+  int row_count = s->input_row_count;
+  // how many leaf values we've processed in the page so far
+  int leaf_count = s->input_leaf_count;
+  // whether or not we need to continue checking for the first row
+  bool skipped_values_set = s->page.skipped_values >= 0;
 
   while (value_count < target_value_count) {
     int const batch_size = min(max_batch_size, target_value_count - value_count);
@@ -1853,491 +620,6 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
   }
 }
 
-template <int lvl_buf_size>
-__device__ std::pair<int, int> page_bounds(page_state_s* const s,
-                                           size_t min_row,
-                                           size_t num_rows,
-                                           bool is_bounds_pg,
-                                           bool has_repetition,
-                                           rle_stream* decoders,
-                                           int t)
-{
-  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
-  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
-
-  // decode batches of level stream data using rle_stream objects and use the results to
-  // calculate start and end value positions in the encoded string data.
-  int const max_depth = s->col.max_nesting_depth;
-  int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
-
-  // can skip all this if we know there are no nulls
-  if (max_def == 0 && !is_bounds_pg) {
-    s->page.num_valids = s->num_input_values;
-    s->page.num_nulls = 0;
-    return {0, s->num_input_values};
-  }
-
-  int start_value = 0;
-  int end_value   = s->page.num_input_values;
-  auto const pp   = &s->page;
-  auto const col  = &s->col;
-
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  int const max_batch_size = lvl_buf_size;
-  uint32_t* def_decode     = pp->lvl_decode_buf[level_type::DEFINITION];
-  uint32_t* rep_decode     = pp->lvl_decode_buf[level_type::REPETITION];
-  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
-                                        s->abs_lvl_start[level_type::DEFINITION],
-                                        s->abs_lvl_end[level_type::DEFINITION],
-                                        max_batch_size,
-                                        def_decode,
-                                        s->page.num_input_values);
-  // only need repetition if this is a bounds page. otherwise all we need is def level info
-  // to count the nulls.
-  if (has_repetition && is_bounds_pg) {
-    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
-                                          s->abs_lvl_start[level_type::REPETITION],
-                                          s->abs_lvl_end[level_type::REPETITION],
-                                          max_batch_size,
-                                          rep_decode,
-                                          s->page.num_input_values);
-  }
-
-  int processed = 0;
-
-  // if this is a bounds page, we need to do extra work to find the start and/or end value index
-  // TODO calculate num_nulls
-  if (is_bounds_pg) {
-    __shared__ int skipped_values;
-    __shared__ int skipped_leaf_values;
-    __shared__ int last_input_value;
-    __shared__ int end_val_idx;
-
-    // need these for skip_rows case
-    auto const page_start_row = col->start_row + pp->chunk_row;
-    auto const max_row        = min_row + num_rows;
-    auto const begin_row      = page_start_row >= min_row ? 0 : min_row - page_start_row;
-    auto const max_page_rows  = pp->num_rows - begin_row;
-    auto const page_rows      = page_start_row + begin_row + max_page_rows <= max_row
-                                  ? max_page_rows
-                                  : max_row - (page_start_row + begin_row);
-    auto const end_row        = begin_row + page_rows;
-
-    // short circuit for no nulls
-    if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; }
-
-    int row_count           = 0;
-    int leaf_count          = 0;
-    bool skipped_values_set = false;
-    bool end_value_set      = false;
-
-    while (processed < s->page.num_input_values) {
-      int start_val = processed;
-
-      if (has_repetition) {
-        decoders[level_type::REPETITION].decode_next(t);
-        __syncthreads();
-      }
-
-      // the # of rep/def levels will always be the same size
-      processed += decoders[level_type::DEFINITION].decode_next(t);
-      __syncthreads();
-
-      // do something with the level data
-      while (start_val < processed) {
-        int idx_t = start_val + t;
-        int idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
-
-        // get absolute thread row index
-        int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
-        int thread_row_count, block_row_count;
-        block_scan(temp_storage.scan_storage)
-          .InclusiveSum(is_new_row, thread_row_count, block_row_count);
-        __syncthreads();
-
-        // get absolute thread leaf index
-        int const is_new_leaf = idx_t < processed && (def_decode[idx] >= max_def);
-        int thread_leaf_count, block_leaf_count;
-        block_scan(temp_storage.scan_storage)
-          .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
-        __syncthreads();
-
-        // if we have not set skipped values yet, see if we found the first in-bounds row
-        if (!skipped_values_set && row_count + block_row_count > begin_row) {
-          // if this thread is in row bounds
-          int const row_index = (thread_row_count + row_count) - 1;
-          int in_row_bounds =
-            idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
-
-          int local_count, global_count;
-          block_scan(temp_storage.scan_storage)
-            .InclusiveSum(in_row_bounds, local_count, global_count);
-          __syncthreads();
-
-          // we found it
-          if (global_count > 0) {
-            // this is the thread that represents the first row. need to test in_row_bounds for
-            // the case where we only want one row and local_count == 1 for many threads.
-            if (local_count == 1 && in_row_bounds) {
-              skipped_values = idx_t;
-              skipped_leaf_values =
-                leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
-            }
-            skipped_values_set = true;
-          }
-        }
-
-        // test if row_count will exceed end_row in this batch
-        if (!end_value_set && row_count + block_row_count >= end_row) {
-          // if this thread exceeds row bounds
-          int const row_index    = (thread_row_count + row_count) - 1;
-          int exceeds_row_bounds = row_index >= end_row;
-
-          int local_count, global_count;
-          block_scan(temp_storage.scan_storage)
-            .InclusiveSum(exceeds_row_bounds, local_count, global_count);
-          __syncthreads();
-
-          // we found it
-          if (global_count > 0) {
-            // this is the thread that represents the end row.
-            if (local_count == 1) {
-              last_input_value = idx_t;
-              end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
-            }
-            end_value_set = true;
-          }
-        }
-
-        row_count += block_row_count;
-        leaf_count += block_leaf_count;
-
-        start_val += preprocess_block_size;
-      }
-      __syncthreads();
-    }
-
-    start_value = skipped_values_set ? skipped_leaf_values : 0;
-    end_value   = end_value_set ? end_val_idx : leaf_count;
-
-    if (t == 0) {
-      int const v0                = skipped_values_set ? skipped_values : 0;
-      int const vn                = end_value_set ? last_input_value : s->num_input_values;
-      int const total_values      = vn - v0;
-      int const total_leaf_values = end_value - start_value;
-      int const num_nulls         = total_values - total_leaf_values;
-      pp->num_nulls               = num_nulls;
-      pp->num_valids              = total_leaf_values;
-#if 0
-      printf("%05d: input vals in page %d,%d lc %d v0 %d vn %d %d nz %d nc %d\n",
-             blockIdx.x,
-             skipped_values_set,
-             end_value_set,
-             leaf_count,
-             v0,
-             vn,
-             total_values,
-             total_leaf_values,
-             num_nulls);
-#endif
-    }
-  }
-  // already filtered out unwanted pages, so need to count all non-null values in this page
-  else {
-    int num_nulls = 0;
-    while (processed < s->page.num_input_values) {
-      int start_val = processed;
-      processed += decoders[level_type::DEFINITION].decode_next(t);
-      __syncthreads();
-
-      while (start_val < processed) {
-        int idx_t = start_val + t;
-        if (idx_t < processed) {
-          int idx = rolling_lvl_index<lvl_buf_size>(idx_t);
-          if (def_decode[idx] < max_def) { num_nulls++; }
-        }
-        start_val += preprocess_block_size;
-      }
-      __syncthreads();
-    }
-
-    int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls);
-
-    if (t == 0) {
-      pp->num_nulls  = null_count;
-      pp->num_valids = pp->num_input_values - null_count;
-    }
-    __syncthreads();
-
-    end_value -= pp->num_nulls;
-  }
-
-  return {start_value, end_value};
-}
-
-__device__ size_t countDictEntries(uint8_t const* data,
-                                   uint8_t const* dict_base,
-                                   int dict_bits,
-                                   int dict_size,
-                                   int data_size,
-                                   int start_value,
-                                   int end_value,
-                                   int t)
-{
-  uint8_t const* ptr       = data;
-  uint8_t const* const end = data + data_size;
-  int const bytecnt        = (dict_bits + 7) >> 3;
-  size_t l_str_len         = 0;  // partial sums across threads
-  int pos                  = 0;  // current value index in the data stream
-  int t0                   = 0;  // thread 0 for this batch
-
-  int dict_run = 0;
-  int dict_val = 0;
-
-  while (pos < end_value && ptr <= end) {
-    if (dict_run <= 1) {
-      dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0;
-      if (!(dict_run & 1)) {
-        // Repeated value
-        if (ptr + bytecnt <= end) {
-          int32_t run_val = ptr[0];
-          if (bytecnt > 1) {
-            run_val |= ptr[1] << 8;
-            if (bytecnt > 2) {
-              run_val |= ptr[2] << 16;
-              if (bytecnt > 3) { run_val |= ptr[3] << 24; }
-            }
-          }
-          dict_val = run_val & ((1 << dict_bits) - 1);
-        }
-        ptr += bytecnt;
-      }
-    }
-
-    int batch_len;
-    if (dict_run & 1) {
-      // Literal batch: must output a multiple of 8, except for the last batch
-      int batch_len_div8;
-      batch_len      = max(min(preprocess_block_size, (int)(dict_run >> 1) * 8), 1);
-      batch_len_div8 = (batch_len + 7) >> 3;
-      dict_run -= batch_len_div8 * 2;
-      ptr += batch_len_div8 * dict_bits;
-    } else {
-      batch_len = dict_run >> 1;
-      dict_run  = 0;
-    }
-
-    int is_literal = dict_run & 1;
-    // if (t == 0 && blockIdx.x == 1) printf("batch_len %d is_lit %d\n", batch_len, is_literal);
-
-    // calculate my thread id for this batch.  way to round-robin the work.
-    int mytid = t - t0;
-    if (mytid < 0) mytid += preprocess_block_size;
-
-    // compute dictionary index.
-    if (is_literal) {
-      int dict_idx = 0;
-      if (mytid < batch_len) {
-        dict_idx         = dict_val;
-        int32_t ofs      = (mytid - ((batch_len + 7) & ~7)) * dict_bits;
-        const uint8_t* p = ptr + (ofs >> 3);
-        ofs &= 7;
-        if (p < end) {
-          uint32_t c = 8 - ofs;
-          dict_idx   = (*p++) >> ofs;
-          if (c < dict_bits && p < end) {
-            dict_idx |= (*p++) << c;
-            c += 8;
-            if (c < dict_bits && p < end) {
-              dict_idx |= (*p++) << c;
-              c += 8;
-              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
-            }
-          }
-          dict_idx &= (1 << dict_bits) - 1;
-        }
-
-        if (pos + mytid < end_value) {
-          uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
-          if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) {
-            const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
-            l_str_len += src->second;
-          }
-        }
-      }
-
-      t0 += batch_len;
-    } else {
-      int start_off = (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
-      batch_len     = min(batch_len, end_value - pos);
-      if (mytid == 0) {
-        uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
-        if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
-          const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
-          l_str_len += (batch_len - start_off) * src->second;
-        }
-      }
-
-      t0 += 1;
-    }
-
-    t0 = t0 % preprocess_block_size;
-    pos += batch_len;
-  }
-  __syncthreads();
-
-  using block_reduce = cub::BlockReduce<size_t, preprocess_block_size>;
-  __shared__ typename block_reduce::TempStorage reduce_storage;
-  size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len);
-
-  return sum_l;
-}
-
-__device__ size_t
-countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_value, int t)
-{
-  int pos          = 0;
-  size_t total_len = 0;
-
-  // This step is purely serial
-  if (!t) {
-    const uint8_t* cur = data;
-    int k              = 0;
-
-    while (pos < end_value && k < data_size) {
-      int len;
-      if (k + 4 <= data_size) {
-        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
-        k += 4;
-        if (k + len > data_size) { len = 0; }
-      } else {
-        len = 0;
-      }
-
-      k += len;
-      if (pos >= start_value) { total_len += len; }
-      pos++;
-    }
-  }
-
-  return total_len;
-}
-
-template <int lvl_buf_size>
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
-{
-  __shared__ __align__(16) page_state_s state_g;
-
-  page_state_s* const s = &state_g;
-  int page_idx          = blockIdx.x;
-  int t                 = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
-
-  // reset str_bytes to 0 in case it's already been calculated
-  if (t == 0) { pp->str_bytes = 0; }
-
-  // only count if it's a string column
-  auto const col         = &chunks[pp->chunk_idx];
-  uint32_t dtype         = col->data_type & 7;
-  uint32_t dtype_len_out = col->data_type >> 3;
-  if (dtype != BYTE_ARRAY || dtype_len_out == 4) { return; }
-
-  // whether or not we have repetition levels (lists)
-  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
-
-  // the level stream decoders
-  __shared__ rle_run def_runs[run_buffer_size];
-  __shared__ rle_run rep_runs[run_buffer_size];
-  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
-
-  // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
-
-  if (!t) {
-    s->page.num_nulls = 0;
-    s->page.str_bytes = 0;
-  }
-  __syncthreads();
-
-  bool is_bounds_pg = is_bounds_page(s, min_row, num_rows);
-
-  // if we're skipping this page anyway, no need to count it
-  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
-
-  // find start/end value indices
-  auto const [start_value, end_value] =
-    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
-
-  // need to save num_nulls calculated in page_bounds in this page
-  // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
-  if (t == 0) {
-    pp->num_nulls  = s->page.num_nulls;
-    pp->num_valids = s->page.num_valids;
-  }
-#if 0
-  if (t == 0)
-    printf(
-      "%05d: col %d start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
-      blockIdx.x,
-      col->src_col_index,
-      start_value,
-      end_value,
-      is_bounds_pg,
-      is_page_contained(s, min_row, num_rows),
-      min_row,
-      min_row + num_rows,
-      col->start_row + pp->chunk_row,
-      col->start_row + pp->chunk_row + pp->num_rows);
-#endif
-
-  // now process string info in the range [start_value, end_value)
-  // set up for decoding strings...can be either plain or dictionary
-  uint8_t const* data      = s->data_start;
-  uint8_t const* const end = s->data_end;
-  uint8_t const* dict_base = nullptr;
-  int dict_size            = 0;
-  size_t str_bytes         = 0;
-
-  switch (pp->encoding) {
-    case Encoding::PLAIN_DICTIONARY:
-    case Encoding::RLE_DICTIONARY:
-      // RLE-packed dictionary indices, first byte indicates index length in bits
-      if (col->str_dict_index) {
-        // String dictionary: use index
-        dict_base = reinterpret_cast<const uint8_t*>(col->str_dict_index);
-        dict_size = col->page_info[0].num_input_values * sizeof(string_index_pair);
-      } else {
-        dict_base = col->page_info[0].page_data;  // dictionary is always stored in the first page
-        dict_size = col->page_info[0].uncompressed_page_size;
-      }
-
-      if (s->dict_bits > 32 || !dict_base) {
-        printf("%03d: error %d %p\n", t, s->dict_bits, dict_base);
-        CUDF_UNREACHABLE("invalid dictionary bit size");
-      }
-
-      str_bytes = countDictEntries(
-        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value, t);
-      break;
-    case Encoding::PLAIN:
-      dict_size = static_cast<int32_t>(end - data);
-      str_bytes = is_bounds_pg ? countPlainEntries(data, dict_size, start_value, end_value, t)
-                               : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
-      break;
-  }
-
-  if (t == 0) {
-    // TODO check for overflow
-    pp->str_bytes = str_bytes;
-    // printf("%05d: string size %ld %d\n", blockIdx.x, str_bytes, col->src_col_index);
-  }
-}
-
 /**
  * @brief Kernel for computing per-page column size information for all nesting levels.
  *
@@ -2522,26 +804,6 @@ __global__ void __launch_bounds__(preprocess_block_size)
   }
 }
 
-// Copies null counts back to `nesting_decode` at the end of scope
-struct null_count_back_copier {
-  page_state_s* s;
-  int t;
-  __device__ ~null_count_back_copier()
-  {
-    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
-      int depth = 0;
-      while (depth < s->page.num_output_nesting_levels) {
-        int const thread_depth = depth + t;
-        if (thread_depth < s->page.num_output_nesting_levels) {
-          s->page.nesting_decode[thread_depth].null_count =
-            s->nesting_decode_cache[thread_depth].null_count;
-        }
-        depth += blockDim.x;
-      }
-    }
-  }
-};
-
 /**
  * @brief Kernel for co the column data stored in the pages
  *
@@ -2728,272 +990,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   }
 }
 
-template <int lvl_buf_size>
-__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
-{
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s state_buffers;
-  __shared__ __align__(4) size_type last_offset;
-
-  page_state_s* const s          = &state_g;
-  page_state_buffers_s* const sb = &state_buffers;
-  int page_idx                   = blockIdx.x;
-  int t                          = threadIdx.x;
-  int out_thread0;
-  [[maybe_unused]] null_count_back_copier _{s, t};
-
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
-
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
-
-  // offsets is global...but the output is local, so account for that below
-  if (t == 0) { last_offset = s->page.str_offset; }
-  __syncthreads();
-
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  //
-  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
-  // starts before this page and ends after this page:
-  //       P0        P1        P2
-  //  |---------|---------|----------|
-  //        ^------------------^
-  //      row start           row end
-  // P1 will contain 0 rows
-  //
-  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
-                                               is_page_contained(s, min_row, num_rows)))) {
-    return;
-  }
-
-  if (s->dict_base) {
-    out_thread0 = (s->dict_bits > 0) ? 64 : 32;
-  } else {
-    out_thread0 =
-      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
-  }
-
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-
-  __shared__ uint32_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
-  __shared__ uint32_t def[non_zero_buffer_size];  // circular buffer of definition level values
-
-  // skipped_leaf_values will always be 0 for flat hierarchies.
-  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
-    int target_pos;
-    int src_pos = s->src_pos;
-
-    if (t < out_thread0) {
-      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
-                       s->nz_count + (decode_block_size - out_thread0));
-    } else {
-      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
-      if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
-    }
-    __syncthreads();
-    if (t < 32) {
-      // decode repetition and definition levels.
-      // - update validity vectors
-      // - updates offsets (for nested columns)
-      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
-    } else if (t < out_thread0) {
-      // skipped_leaf_values will always be 0 for flat hierarchies.
-      uint32_t src_target_pos = target_pos + skipped_leaf_values;
-
-      // WARP1: Decode dictionary indices, booleans or string positions
-      if (s->dict_base) {
-        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
-      } else if ((s->col.data_type & 7) == BOOLEAN) {
-        src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
-      } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
-      }
-      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
-    } else {
-      // WARP1..WARP3: Decode values
-      src_pos += t - out_thread0;
-
-      // the position in the output column/buffer
-      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
-
-      // for the flat hierarchy case we will be reading from the beginning of the value stream,
-      // regardless of the value of first_row. so adjust our destination offset accordingly.
-      // example:
-      // - user has passed skip_rows = 2, so our first_row to output is 2
-      // - the row values we get from nz_idx will be
-      //   0, 1, 2, 3, 4 ....
-      // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
-      // - so we will end up ignoring the first two input rows, and input rows 2..n will
-      //   get written to the output starting at position 0.
-      //
-      if (!has_repetition) { dst_pos -= s->first_row; }
-
-      // need to do this before we branch on src_pos/dst_pos so we don't deadlock
-      // choose a character parallel string copy when the average string is longer than a warp
-      auto const use_char_ll = s->page.num_valids > 0 &&
-                               (s->page.str_bytes / s->page.num_valids) >= cudf::detail::warp_size;
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
-      int const me               = t - out_thread0;
-
-      if (me < 32) {
-        for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
-          dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
-          if (!has_repetition) { dst_pos -= s->first_row; }
-
-          auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
-                              ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
-                              : cuda::std::pair<char const*, size_t>{nullptr, 0};
-
-          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-          size_type offset;
-          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
-          offset += last_offset;
-
-          if (use_char_ll) {
-            // TODO: might want separate kernel for string page decoding so we don't waste all
-            // this shared memory on non-string columns.
-            __shared__ __align__(8) uint8_t const* pointers[32];
-            __shared__ __align__(4) size_type offsets[32];
-            __shared__ __align__(4) int dsts[32];
-            __shared__ __align__(4) int lengths[32];
-
-            offsets[me]  = offset;
-            pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
-            dsts[me]     = dst_pos;
-            lengths[me]  = len;
-            __syncwarp();
-
-            for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
-              if (dsts[ss] >= 0) {
-                auto offptr =
-                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
-                  dsts[ss];
-                *offptr = offsets[ss];
-                auto str_ptr =
-                  nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
-                ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
-#if 0
-                  if (is_bounds_page(s, min_row, num_rows)) {
-                    if (me == 0)
-                      printf("%05d,%03d: src %d dst %d len %d offset %d\n",
-                             blockIdx.x,
-                             me,
-                             src_pos + i + ss,
-                             dsts[ss],
-                             lengths[ss],
-                             offsets[ss]);
-                  }
-#endif
-              }
-            }
-
-          } else {
-            if (src_pos + i < target_pos && dst_pos >= 0) {
-              auto offptr =
-                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
-              *offptr = offset;
-              auto str_ptr =
-                nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
-              memcpy(str_ptr, ptr, len);
-#if 0
-                if (is_bounds_page(s, min_row, num_rows)) {
-                  printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
-                         blockIdx.x,
-                         t,
-                         src_pos + i,
-                         dst_pos,
-                         len,
-                         offset);
-                }
-#endif
-            }
-            __syncwarp();
-          }
-
-          if (me == 31) { last_offset = offset + len; }
-          __syncwarp();
-        }
-      }
-
-      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
-    }
-    __syncthreads();
-  }
-
-  // if there are nulls and this is a string column, clean up the offsets array.
-  // but if there's a list parent, then no need.
-#if 0
-  if ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4) {
-    int const leaf_level_index = s->col.max_nesting_depth - 1;
-    if (t == 0 && is_bounds_page(s, min_row, num_rows)) {
-      printf("%05d: nz %d nulls %d valids %d iv %d nival %d nivalid %d\n",
-             blockIdx.x,
-             s->nz_count,
-             s->page.num_nulls,
-             s->page.num_valids,
-             s->num_input_values,
-             nesting_info_base[leaf_level_index].value_count,
-             nesting_info_base[leaf_level_index].valid_count);
-    }
-  }
-#endif
-
-  if (s->page.num_nulls != 0) {
-    int const value_count      = s->page.num_valids + s->page.num_nulls;
-    int const leaf_level_index = s->col.max_nesting_depth - 1;
-
-    auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
-
-    if (nesting_info_base[leaf_level_index].null_count > 0) {
-      // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
-      if (s->nz_count == 0) {
-        for (int i = t; i < value_count; i += decode_block_size) {
-          offptr[i] = s->page.str_offset;
-        }
-      }
-      // just some nulls, do this serially for now
-      else if (t == 0) {
-        if (offptr[value_count - 1] == 0) {
-          offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
-        }
-        for (int i = value_count - 2; i > 0; i--) {
-          if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
-        }
-        offptr[0] = s->page.str_offset;
-      }
-    }
-    __syncthreads();
-#if 0
-      if (t == 0)
-        printf("%05d: offptr %p/%p %d %d\n",
-               blockIdx.x,
-               offptr,
-               offptr + value_count,
-               offptr[value_count - 2],
-               offptr[value_count - 1]);
-#endif
-  }
-}
-
 }  // anonymous namespace
 
-void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
-                            hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            size_t min_row,
-                            size_t num_rows,
-                            rmm::cuda_stream_view stream)
-{
-  dim3 dim_block(preprocess_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-  gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
-}
-
 /**
  * @copydoc cudf::io::parquet::gpu::ComputePageSizes
  */
@@ -3035,24 +1033,6 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
     <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
 }
 
-/**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
- */
-void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
-                                   hostdevice_vector<ColumnChunkDesc> const& chunks,
-                                   size_t num_rows,
-                                   size_t min_row,
-                                   rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
-
-  dim3 dim_block(decode_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-
-  gpuDecodeStringPageData<non_zero_buffer_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
-}
-
 }  // namespace gpu
 }  // namespace parquet
 }  // namespace io
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
new file mode 100644
index 00000000000..c59fedd6577
--- /dev/null
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -0,0 +1,1197 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+#include "rle_stream.cuh"
+
+#include <cuda/std/tuple>
+#include <io/utilities/block_utils.cuh>
+
+namespace cudf::io::parquet::gpu {
+namespace {
+
+constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
+constexpr int decode_block_size     = 128;
+constexpr int non_zero_buffer_size  = decode_block_size * 2;
+
+constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
+template <int lvl_buf_size>
+constexpr int rolling_lvl_index(int index)
+{
+  return index % lvl_buf_size;
+}
+
+struct page_state_s {
+  const uint8_t* data_start;
+  const uint8_t* data_end;
+  const uint8_t* lvl_end;
+  const uint8_t* dict_base;    // ptr to dictionary page data
+  int32_t dict_size;           // size of dictionary data
+  int32_t first_row;           // First row in page to output
+  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
+  int32_t first_output_value;  // First value in page to output
+  int32_t num_input_values;    // total # of input/level values in the page
+  int32_t dtype_len;           // Output data type length
+  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
+  int32_t dict_bits;           // # of bits to store dictionary indices
+  uint32_t dict_run;
+  int32_t dict_val;
+  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
+  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
+  int32_t error;
+  PageInfo page;
+  ColumnChunkDesc col;
+
+  // (leaf) value decoding
+  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
+  int32_t dict_pos;  // write position of dictionary indices
+  int32_t src_pos;   // input read position of final output value
+  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
+
+  // repetition/definition level decoding
+  int32_t input_value_count;                  // how many values of the input we've processed
+  int32_t input_row_count;                    // how many rows of the input we've processed
+  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
+  const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  const uint8_t* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  const uint8_t* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
+
+  // a shared-memory cache of frequently used data when decoding. The source of this data is
+  // normally stored in global memory which can yield poor performance. So, when possible
+  // we copy that info here prior to decoding
+  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
+  // points to either nesting_decode_cache above when possible, or to the global source otherwise
+  PageNestingDecodeInfo* nesting_info;
+};
+
+// buffers only used in the decode kernel.  separated from page_state_s to keep
+// shared memory usage in other kernels (eg, gpuComputePageSizes) down.
+struct page_state_buffers_s {
+  uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
+  uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
+  uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
+};
+
+// Copies null counts back to `nesting_decode` at the end of scope
+struct null_count_back_copier {
+  page_state_s* s;
+  int t;
+  __device__ ~null_count_back_copier()
+  {
+    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
+      int depth = 0;
+      while (depth < s->page.num_output_nesting_levels) {
+        int const thread_depth = depth + t;
+        if (thread_depth < s->page.num_output_nesting_levels) {
+          s->page.nesting_decode[thread_depth].null_count =
+            s->nesting_decode_cache[thread_depth].null_count;
+        }
+        depth += blockDim.x;
+      }
+    }
+  }
+};
+
+/**
+ * @brief Returns whether or not a page spans either the beginning or the end of the
+ * specified row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ *
+ * @return True if the page spans the beginning or the end of the row bounds
+ */
+inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  return ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
+}
+
+/**
+ * @brief Returns whether or not a page is completely contained within the specified
+ * row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ *
+ * @return True if the page is completely contained within the row bounds
+ */
+inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  return page_begin >= begin && page_end <= end;
+}
+
+/**
+ * @brief Retrieves string information for a string at the specified source position
+ *
+ * @param[in] s Page state input
+ * @param[out] sb Page state buffer output
+ * @param[in] src_pos Source position
+ *
+ * @return A pair containing a pointer to the string and its length
+ */
+inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
+  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos)
+{
+  const char* ptr = nullptr;
+  size_t len      = 0;
+
+  if (s->dict_base) {
+    // String dictionary
+    uint32_t dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
+    if (dict_pos < (uint32_t)s->dict_size) {
+      const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+      ptr             = src->first;
+      len             = src->second;
+    }
+  } else {
+    // Plain encoding
+    uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
+    if (dict_pos <= (uint32_t)s->dict_size) {
+      ptr = reinterpret_cast<const char*>(s->data_start + dict_pos);
+      len = sb->str_len[rolling_index(src_pos)];
+    }
+  }
+
+  return {ptr, len};
+}
+
+/**
+ * @brief Performs RLE decoding of dictionary indexes
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
+ * 31)
+ * @param[in] t Warp1 thread ID (0..31)
+ *
+ * @return A pair containing the new output position, and the total length of strings decoded (this
+ * will only be valid on thread 0 and if sizes_only is true). In the event that this function
+ * decodes strings beyond target_pos, the total length of strings returned will include these
+ * additional values.
+ */
+template <bool sizes_only>
+__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
+  volatile page_state_s* s,
+  [[maybe_unused]] volatile page_state_buffers_s* sb,
+  int target_pos,
+  int t)
+{
+  const uint8_t* end = s->data_end;
+  int dict_bits      = s->dict_bits;
+  int pos            = s->dict_pos;
+  int str_len        = 0;
+
+  while (pos < target_pos) {
+    int is_literal, batch_len;
+    if (!t) {
+      uint32_t run       = s->dict_run;
+      const uint8_t* cur = s->data_start;
+      if (run <= 1) {
+        run = (cur < end) ? get_vlq32(cur, end) : 0;
+        if (!(run & 1)) {
+          // Repeated value
+          int bytecnt = (dict_bits + 7) >> 3;
+          if (cur + bytecnt <= end) {
+            int32_t run_val = cur[0];
+            if (bytecnt > 1) {
+              run_val |= cur[1] << 8;
+              if (bytecnt > 2) {
+                run_val |= cur[2] << 16;
+                if (bytecnt > 3) { run_val |= cur[3] << 24; }
+              }
+            }
+            s->dict_val = run_val & ((1 << dict_bits) - 1);
+          }
+          cur += bytecnt;
+        }
+      }
+      if (run & 1) {
+        // Literal batch: must output a multiple of 8, except for the last batch
+        int batch_len_div8;
+        batch_len      = max(min(32, (int)(run >> 1) * 8), 1);
+        batch_len_div8 = (batch_len + 7) >> 3;
+        run -= batch_len_div8 * 2;
+        cur += batch_len_div8 * dict_bits;
+      } else {
+        batch_len = max(min(32, (int)(run >> 1)), 1);
+        run -= batch_len * 2;
+      }
+      s->dict_run   = run;
+      s->data_start = cur;
+      is_literal    = run & 1;
+      __threadfence_block();
+    }
+    __syncwarp();
+    is_literal = shuffle(is_literal);
+    batch_len  = shuffle(batch_len);
+
+    // compute dictionary index.
+    int dict_idx = 0;
+    if (t < batch_len) {
+      dict_idx = s->dict_val;
+      if (is_literal) {
+        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
+        const uint8_t* p = s->data_start + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+      }
+
+      // if we're not computing sizes, store off the dictionary index
+      if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; }
+    }
+
+    // if we're computing sizes, add the length(s)
+    if constexpr (sizes_only) {
+      int const len = [&]() {
+        if (t >= batch_len || (pos + t >= target_pos)) { return 0; }
+        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+        if (dict_pos < (uint32_t)s->dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+          return src->second;
+        }
+        return 0;
+      }();
+
+      using WarpReduce = cub::WarpReduce<size_type>;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      // note: str_len will only be valid on thread 0.
+      str_len += WarpReduce(temp_storage).Sum(len);
+    }
+
+    pos += batch_len;
+  }
+  return {pos, str_len};
+}
+
+/**
+ * @brief Parses the length and position of strings and returns total length of all strings
+ * processed
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target output position
+ * @param[in] t Thread ID
+ *
+ * @return Total length of strings processed
+ */
+template <bool sizes_only>
+__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
+                                              [[maybe_unused]] volatile page_state_buffers_s* sb,
+                                              int target_pos,
+                                              int t)
+{
+  int pos       = s->dict_pos;
+  int total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    const uint8_t* cur = s->data_start;
+    int dict_size      = s->dict_size;
+    int k              = s->dict_val;
+
+    while (pos < target_pos) {
+      int len;
+      if (k + 4 <= dict_size) {
+        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+        k += 4;
+        if (k + len > dict_size) { len = 0; }
+      } else {
+        len = 0;
+      }
+      if constexpr (!sizes_only) {
+        sb->dict_idx[rolling_index(pos)] = k;
+        sb->str_len[rolling_index(pos)]  = len;
+      }
+      k += len;
+      total_len += len;
+      pos++;
+    }
+    s->dict_val = k;
+    __threadfence_block();
+  }
+
+  return total_len;
+}
+
+/**
+ * @brief Decode values out of a definition or repetition stream
+ *
+ * @param[in,out] s Page state input/output
+ * @param[in] t target_count Target count of stream values on output
+ * @param[in] t Warp0 thread ID (0..31)
+ * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
+ */
+__device__ void gpuDecodeStream(
+  uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
+{
+  const uint8_t* cur_def    = s->lvl_start[lvl];
+  const uint8_t* end        = s->lvl_end;
+  uint32_t level_run        = s->initial_rle_run[lvl];
+  int32_t level_val         = s->initial_rle_value[lvl];
+  int level_bits            = s->col.level_bits[lvl];
+  int32_t num_input_values  = s->num_input_values;
+  int32_t value_count       = s->lvl_count[lvl];
+  int32_t batch_coded_count = 0;
+
+  while (value_count < target_count && value_count < num_input_values) {
+    int batch_len;
+    if (level_run <= 1) {
+      // Get a new run symbol from the byte stream
+      int sym_len = 0;
+      if (!t) {
+        const uint8_t* cur = cur_def;
+        if (cur < end) { level_run = get_vlq32(cur, end); }
+        if (!(level_run & 1)) {
+          if (cur < end) level_val = cur[0];
+          cur++;
+          if (level_bits > 8) {
+            if (cur < end) level_val |= cur[0] << 8;
+            cur++;
+          }
+        }
+        if (cur > end || level_run <= 1) { s->error = 0x10; }
+        sym_len = (int32_t)(cur - cur_def);
+        __threadfence_block();
+      }
+      sym_len   = shuffle(sym_len);
+      level_val = shuffle(level_val);
+      level_run = shuffle(level_run);
+      cur_def += sym_len;
+    }
+    if (s->error) { break; }
+
+    batch_len = min(num_input_values - value_count, 32);
+    if (level_run & 1) {
+      // Literal run
+      int batch_len8;
+      batch_len  = min(batch_len, (level_run >> 1) * 8);
+      batch_len8 = (batch_len + 7) >> 3;
+      if (t < batch_len) {
+        int bitpos         = t * level_bits;
+        const uint8_t* cur = cur_def + (bitpos >> 3);
+        bitpos &= 7;
+        if (cur < end) level_val = cur[0];
+        cur++;
+        if (level_bits > 8 - bitpos && cur < end) {
+          level_val |= cur[0] << 8;
+          cur++;
+          if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16;
+        }
+        level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
+      }
+      level_run -= batch_len8 * 2;
+      cur_def += batch_len8 * level_bits;
+    } else {
+      // Repeated value
+      batch_len = min(batch_len, level_run >> 1);
+      level_run -= batch_len * 2;
+    }
+    if (t < batch_len) {
+      int idx                    = value_count + t;
+      output[rolling_index(idx)] = level_val;
+    }
+    batch_coded_count += batch_len;
+    value_count += batch_len;
+  }
+
+  // update the stream info
+  if (!t) {
+    s->lvl_start[lvl]         = cur_def;
+    s->initial_rle_run[lvl]   = level_run;
+    s->initial_rle_value[lvl] = level_val;
+    s->lvl_count[lvl]         = value_count;
+  }
+}
+
+/**
+ * @brief Store a validity mask containing value_count bits into the output validity buffer of the
+ * page.
+ *
+ * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
+ * offset is also updated
+ * @param[in] valid_mask The validity mask to be stored
+ * @param[in] value_count # of bits in the validity mask
+ */
+__device__ void store_validity(PageNestingDecodeInfo* nesting_info,
+                               uint32_t valid_mask,
+                               int32_t value_count)
+{
+  int word_offset = nesting_info->valid_map_offset / 32;
+  int bit_offset  = nesting_info->valid_map_offset % 32;
+  // if we fit entirely in the output word
+  if (bit_offset + value_count <= 32) {
+    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
+
+    if (relevant_mask == ~0) {
+      nesting_info->valid_map[word_offset] = valid_mask;
+    } else {
+      atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
+      atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
+    }
+  }
+  // we're going to spill over into the next word.
+  // note : writing both values here is the lazy/slow way.  we could be writing just
+  // the first word and rolling the remaining bits over into the next call.
+  // however, some basic performance tests shows almost no difference between these two
+  // methods. More detailed performance testing might be worthwhile here.
+  else {
+    uint32_t bits_left = 32 - bit_offset;
+
+    // first word. strip bits_left bits off the beginning and store that
+    uint32_t relevant_mask = ((1 << bits_left) - 1);
+    uint32_t mask_word0    = valid_mask & relevant_mask;
+    atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
+    atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset);
+
+    // second word. strip the remainder of the bits off the end and store that
+    relevant_mask       = ((1 << (value_count - bits_left)) - 1);
+    uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
+    atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask));
+    atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left);
+  }
+
+  nesting_info->valid_map_offset += value_count;
+}
+
+/**
+ * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level
+ * D to which we should considered them null or not.
+ *
+ * @param[out] start_depth The start nesting depth
+ * @param[out] end_depth The end nesting depth (inclusive)
+ * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
+ * d will be -1
+ * @param[in] s Local page information
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] input_value_count The current count of input level values we have processed
+ * @param[in] target_input_value_count The desired # of input level values we want to process
+ * @param[in] t Thread index
+ */
+template <int lvl_buf_size>
+inline __device__ void get_nesting_bounds(int& start_depth,
+                                          int& end_depth,
+                                          int& d,
+                                          page_state_s* s,
+                                          uint32_t const* const rep,
+                                          uint32_t const* const def,
+                                          int input_value_count,
+                                          int32_t target_input_value_count,
+                                          int t)
+{
+  start_depth = -1;
+  end_depth   = -1;
+  d           = -1;
+  if (input_value_count + t < target_input_value_count) {
+    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d         = def[index];
+    // if we have repetition (there are list columns involved) we have to
+    // bound what nesting levels we apply values to
+    if (s->col.max_level[level_type::REPETITION] > 0) {
+      int r       = rep[index];
+      start_depth = s->nesting_info[r].start_depth;
+      end_depth   = s->nesting_info[d].end_depth;
+    }
+    // for columns without repetition (even ones involving structs) we always
+    // traverse the entire hierarchy.
+    else {
+      start_depth = 0;
+      end_depth   = s->col.max_nesting_depth - 1;
+    }
+  }
+}
+
+/**
+ * @brief Process a batch of incoming repetition/definition level values and generate
+ *        validity, nested column offsets (where appropriate) and decoding indices.
+ *
+ * @param[in] target_input_value_count The # of repetition/definition levels to process up to
+ * @param[in] s Local page information
+ * @param[out] sb Page state buffer output
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] t Thread index
+ */
+template <int lvl_buf_size>
+__device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
+                                                      page_state_s* s,
+                                                      page_state_buffers_s* sb,
+                                                      uint32_t const* const rep,
+                                                      uint32_t const* const def,
+                                                      int t)
+{
+  // max nesting depth of the column
+  int const max_depth       = s->col.max_nesting_depth;
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  // how many (input) values we've processed in the page so far
+  int input_value_count = s->input_value_count;
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  // process until we've reached the target
+  while (input_value_count < target_input_value_count) {
+    // determine the nesting bounds for this thread (the range of nesting depths we
+    // will generate new value indices and validity bits for)
+    int start_depth, end_depth, d;
+    get_nesting_bounds<non_zero_buffer_size>(
+      start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
+
+    // 4 interesting things to track:
+    // thread_value_count : # of output values from the view of this thread
+    // warp_value_count   : # of output values for the whole warp
+    //
+    // thread_valid_count : # of valid values from the view of this thread
+    // warp_valid_count   : # of valid values for the whole warp
+    uint32_t thread_value_count, warp_value_count;
+    uint32_t thread_valid_count, warp_valid_count;
+
+    // track (page-relative) row index for the thread so we can compare against input bounds
+    // keep track of overall # of rows we've read.
+    int const is_new_row               = start_depth == 0 ? 1 : 0;
+    uint32_t const warp_row_count_mask = ballot(is_new_row);
+    int32_t const thread_row_index =
+      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
+    input_row_count += __popc(warp_row_count_mask);
+    // is this thread within read row bounds?
+    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
+                                  thread_row_index < (s->first_row + s->num_rows)
+                                ? 1
+                                : 0;
+
+    // compute warp and thread value counts
+    uint32_t const warp_count_mask =
+      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
+
+    warp_value_count = __popc(warp_count_mask);
+    // Note : ((1 << t) - 1) implies "for all threads before me"
+    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
+
+    // walk from 0 to max_depth
+    uint32_t next_thread_value_count, next_warp_value_count;
+    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
+      PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
+
+      // if we are within the range of nesting levels we should be adding value indices for
+      int const in_nesting_bounds =
+        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
+
+      // everything up to the max_def_level is a non-null value
+      uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
+
+      // compute warp and thread valid counts
+      uint32_t const warp_valid_mask =
+        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
+        // because every value in the input matches to a value in the output
+        !has_repetition
+          ? ballot(is_valid)
+          :
+          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
+          // however not all of them will necessarily represent a value at this nesting level. so
+          // the validity bit for thread t might actually represent output value t-6. the correct
+          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // __reduce_or_sync(), but until then we have to do a warp reduce.
+          WarpReduceOr32(is_valid << thread_value_count);
+
+      thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
+      warp_valid_count   = __popc(warp_valid_mask);
+
+      // if this is the value column emit an index for value decoding
+      if (is_valid && s_idx == max_depth - 1) {
+        int const src_pos = nesting_info->valid_count + thread_valid_count;
+        int const dst_pos = nesting_info->value_count + thread_value_count;
+        // nz_idx is a mapping of src buffer indices to destination buffer indices
+        sb->nz_idx[rolling_index(src_pos)] = dst_pos;
+      }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for nested schemas so that we can emit an offset for the -current- nesting
+      // level. more concretely : the offset for the current nesting level == current length of the
+      // next nesting level
+      if (s_idx < max_depth - 1) {
+        uint32_t const next_warp_count_mask =
+          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
+        next_warp_value_count   = __popc(next_warp_count_mask);
+        next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
+
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && nesting_info->data_out != nullptr) {
+          int const idx             = nesting_info->value_count + thread_value_count;
+          cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
+                                      next_thread_value_count +
+                                      nesting_info_base[s_idx + 1].page_start_value;
+          (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
+        }
+      }
+
+      // nested schemas always read and write to the same bounds (that is, read and write positions
+      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
+      // first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector here
+      // we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds =
+        !has_repetition
+          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
+          : in_row_bounds;
+      int const first_thread_in_write_range =
+        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+
+      // # of bits to of the validity mask to write out
+      int const warp_valid_mask_bit_count =
+        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
+
+      // increment count of valid values, count of total values, and update validity mask
+      if (!t) {
+        if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
+          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
+          store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count);
+
+          nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
+        }
+        nesting_info->valid_count += warp_valid_count;
+        nesting_info->value_count += warp_value_count;
+      }
+
+      // propagate value counts for the next level
+      warp_value_count   = next_warp_value_count;
+      thread_value_count = next_thread_value_count;
+    }
+
+    input_value_count += min(32, (target_input_value_count - input_value_count));
+    __syncwarp();
+  }
+
+  // update
+  if (!t) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = nesting_info_base[max_depth - 1].valid_count;
+    s->input_value_count = input_value_count;
+    s->input_row_count   = input_row_count;
+  }
+}
+
+/**
+ * @brief Process repetition and definition levels up to the target count of leaf values.
+ *
+ * In order to decode actual leaf values from the input stream, we need to generate the
+ * list of non-null value positions (page_state_s::nz_idx). We do this by processing
+ * the repetition and definition level streams.  This process also generates validity information,
+ * and offset column values in the case of nested schemas. Because of the way the streams
+ * are encoded, this function may generate slightly more than target_leaf_count.
+ *
+ * Only runs on 1 warp.
+ *
+ * @param[in] s The local page state
+ * @param[out] sb Page state buffer output
+ * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] t Thread index
+ */
+template <int lvl_buf_size>
+__device__ void gpuDecodeLevels(page_state_s* s,
+                                page_state_buffers_s* sb,
+                                int32_t target_leaf_count,
+                                uint32_t* const rep,
+                                uint32_t* const def,
+                                int t)
+{
+  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  constexpr int batch_size = 32;
+  int cur_leaf_count       = target_leaf_count;
+  while (!s->error && s->nz_count < target_leaf_count &&
+         s->input_value_count < s->num_input_values) {
+    if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); }
+    gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION);
+    __syncwarp();
+
+    // because the rep and def streams are encoded separately, we cannot request an exact
+    // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
+    // levels we get.
+    int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
+                                                 s->lvl_count[level_type::DEFINITION])
+                                           : s->lvl_count[level_type::DEFINITION];
+
+    // process what we got back
+    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size>(actual_leaf_count, s, sb, rep, def, t);
+    cur_leaf_count = actual_leaf_count + batch_size;
+    __syncwarp();
+  }
+}
+
+/**
+ * @brief Parse the beginning of the level section (definition or repetition),
+ * initializes the initial RLE run & value, and returns the section length
+ *
+ * @param[in,out] s The page state
+ * @param[in] cur The current data position
+ * @param[in] end The end of the data
+ * @param[in] level_bits The bits required
+ *
+ * @return The length of the section
+ */
+__device__ uint32_t InitLevelSection(page_state_s* s,
+                                     const uint8_t* cur,
+                                     const uint8_t* end,
+                                     level_type lvl,
+                                     bool is_decode_step,
+                                     rle_stream* decoders)
+{
+  int32_t len;
+  int level_bits    = s->col.level_bits[lvl];
+  Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
+                                                    : s->page.repetition_level_encoding;
+
+  auto start = cur;
+  if (level_bits == 0) {
+    len                       = 0;
+    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else if (encoding == Encoding::RLE) {
+    // V2 only uses RLE encoding, so only perform check here
+    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
+      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
+    } else if (cur + 4 < end) {
+      len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
+      cur += 4;
+    } else {
+      len      = 0;
+      s->error = 2;
+    }
+    s->abs_lvl_start[lvl] = cur;
+    if (!s->error) {
+      uint32_t run            = get_vlq32(cur, end);
+      s->initial_rle_run[lvl] = run;
+      if (!(run & 1)) {
+        int v = (cur < end) ? cur[0] : 0;
+        cur++;
+        if (level_bits > 8) {
+          v |= ((cur < end) ? cur[0] : 0) << 8;
+          cur++;
+        }
+        s->initial_rle_value[lvl] = v;
+      }
+      s->lvl_start[lvl] = cur;
+    }
+
+    if (cur > end) { s->error = 2; }
+  } else if (encoding == Encoding::BIT_PACKED) {
+    len                       = (s->page.num_input_values * level_bits + 7) >> 3;
+    s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else {
+    s->error = 3;
+    len      = 0;
+  }
+
+  s->abs_lvl_end[lvl] = start + len;
+
+  return static_cast<uint32_t>(len);
+}
+
+/**
+ * @brief Sets up block-local page state information from the global pages.
+ *
+ * @param[in, out] s The local page state to be filled in
+ * @param[in] p The global page to be copied from
+ * @param[in] chunks The global list of chunks
+ * @param[in] min_row Crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
+ * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
+ * Currently only used by gpuComputePageSizes step)
+ */
+__device__ bool setupLocalPageInfo(page_state_s* const s,
+                                   PageInfo const* p,
+                                   device_span<ColumnChunkDesc const> chunks,
+                                   size_t min_row,
+                                   size_t num_rows,
+                                   bool is_decode_step,
+                                   rle_stream* decoders = nullptr)
+{
+  int t = threadIdx.x;
+  int chunk_idx;
+
+  // Fetch page info
+  if (!t) {
+    s->page         = *p;
+    s->nesting_info = nullptr;
+  }
+  __syncthreads();
+
+  if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; }
+  // Fetch column chunk info
+  chunk_idx = s->page.chunk_idx;
+  if (!t) { s->col = chunks[chunk_idx]; }
+
+  // if we can use the nesting decode cache, set it up now
+  auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
+  if (can_use_decode_cache) {
+    int depth = 0;
+    while (depth < s->page.nesting_info_size) {
+      int const thread_depth = depth + t;
+      if (thread_depth < s->page.nesting_info_size) {
+        // these values need to be copied over from global
+        s->nesting_decode_cache[thread_depth].max_def_level =
+          s->page.nesting_decode[thread_depth].max_def_level;
+        s->nesting_decode_cache[thread_depth].page_start_value =
+          s->page.nesting_decode[thread_depth].page_start_value;
+        s->nesting_decode_cache[thread_depth].start_depth =
+          s->page.nesting_decode[thread_depth].start_depth;
+        s->nesting_decode_cache[thread_depth].end_depth =
+          s->page.nesting_decode[thread_depth].end_depth;
+      }
+      depth += blockDim.x;
+    }
+  }
+  if (!t) {
+    s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
+  }
+
+  __syncthreads();
+
+  // zero counts
+  int depth = 0;
+  while (depth < s->page.num_output_nesting_levels) {
+    int const thread_depth = depth + t;
+    if (thread_depth < s->page.num_output_nesting_levels) {
+      s->nesting_info[thread_depth].valid_count = 0;
+      s->nesting_info[thread_depth].value_count = 0;
+      s->nesting_info[thread_depth].null_count  = 0;
+    }
+    depth += blockDim.x;
+  }
+  __syncthreads();
+
+  if (!t) {
+    s->error = 0;
+
+    // our starting row (absolute index) is
+    // col.start_row == absolute row index
+    // page.chunk-row == relative row index within the chunk
+    size_t page_start_row = s->col.start_row + s->page.chunk_row;
+
+    // IMPORTANT : nested schemas can have 0 rows in a page but still have
+    // values. The case is:
+    // - On page N-1, the last row starts, with 2/6 values encoded
+    // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
+    // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
+    if (s->page.num_input_values > 0) {
+      uint8_t* cur = s->page.page_data;
+      uint8_t* end = cur + s->page.uncompressed_page_size;
+
+      uint32_t dtype_len_out = s->col.data_type >> 3;
+      s->ts_scale            = 0;
+      // Validate data type
+      auto const data_type = s->col.data_type & 7;
+      switch (data_type) {
+        case BOOLEAN:
+          s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
+          break;
+        case INT32: [[fallthrough]];
+        case FLOAT: s->dtype_len = 4; break;
+        case INT64:
+          if (s->col.ts_clock_rate) {
+            int32_t units = 0;
+            // Duration types are not included because no scaling is done when reading
+            if (s->col.converted_type == TIMESTAMP_MILLIS) {
+              units = cudf::timestamp_ms::period::den;
+            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
+              units = cudf::timestamp_us::period::den;
+            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
+              units = cudf::timestamp_ns::period::den;
+            }
+            if (units and units != s->col.ts_clock_rate) {
+              s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
+                                                           : (s->col.ts_clock_rate / units);
+            }
+          }
+          [[fallthrough]];
+        case DOUBLE: s->dtype_len = 8; break;
+        case INT96: s->dtype_len = 12; break;
+        case BYTE_ARRAY:
+          if (s->col.converted_type == DECIMAL) {
+            auto const decimal_precision = s->col.decimal_precision;
+            s->dtype_len                 = [decimal_precision]() {
+              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+                return sizeof(int32_t);
+              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+                return sizeof(int64_t);
+              } else {
+                return sizeof(__int128_t);
+              }
+            }();
+          } else {
+            s->dtype_len = sizeof(string_index_pair);
+          }
+          break;
+        default:  // FIXED_LEN_BYTE_ARRAY:
+          s->dtype_len = dtype_len_out;
+          s->error |= (s->dtype_len <= 0);
+          break;
+      }
+      // Special check for downconversions
+      s->dtype_len_in = s->dtype_len;
+      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
+        s->dtype_len = [dtype_len = s->dtype_len]() {
+          if (dtype_len <= sizeof(int32_t)) {
+            return sizeof(int32_t);
+          } else if (dtype_len <= sizeof(int64_t)) {
+            return sizeof(int64_t);
+          } else {
+            return sizeof(__int128_t);
+          }
+        }();
+      } else if (data_type == INT32) {
+        if (dtype_len_out == 1) {
+          // INT8 output
+          s->dtype_len = 1;
+        } else if (dtype_len_out == 2) {
+          // INT16 output
+          s->dtype_len = 2;
+        } else if (s->col.converted_type == TIME_MILLIS) {
+          // INT64 output
+          s->dtype_len = 8;
+        }
+      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
+        s->dtype_len = 4;  // HASH32 output
+      } else if (data_type == INT96) {
+        s->dtype_len = 8;  // Convert to 64-bit timestamp
+      }
+
+      // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
+      // invalid/bogus during first pass of the preprocess step for nested types. this is ok
+      // because we ignore these values in that stage.
+      {
+        auto const max_row = min_row + num_rows;
+
+        // if we are totally outside the range of the input, do nothing
+        if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
+          s->first_row = 0;
+          s->num_rows  = 0;
+        }
+        // otherwise
+        else {
+          s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
+          auto const max_page_rows = s->page.num_rows - s->first_row;
+          s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
+                                       ? max_page_rows
+                                       : max_row - (page_start_row + s->first_row);
+        }
+      }
+
+      // during the decoding step we need to offset the global output buffers
+      // for each level of nesting so that we write to the section this page
+      // is responsible for.
+      // - for flat schemas, we can do this directly by using row counts
+      // - for nested schemas, these offsets are computed during the preprocess step
+      //
+      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
+      // will be aliased to memory that has been freed when we get here in the non-decode step, so
+      // we cannot check against nullptr.  we'll just check a flag directly.
+      if (is_decode_step) {
+        int max_depth = s->col.max_nesting_depth;
+        for (int idx = 0; idx < max_depth; idx++) {
+          PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx];
+
+          size_t output_offset;
+          // schemas without lists
+          if (s->col.max_level[level_type::REPETITION] == 0) {
+            output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
+          }
+          // for schemas with lists, we've already got the exact value precomputed
+          else {
+            output_offset = nesting_info->page_start_value;
+          }
+
+          if (s->col.column_data_base != nullptr) {
+            nesting_info->data_out   = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+            nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
+
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+
+            if (nesting_info->data_out != nullptr) {
+              // anything below max depth with a valid data pointer must be a list, so the
+              // element size is the size of the offset type.
+              uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
+              // if this is a string column, then dtype_len is a lie. data will be offsets rather
+              // than (ptr,len) tuples.
+              if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); }
+              nesting_info->data_out += (output_offset * len);
+            }
+            if (nesting_info->string_out != nullptr) {
+              nesting_info->string_out += s->page.str_offset;
+            }
+            nesting_info->valid_map = s->col.valid_map_base[idx];
+            if (nesting_info->valid_map != nullptr) {
+              nesting_info->valid_map += output_offset >> 5;
+              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
+            }
+          }
+        }
+      }
+      s->first_output_value = 0;
+
+      // Find the compressed size of repetition levels
+      cur += InitLevelSection(s, cur, end, level_type::REPETITION, is_decode_step, decoders);
+      // Find the compressed size of definition levels
+      cur += InitLevelSection(s, cur, end, level_type::DEFINITION, is_decode_step, decoders);
+
+      s->dict_bits = 0;
+      s->dict_base = nullptr;
+      s->dict_size = 0;
+      // NOTE:  if additional encodings are supported in the future, modifications must
+      // be made to is_supported_encoding() in reader_impl_preprocess.cu
+      switch (s->page.encoding) {
+        case Encoding::PLAIN_DICTIONARY:
+        case Encoding::RLE_DICTIONARY:
+          // RLE-packed dictionary indices, first byte indicates index length in bits
+          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
+            // String dictionary: use index
+            s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index);
+            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
+          } else {
+            s->dict_base =
+              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
+            s->dict_size = s->col.page_info[0].uncompressed_page_size;
+          }
+          s->dict_run  = 0;
+          s->dict_val  = 0;
+          s->dict_bits = (cur < end) ? *cur++ : 0;
+          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
+          break;
+        case Encoding::PLAIN:
+          s->dict_size = static_cast<int32_t>(end - cur);
+          s->dict_val  = 0;
+          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
+          break;
+        case Encoding::RLE: s->dict_run = 0; break;
+        default:
+          s->error = 1;  // Unsupported encoding
+          break;
+      }
+      if (cur > end) { s->error = 1; }
+      s->lvl_end    = cur;
+      s->data_start = cur;
+      s->data_end   = end;
+    } else {
+      s->error = 1;
+    }
+
+    s->lvl_count[level_type::REPETITION] = 0;
+    s->lvl_count[level_type::DEFINITION] = 0;
+    s->nz_count                          = 0;
+    s->num_input_values                  = s->page.num_input_values;
+    s->dict_pos                          = 0;
+    s->src_pos                           = 0;
+
+    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
+    // preprocess of the definition levels (since nulls will have no actual decodable value, there
+    // is no direct correlation between # of rows and # of decodable values).  so we will start
+    // processing at the beginning of the value stream and disregard any indices that start
+    // before the first row.
+    if (s->col.max_level[level_type::REPETITION] == 0) {
+      s->page.skipped_values      = 0;
+      s->page.skipped_leaf_values = 0;
+      s->input_value_count        = 0;
+      s->input_row_count          = 0;
+      s->input_leaf_count         = 0;
+
+      s->row_index_lower_bound = -1;
+    }
+    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
+    // we need to start decoding at
+    else {
+      // input_row_count translates to "how many rows we have processed so far", so since we are
+      // skipping directly to where we want to start decoding, set it to first_row
+      s->input_row_count = s->first_row;
+
+      // return the lower bound to compare (page-relative) thread row index against. Explanation:
+      // In the case of nested schemas, rows can span page boundaries.  That is to say,
+      // we can encounter the first value for row X on page M, but the last value for page M
+      // might not be the last value for row X. page M+1 (or further) may contain the last value.
+      //
+      // This means that the first values we encounter for a given page (M+1) may not belong to the
+      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
+      // previous row is within the overall row bounds, include the values by allowing relative row
+      // index -1
+      int const max_row = (min_row + num_rows) - 1;
+      if (min_row < page_start_row && max_row >= page_start_row - 1) {
+        s->row_index_lower_bound = -1;
+      } else {
+        s->row_index_lower_bound = s->first_row;
+      }
+
+      // if we're in the decoding step, jump directly to the first
+      // value we care about
+      if (is_decode_step) {
+        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
+      } else {
+        s->input_value_count = 0;
+        s->input_leaf_count  = 0;
+        s->page.skipped_values =
+          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
+        s->page.skipped_leaf_values = 0;
+      }
+    }
+
+    __threadfence_block();
+  }
+  __syncthreads();
+
+  return true;
+}
+
+}  // namespace
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
new file mode 100644
index 00000000000..091a0673f76
--- /dev/null
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_decode.cuh"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/gather.cuh>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+namespace {
+
+// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
+// copies from src to dst in 16B chunks per thread.
+__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  using cudf::strings::detail::load_uint4;
+
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
+
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
+  auto const in_start         = src;
+
+  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
+  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
+  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
+  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
+  // `load_uint4` reading beyond string boundaries.
+  // use signed int since out_end_aligned can be negative.
+  int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
+                                out_datatype_size * out_datatype_size -
+                              alignment_offset;
+  int64_t out_end_aligned =
+    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+    alignment_offset;
+
+  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
+       ichar += warp_size * out_datatype_size) {
+    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
+      load_uint4((const char*)in_start + ichar);
+  }
+
+  // Tail logic: copy characters of the current string outside
+  // `[out_start_aligned, out_end_aligned)`.
+  if (out_end_aligned <= out_start_aligned) {
+    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+    // entire string.
+    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
+      dst[ichar] = in_start[ichar];
+    }
+  } else {
+    // Copy characters in range `[0, out_start_aligned)`.
+    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
+    // Copy characters in range `[out_end_aligned, len)`.
+    int64_t ichar = out_end_aligned + lane_id;
+    if (ichar < len) { dst[ichar] = in_start[ichar]; }
+  }
+}
+
+// data parallel strcpy
+__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  if (len > 64) {
+    wideStrcpy(dst, src, len, lane_id);
+  } else {
+    for (int i = lane_id; i < len; i += warp_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+template <int lvl_buf_size>
+__device__ std::pair<int, int> page_bounds(page_state_s* const s,
+                                           size_t min_row,
+                                           size_t num_rows,
+                                           bool is_bounds_pg,
+                                           bool has_repetition,
+                                           rle_stream* decoders,
+                                           int t)
+{
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  // decode batches of level stream data using rle_stream objects and use the results to
+  // calculate start and end value positions in the encoded string data.
+  int const max_depth = s->col.max_nesting_depth;
+  int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
+
+  // can skip all this if we know there are no nulls
+  if (max_def == 0 && !is_bounds_pg) {
+    s->page.num_valids = s->num_input_values;
+    s->page.num_nulls  = 0;
+    return {0, s->num_input_values};
+  }
+
+  int start_value = 0;
+  int end_value   = s->page.num_input_values;
+  auto const pp   = &s->page;
+  auto const col  = &s->col;
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  int const max_batch_size = lvl_buf_size;
+  uint32_t* def_decode     = pp->lvl_decode_buf[level_type::DEFINITION];
+  uint32_t* rep_decode     = pp->lvl_decode_buf[level_type::REPETITION];
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        max_batch_size,
+                                        def_decode,
+                                        s->page.num_input_values);
+  // only need repetition if this is a bounds page. otherwise all we need is def level info
+  // to count the nulls.
+  if (has_repetition && is_bounds_pg) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          max_batch_size,
+                                          rep_decode,
+                                          s->page.num_input_values);
+  }
+
+  int processed = 0;
+
+  // if this is a bounds page, we need to do extra work to find the start and/or end value index
+  // TODO calculate num_nulls
+  if (is_bounds_pg) {
+    __shared__ int skipped_values;
+    __shared__ int skipped_leaf_values;
+    __shared__ int last_input_value;
+    __shared__ int end_val_idx;
+
+    // need these for skip_rows case
+    auto const page_start_row = col->start_row + pp->chunk_row;
+    auto const max_row        = min_row + num_rows;
+    auto const begin_row      = page_start_row >= min_row ? 0 : min_row - page_start_row;
+    auto const max_page_rows  = pp->num_rows - begin_row;
+    auto const page_rows      = page_start_row + begin_row + max_page_rows <= max_row
+                                  ? max_page_rows
+                                  : max_row - (page_start_row + begin_row);
+    auto const end_row        = begin_row + page_rows;
+
+    // short circuit for no nulls
+    if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; }
+
+    int row_count           = 0;
+    int leaf_count          = 0;
+    bool skipped_values_set = false;
+    bool end_value_set      = false;
+
+    while (processed < s->page.num_input_values) {
+      int start_val = processed;
+
+      if (has_repetition) {
+        decoders[level_type::REPETITION].decode_next(t);
+        __syncthreads();
+      }
+
+      // the # of rep/def levels will always be the same size
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      // do something with the level data
+      while (start_val < processed) {
+        int idx_t = start_val + t;
+        int idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
+
+        // get absolute thread row index
+        int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
+        int thread_row_count, block_row_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+        __syncthreads();
+
+        // get absolute thread leaf index
+        int const is_new_leaf = idx_t < processed && (def_decode[idx] >= max_def);
+        int thread_leaf_count, block_leaf_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+        __syncthreads();
+
+        // if we have not set skipped values yet, see if we found the first in-bounds row
+        if (!skipped_values_set && row_count + block_row_count > begin_row) {
+          // if this thread is in row bounds
+          int const row_index = (thread_row_count + row_count) - 1;
+          int in_row_bounds =
+            idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(in_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the first row. need to test in_row_bounds for
+            // the case where we only want one row and local_count == 1 for many threads.
+            if (local_count == 1 && in_row_bounds) {
+              skipped_values = idx_t;
+              skipped_leaf_values =
+                leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            skipped_values_set = true;
+          }
+        }
+
+        // test if row_count will exceed end_row in this batch
+        if (!end_value_set && row_count + block_row_count >= end_row) {
+          // if this thread exceeds row bounds
+          int const row_index    = (thread_row_count + row_count) - 1;
+          int exceeds_row_bounds = row_index >= end_row;
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(exceeds_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the end row.
+            if (local_count == 1) {
+              last_input_value = idx_t;
+              end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            end_value_set = true;
+          }
+        }
+
+        row_count += block_row_count;
+        leaf_count += block_leaf_count;
+
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+    }
+
+    start_value = skipped_values_set ? skipped_leaf_values : 0;
+    end_value   = end_value_set ? end_val_idx : leaf_count;
+
+    if (t == 0) {
+      int const v0                = skipped_values_set ? skipped_values : 0;
+      int const vn                = end_value_set ? last_input_value : s->num_input_values;
+      int const total_values      = vn - v0;
+      int const total_leaf_values = end_value - start_value;
+      int const num_nulls         = total_values - total_leaf_values;
+      pp->num_nulls               = num_nulls;
+      pp->num_valids              = total_leaf_values;
+#if 0
+      printf("%05d: input vals in page %d,%d lc %d v0 %d vn %d %d nz %d nc %d\n",
+             blockIdx.x,
+             skipped_values_set,
+             end_value_set,
+             leaf_count,
+             v0,
+             vn,
+             total_values,
+             total_leaf_values,
+             num_nulls);
+#endif
+    }
+  }
+  // already filtered out unwanted pages, so need to count all non-null values in this page
+  else {
+    int num_nulls = 0;
+    while (processed < s->page.num_input_values) {
+      int start_val = processed;
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      while (start_val < processed) {
+        int idx_t = start_val + t;
+        if (idx_t < processed) {
+          int idx = rolling_lvl_index<lvl_buf_size>(idx_t);
+          if (def_decode[idx] < max_def) { num_nulls++; }
+        }
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+    }
+
+    int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls);
+
+    if (t == 0) {
+      pp->num_nulls  = null_count;
+      pp->num_valids = pp->num_input_values - null_count;
+    }
+    __syncthreads();
+
+    end_value -= pp->num_nulls;
+  }
+
+  return {start_value, end_value};
+}
+
+__device__ size_t countDictEntries(uint8_t const* data,
+                                   uint8_t const* dict_base,
+                                   int dict_bits,
+                                   int dict_size,
+                                   int data_size,
+                                   int start_value,
+                                   int end_value,
+                                   int t)
+{
+  uint8_t const* ptr       = data;
+  uint8_t const* const end = data + data_size;
+  int const bytecnt        = (dict_bits + 7) >> 3;
+  size_t l_str_len         = 0;  // partial sums across threads
+  int pos                  = 0;  // current value index in the data stream
+  int t0                   = 0;  // thread 0 for this batch
+
+  int dict_run = 0;
+  int dict_val = 0;
+
+  while (pos < end_value && ptr <= end) {
+    if (dict_run <= 1) {
+      dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0;
+      if (!(dict_run & 1)) {
+        // Repeated value
+        if (ptr + bytecnt <= end) {
+          int32_t run_val = ptr[0];
+          if (bytecnt > 1) {
+            run_val |= ptr[1] << 8;
+            if (bytecnt > 2) {
+              run_val |= ptr[2] << 16;
+              if (bytecnt > 3) { run_val |= ptr[3] << 24; }
+            }
+          }
+          dict_val = run_val & ((1 << dict_bits) - 1);
+        }
+        ptr += bytecnt;
+      }
+    }
+
+    int batch_len;
+    if (dict_run & 1) {
+      // Literal batch: must output a multiple of 8, except for the last batch
+      int batch_len_div8;
+      batch_len      = max(min(preprocess_block_size, (int)(dict_run >> 1) * 8), 1);
+      batch_len_div8 = (batch_len + 7) >> 3;
+      dict_run -= batch_len_div8 * 2;
+      ptr += batch_len_div8 * dict_bits;
+    } else {
+      batch_len = dict_run >> 1;
+      dict_run  = 0;
+    }
+
+    int is_literal = dict_run & 1;
+    // if (t == 0 && blockIdx.x == 1) printf("batch_len %d is_lit %d\n", batch_len, is_literal);
+
+    // calculate my thread id for this batch.  way to round-robin the work.
+    int mytid = t - t0;
+    if (mytid < 0) mytid += preprocess_block_size;
+
+    // compute dictionary index.
+    if (is_literal) {
+      int dict_idx = 0;
+      if (mytid < batch_len) {
+        dict_idx         = dict_val;
+        int32_t ofs      = (mytid - ((batch_len + 7) & ~7)) * dict_bits;
+        const uint8_t* p = ptr + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+
+        if (pos + mytid < end_value) {
+          uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+          if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) {
+            const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+            l_str_len += src->second;
+          }
+        }
+      }
+
+      t0 += batch_len;
+    } else {
+      int start_off = (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
+      batch_len     = min(batch_len, end_value - pos);
+      if (mytid == 0) {
+        uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
+        if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+          l_str_len += (batch_len - start_off) * src->second;
+        }
+      }
+
+      t0 += 1;
+    }
+
+    t0 = t0 % preprocess_block_size;
+    pos += batch_len;
+  }
+  __syncthreads();
+
+  using block_reduce = cub::BlockReduce<size_t, preprocess_block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+  size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len);
+
+  return sum_l;
+}
+
+__device__ size_t
+countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_value, int t)
+{
+  int pos          = 0;
+  size_t total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    const uint8_t* cur = data;
+    int k              = 0;
+
+    while (pos < end_value && k < data_size) {
+      int len;
+      if (k + 4 <= data_size) {
+        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+        k += 4;
+        if (k + len > data_size) { len = 0; }
+      } else {
+        len = 0;
+      }
+
+      k += len;
+      if (pos >= start_value) { total_len += len; }
+      pos++;
+    }
+  }
+
+  return total_len;
+}
+
+template <int lvl_buf_size>
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  // reset str_bytes to 0 in case it's already been calculated
+  if (t == 0) { pp->str_bytes = 0; }
+
+  // only count if it's a string column
+  auto const col         = &chunks[pp->chunk_idx];
+  uint32_t dtype         = col->data_type & 7;
+  uint32_t dtype_len_out = col->data_type >> 3;
+  if (dtype != BYTE_ARRAY || dtype_len_out == 4) { return; }
+
+  // whether or not we have repetition levels (lists)
+  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the level stream decoders
+  __shared__ rle_run def_runs[run_buffer_size];
+  __shared__ rle_run rep_runs[run_buffer_size];
+  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
+
+  if (!t) {
+    s->page.num_nulls = 0;
+    s->page.str_bytes = 0;
+  }
+  __syncthreads();
+
+  bool is_bounds_pg = is_bounds_page(s, min_row, num_rows);
+
+  // if we're skipping this page anyway, no need to count it
+  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
+
+  // find start/end value indices
+  auto const [start_value, end_value] =
+    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
+
+  // need to save num_nulls calculated in page_bounds in this page
+  // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
+  if (t == 0) {
+    pp->num_nulls  = s->page.num_nulls;
+    pp->num_valids = s->page.num_valids;
+  }
+#if 0
+  if (t == 0)
+    printf(
+      "%05d: col %d start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
+      blockIdx.x,
+      col->src_col_index,
+      start_value,
+      end_value,
+      is_bounds_pg,
+      is_page_contained(s, min_row, num_rows),
+      min_row,
+      min_row + num_rows,
+      col->start_row + pp->chunk_row,
+      col->start_row + pp->chunk_row + pp->num_rows);
+#endif
+
+  // now process string info in the range [start_value, end_value)
+  // set up for decoding strings...can be either plain or dictionary
+  uint8_t const* data      = s->data_start;
+  uint8_t const* const end = s->data_end;
+  uint8_t const* dict_base = nullptr;
+  int dict_size            = 0;
+  size_t str_bytes         = 0;
+
+  switch (pp->encoding) {
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE_DICTIONARY:
+      // RLE-packed dictionary indices, first byte indicates index length in bits
+      if (col->str_dict_index) {
+        // String dictionary: use index
+        dict_base = reinterpret_cast<const uint8_t*>(col->str_dict_index);
+        dict_size = col->page_info[0].num_input_values * sizeof(string_index_pair);
+      } else {
+        dict_base = col->page_info[0].page_data;  // dictionary is always stored in the first page
+        dict_size = col->page_info[0].uncompressed_page_size;
+      }
+
+      if (s->dict_bits > 32 || !dict_base) {
+        printf("%03d: error %d %p\n", t, s->dict_bits, dict_base);
+        CUDF_UNREACHABLE("invalid dictionary bit size");
+      }
+
+      str_bytes = countDictEntries(
+        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value, t);
+      break;
+    case Encoding::PLAIN:
+      dict_size = static_cast<int32_t>(end - data);
+      str_bytes = is_bounds_pg ? countPlainEntries(data, dict_size, start_value, end_value, t)
+                               : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
+      break;
+  }
+
+  if (t == 0) {
+    // TODO check for overflow
+    pp->str_bytes = str_bytes;
+    // printf("%05d: string size %ld %d\n", blockIdx.x, str_bytes, col->src_col_index);
+  }
+}
+
+template <int lvl_buf_size>
+__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s state_buffers;
+  __shared__ __align__(4) size_type last_offset;
+
+  page_state_s* const s          = &state_g;
+  page_state_buffers_s* const sb = &state_buffers;
+  int page_idx                   = blockIdx.x;
+  int t                          = threadIdx.x;
+  int out_thread0;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
+
+  // offsets is global...but the output is local, so account for that below
+  if (t == 0) { last_offset = s->page.str_offset; }
+  __syncthreads();
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
+                                               is_page_contained(s, min_row, num_rows)))) {
+    return;
+  }
+
+  if (s->dict_base) {
+    out_thread0 = (s->dict_bits > 0) ? 64 : 32;
+  } else {
+    out_thread0 =
+      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
+  }
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  __shared__ uint32_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
+  __shared__ uint32_t def[non_zero_buffer_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+      if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
+    }
+    __syncthreads();
+    if (t < 32) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
+    } else if (t < out_thread0) {
+      // skipped_leaf_values will always be 0 for flat hierarchies.
+      uint32_t src_target_pos = target_pos + skipped_leaf_values;
+
+      // WARP1: Decode dictionary indices, booleans or string positions
+      if (s->dict_base) {
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+      } else {
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+      }
+      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+    } else {
+      // WARP1..WARP3: Decode values
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -1, -2, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // need to do this before we branch on src_pos/dst_pos so we don't deadlock
+      // choose a character parallel string copy when the average string is longer than a warp
+      auto const use_char_ll = s->page.num_valids > 0 &&
+                               (s->page.str_bytes / s->page.num_valids) >= cudf::detail::warp_size;
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+      int const me               = t - out_thread0;
+
+      if (me < 32) {
+        for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
+          dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
+          if (!has_repetition) { dst_pos -= s->first_row; }
+
+          auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
+                              ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
+                              : cuda::std::pair<char const*, size_t>{nullptr, 0};
+
+          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
+          size_type offset;
+          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
+          offset += last_offset;
+
+          if (use_char_ll) {
+            // TODO: might want separate kernel for string page decoding so we don't waste all
+            // this shared memory on non-string columns.
+            __shared__ __align__(8) uint8_t const* pointers[32];
+            __shared__ __align__(4) size_type offsets[32];
+            __shared__ __align__(4) int dsts[32];
+            __shared__ __align__(4) int lengths[32];
+
+            offsets[me]  = offset;
+            pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
+            dsts[me]     = dst_pos;
+            lengths[me]  = len;
+            __syncwarp();
+
+            for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
+              if (dsts[ss] >= 0) {
+                auto offptr =
+                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
+                  dsts[ss];
+                *offptr = offsets[ss];
+                auto str_ptr =
+                  nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
+                ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
+#if 0
+                  if (is_bounds_page(s, min_row, num_rows)) {
+                    if (me == 0)
+                      printf("%05d,%03d: src %d dst %d len %d offset %d\n",
+                             blockIdx.x,
+                             me,
+                             src_pos + i + ss,
+                             dsts[ss],
+                             lengths[ss],
+                             offsets[ss]);
+                  }
+#endif
+              }
+            }
+
+          } else {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr = offset;
+              auto str_ptr =
+                nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
+              memcpy(str_ptr, ptr, len);
+#if 0
+                if (is_bounds_page(s, min_row, num_rows)) {
+                  printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
+                         blockIdx.x,
+                         t,
+                         src_pos + i,
+                         dst_pos,
+                         len,
+                         offset);
+                }
+#endif
+            }
+            __syncwarp();
+          }
+
+          if (me == 31) { last_offset = offset + len; }
+          __syncwarp();
+        }
+      }
+
+      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+
+  // if there are nulls and this is a string column, clean up the offsets array.
+  // but if there's a list parent, then no need.
+#if 0
+  if ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4) {
+    int const leaf_level_index = s->col.max_nesting_depth - 1;
+    if (t == 0 && is_bounds_page(s, min_row, num_rows)) {
+      printf("%05d: nz %d nulls %d valids %d iv %d nival %d nivalid %d\n",
+             blockIdx.x,
+             s->nz_count,
+             s->page.num_nulls,
+             s->page.num_valids,
+             s->num_input_values,
+             nesting_info_base[leaf_level_index].value_count,
+             nesting_info_base[leaf_level_index].valid_count);
+    }
+  }
+#endif
+
+  if (s->page.num_nulls != 0) {
+    int const value_count      = s->page.num_valids + s->page.num_nulls;
+    int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+    auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
+
+    if (nesting_info_base[leaf_level_index].null_count > 0) {
+      // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
+      if (s->nz_count == 0) {
+        for (int i = t; i < value_count; i += decode_block_size) {
+          offptr[i] = s->page.str_offset;
+        }
+      }
+      // just some nulls, do this serially for now
+      else if (t == 0) {
+        if (offptr[value_count - 1] == 0) {
+          offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
+        }
+        for (int i = value_count - 2; i > 0; i--) {
+          if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
+        }
+        offptr[0] = s->page.str_offset;
+      }
+    }
+    __syncthreads();
+#if 0
+      if (t == 0)
+        printf("%05d: offptr %p/%p %d %d\n",
+               blockIdx.x,
+               offptr,
+               offptr + value_count,
+               offptr[value_count - 2],
+               offptr[value_count - 1]);
+#endif
+  }
+}
+
+}  // anonymous namespace
+
+void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
+                            hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(preprocess_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+}
+
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodePageData
+ */
+void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
+                                   hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  gpuDecodeStringPageData<non_zero_buffer_size>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+}
+
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf

From 59bd2d6410735b72e348a0ba217e4b8f011f4047 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 3 May 2023 14:11:06 -0700
Subject: [PATCH 030/114] finish merge

---
 cpp/src/io/parquet/page_decode.cuh       | 18 +++++++++---------
 cpp/src/io/parquet/page_string_decode.cu |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index c59fedd6577..c49d70ea72c 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -365,7 +365,7 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
  */
 __device__ void gpuDecodeStream(
-  uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
+  level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
   const uint8_t* cur_def    = s->lvl_start[lvl];
   const uint8_t* end        = s->lvl_end;
@@ -516,8 +516,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
                                           int& end_depth,
                                           int& d,
                                           page_state_s* s,
-                                          uint32_t const* const rep,
-                                          uint32_t const* const def,
+                                          level_t const* const rep,
+                                          level_t const* const def,
                                           int input_value_count,
                                           int32_t target_input_value_count,
                                           int t)
@@ -526,8 +526,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d         = def[index];
+    level_t index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d             = def[index];
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {
@@ -559,8 +559,8 @@ template <int lvl_buf_size>
 __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
                                                       page_state_s* s,
                                                       page_state_buffers_s* sb,
-                                                      uint32_t const* const rep,
-                                                      uint32_t const* const def,
+                                                      level_t const* const rep,
+                                                      level_t const* const def,
                                                       int t)
 {
   // max nesting depth of the column
@@ -738,8 +738,8 @@ template <int lvl_buf_size>
 __device__ void gpuDecodeLevels(page_state_s* s,
                                 page_state_buffers_s* sb,
                                 int32_t target_leaf_count,
-                                uint32_t* const rep,
-                                uint32_t* const def,
+                                level_t* const rep,
+                                level_t* const def,
                                 int t)
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 091a0673f76..9007761e227 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -125,8 +125,8 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
-  uint32_t* def_decode     = pp->lvl_decode_buf[level_type::DEFINITION];
-  uint32_t* rep_decode     = pp->lvl_decode_buf[level_type::REPETITION];
+  level_t* def_decode      = pp->lvl_decode_buf[level_type::DEFINITION];
+  level_t* rep_decode      = pp->lvl_decode_buf[level_type::REPETITION];
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
@@ -624,8 +624,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
 
-  __shared__ uint32_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
-  __shared__ uint32_t def[non_zero_buffer_size];  // circular buffer of definition level values
+  __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
+  __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;

From 8986afb3336412f17e0a10d81014926332ab2f5f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 13:08:25 -0700
Subject: [PATCH 031/114] clean up

---
 cpp/src/io/parquet/reader_impl.cpp | 35 +++++++++---------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index e5de7e4098f..6bf9cc708b4 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -17,7 +17,6 @@
 #include "reader_impl.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
@@ -38,13 +37,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
-  // Here's the plan. Compute string sizes in case it hasn't already been done.  can work out
-  // if this is redundant later.  Then allocate buffers for the string data, and offsets to the
-  // first string.  pass this to decode, where string data will be written to the buffer rather
-  // than to _strings.  also need to allocate a size_type buffer to hold strings offsets, which
-  // will be calculated as we're writing the data.  once done, we'll have for each string column
-  // a char array with the contiguous string data, and a size_type array of offsets.  use these
-  // as child columns and create string column.  no need to call create_strings_column now.
+  // Check to see if there are any string columns present. If so, then we need to get size info
+  // for each string page. This size info will be used to pre-allocate memory for the column,
+  // allowing the page decoder to write string data directly to the column buffer, rather than
+  // doing a gather operation later on.
+  // TODO: The current implementation does a round trip for the page info. Need to explore doing
+  // this step on device. This call is also somewhat redundant if size info has already been
+  // calculated (nested schema, chunked reader).
   auto const has_strings = std::any_of(pages.begin(), pages.end(), [&chunks](auto const& page) {
     auto const& chunk = chunks[page.chunk_idx];
     return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4;
@@ -68,8 +67,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         }
       }
     }
-    // for (size_t i=0; i < col_sizes.size(); i++)
-    //  printf("col %ld size %d\n", i, col_sizes[i]);
+    pages.host_to_device(_stream);
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -161,25 +159,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     page_count += chunks[c].max_num_pages;
   }
 
-  if (has_strings) {
-    pages.host_to_device(_stream);  // FIXME: get rid of this eventually
-  }
   chunks.host_to_device(_stream);
   chunk_nested_valids.host_to_device(_stream);
   chunk_nested_data.host_to_device(_stream);
   chunk_nested_str_data.host_to_device(_stream);
 
-  {
-    rmm::cuda_stream_pool pool(2);
-    auto s1 = pool.get_stream();
-    auto s2 = pool.get_stream();
-    if (has_strings) {
-      gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, s1);
-    }
-    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, s2);
-    s2.synchronize();
-    s1.synchronize();
-  }
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
+  gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, _stream);
 
   pages.device_to_host(_stream);
   page_nesting.device_to_host(_stream);
@@ -222,7 +208,6 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
                         sizeof(size_type),
                         cudaMemcpyDefault,
                         _stream.value());
-        // printf("col %ld sz %d colsize %d\n", idx, out_buf.size, sz);
       }
     }
   }

From 6b352a576b4025c39803b82c79262f6865aa7a86 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 13:21:47 -0700
Subject: [PATCH 032/114] clean up

---
 cpp/src/io/parquet/page_string_decode.cu | 304 ++++++++++++++++-------
 1 file changed, 209 insertions(+), 95 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 9007761e227..50c205d9828 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -269,18 +269,6 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
       int const num_nulls         = total_values - total_leaf_values;
       pp->num_nulls               = num_nulls;
       pp->num_valids              = total_leaf_values;
-#if 0
-      printf("%05d: input vals in page %d,%d lc %d v0 %d vn %d %d nz %d nc %d\n",
-             blockIdx.x,
-             skipped_values_set,
-             end_value_set,
-             leaf_count,
-             v0,
-             vn,
-             total_values,
-             total_leaf_values,
-             num_nulls);
-#endif
     }
   }
   // already filtered out unwanted pages, so need to count all non-null values in this page
@@ -369,7 +357,6 @@ __device__ size_t countDictEntries(uint8_t const* data,
     }
 
     int is_literal = dict_run & 1;
-    // if (t == 0 && blockIdx.x == 1) printf("batch_len %d is_lit %d\n", batch_len, is_literal);
 
     // calculate my thread id for this batch.  way to round-robin the work.
     int mytid = t - t0;
@@ -510,27 +497,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   auto const [start_value, end_value] =
     page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
 
-  // need to save num_nulls calculated in page_bounds in this page
-  // FIXME: num_nulls is only correct for !is_bounds_pg...need to fix this
+  // need to save num_nulls and num_valids calculated in page_bounds in this page
   if (t == 0) {
     pp->num_nulls  = s->page.num_nulls;
     pp->num_valids = s->page.num_valids;
   }
-#if 0
-  if (t == 0)
-    printf(
-      "%05d: col %d start_val %d end_val %d is_bounds %d is_contained %d (%ld,%ld] (%ld,%ld]\n",
-      blockIdx.x,
-      col->src_col_index,
-      start_value,
-      end_value,
-      is_bounds_pg,
-      is_page_contained(s, min_row, num_rows),
-      min_row,
-      min_row + num_rows,
-      col->start_row + pp->chunk_row,
-      col->start_row + pp->chunk_row + pp->num_rows);
-#endif
 
   // now process string info in the range [start_value, end_value)
   // set up for decoding strings...can be either plain or dictionary
@@ -553,10 +524,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
         dict_size = col->page_info[0].uncompressed_page_size;
       }
 
-      if (s->dict_bits > 32 || !dict_base) {
-        printf("%03d: error %d %p\n", t, s->dict_bits, dict_base);
-        CUDF_UNREACHABLE("invalid dictionary bit size");
-      }
+      // FIXME: need to return an error condition...this won't actually do anything
+      if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
 
       str_bytes = countDictEntries(
         data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value, t);
@@ -571,7 +540,6 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     // TODO check for overflow
     pp->str_bytes = str_bytes;
-    // printf("%05d: string size %ld %d\n", blockIdx.x, str_bytes, col->src_col_index);
   }
 }
 
@@ -594,6 +562,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
+  // return if not a string column
   if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
 
   // offsets is global...but the output is local, so account for that below
@@ -680,13 +649,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
       // need to do this before we branch on src_pos/dst_pos so we don't deadlock
       // choose a character parallel string copy when the average string is longer than a warp
-      auto const use_char_ll = s->page.num_valids > 0 &&
-                               (s->page.str_bytes / s->page.num_valids) >= cudf::detail::warp_size;
+      using cudf::detail::warp_size;
+      auto const use_char_ll =
+        s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
       int const leaf_level_index = s->col.max_nesting_depth - 1;
       int const me               = t - out_thread0;
 
-      if (me < 32) {
-        for (int i = 0; i < decode_block_size - out_thread0; i += 32) {
+      if (me < warp_size) {
+        for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
           dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
           if (!has_repetition) { dst_pos -= s->first_row; }
 
@@ -700,12 +670,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
           offset += last_offset;
 
           if (use_char_ll) {
-            // TODO: might want separate kernel for string page decoding so we don't waste all
-            // this shared memory on non-string columns.
-            __shared__ __align__(8) uint8_t const* pointers[32];
-            __shared__ __align__(4) size_type offsets[32];
-            __shared__ __align__(4) int dsts[32];
-            __shared__ __align__(4) int lengths[32];
+            __shared__ __align__(8) uint8_t const* pointers[warp_size];
+            __shared__ __align__(4) size_type offsets[warp_size];
+            __shared__ __align__(4) int dsts[warp_size];
+            __shared__ __align__(4) int lengths[warp_size];
 
             offsets[me]  = offset;
             pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
@@ -713,7 +681,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
             lengths[me]  = len;
             __syncwarp();
 
-            for (int ss = 0; ss < 32 && ss + i + s->src_pos < target_pos; ss++) {
+            for (int ss = 0; ss < warp_size && ss + i + s->src_pos < target_pos; ss++) {
               if (dsts[ss] >= 0) {
                 auto offptr =
                   reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
@@ -722,18 +690,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
                 auto str_ptr =
                   nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
                 ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
-#if 0
-                  if (is_bounds_page(s, min_row, num_rows)) {
-                    if (me == 0)
-                      printf("%05d,%03d: src %d dst %d len %d offset %d\n",
-                             blockIdx.x,
-                             me,
-                             src_pos + i + ss,
-                             dsts[ss],
-                             lengths[ss],
-                             offsets[ss]);
-                  }
-#endif
               }
             }
 
@@ -745,22 +701,12 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
               auto str_ptr =
                 nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
               memcpy(str_ptr, ptr, len);
-#if 0
-                if (is_bounds_page(s, min_row, num_rows)) {
-                  printf("%05d,%03d: src %d dst %d len %ld offset %d\n",
-                         blockIdx.x,
-                         t,
-                         src_pos + i,
-                         dst_pos,
-                         len,
-                         offset);
-                }
-#endif
             }
             __syncwarp();
           }
 
-          if (me == 31) { last_offset = offset + len; }
+          // last thread in warp updates last_offset
+          if (me == warp_size - 1) { last_offset = offset + len; }
           __syncwarp();
         }
       }
@@ -770,24 +716,196 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     __syncthreads();
   }
 
-  // if there are nulls and this is a string column, clean up the offsets array.
-  // but if there's a list parent, then no need.
-#if 0
-  if ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4) {
+  // if there are nulls clean up the offsets array.
+  if (s->page.num_nulls != 0) {
+    int const value_count      = s->page.num_valids + s->page.num_nulls;
     int const leaf_level_index = s->col.max_nesting_depth - 1;
-    if (t == 0 && is_bounds_page(s, min_row, num_rows)) {
-      printf("%05d: nz %d nulls %d valids %d iv %d nival %d nivalid %d\n",
-             blockIdx.x,
-             s->nz_count,
-             s->page.num_nulls,
-             s->page.num_valids,
-             s->num_input_values,
-             nesting_info_base[leaf_level_index].value_count,
-             nesting_info_base[leaf_level_index].valid_count);
+
+    auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
+
+    if (nesting_info_base[leaf_level_index].null_count > 0) {
+      // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
+      if (s->nz_count == 0) {
+        for (int i = t; i < value_count; i += decode_block_size) {
+          offptr[i] = s->page.str_offset;
+        }
+      }
+      // just some nulls, do this serially for now
+      else if (t == 0) {
+        if (offptr[value_count - 1] == 0) {
+          offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
+        }
+        for (int i = value_count - 2; i > 0; i--) {
+          if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
+        }
+        offptr[0] = s->page.str_offset;
+      }
     }
+    __syncthreads();
   }
-#endif
+}
 
+template <int lvl_buf_size>
+__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s state_buffers;
+  __shared__ __align__(4) size_type last_offset;
+
+  page_state_s* const s          = &state_g;
+  page_state_buffers_s* const sb = &state_buffers;
+  int page_idx                   = blockIdx.x;
+  int t                          = threadIdx.x;
+  int out_thread0;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // return if not a string column
+  if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
+
+  // offsets is global...but the output is local, so account for that below
+  if (t == 0) { last_offset = s->page.str_offset; }
+  __syncthreads();
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
+                                               is_page_contained(s, min_row, num_rows)))) {
+    return;
+  }
+
+  out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
+  __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int src_pos = s->src_pos;
+    // target pos for level decoding
+    int target_pos = min(src_pos + decode_block_size, s->nz_count + decode_block_size);
+
+    if (t < 32) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
+    } else if (t < out_thread0) {
+      // skipped_leaf_values will always be 0 for flat hierarchies.
+      uint32_t src_target_pos = target_pos + skipped_leaf_values;
+
+      // WARP1: Decode dictionary indices, booleans or string positions
+      if (s->dict_base) {
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+      } else {
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+      }
+      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+    }
+    __syncthreads();
+
+    // target_pos for value deconding
+    target_pos = min(s->nz_count, target_pos);
+
+    // Decode values
+    src_pos += t;
+
+    // the position in the output column/buffer
+    int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+
+    // for the flat hierarchy case we will be reading from the beginning of the value stream,
+    // regardless of the value of first_row. so adjust our destination offset accordingly.
+    // example:
+    // - user has passed skip_rows = 2, so our first_row to output is 2
+    // - the row values we get from nz_idx will be
+    //   0, 1, 2, 3, 4 ....
+    // - by shifting these values by first_row, the sequence becomes
+    //   -1, -2, 0, 1, 2 ...
+    // - so we will end up ignoring the first two input rows, and input rows 2..n will
+    //   get written to the output starting at position 0.
+    //
+    if (!has_repetition) { dst_pos -= s->first_row; }
+
+    // need to do this before we branch on src_pos/dst_pos so we don't deadlock
+    // choose a character parallel string copy when the average string is longer than a warp
+    using cudf::detail::warp_size;
+    auto const use_char_ll =
+      s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
+    int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+    auto [ptr, len] = src_pos < target_pos && dst_pos >= 0
+                        ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values)
+                        : cuda::std::pair<char const*, size_t>{nullptr, 0};
+
+    using block_scan = cub::BlockScan<size_type, decode_block_size>;
+    __shared__ typename block_scan::TempStorage scan_storage;
+    size_type offset;
+    block_scan(scan_storage).ExclusiveSum(len, offset);
+
+    offset += last_offset;
+
+    if (use_char_ll) {
+      __shared__ __align__(8) uint8_t const* pointers[decode_block_size];
+      __shared__ __align__(4) size_type offsets[decode_block_size];
+      __shared__ __align__(4) int dsts[decode_block_size];
+      __shared__ __align__(4) int lengths[decode_block_size];
+
+      offsets[t]  = offset;
+      pointers[t] = reinterpret_cast<uint8_t const*>(ptr);
+      dsts[t]     = dst_pos;
+      lengths[t]  = len;
+      __syncthreads();
+
+      using cudf::detail::warp_size;
+      constexpr int nwarp = decode_block_size / warp_size;
+      int const warpid    = t / warp_size;
+      int const lane_id   = t % warp_size;
+      for (int ss = warpid; ss < decode_block_size && ss + s->src_pos < target_pos; ss += nwarp) {
+        if (dsts[ss] >= 0) {
+          auto offptr =
+            reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dsts[ss];
+          *offptr = offsets[ss];
+          auto str_ptr =
+            nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
+          ll_strcpy(str_ptr, pointers[ss], lengths[ss], lane_id);
+        }
+      }
+    } else {
+      if (src_pos < target_pos && dst_pos >= 0) {
+        auto offptr =
+          reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+        *offptr      = offset;
+        auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
+        memcpy(str_ptr, ptr, len);
+      }
+    }
+    __syncthreads();
+
+    // last thread in block updates last_offset.
+    if (t == decode_block_size - 1) {
+      last_offset = offset + len;
+      *(volatile int32_t*)&s->src_pos += decode_block_size;
+    }
+    __syncthreads();
+  }
+
+  // if there are nulls clean up the offsets array.
   if (s->page.num_nulls != 0) {
     int const value_count      = s->page.num_valids + s->page.num_nulls;
     int const leaf_level_index = s->col.max_nesting_depth - 1;
@@ -813,15 +931,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       }
     }
     __syncthreads();
-#if 0
-      if (t == 0)
-        printf("%05d: offptr %p/%p %d %d\n",
-               blockIdx.x,
-               offptr,
-               offptr + value_count,
-               offptr[value_count - 2],
-               offptr[value_count - 1]);
-#endif
   }
 }
 
@@ -853,8 +962,13 @@ void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  gpuDecodeStringPageData<non_zero_buffer_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  if constexpr (false) {
+    gpuDecodeStringPageData<non_zero_buffer_size>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodeStringPageDataV2<non_zero_buffer_size>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
 }
 
 }  // namespace gpu

From 6aa11202daecd44f95462b48a213386678a9049b Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 13:29:53 -0700
Subject: [PATCH 033/114] add docstrings

---
 cpp/src/io/parquet/parquet_gpu.hpp | 48 +++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 62d4ddfc56b..3bcc19623c9 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -281,7 +281,7 @@ struct ColumnChunkDesc {
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
   int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+  int32_t ts_clock_rate;   // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index;   // my input column index
   int32_t src_col_schema;  // my schema index in the file
@@ -370,16 +370,16 @@ struct slot_type;
 struct EncColumnChunk {
   parquet_column_device_view const* col_desc;  //!< Column description
   size_type col_desc_id;
-  PageFragment* fragments;        //!< First fragment in chunk
-  uint8_t* uncompressed_bfr;      //!< Uncompressed page data
-  uint8_t* compressed_bfr;        //!< Compressed page data
-  statistics_chunk const* stats;  //!< Fragment statistics
-  uint32_t bfr_size;              //!< Uncompressed buffer size
-  uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
-  uint32_t page_headers_size;     //!< Sum of size of all page headers
-  size_type start_row;            //!< First row of chunk
-  uint32_t num_rows;              //!< Number of rows in chunk
+  PageFragment* fragments;                     //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
+  uint8_t* compressed_bfr;                     //!< Compressed page data
+  statistics_chunk const* stats;               //!< Fragment statistics
+  uint32_t bfr_size;                           //!< Uncompressed buffer size
+  uint32_t compressed_size;                    //!< Compressed buffer size
+  uint32_t max_page_data_size;  //!< Max data size (excluding header) of any page in this chunk
+  uint32_t page_headers_size;   //!< Sum of size of all page headers
+  size_type start_row;          //!< First row of chunk
+  uint32_t num_rows;            //!< Number of rows in chunk
   size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage* pages;           //!< Ptr to pages that belong to this chunk
@@ -481,6 +481,20 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
                       bool compute_string_sizes,
                       rmm::cuda_stream_view stream);
 
+/**
+ * @brief Compute string page output size information.
+ *
+ * String columns need accurate data size information to preallocate memory in the column buffer to
+ * store the char data. This calls a kernel to calculate information needed by the string decoding
+ * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
+ * are updated. This call ignores non-string columns.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ * @param stream CUDA stream to use, default 0
+ */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
                             size_t min_row,
@@ -505,6 +519,18 @@ void DecodePageData(hostdevice_vector<PageInfo>& pages,
                     size_t min_row,
                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the string column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] stream CUDA stream to use, default 0
+ */
 void DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
                           hostdevice_vector<ColumnChunkDesc> const& chunks,
                           size_t num_rows,

From fbd9fc66ac10fca2371f11e9d95110c42053244a Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 14:03:18 -0700
Subject: [PATCH 034/114] more docstrings and clean up

---
 cpp/src/io/parquet/page_data.cu          |  2 +-
 cpp/src/io/parquet/page_string_decode.cu | 98 +++++++++++++++++++++---
 cpp/src/io/parquet/parquet_gpu.hpp       |  6 --
 3 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 8c66833b168..e835d2eddee 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -805,7 +805,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
 }
 
 /**
- * @brief Kernel for co the column data stored in the pages
+ * @brief Kernel for computing the column data stored in the pages
  *
  * This function will write the page data and the page data's validity to the
  * output specified in the page's column chunk. If necessary, additional
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 50c205d9828..4b645932ce9 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -77,7 +77,9 @@ __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_
   }
 }
 
-// data parallel strcpy
+/**
+ * @brief char-parallel string copy.
+ */
 __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
 {
   using cudf::detail::warp_size;
@@ -90,14 +92,29 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
   }
 }
 
+/**
+ * @brief Compute the start and end page value bounds for this page
+ *
+ * This uses definition and repetition level info to determine the number of valid and null
+ * values for the page, taking into account skip_rows/num_rows (if set).
+ *
+ * @param s The local page info
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param is_bounds_pg True if this page is clipped
+ * @param has_repetition True if the schema is nested
+ * @param decoders Definition and repetition level decoders
+ * @param t Thread index
+ * @return pair containg start and end value indexes
+ */
 template <int lvl_buf_size>
-__device__ std::pair<int, int> page_bounds(page_state_s* const s,
-                                           size_t min_row,
-                                           size_t num_rows,
-                                           bool is_bounds_pg,
-                                           bool has_repetition,
-                                           rle_stream* decoders,
-                                           int t)
+__device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
+                                              size_t min_row,
+                                              size_t num_rows,
+                                              bool is_bounds_pg,
+                                              bool has_repetition,
+                                              rle_stream* decoders,
+                                              int t)
 {
   using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
   using block_scan   = cub::BlockScan<int, preprocess_block_size>;
@@ -304,6 +321,18 @@ __device__ std::pair<int, int> page_bounds(page_state_s* const s,
   return {start_value, end_value};
 }
 
+/**
+ * @brief Compute string size information for dictionary encoded strings.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param dict_base Pointer to the start of the dictionary
+ * @param dict_bits The number of bits used to in the dictionary bit packing
+ * @param dict_size Size of the dictionary in bytes
+ * @param data_size Size of the page data in bytes
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ * @param t Thread index
+ */
 __device__ size_t countDictEntries(uint8_t const* data,
                                    uint8_t const* dict_base,
                                    int dict_bits,
@@ -421,6 +450,15 @@ __device__ size_t countDictEntries(uint8_t const* data,
   return sum_l;
 }
 
+/**
+ * @brief Compute string size information for plain encoded strings.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param data_size Length of data
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ * @param t Thread index
+ */
 __device__ size_t
 countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_value, int t)
 {
@@ -451,6 +489,19 @@ countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_v
   return total_len;
 }
 
+/**
+ * @brief Kernel for computing string page output size information.
+ *
+ * String columns need accurate data size information to preallocate memory in the column buffer to
+ * store the char data. This calls a kernel to calculate information needed by the string decoding
+ * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
+ * are updated. This call ignores non-string columns.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
 template <int lvl_buf_size>
 __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
@@ -543,6 +594,19 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   }
 }
 
+/**
+ * @brief Kernel for computing the string column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk.
+ *
+ * This version uses a single warp to do the string copies.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ */
 template <int lvl_buf_size>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
@@ -745,6 +809,19 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   }
 }
 
+/**
+ * @brief Kernel for computing the string column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk.
+ *
+ * This version uses all threads in the block to do the string copies.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ */
 template <int lvl_buf_size>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
@@ -936,6 +1013,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 
 }  // anonymous namespace
 
+/**
+ * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
                             size_t min_row,
@@ -949,7 +1029,7 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
+ * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
  */
 void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
                                    hostdevice_vector<ColumnChunkDesc> const& chunks,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3bcc19623c9..4c8979faf6a 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -444,12 +444,6 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream);
 
-void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
-                            hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            size_t min_row,
-                            size_t num_rows,
-                            rmm::cuda_stream_view stream);
-
 /**
  * @brief Compute page output size information.
  *

From f4cf521c205eb43eab47b510d42dac9c159b6562 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 4 May 2023 16:08:08 -0500
Subject: [PATCH 035/114] PR review fixes.  Removed unused shuffle_ptr()
 function.  Corrected an incorrectly sized (benign) run_buffer_size constexpr.

---
 cpp/src/io/parquet/reader_impl.hpp   | 2 +-
 cpp/src/io/parquet/rle_stream.cuh    | 2 +-
 cpp/src/io/utilities/block_utils.cuh | 7 -------
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 7e22179a421..4d627c41433 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -182,7 +182,7 @@ class reader::impl {
   void allocate_nesting_info();
 
   /**
-   * @brief Allocate space for use when decoding definition/repetition levels/
+   * @brief Allocate space for use when decoding definition/repetition levels.
    *
    * One large contiguous buffer of data allocated and
    * distributed among the PageInfo structs.
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index ecf2cc33e20..707792ce636 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -27,7 +27,7 @@ namespace gpu {
 // TODO: consider if these should be template parameters to rle_stream
 constexpr int num_rle_stream_decode_threads = 512;
 constexpr int num_rle_stream_decode_warps   = (num_rle_stream_decode_threads / 32) - 1;
-constexpr int run_buffer_size               = (num_rle_stream_decode_warps * 2) + 2;
+constexpr int run_buffer_size               = (num_rle_stream_decode_warps * 2);
 constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
 
 /**
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 830523a288e..7c923503528 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -26,13 +26,6 @@ inline __device__ T shuffle(T var, int lane = 0)
   return __shfl_sync(~0, var, lane);
 }
 
-template <typename T>
-inline __device__ T shuffle_ptr(T var, int lane = 0)
-{
-  uintptr_t ptr_val = reinterpret_cast<uintptr_t>(var);
-  return reinterpret_cast<T>(shuffle(ptr_val));
-}
-
 template <typename T>
 inline __device__ T shuffle_xor(T var, uint32_t delta)
 {

From 305bf0912eae247bfbaad783cfc82dc8c12c7707 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 15:23:35 -0700
Subject: [PATCH 036/114] test for string col earlier

---
 cpp/src/io/parquet/page_data.cu          |  6 +++---
 cpp/src/io/parquet/page_decode.cuh       | 10 ++++++++++
 cpp/src/io/parquet/page_string_decode.cu | 14 +++++++-------
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e835d2eddee..10bc6e19a25 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -831,13 +831,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
+  // string cols handled elsewhere
+  if (is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // string cols handled elsewhere
-  if ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4) { return; }
-
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
   // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index c49d70ea72c..b445903979b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -109,6 +109,16 @@ struct null_count_back_copier {
   }
 };
 
+/**
+ * @brief Test if the given page is in a string column
+ */
+constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+{
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  auto const& col = chunks[page.chunk_idx];
+  return (col.data_type & 7) == BYTE_ARRAY and (col.data_type >> 3) != 4;
+}
+
 /**
  * @brief Returns whether or not a page spans either the beginning or the end of the
  * specified row bounds
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 4b645932ce9..8658816d9c3 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -615,6 +615,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
+  // return if not a string column
+  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
@@ -626,9 +629,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // return if not a string column
-  if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
-
   // offsets is global...but the output is local, so account for that below
   if (t == 0) { last_offset = s->page.str_offset; }
   __syncthreads();
@@ -830,6 +830,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
+  // return if not a string column
+  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
@@ -841,9 +844,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // return if not a string column
-  if ((s->col.data_type & 7) != BYTE_ARRAY || s->dtype_len == 4) { return; }
-
   // offsets is global...but the output is local, so account for that below
   if (t == 0) { last_offset = s->page.str_offset; }
   __syncthreads();
@@ -1042,7 +1042,7 @@ void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  if constexpr (false) {
+  if constexpr (true) {
     gpuDecodeStringPageData<non_zero_buffer_size>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {

From ad231f8ab92faf4a0be86eba8e4a4387cef6f354 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 4 May 2023 17:53:20 -0500
Subject: [PATCH 037/114] Change the level_decode_buf (temp space) to use
 rmm::mr::get_current_device_resource.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8cfbfdc01b5..3d6a213a6eb 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -675,7 +675,8 @@ void reader::impl::allocate_level_decode_space()
   // repetition data.
   size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * sizeof(level_t);
   auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
-  _file_itm_data.level_decode_data      = rmm::device_buffer(decode_buf_size, _stream, _mr);
+  _file_itm_data.level_decode_data =
+    rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
   // distribute the buffers
   level_t* buf = static_cast<level_t*>(_file_itm_data.level_decode_data.data());

From 301cce8f4bb7f94cb0e586b1b87e5b4bb9445c8f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 16:03:57 -0700
Subject: [PATCH 038/114] need to call setupLocalPageInfo or bad things happen

---
 cpp/src/io/parquet/page_data.cu          |  6 +++---
 cpp/src/io/parquet/page_decode.cuh       |  4 +---
 cpp/src/io/parquet/page_string_decode.cu | 12 ++++++------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 10bc6e19a25..0e49e0cc3d5 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -831,11 +831,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  // string cols handled elsewhere
-  if (is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
+  // string cols handled elsewhere
+  if (is_string_col(s->col)) { return; }
+
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b445903979b..58408b53475 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -112,10 +112,8 @@ struct null_count_back_copier {
 /**
  * @brief Test if the given page is in a string column
  */
-constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+constexpr bool is_string_col(ColumnChunkDesc const& col)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
-  auto const& col = chunks[page.chunk_idx];
   return (col.data_type & 7) == BYTE_ARRAY and (col.data_type >> 3) != 4;
 }
 
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 8658816d9c3..c026b97825b 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -615,9 +615,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
-  // return if not a string column
-  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
@@ -627,6 +624,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
+  // return if not a string column
+  if (not is_string_col(s->col)) { return; }
+
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets is global...but the output is local, so account for that below
@@ -830,9 +830,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
-  // return if not a string column
-  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
@@ -842,6 +839,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
+  // return if not a string column
+  if (not is_string_col(s->col)) { return; }
+
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets is global...but the output is local, so account for that below

From 6db20c10206aab39a9c59886065a27f8053c7c8b Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 16:46:53 -0700
Subject: [PATCH 039/114] add todo

---
 cpp/src/io/parquet/reader_impl.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 6bf9cc708b4..2117006a2c8 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -164,6 +164,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_data.host_to_device(_stream);
   chunk_nested_str_data.host_to_device(_stream);
 
+  // TODO: explore launching these concurrently with a stream pool
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
   gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, _stream);
 

From 07b0d73ae54c5db7de277265ea1456106319af56 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 17:38:27 -0700
Subject: [PATCH 040/114] final fix for restoring decode cache. add some
 consts.

---
 cpp/src/io/parquet/page_data.cu          |  9 ++--
 cpp/src/io/parquet/page_decode.cuh       | 33 ++++++------
 cpp/src/io/parquet/page_string_decode.cu | 68 +++++++++++++-----------
 3 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 0e49e0cc3d5..e128ec61aff 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -824,18 +824,17 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(16) page_state_buffers_s state_buffers;
 
+  // string cols handled elsewhere
+  if (is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
   int t                          = threadIdx.x;
   int out_thread0;
-  [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
-  // string cols handled elsewhere
-  if (is_string_col(s->col)) { return; }
-
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
@@ -850,6 +849,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   //
   if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
                                                is_page_contained(s, min_row, num_rows)))) {
+    restore_decode_cache(s);
     return;
   }
 
@@ -988,6 +988,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
     }
     __syncthreads();
   }
+  restore_decode_cache(s);
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 58408b53475..dbeb0a4ff34 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -89,31 +89,30 @@ struct page_state_buffers_s {
   uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
 };
 
-// Copies null counts back to `nesting_decode` at the end of scope
-struct null_count_back_copier {
-  page_state_s* s;
-  int t;
-  __device__ ~null_count_back_copier()
-  {
-    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
-      int depth = 0;
-      while (depth < s->page.num_output_nesting_levels) {
-        int const thread_depth = depth + t;
-        if (thread_depth < s->page.num_output_nesting_levels) {
-          s->page.nesting_decode[thread_depth].null_count =
-            s->nesting_decode_cache[thread_depth].null_count;
-        }
-        depth += blockDim.x;
+// if we are using the nesting decode cache, copy null count back. call this if
+// setupLocalPageInfo returns true.
+__device__ void restore_decode_cache(page_state_s* s)
+{
+  if (s->nesting_info == s->nesting_decode_cache) {
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      int const thread_depth = depth + threadIdx.x;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        s->page.nesting_decode[thread_depth].null_count =
+          s->nesting_decode_cache[thread_depth].null_count;
       }
+      depth += blockDim.x;
     }
   }
-};
+}
 
 /**
  * @brief Test if the given page is in a string column
  */
-constexpr bool is_string_col(ColumnChunkDesc const& col)
+constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
 {
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  auto const& col = chunks[page.chunk_idx];
   return (col.data_type & 7) == BYTE_ARRAY and (col.data_type >> 3) != 4;
 }
 
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index c026b97825b..1f96e18a567 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -142,8 +142,8 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
-  level_t* def_decode      = pp->lvl_decode_buf[level_type::DEFINITION];
-  level_t* rep_decode      = pp->lvl_decode_buf[level_type::REPETITION];
+  auto const def_decode    = pp->lvl_decode_buf[level_type::DEFINITION];
+  auto const rep_decode    = pp->lvl_decode_buf[level_type::REPETITION];
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
@@ -164,7 +164,6 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
   int processed = 0;
 
   // if this is a bounds page, we need to do extra work to find the start and/or end value index
-  // TODO calculate num_nulls
   if (is_bounds_pg) {
     __shared__ int skipped_values;
     __shared__ int skipped_leaf_values;
@@ -224,7 +223,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
         if (!skipped_values_set && row_count + block_row_count > begin_row) {
           // if this thread is in row bounds
           int const row_index = (thread_row_count + row_count) - 1;
-          int in_row_bounds =
+          int const in_row_bounds =
             idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
 
           int local_count, global_count;
@@ -248,8 +247,8 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
         // test if row_count will exceed end_row in this batch
         if (!end_value_set && row_count + block_row_count >= end_row) {
           // if this thread exceeds row bounds
-          int const row_index    = (thread_row_count + row_count) - 1;
-          int exceeds_row_bounds = row_index >= end_row;
+          int const row_index          = (thread_row_count + row_count) - 1;
+          int const exceeds_row_bounds = row_index >= end_row;
 
           int local_count, global_count;
           block_scan(temp_storage.scan_storage)
@@ -385,7 +384,7 @@ __device__ size_t countDictEntries(uint8_t const* data,
       dict_run  = 0;
     }
 
-    int is_literal = dict_run & 1;
+    int const is_literal = dict_run & 1;
 
     // calculate my thread id for this batch.  way to round-robin the work.
     int mytid = t - t0;
@@ -425,8 +424,9 @@ __device__ size_t countDictEntries(uint8_t const* data,
 
       t0 += batch_len;
     } else {
-      int start_off = (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
-      batch_len     = min(batch_len, end_value - pos);
+      int const start_off =
+        (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
+      batch_len = min(batch_len, end_value - pos);
       if (mytid == 0) {
         uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
         if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
@@ -508,20 +508,17 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 {
   __shared__ __align__(16) page_state_s state_g;
 
+  // only count if it's a string column
+  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   page_state_s* const s = &state_g;
-  int page_idx          = blockIdx.x;
-  int t                 = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
 
   // reset str_bytes to 0 in case it's already been calculated
   if (t == 0) { pp->str_bytes = 0; }
 
-  // only count if it's a string column
-  auto const col         = &chunks[pp->chunk_idx];
-  uint32_t dtype         = col->data_type & 7;
-  uint32_t dtype_len_out = col->data_type >> 3;
-  if (dtype != BYTE_ARRAY || dtype_len_out == 4) { return; }
-
   // whether or not we have repetition levels (lists)
   bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
 
@@ -542,7 +539,10 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   bool is_bounds_pg = is_bounds_page(s, min_row, num_rows);
 
   // if we're skipping this page anyway, no need to count it
-  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
+  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) {
+    restore_decode_cache(s);
+    return;
+  }
 
   // find start/end value indices
   auto const [start_value, end_value] =
@@ -556,6 +556,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // now process string info in the range [start_value, end_value)
   // set up for decoding strings...can be either plain or dictionary
+  auto const& col          = s->col;
   uint8_t const* data      = s->data_start;
   uint8_t const* const end = s->data_end;
   uint8_t const* dict_base = nullptr;
@@ -566,13 +567,13 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     case Encoding::PLAIN_DICTIONARY:
     case Encoding::RLE_DICTIONARY:
       // RLE-packed dictionary indices, first byte indicates index length in bits
-      if (col->str_dict_index) {
+      if (col.str_dict_index) {
         // String dictionary: use index
-        dict_base = reinterpret_cast<const uint8_t*>(col->str_dict_index);
-        dict_size = col->page_info[0].num_input_values * sizeof(string_index_pair);
+        dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
+        dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
       } else {
-        dict_base = col->page_info[0].page_data;  // dictionary is always stored in the first page
-        dict_size = col->page_info[0].uncompressed_page_size;
+        dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
+        dict_size = col.page_info[0].uncompressed_page_size;
       }
 
       // FIXME: need to return an error condition...this won't actually do anything
@@ -592,6 +593,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     // TODO check for overflow
     pp->str_bytes = str_bytes;
   }
+  restore_decode_cache(s);
 }
 
 /**
@@ -615,18 +617,17 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
+  // return if not a string column
+  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
   int t                          = threadIdx.x;
   int out_thread0;
-  [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
-  // return if not a string column
-  if (not is_string_col(s->col)) { return; }
-
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets is global...but the output is local, so account for that below
@@ -645,6 +646,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   //
   if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
                                                is_page_contained(s, min_row, num_rows)))) {
+    restore_decode_cache(s);
     return;
   }
 
@@ -807,6 +809,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     }
     __syncthreads();
   }
+  restore_decode_cache(s);
 }
 
 /**
@@ -830,18 +833,17 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
+  // return if not a string column
+  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
+
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
   int t                          = threadIdx.x;
   int out_thread0;
-  [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
-  // return if not a string column
-  if (not is_string_col(s->col)) { return; }
-
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets is global...but the output is local, so account for that below
@@ -860,6 +862,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   //
   if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
                                                is_page_contained(s, min_row, num_rows)))) {
+    restore_decode_cache(s);
     return;
   }
 
@@ -1009,6 +1012,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     }
     __syncthreads();
   }
+  restore_decode_cache(s);
 }
 
 }  // anonymous namespace

From 2c8dbb46805f6bc7ce348ed4368d383bdb231d52 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 4 May 2023 17:42:52 -0700
Subject: [PATCH 041/114] more consts

---
 cpp/src/io/parquet/page_string_decode.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 1f96e18a567..3e8c39716c5 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -520,7 +520,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) { pp->str_bytes = 0; }
 
   // whether or not we have repetition levels (lists)
-  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
 
   // the level stream decoders
   __shared__ rle_run def_runs[run_buffer_size];
@@ -536,7 +536,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   }
   __syncthreads();
 
-  bool is_bounds_pg = is_bounds_page(s, min_row, num_rows);
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows);
 
   // if we're skipping this page anyway, no need to count it
   if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) {
@@ -622,8 +622,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
-  int page_idx                   = blockIdx.x;
-  int t                          = threadIdx.x;
+  int const page_idx             = blockIdx.x;
+  int const t                    = threadIdx.x;
   int out_thread0;
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
@@ -657,7 +657,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
   }
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+  PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
   __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
   __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
@@ -838,8 +838,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
-  int page_idx                   = blockIdx.x;
-  int t                          = threadIdx.x;
+  int const page_idx             = blockIdx.x;
+  int const t                    = threadIdx.x;
   int out_thread0;
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
@@ -868,7 +868,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 
   out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+  PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
   __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
   __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values

From 7b392f60c346e2bd1e9079a98d17c66454a74395 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 5 May 2023 08:45:53 -0700
Subject: [PATCH 042/114] simplify string col detection

---
 cpp/src/io/parquet/reader_impl.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 2117006a2c8..ccd55381193 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -44,10 +44,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // TODO: The current implementation does a round trip for the page info. Need to explore doing
   // this step on device. This call is also somewhat redundant if size info has already been
   // calculated (nested schema, chunked reader).
-  auto const has_strings = std::any_of(pages.begin(), pages.end(), [&chunks](auto const& page) {
-    auto const& chunk = chunks[page.chunk_idx];
+  auto is_string_col = [](gpu::ColumnChunkDesc const& chunk) {
     return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4;
-  });
+  };
+  auto const has_strings = std::any_of(chunks.begin(), chunks.end(), is_string_col);
 
   std::vector<size_type> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
@@ -57,10 +57,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     pages.device_to_host(_stream, true);
     for (auto& page : pages) {
       if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0) {
-        auto const& col        = chunks[page.chunk_idx];
-        uint32_t dtype         = col.data_type & 7;
-        uint32_t dtype_len_out = col.data_type >> 3;
-        if (dtype == BYTE_ARRAY && dtype_len_out != 4) {
+        auto const& col = chunks[page.chunk_idx];
+        if (is_string_col(col)) {
           size_type const offset       = col_sizes[col.src_col_index];
           page.str_offset              = offset;
           col_sizes[col.src_col_index] = offset + page.str_bytes;

From 6156e66b3c9dc180649b65769e6ac7742c91e8c4 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 5 May 2023 08:55:15 -0700
Subject: [PATCH 043/114] more consts

---
 cpp/src/io/parquet/reader_impl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ccd55381193..ec2af67de73 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -189,10 +189,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       if (out_buf.type.id() == type_id::LIST &&
           (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED) == 0) {
         CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-        auto& child = (*cols)[input_col.nesting[l_idx + 1]];
+        auto const& child = (*cols)[input_col.nesting[l_idx + 1]];
 
         // the final offset for a list at level N is the size of it's child
-        int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
+        int const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
         CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
                                       &offset,
                                       sizeof(offset),

From fa0cdfc3edba323691d173e69a5a5e29bcfc2bca Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 5 May 2023 09:16:51 -0700
Subject: [PATCH 044/114] cleanup

---
 cpp/src/io/utilities/column_buffer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index f0b18d3fc54..2f6ae37a674 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -60,11 +60,11 @@ void column_buffer<contains_strings>::create(size_type _size,
 
   switch (type.id()) {
     case type_id::STRING:
-      if constexpr (!contains_strings) { this->create_strings(size, stream); }
-
       if constexpr (contains_strings) {
         // size + 1 for final offset. _string_data will be initialized later.
         _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
+      } else {
+        this->create_strings(size, stream);
       }
       break;
 
@@ -321,8 +321,8 @@ std::unique_ptr<column> make_column<true>(utilities::column_buffer<true>& buffer
   if (buffer.type.id() == type_id::STRING) {
     auto make_string_col = [stream](auto& buffer) {
       // no need for copies, just transfer ownership of the data_buffers to the columns
-      auto mr    = buffer._string_data.memory_resource();
-      auto state = mask_state::UNALLOCATED;
+      auto const& mr   = buffer._string_data.memory_resource();
+      auto const state = mask_state::UNALLOCATED;
       auto str_col =
         buffer._string_data.size() == 0
           ? make_empty_column(data_type{type_id::INT8})

From 62b61a20aeff38d6ea9477dcf5ea31b2cd05731a Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 5 May 2023 09:23:52 -0700
Subject: [PATCH 045/114] add some TODOs

---
 cpp/src/io/parquet/page_string_decode.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 3e8c39716c5..8a8e86b99c2 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -540,7 +540,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // if we're skipping this page anyway, no need to count it
   if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) {
-    restore_decode_cache(s);
+    restore_decode_cache(s);  // TODO is this necessary?
     return;
   }
 
@@ -593,6 +593,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     // TODO check for overflow
     pp->str_bytes = str_bytes;
   }
+  // TODO: is this necessary?
   restore_decode_cache(s);
 }
 
@@ -1046,6 +1047,8 @@ void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
+  // TODO figure out when one version is better than the other.  waiting on further changes to
+  // rle_stream to simplify the decode step.
   if constexpr (true) {
     gpuDecodeStringPageData<non_zero_buffer_size>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);

From 3d5c1c80376e856b78f63c5897685af23d6e54c5 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 8 May 2023 10:38:10 -0500
Subject: [PATCH 046/114] Use a dynamically sized type for level/repetition
 data.  In almost all cases, we only need 1 byte to store level information
 since size of the values is proportional to nesting depth.

---
 cpp/src/io/parquet/page_data.cu              | 69 +++++++++++++-------
 cpp/src/io/parquet/parquet_gpu.hpp           |  9 ++-
 cpp/src/io/parquet/reader_impl.cpp           |  2 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 40 +++++++++---
 cpp/src/io/parquet/rle_stream.cuh            | 11 ++--
 5 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index bcdaba5cd9e..089a6d4c925 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -158,15 +158,18 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
  * @param[in] cur The current data position
  * @param[in] end The end of the data
  * @param[in] level_bits The bits required
+ * @param[in] is_decode_step True if we are performing the decode step.
+ * @param[in,out] decoders The repetition and definition level stream decoders
  *
  * @return The length of the section
  */
+template <typename level_t>
 __device__ uint32_t InitLevelSection(page_state_s* s,
                                      const uint8_t* cur,
                                      const uint8_t* end,
                                      level_type lvl,
                                      bool is_decode_step,
-                                     rle_stream* decoders)
+                                     rle_stream<level_t>* decoders)
 {
   int32_t len;
   int level_bits    = s->col.level_bits[lvl];
@@ -232,6 +235,7 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
  * @param[in] t Warp0 thread ID (0..31)
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
  */
+template <typename level_t>
 __device__ void gpuDecodeStream(
   level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
@@ -974,13 +978,14 @@ static __device__ void gpuOutputGeneric(
  * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
  * Currently only used by gpuComputePageSizes step)
  */
+template <typename level_t>
 static __device__ bool setupLocalPageInfo(page_state_s* const s,
                                           PageInfo const* p,
                                           device_span<ColumnChunkDesc const> chunks,
                                           size_t min_row,
                                           size_t num_rows,
                                           bool is_decode_step,
-                                          rle_stream* decoders = nullptr)
+                                          rle_stream<level_t>* decoders = nullptr)
 {
   int t = threadIdx.x;
   int chunk_idx;
@@ -1369,7 +1374,7 @@ static __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
  * @param[in] target_input_value_count The desired # of input level values we want to process
  * @param[in] t Thread index
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 inline __device__ void get_nesting_bounds(int& start_depth,
                                           int& end_depth,
                                           int& d,
@@ -1384,8 +1389,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    level_t index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d             = static_cast<int>(def[index]);
+    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d         = static_cast<int>(def[index]);
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {
@@ -1413,7 +1418,7 @@ inline __device__ void get_nesting_bounds(int& start_depth,
  * @param[in] def Definition level buffer
  * @param[in] t Thread index
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
                                                              page_state_s* s,
                                                              page_state_buffers_s* sb,
@@ -1436,7 +1441,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
     // determine the nesting bounds for this thread (the range of nesting depths we
     // will generate new value indices and validity bits for)
     int start_depth, end_depth, d;
-    get_nesting_bounds<non_zero_buffer_size>(
+    get_nesting_bounds<non_zero_buffer_size, level_t>(
       start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
 
     // 4 interesting things to track:
@@ -1592,7 +1597,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
  * @param[in] def Definition level buffer
  * @param[in] t Thread index
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __device__ void gpuDecodeLevels(page_state_s* s,
                                 page_state_buffers_s* sb,
                                 int32_t target_leaf_count,
@@ -1618,7 +1623,8 @@ __device__ void gpuDecodeLevels(page_state_s* s,
                                            : s->lvl_count[level_type::DEFINITION];
 
     // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size>(actual_leaf_count, s, sb, rep, def, t);
+    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size, level_t>(
+      actual_leaf_count, s, sb, rep, def, t);
     cur_leaf_count = actual_leaf_count + batch_size;
     __syncwarp();
   }
@@ -1666,7 +1672,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
  * @param t Thread index
  * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 static __device__ void gpuUpdatePageSizes(page_state_s* s,
                                           int target_value_count,
                                           level_t const* const rep,
@@ -1701,7 +1707,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
 
     // start/end depth
     int start_depth, end_depth, d;
-    get_nesting_bounds<lvl_buf_size>(
+    get_nesting_bounds<lvl_buf_size, level_t>(
       start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t);
 
     // is this thread within row bounds? in the non skip_rows/num_rows case this will always
@@ -1791,7 +1797,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
  * @param compute_string_sizes Whether or not we should be computing string sizes
  * (PageInfo::str_bytes) as part of the pass
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(preprocess_block_size)
   gpuComputePageSizes(PageInfo* pages,
                       device_span<ColumnChunkDesc const> chunks,
@@ -1811,17 +1817,17 @@ __global__ void __launch_bounds__(preprocess_block_size)
   bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
 
   // the level stream decoders
-  __shared__ rle_run def_runs[run_buffer_size];
-  __shared__ rle_run rep_runs[run_buffer_size];
-  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+  __shared__ rle_run<level_t> def_runs[run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[run_buffer_size];
+  rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
   if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
-  level_t* rep             = pp->lvl_decode_buf[level_type::REPETITION];
-  level_t* def             = pp->lvl_decode_buf[level_type::DEFINITION];
+  level_t* rep             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  level_t* def             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
@@ -1993,7 +1999,7 @@ struct null_count_back_copier {
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
@@ -2007,7 +2013,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
+    return;
+  }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -2057,7 +2065,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       // - update validity vectors
       // - updates offsets (for nested columns)
       // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
     } else if (t < out_thread0) {
       // skipped_leaf_values will always be 0 for flat hierarchies.
       uint32_t src_target_pos = target_pos + skipped_leaf_values;
@@ -2174,6 +2182,7 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
                       size_t num_rows,
                       bool compute_num_rows,
                       bool compute_string_sizes,
+                      int level_type_size,
                       rmm::cuda_stream_view stream)
 {
   dim3 dim_block(preprocess_block_size, 1);
@@ -2184,8 +2193,14 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
   // This computes the size for the entire page, not taking row bounds into account.
   // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
   // the starting and ending read values to account for these bounds.
-  gpuComputePageSizes<LEVEL_DECODE_BUF_SIZE><<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  if (level_type_size == 1) {
+    gpuComputePageSizes<LEVEL_DECODE_BUF_SIZE, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  } else {
+    gpuComputePageSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(
+        pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  }
 }
 
 /**
@@ -2195,6 +2210,7 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
                              hostdevice_vector<ColumnChunkDesc> const& chunks,
                              size_t num_rows,
                              size_t min_row,
+                             int level_type_size,
                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -2202,8 +2218,13 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  gpuDecodePageData<non_zero_buffer_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  if (level_type_size == 1) {
+    gpuDecodePageData<non_zero_buffer_size, uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodePageData<non_zero_buffer_size, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8deb7133c65..187e5b47fd7 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -46,7 +46,6 @@ constexpr int MAX_DICT_BITS = 24;
 constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 
 // level decode buffer size.
-typedef uint16_t level_t;
 constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 
 /**
@@ -199,7 +198,7 @@ struct PageInfo {
   PageNestingDecodeInfo* nesting_decode;
 
   // level decode buffers
-  level_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
+  uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
 };
 
 /**
@@ -291,7 +290,9 @@ struct file_intermediate_data {
   hostdevice_vector<gpu::PageInfo> pages_info{};
   hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
   hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+
   rmm::device_buffer level_decode_data;
+  int level_type_size;
 };
 
 /**
@@ -459,6 +460,7 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
  * computed
  * @param compute_string_sizes If set to true, the str_bytes field in PageInfo will
  * be computed
+ * @param level_type_size Size in bytes of the type for level decoding
  * @param stream CUDA stream to use, default 0
  */
 void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
@@ -467,6 +469,7 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
                       size_t num_rows,
                       bool compute_num_rows,
                       bool compute_string_sizes,
+                      int level_type_size,
                       rmm::cuda_stream_view stream);
 
 /**
@@ -479,12 +482,14 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
  * @param[in] chunks All chunks to be decoded
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] stream CUDA stream to use, default 0
  */
 void DecodePageData(hostdevice_vector<PageInfo>& pages,
                     hostdevice_vector<ColumnChunkDesc> const& chunks,
                     size_t num_rows,
                     size_t min_row,
+                    int level_type_size,
                     rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9f1644dfd45..a3e07f9f255 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -121,7 +121,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device(_stream);
   chunk_nested_data.host_to_device(_stream);
 
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
 
   pages.device_to_host(_stream);
   page_nesting.device_to_host(_stream);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 3d6a213a6eb..26073b0ff3d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -325,10 +325,11 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param chunks List of column chunk descriptors
  * @param pages List of page information
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns The size in bytes of level type data required
  */
-void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                         hostdevice_vector<gpu::PageInfo>& pages,
-                         rmm::cuda_stream_view stream)
+int decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                        hostdevice_vector<gpu::PageInfo>& pages,
+                        rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
@@ -340,6 +341,22 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
   chunks.host_to_device(stream);
   gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+
+  // compute max bytes needed for level data
+  auto level_bit_size =
+    cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
+      auto c = chunks[i];
+      return static_cast<int>(std::max(c.level_bits[gpu::level_type::REPETITION],
+                                       c.level_bits[gpu::level_type::DEFINITION]));
+    });
+  // max level data bit size.
+  int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
+                                            level_bit_size,
+                                            level_bit_size + chunks.size(),
+                                            0,
+                                            thrust::maximum<int>());
+  auto const level_type_size = max(1, cudf::util::round_up_safe(max_level_bits, 8) / 8);
+
   pages.device_to_host(stream, true);
 
   // validate page encodings
@@ -347,6 +364,8 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                            pages.end(),
                            [](auto const& page) { return is_supported_encoding(page.encoding); }),
                "Unsupported page encoding detected");
+
+  return level_type_size;
 }
 
 /**
@@ -673,20 +692,21 @@ void reader::impl::allocate_level_decode_space()
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
-  size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * sizeof(level_t);
-  auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
+  size_t const per_page_decode_buf_size =
+    LEVEL_DECODE_BUF_SIZE * 2 * _file_itm_data.level_type_size;
+  auto const decode_buf_size = per_page_decode_buf_size * pages.size();
   _file_itm_data.level_decode_data =
     rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
   // distribute the buffers
-  level_t* buf = static_cast<level_t*>(_file_itm_data.level_decode_data.data());
+  uint8_t* buf = static_cast<uint8_t*>(_file_itm_data.level_decode_data.data());
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
     p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
-    buf += LEVEL_DECODE_BUF_SIZE;
+    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
     p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
-    buf += LEVEL_DECODE_BUF_SIZE;
+    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
   }
 }
 
@@ -808,7 +828,7 @@ void reader::impl::load_and_decompress_data(
 
   if (total_pages > 0) {
     // decoding of column/page information
-    decode_page_headers(chunks, pages, _stream);
+    _file_itm_data.level_type_size = decode_page_headers(chunks, pages, _stream);
     if (has_compressed_data) {
       decomp_page_data = decompress_page_data(chunks, pages, _stream);
       // Free compressed data
@@ -1600,6 +1620,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                           std::numeric_limits<size_t>::max(),
                           true,                  // compute num_rows
                           chunk_read_limit > 0,  // compute string sizes
+                          _file_itm_data.level_type_size,
                           _stream);
 
     // computes:
@@ -1651,6 +1672,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                           num_rows,
                           false,  // num_rows is already computed
                           false,  // no need to compute string sizes
+                          _file_itm_data.level_type_size,
                           _stream);
 
     // print_pages(pages, _stream);
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 707792ce636..07de76d0980 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -58,6 +58,7 @@ inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
 
 // an individual batch. processed by a warp.
 // batches should be in shared memory.
+template <typename level_t>
 struct rle_batch {
   uint8_t const* run_start;  // start of the run we are part of
   int run_offset;            // value offset of this batch from the start of the run
@@ -122,6 +123,7 @@ struct rle_batch {
 };
 
 // a single rle run. may be broken up into multiple rle_batches
+template <typename level_t>
 struct rle_run {
   int size;  // total size of the run
   int output_pos;
@@ -129,16 +131,17 @@ struct rle_run {
   int level_run;  // level_run header value
   int remaining;
 
-  __device__ __inline__ rle_batch next_batch(level_t* const output, int max_size)
+  __device__ __inline__ rle_batch<level_t> next_batch(level_t* const output, int max_size)
   {
     int batch_len        = min(max_size, remaining);
     int const run_offset = size - remaining;
     remaining -= batch_len;
-    return rle_batch{start, run_offset, output, level_run, batch_len};
+    return rle_batch<level_t>{start, run_offset, output, level_run, batch_len};
   }
 };
 
 // a stream of rle_runs
+template <typename level_t>
 struct rle_stream {
   int level_bits;
   uint8_t const* start;
@@ -151,7 +154,7 @@ struct rle_stream {
 
   level_t* output;
 
-  rle_run* runs;
+  rle_run<level_t>* runs;
   int run_index;
   int run_count;
   int output_pos;
@@ -160,7 +163,7 @@ struct rle_stream {
   int next_batch_run_start;
   int next_batch_run_count;
 
-  __device__ rle_stream(rle_run* _runs) : runs(_runs) {}
+  __device__ rle_stream(rle_run<level_t>* _runs) : runs(_runs) {}
 
   __device__ void init(int _level_bits,
                        uint8_t const* _start,

From ecb336e86a19ff8f11a6ac29a420fb22eaf9c559 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 8 May 2023 10:50:43 -0700
Subject: [PATCH 047/114] finish merge

---
 cpp/src/io/parquet/page_decode.cuh       | 24 ++++++----
 cpp/src/io/parquet/page_string_decode.cu | 59 ++++++++++++++++--------
 cpp/src/io/parquet/parquet_gpu.hpp       |  2 +
 cpp/src/io/parquet/reader_impl.cpp       |  6 ++-
 4 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index dbeb0a4ff34..6642fa72acd 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -371,6 +371,7 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
  * @param[in] t Warp0 thread ID (0..31)
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
  */
+template <typename level_t>
 __device__ void gpuDecodeStream(
   level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
@@ -518,7 +519,7 @@ __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
  * @param[in] target_input_value_count The desired # of input level values we want to process
  * @param[in] t Thread index
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 inline __device__ void get_nesting_bounds(int& start_depth,
                                           int& end_depth,
                                           int& d,
@@ -533,8 +534,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    level_t index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d             = def[index];
+    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d         = static_cast<int>(def[index]);
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {
@@ -562,7 +563,7 @@ inline __device__ void get_nesting_bounds(int& start_depth,
  * @param[in] def Definition level buffer
  * @param[in] t Thread index
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
                                                       page_state_s* s,
                                                       page_state_buffers_s* sb,
@@ -585,7 +586,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
     // determine the nesting bounds for this thread (the range of nesting depths we
     // will generate new value indices and validity bits for)
     int start_depth, end_depth, d;
-    get_nesting_bounds<non_zero_buffer_size>(
+    get_nesting_bounds<non_zero_buffer_size, level_t>(
       start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
 
     // 4 interesting things to track:
@@ -741,7 +742,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
  * @param[in] def Definition level buffer
  * @param[in] t Thread index
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __device__ void gpuDecodeLevels(page_state_s* s,
                                 page_state_buffers_s* sb,
                                 int32_t target_leaf_count,
@@ -767,7 +768,8 @@ __device__ void gpuDecodeLevels(page_state_s* s,
                                            : s->lvl_count[level_type::DEFINITION];
 
     // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size>(actual_leaf_count, s, sb, rep, def, t);
+    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size, level_t>(
+      actual_leaf_count, s, sb, rep, def, t);
     cur_leaf_count = actual_leaf_count + batch_size;
     __syncwarp();
   }
@@ -781,15 +783,18 @@ __device__ void gpuDecodeLevels(page_state_s* s,
  * @param[in] cur The current data position
  * @param[in] end The end of the data
  * @param[in] level_bits The bits required
+ * @param[in] is_decode_step True if we are performing the decode step.
+ * @param[in,out] decoders The repetition and definition level stream decoders
  *
  * @return The length of the section
  */
+template <typename level_t>
 __device__ uint32_t InitLevelSection(page_state_s* s,
                                      const uint8_t* cur,
                                      const uint8_t* end,
                                      level_type lvl,
                                      bool is_decode_step,
-                                     rle_stream* decoders)
+                                     rle_stream<level_t>* decoders)
 {
   int32_t len;
   int level_bits    = s->col.level_bits[lvl];
@@ -859,13 +864,14 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
  * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
  * Currently only used by gpuComputePageSizes step)
  */
+template <typename level_t>
 __device__ bool setupLocalPageInfo(page_state_s* const s,
                                    PageInfo const* p,
                                    device_span<ColumnChunkDesc const> chunks,
                                    size_t min_row,
                                    size_t num_rows,
                                    bool is_decode_step,
-                                   rle_stream* decoders = nullptr)
+                                   rle_stream<level_t>* decoders = nullptr)
 {
   int t = threadIdx.x;
   int chunk_idx;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 8a8e86b99c2..1414328dbcf 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -107,13 +107,13 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
  * @param t Thread index
  * @return pair containg start and end value indexes
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
                                               size_t min_row,
                                               size_t num_rows,
                                               bool is_bounds_pg,
                                               bool has_repetition,
-                                              rle_stream* decoders,
+                                              rle_stream<level_t>* decoders,
                                               int t)
 {
   using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
@@ -142,8 +142,8 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
-  auto const def_decode    = pp->lvl_decode_buf[level_type::DEFINITION];
-  auto const rep_decode    = pp->lvl_decode_buf[level_type::REPETITION];
+  auto const def_decode    = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  auto const rep_decode    = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
@@ -502,7 +502,7 @@ countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_v
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
@@ -523,9 +523,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
 
   // the level stream decoders
-  __shared__ rle_run def_runs[run_buffer_size];
-  __shared__ rle_run rep_runs[run_buffer_size];
-  rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+  __shared__ rle_run<level_t> def_runs[run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[run_buffer_size];
+  rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
   if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
@@ -610,7 +610,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
@@ -627,7 +627,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   int const t                    = threadIdx.x;
   int out_thread0;
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
+    return;
+  }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -682,7 +684,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       // - update validity vectors
       // - updates offsets (for nested columns)
       // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
     } else if (t < out_thread0) {
       // skipped_leaf_values will always be 0 for flat hierarchies.
       uint32_t src_target_pos = target_pos + skipped_leaf_values;
@@ -826,7 +828,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
  */
-template <int lvl_buf_size>
+template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
@@ -843,7 +845,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   int const t                    = threadIdx.x;
   int out_thread0;
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
+    return;
+  }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -1025,12 +1029,18 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
                             size_t min_row,
                             size_t num_rows,
+                            int level_type_size,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block(preprocess_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-  gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  if (level_type_size == 1) {
+    gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
 }
 
 /**
@@ -1040,6 +1050,7 @@ void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
                                    hostdevice_vector<ColumnChunkDesc> const& chunks,
                                    size_t num_rows,
                                    size_t min_row,
+                                   int level_type_size,
                                    rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -1050,11 +1061,21 @@ void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
   // TODO figure out when one version is better than the other.  waiting on further changes to
   // rle_stream to simplify the decode step.
   if constexpr (true) {
-    gpuDecodeStringPageData<non_zero_buffer_size>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    if (level_type_size == 1) {
+      gpuDecodeStringPageData<non_zero_buffer_size, uint8_t>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    } else {
+      gpuDecodeStringPageData<non_zero_buffer_size, uint16_t>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    }
   } else {
-    gpuDecodeStringPageDataV2<non_zero_buffer_size>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    if (level_type_size == 1) {
+      gpuDecodeStringPageDataV2<non_zero_buffer_size, uint8_t>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    } else {
+      gpuDecodeStringPageDataV2<non_zero_buffer_size, uint16_t>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    }
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b6e9129d236..49adfefa63c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -496,6 +496,7 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
                             size_t min_row,
                             size_t num_rows,
+                            int level_type_size,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -534,6 +535,7 @@ void DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
                           hostdevice_vector<ColumnChunkDesc> const& chunks,
                           size_t num_rows,
                           size_t min_row,
+                          int level_type_size,
                           rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 94786636b99..768616ce68b 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -51,7 +51,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   std::vector<size_type> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    gpu::ComputePageStringSizes(pages, chunks, skip_rows, num_rows, _stream);
+    gpu::ComputePageStringSizes(
+      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
 
     // TODO do the following on device with thrust/kernel to avoid the pages round trip
     pages.device_to_host(_stream, true);
@@ -164,7 +165,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // TODO: explore launching these concurrently with a stream pool
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-  gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, _stream);
+  gpu::DecodeStringPageData(
+    pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
 
   pages.device_to_host(_stream);
   page_nesting.device_to_host(_stream);

From c1aebf3f05b6be8c375b434cc46ff550ff494ff2 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 8 May 2023 15:39:14 -0700
Subject: [PATCH 048/114] fix string buffer length

---
 cpp/src/io/utilities/column_buffer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 2f6ae37a674..ed2e15cef0d 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -327,7 +327,7 @@ std::unique_ptr<column> make_column<true>(utilities::column_buffer<true>& buffer
         buffer._string_data.size() == 0
           ? make_empty_column(data_type{type_id::INT8})
           : std::make_unique<column>(data_type{type_id::INT8},
-                                     buffer.size,
+                                     buffer.string_size(),
                                      std::move(buffer._string_data),
                                      cudf::detail::create_null_mask(buffer.size, state, stream, mr),
                                      state_null_count(state, buffer.size),

From 217e12f568782332330903005015e0ebecd6691a Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 8 May 2023 16:45:36 -0700
Subject: [PATCH 049/114] fix for columns that start with null values

---
 cpp/src/io/parquet/page_string_decode.cu | 43 +++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 1414328dbcf..f602460cdb1 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -627,6 +627,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   int const t                    = threadIdx.x;
   int out_thread0;
 
+  // set during string copy by lane 0
+  int first_non_null = -1;
+
   if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
     return;
   }
@@ -697,6 +700,22 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
+      int const me = t - out_thread0;
+
+      // if this is the first page, then the first non-null entry will have an offset of 0.
+      // pages that start with a run of nulls will have repeated 0 values, so for the fixing
+      // of null offsets done at the end, we need to know the last index that should be 0.
+      if (me == 0 && s->page.str_offset == 0 && first_non_null == -1) {
+        for (int i = src_pos; i < target_pos; i++) {
+          int dst_pos = sb->nz_idx[rolling_index(i)];
+          if (!has_repetition) { dst_pos -= s->first_row; }
+          if (dst_pos >= 0) {
+            first_non_null = dst_pos;
+            break;
+          }
+        }
+      }
+
       // WARP1..WARP3: Decode values
       src_pos += t - out_thread0;
 
@@ -722,7 +741,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       auto const use_char_ll =
         s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
       int const leaf_level_index = s->col.max_nesting_depth - 1;
-      int const me               = t - out_thread0;
 
       if (me < warp_size) {
         for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
@@ -800,11 +818,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
         }
       }
       // just some nulls, do this serially for now
-      else if (t == 0) {
+      else if (t == out_thread0) {
         if (offptr[value_count - 1] == 0) {
           offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
         }
-        for (int i = value_count - 2; i > 0; i--) {
+        for (int i = value_count - 2; i > first_non_null; i--) {
           if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
         }
         offptr[0] = s->page.str_offset;
@@ -845,6 +863,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   int const t                    = threadIdx.x;
   int out_thread0;
 
+  // set during string copy by lane 0
+  int first_non_null = -1;
+
   if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
     return;
   }
@@ -908,6 +929,20 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     // target_pos for value deconding
     target_pos = min(s->nz_count, target_pos);
 
+    // if this is the first page, then the first non-null entry will have an offset of 0.
+    // pages that start with a run of nulls will have repeated 0 values, so for the fixing
+    // of null offsets done at the end, we need to know the last index that should be 0.
+    if (t == 0 && s->page.str_offset == 0 && first_non_null == -1) {
+      for (int i = src_pos; i < target_pos; i++) {
+        int dst_pos = sb->nz_idx[rolling_index(i)];
+        if (!has_repetition) { dst_pos -= s->first_row; }
+        if (dst_pos >= 0) {
+          first_non_null = dst_pos;
+          break;
+        }
+      }
+    }
+
     // Decode values
     src_pos += t;
 
@@ -1009,7 +1044,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
         if (offptr[value_count - 1] == 0) {
           offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
         }
-        for (int i = value_count - 2; i > 0; i--) {
+        for (int i = value_count - 2; i > first_non_null; i--) {
           if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
         }
         offptr[0] = s->page.str_offset;

From b49ff95629828c41f84dffe892958b55b825520c Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 8 May 2023 16:53:11 -0700
Subject: [PATCH 050/114] fix for decimal columns

---
 cpp/src/io/parquet/page_decode.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 6642fa72acd..07f8567322f 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -113,7 +113,8 @@ constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc c
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
-  return (col.data_type & 7) == BYTE_ARRAY and (col.data_type >> 3) != 4;
+  return (col.data_type & 7) == BYTE_ARRAY and (col.data_type >> 3) != 4 and
+         col.converted_type != DECIMAL;
 }
 
 /**

From 8054f10c6a164120245c7f965ecf7cdf7d4e11af Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 8 May 2023 17:09:21 -0700
Subject: [PATCH 051/114] another fix for null handlng

---
 cpp/src/io/parquet/page_string_decode.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index f602460cdb1..3be0181fe4d 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -819,7 +819,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       }
       // just some nulls, do this serially for now
       else if (t == out_thread0) {
-        if (offptr[value_count - 1] == 0) {
+        if (first_non_null == -1) { first_non_null = 0; }
+
+        if (offptr[value_count - 1] == 0 && value_count - 1 != first_non_null) {
           offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
         }
         for (int i = value_count - 2; i > first_non_null; i--) {
@@ -1041,7 +1043,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
       }
       // just some nulls, do this serially for now
       else if (t == 0) {
-        if (offptr[value_count - 1] == 0) {
+        if (first_non_null == -1) { first_non_null = 0; }
+
+        if (offptr[value_count - 1] == 0 && value_count - 1 != first_non_null) {
           offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
         }
         for (int i = value_count - 2; i > first_non_null; i--) {

From 84762b9be9a84b4f29e5e2ec42dd5dc1b5630781 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 9 May 2023 08:41:55 -0700
Subject: [PATCH 052/114] one more bug cleaning up nulls

---
 cpp/src/io/parquet/page_string_decode.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 3be0181fe4d..031c41a3bc4 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -805,8 +805,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   // if there are nulls clean up the offsets array.
   if (s->page.num_nulls != 0) {
-    int const value_count      = s->page.num_valids + s->page.num_nulls;
     int const leaf_level_index = s->col.max_nesting_depth - 1;
+    int const value_count      = nesting_info_base[leaf_level_index].value_count;
 
     auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
 
@@ -1029,8 +1029,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 
   // if there are nulls clean up the offsets array.
   if (s->page.num_nulls != 0) {
-    int const value_count      = s->page.num_valids + s->page.num_nulls;
     int const leaf_level_index = s->col.max_nesting_depth - 1;
+    int const value_count      = nesting_info_base[leaf_level_index].value_count;
 
     auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
 

From 24fb8f2ae3f7ef11e296e28b57694d7eeb2138a9 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 9 May 2023 10:56:28 -0500
Subject: [PATCH 053/114] PR review feedback.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu |  2 +-
 cpp/src/io/parquet/rle_stream.cuh            | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 26073b0ff3d..28c18a563a8 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -355,7 +355,7 @@ int decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                                             level_bit_size + chunks.size(),
                                             0,
                                             thrust::maximum<int>());
-  auto const level_type_size = max(1, cudf::util::round_up_safe(max_level_bits, 8) / 8);
+  auto const level_type_size = max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
   pages.device_to_host(stream, true);
 
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 07de76d0980..3bcfb0e6b29 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -26,8 +26,12 @@ namespace gpu {
 
 // TODO: consider if these should be template parameters to rle_stream
 constexpr int num_rle_stream_decode_threads = 512;
-constexpr int num_rle_stream_decode_warps   = (num_rle_stream_decode_threads / 32) - 1;
-constexpr int run_buffer_size               = (num_rle_stream_decode_warps * 2);
+// the -1 here is for the look-ahead warp that fills in the list of runs to be decoded
+// in an overlapped manner. so if we had 16 total warps:
+// - warp 0 would be filling in batches of runs to be processed
+// - warps 1-15 would be decoding the previous batch of runs generated
+constexpr int num_rle_stream_decode_warps = (num_rle_stream_decode_threads / 32) - 1;
+constexpr int run_buffer_size             = (num_rle_stream_decode_warps * 2);
 constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
 
 /**
@@ -86,14 +90,14 @@ struct rle_batch {
     int _level_val;
     if (!(level_run & 1)) {
       _level_val = run_start[0];
-      if (level_bits > 8) { _level_val |= run_start[0] << 8; }
+      if (level_bits > 8) { _level_val |= run_start[1] << 8; }
     }
 
     // process
     while (remain > 0) {
       int batch_len = min(32, remain);
 
-      // if this is a literal run. each thread computes it's own level_val
+      // if this is a literal run. each thread computes its own level_val
       if (level_run & 1) {
         int const batch_len8 = (batch_len + 7) >> 3;
         if (lane < batch_len) {

From 32ce89b7c74b48f1d7aa59d2939be924bf3a164a Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 9 May 2023 13:58:47 -0700
Subject: [PATCH 054/114] minor cleanup

---
 cpp/src/io/parquet/page_string_decode.cu | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 031c41a3bc4..aaff0ae08ec 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -113,8 +113,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
                                               size_t num_rows,
                                               bool is_bounds_pg,
                                               bool has_repetition,
-                                              rle_stream<level_t>* decoders,
-                                              int t)
+                                              rle_stream<level_t>* decoders)
 {
   using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
   using block_scan   = cub::BlockScan<int, preprocess_block_size>;
@@ -123,6 +122,8 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
+  int const t = threadIdx.x;
+
   // decode batches of level stream data using rle_stream objects and use the results to
   // calculate start and end value positions in the encoded string data.
   int const max_depth = s->col.max_nesting_depth;
@@ -338,9 +339,9 @@ __device__ size_t countDictEntries(uint8_t const* data,
                                    int dict_size,
                                    int data_size,
                                    int start_value,
-                                   int end_value,
-                                   int t)
+                                   int end_value)
 {
+  int const t              = threadIdx.x;
   uint8_t const* ptr       = data;
   uint8_t const* const end = data + data_size;
   int const bytecnt        = (dict_bits + 7) >> 3;
@@ -459,9 +460,12 @@ __device__ size_t countDictEntries(uint8_t const* data,
  * @param end_value Do not count values that occur after this index
  * @param t Thread index
  */
-__device__ size_t
-countPlainEntries(uint8_t const* data, int data_size, int start_value, int end_value, int t)
+__device__ size_t countPlainEntries(uint8_t const* data,
+                                    int data_size,
+                                    int start_value,
+                                    int end_value)
 {
+  int const t      = threadIdx.x;
   int pos          = 0;
   size_t total_len = 0;
 
@@ -546,7 +550,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // find start/end value indices
   auto const [start_value, end_value] =
-    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders, t);
+    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
 
   // need to save num_nulls and num_valids calculated in page_bounds in this page
   if (t == 0) {
@@ -580,11 +584,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
       if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
 
       str_bytes = countDictEntries(
-        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value, t);
+        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
       break;
     case Encoding::PLAIN:
       dict_size = static_cast<int32_t>(end - data);
-      str_bytes = is_bounds_pg ? countPlainEntries(data, dict_size, start_value, end_value, t)
+      str_bytes = is_bounds_pg ? countPlainEntries(data, dict_size, start_value, end_value)
                                : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
       break;
   }

From 859eb431b39c96ad937da98bf6675a7b50758fb7 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 10 May 2023 13:04:16 -0500
Subject: [PATCH 055/114] PR review feedback.

---
 cpp/src/io/parquet/page_data.cu              | 10 ++---
 cpp/src/io/parquet/reader_impl_preprocess.cu |  6 +--
 cpp/src/io/parquet/rle_stream.cuh            | 40 ++++++++++----------
 cpp/src/io/utilities/block_utils.cuh         |  2 +-
 4 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 089a6d4c925..03b31d77122 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1389,14 +1389,14 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d         = static_cast<int>(def[index]);
+    int const index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d               = static_cast<int>(def[index]);
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {
-      level_t r   = rep[index];
-      start_depth = s->nesting_info[r].start_depth;
-      end_depth   = s->nesting_info[d].end_depth;
+      level_t const r = rep[index];
+      start_depth     = s->nesting_info[r].start_depth;
+      end_depth       = s->nesting_info[d].end_depth;
     }
     // for columns without repetition (even ones involving structs) we always
     // traverse the entire hierarchy.
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 828259e8dcd..4433561ff1b 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -346,8 +346,8 @@ int decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
   auto level_bit_size =
     cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
       auto c = chunks[i];
-      return static_cast<int>(std::max(c.level_bits[gpu::level_type::REPETITION],
-                                       c.level_bits[gpu::level_type::DEFINITION]));
+      return static_cast<int>(
+        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
     });
   // max level data bit size.
   int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
@@ -355,7 +355,7 @@ int decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                                             level_bit_size + chunks.size(),
                                             0,
                                             thrust::maximum<int>());
-  auto const level_type_size = max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
+  auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
   pages.device_to_host(stream, true);
 
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 3bcfb0e6b29..93191dd07e0 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "parquet_gpu.hpp"
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 namespace cudf {
@@ -30,8 +31,9 @@ constexpr int num_rle_stream_decode_threads = 512;
 // in an overlapped manner. so if we had 16 total warps:
 // - warp 0 would be filling in batches of runs to be processed
 // - warps 1-15 would be decoding the previous batch of runs generated
-constexpr int num_rle_stream_decode_warps = (num_rle_stream_decode_threads / 32) - 1;
-constexpr int run_buffer_size             = (num_rle_stream_decode_warps * 2);
+constexpr int num_rle_stream_decode_warps =
+  (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1;
+constexpr int run_buffer_size = (num_rle_stream_decode_warps * 2);
 constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
 
 /**
@@ -42,7 +44,7 @@ constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
  *
  * @return The 32-bit value read
  */
-inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
+inline __device__ uint32_t get_vlq32(uint8_t const*& cur, uint8_t const* end)
 {
   uint32_t v = *cur++;
   if (v >= 0x80 && cur < end) {
@@ -87,15 +89,15 @@ struct rle_batch {
     }
 
     // if this is a repeated run, compute the repeated value
-    int _level_val;
+    int level_val;
     if (!(level_run & 1)) {
-      _level_val = run_start[0];
-      if (level_bits > 8) { _level_val |= run_start[1] << 8; }
+      level_val = run_start[0];
+      if (level_bits > 8) { level_val |= run_start[1] << 8; }
     }
 
     // process
     while (remain > 0) {
-      int batch_len = min(32, remain);
+      int const batch_len = min(32, remain);
 
       // if this is a literal run. each thread computes its own level_val
       if (level_run & 1) {
@@ -104,22 +106,22 @@ struct rle_batch {
           int bitpos                = lane * level_bits;
           uint8_t const* cur_thread = cur + (bitpos >> 3);
           bitpos &= 7;
-          _level_val = 0;
-          if (cur_thread < end) { _level_val = cur_thread[0]; }
+          level_val = 0;
+          if (cur_thread < end) { level_val = cur_thread[0]; }
           cur_thread++;
           if (level_bits > 8 - bitpos && cur_thread < end) {
-            _level_val |= cur_thread[0] << 8;
+            level_val |= cur_thread[0] << 8;
             cur_thread++;
-            if (level_bits > 16 - bitpos && cur_thread < end) { _level_val |= cur_thread[0] << 16; }
+            if (level_bits > 16 - bitpos && cur_thread < end) { level_val |= cur_thread[0] << 16; }
           }
-          _level_val = (_level_val >> bitpos) & ((1 << level_bits) - 1);
+          level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
         }
 
         cur += batch_len8 * level_bits;
       }
 
       // store level_val
-      if (lane < batch_len && (lane + output_pos) >= 0) { output[lane + output_pos] = _level_val; }
+      if (lane < batch_len && (lane + output_pos) >= 0) { output[lane + output_pos] = level_val; }
       remain -= batch_len;
       output_pos += batch_len;
     }
@@ -137,7 +139,7 @@ struct rle_run {
 
   __device__ __inline__ rle_batch<level_t> next_batch(level_t* const output, int max_size)
   {
-    int batch_len        = min(max_size, remaining);
+    int const batch_len  = min(max_size, remaining);
     int const run_offset = size - remaining;
     remaining -= batch_len;
     return rle_batch<level_t>{start, run_offset, output, level_run, batch_len};
@@ -251,8 +253,8 @@ struct rle_stream {
     // if we've reached the output limit on the last run
     if (output_pos >= max_count) {
       // first, see if we've spilled over
-      auto& src       = runs[rolling_run_index(run_index - 1)];
-      int spill_count = output_pos - max_count;
+      auto const& src       = runs[rolling_run_index(run_index - 1)];
+      int const spill_count = output_pos - max_count;
 
       // a spill has occurred in the current run. spill the extra values over into the beginning of
       // the next run.
@@ -279,7 +281,7 @@ struct rle_stream {
     }
   }
 
-  __device__ __inline__ int decode_next(int t)
+  __device__ inline int decode_next(int t)
   {
     int const output_count = min(max_output_values, (total_values - cur_values));
 
@@ -297,9 +299,9 @@ struct rle_stream {
     }
 
     // otherwise, full decode.
-    int const warp_id        = t / 32;
+    int const warp_id        = t / cudf::detail::warp_size;
     int const warp_decode_id = warp_id - 1;
-    int const warp_lane      = t % 32;
+    int const warp_lane      = t % cudf::detail::warp_size;
 
     __shared__ int run_start;
     __shared__ int num_runs;
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 7c923503528..d73f0ebc9b7 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From d994b0c8479df0ee2f1aa361726dde28b3976675 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 10 May 2023 11:20:45 -0700
Subject: [PATCH 056/114] finish merge

---
 cpp/src/io/parquet/page_decode.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 07f8567322f..eab9533614b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -535,8 +535,8 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d         = static_cast<int>(def[index]);
+    int const index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d               = static_cast<int>(def[index]);
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
     if (s->col.max_level[level_type::REPETITION] > 0) {

From f12bcc9143a15dc464eb6be76a85b6c51662d1c4 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 10 May 2023 15:00:41 -0500
Subject: [PATCH 057/114] Fix a bug where specific usage of skip_rows/num_rows
 could cause a race condition when computing the number of skipped values
 during the preprocess step.

---
 cpp/src/io/parquet/page_data.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 03b31d77122..c00595b7dd6 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1745,7 +1745,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
         // we found it
         if (global_count > 0) {
           // this is the thread that represents the first row.
-          if (local_count == 1) {
+          if (local_count == 1 && in_row_bounds) {
             s->page.skipped_values = value_count + t;
             s->page.skipped_leaf_values =
               leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);

From 880400769f6ee8f53bfbde9a8bc2d94959c83e61 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 15 May 2023 09:31:22 -0500
Subject: [PATCH 058/114] PR review feedback.

---
 cpp/src/io/parquet/rle_stream.cuh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 93191dd07e0..473db660238 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -20,10 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::gpu {
 
 // TODO: consider if these should be template parameters to rle_stream
 constexpr int num_rle_stream_decode_threads = 512;
@@ -248,9 +245,15 @@ struct rle_stream {
       run_index++;
     }
 
+    // the above loop computes a batch of runs to be processed. mark down
+    // the number of runs because the code after this point resets run_count
+    // for the next batch. each batch is returned via get_next_batch().
     next_batch_run_count = run_count;
 
-    // if we've reached the output limit on the last run
+    // -------------------------------------
+    // prepare for the next run:
+
+    // if we've reached the value output limit on the last run
     if (output_pos >= max_count) {
       // first, see if we've spilled over
       auto const& src       = runs[rolling_run_index(run_index - 1)];
@@ -353,7 +356,4 @@ struct rle_stream {
   }
 };
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::gpu

From 9d09842cb1fcf5334cb0789274a48e6e3a103305 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 15 May 2023 19:28:53 -0700
Subject: [PATCH 059/114] spelling

---
 cpp/src/io/parquet/page_string_decode.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index aaff0ae08ec..f99e6a573fa 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -105,7 +105,7 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
  * @param has_repetition True if the schema is nested
  * @param decoders Definition and repetition level decoders
  * @param t Thread index
- * @return pair containg start and end value indexes
+ * @return pair containing start and end value indexes
  */
 template <int lvl_buf_size, typename level_t>
 __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
@@ -932,7 +932,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     }
     __syncthreads();
 
-    // target_pos for value deconding
+    // target_pos for value decoding
     target_pos = min(s->nz_count, target_pos);
 
     // if this is the first page, then the first non-null entry will have an offset of 0.

From f23f9cf43f66e0c2fbcf2712acb96a43fe3fe53a Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 16 May 2023 10:04:48 -0700
Subject: [PATCH 060/114] simplify out_thread0 calc

---
 cpp/src/io/parquet/page_string_decode.cu | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index aaff0ae08ec..1f4135ab264 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -629,7 +629,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
-  int out_thread0;
 
   // set during string copy by lane 0
   int first_non_null = -1;
@@ -660,12 +659,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     return;
   }
 
-  if (s->dict_base) {
-    out_thread0 = (s->dict_bits > 0) ? 64 : 32;
-  } else {
-    out_thread0 =
-      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
-  }
+  int out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
 
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
@@ -867,7 +861,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
-  int out_thread0;
 
   // set during string copy by lane 0
   int first_non_null = -1;
@@ -898,7 +891,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     return;
   }
 
-  out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
+  int out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
 
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 

From e80c07b636af6884431de5ad1b33b7aa34166450 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 16 May 2023 10:51:51 -0700
Subject: [PATCH 061/114] fix for string col detection

---
 cpp/src/io/parquet/reader_impl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 768616ce68b..4f01f1f18db 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -45,7 +45,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // this step on device. This call is also somewhat redundant if size info has already been
   // calculated (nested schema, chunked reader).
   auto is_string_col = [](gpu::ColumnChunkDesc const& chunk) {
-    return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4;
+    return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4 &&
+           chunk.converted_type != DECIMAL;
   };
   auto const has_strings = std::any_of(chunks.begin(), chunks.end(), is_string_col);
 

From 6e89596c97fd9df09cef594bcf5fc7239dd3808b Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 17 May 2023 16:00:18 -0700
Subject: [PATCH 062/114] finish merge

---
 cpp/src/io/parquet/page_decode.cuh       | 13 ++++---------
 cpp/src/io/parquet/page_string_decode.cu | 10 +++-------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index eab9533614b..88d4e0bac68 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -789,13 +789,10 @@ __device__ void gpuDecodeLevels(page_state_s* s,
  *
  * @return The length of the section
  */
-template <typename level_t>
 __device__ uint32_t InitLevelSection(page_state_s* s,
                                      const uint8_t* cur,
                                      const uint8_t* end,
-                                     level_type lvl,
-                                     bool is_decode_step,
-                                     rle_stream<level_t>* decoders)
+                                     level_type lvl)
 {
   int32_t len;
   int level_bits    = s->col.level_bits[lvl];
@@ -865,14 +862,12 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
  * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
  * Currently only used by gpuComputePageSizes step)
  */
-template <typename level_t>
 __device__ bool setupLocalPageInfo(page_state_s* const s,
                                    PageInfo const* p,
                                    device_span<ColumnChunkDesc const> chunks,
                                    size_t min_row,
                                    size_t num_rows,
-                                   bool is_decode_step,
-                                   rle_stream<level_t>* decoders = nullptr)
+                                   bool is_decode_step)
 {
   int t = threadIdx.x;
   int chunk_idx;
@@ -1098,9 +1093,9 @@ __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->first_output_value = 0;
 
       // Find the compressed size of repetition levels
-      cur += InitLevelSection(s, cur, end, level_type::REPETITION, is_decode_step, decoders);
+      cur += InitLevelSection(s, cur, end, level_type::REPETITION);
       // Find the compressed size of definition levels
-      cur += InitLevelSection(s, cur, end, level_type::DEFINITION, is_decode_step, decoders);
+      cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
 
       s->dict_bits = 0;
       s->dict_base = nullptr;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 2cf143309f8..965b6ddb0ee 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -532,7 +532,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false, decoders)) { return; }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
 
   if (!t) {
     s->page.num_nulls = 0;
@@ -633,9 +633,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   // set during string copy by lane 0
   int first_non_null = -1;
 
-  if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
-    return;
-  }
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -865,9 +863,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   // set during string copy by lane 0
   int first_non_null = -1;
 
-  if (!setupLocalPageInfo<level_t>(s, &pages[page_idx], chunks, min_row, num_rows, true)) {
-    return;
-  }
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 

From f1669bc5be267398fb80144d45db8c60c6b04206 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 19 May 2023 16:20:08 -0700
Subject: [PATCH 063/114] alternate way to do column_buffer

---
 cpp/src/io/parquet/reader_impl_helpers.hpp |   3 +-
 cpp/src/io/utilities/column_buffer.cpp     | 154 +++++++++---------
 cpp/src/io/utilities/column_buffer.hpp     | 173 +++++++++++----------
 3 files changed, 178 insertions(+), 152 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index cc4a64fad94..0e3b1a20d96 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -29,7 +29,8 @@
 namespace cudf::io::detail::parquet {
 
 using namespace cudf::io::parquet;
-using column_buffer = cudf::io::detail::utilities::column_buffer<true>;
+using string_type = cudf::io::detail::utilities::column_buffer_with_strings;
+using column_buffer = cudf::io::detail::utilities::column_buffer<string_type>;
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 2196edf0ef2..1a50227e747 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,46 +26,19 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
+namespace cudf::io::detail {
 namespace utilities {
 
-void column_buffer_with_pointers::create_strings(size_type size, rmm::cuda_stream_view stream)
-{
-  // The contents of _strings will never be directly returned to the user.
-  // Due to the fact that make_strings_column copies the input data to
-  // produce its outputs, _strings is actually a temporary. As a result, we
-  // do not pass the provided mr to the call to
-  // make_zeroed_device_uvector_async here and instead let it use the
-  // default rmm memory resource.
-  _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-      size, stream, rmm::mr::get_current_device_resource()));
-}
-
-void column_buffer_with_strings::create_strings(size_type num_bytes,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
-{
-  _string_data = rmm::device_buffer(num_bytes, stream, mr);
-}
-
-template <bool contains_strings>
-void column_buffer<contains_strings>::create(size_type _size,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+void column_buffer_base::create(size_type _size,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* _mr)
 {
   size = _size;
+  mr   = _mr;
 
   switch (type.id()) {
     case type_id::STRING:
-      if constexpr (contains_strings) {
-        // size + 1 for final offset. _string_data will be initialized later.
-        _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
-      } else {
-        this->create_strings(size, stream);
-      }
+      // will be handled by children
       break;
 
     // list columns store a buffer of int32's as offsets to represent
@@ -81,7 +54,42 @@ void column_buffer<contains_strings>::create(size_type _size,
     _null_mask =
       cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
   }
-  this->mr = mr;
+}
+
+void column_buffer_with_pointers::create(size_type _size,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* _mr)
+{
+  column_buffer_base::create(_size, stream, _mr);
+  if (type.id() == type_id::STRING) {
+    // The contents of _strings will never be directly returned to the user.
+    // Due to the fact that make_strings_column copies the input data to
+    // produce its outputs, _strings is actually a temporary. As a result, we
+    // do not pass the provided mr to the call to
+    // make_zeroed_device_uvector_async here and instead let it use the
+    // default rmm memory resource.
+    _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+        size, stream, rmm::mr::get_current_device_resource()));
+  }
+}
+
+void column_buffer_with_strings::create(size_type _size,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* _mr)
+{
+  column_buffer_base::create(_size, stream, _mr);
+  if (type.id() == type_id::STRING) {
+    // size + 1 for final offset. _string_data will be initialized later.
+    _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
+  }
+}
+
+void column_buffer_with_strings::create_strings(size_type num_bytes,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* _mr)
+{
+  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
 namespace {
@@ -92,24 +100,24 @@ namespace {
  * @param buff The old output buffer
  * @param new_buff The new output buffer
  */
-template <bool contains_strings>
-void copy_buffer_data(column_buffer<contains_strings> const& buff,
-                      column_buffer<contains_strings>& new_buff)
+template <class column_buffer_type>
+void copy_buffer_data(column_buffer<column_buffer_type> const& buff,
+                      column_buffer<column_buffer_type>& new_buff)
 {
   new_buff.name      = buff.name;
   new_buff.user_data = buff.user_data;
   for (auto const& child : buff.children) {
     auto& new_child = new_buff.children.emplace_back(
-      column_buffer<contains_strings>(child.type, child.is_nullable));
+      column_buffer<column_buffer_type>(child.type, child.is_nullable));
     copy_buffer_data(child, new_child);
   }
 }
 
 }  // namespace
 
-template <bool contains_strings>
-column_buffer<contains_strings> column_buffer<contains_strings>::empty_like(
-  column_buffer<contains_strings> const& input)
+template <class column_buffer_type>
+column_buffer<column_buffer_type> column_buffer<column_buffer_type>::empty_like(
+  column_buffer<column_buffer_type> const& input)
 {
   auto new_buff = column_buffer(input.type, input.is_nullable);
   copy_buffer_data(input, new_buff);
@@ -117,11 +125,11 @@ column_buffer<contains_strings> column_buffer<contains_strings>::empty_like(
 }
 
 // force instantiation of both column_buffers
-template class column_buffer<true>;
-template class column_buffer<false>;
+template class column_buffer<column_buffer_with_strings>;
+template class column_buffer<column_buffer_with_pointers>;
 
-template <bool contains_strings>
-std::unique_ptr<column> make_column(column_buffer<contains_strings>& buffer,
+template <class column_buffer_type>
+std::unique_ptr<column> make_column(column_buffer<column_buffer_type>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
@@ -149,7 +157,7 @@ std::unique_ptr<column> make_column(column_buffer<contains_strings>& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = cudf::io::detail::make_column<contains_strings>(
+      auto child = cudf::io::detail::make_column<column_buffer_type>(
         buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
@@ -179,7 +187,7 @@ std::unique_ptr<column> make_column(column_buffer<contains_strings>& buffer,
                                     ? std::make_optional<reader_column_schema>(schema->child(i))
                                     : std::nullopt;
 
-        output_children.emplace_back(cudf::io::detail::make_column<contains_strings>(
+        output_children.emplace_back(cudf::io::detail::make_column<column_buffer_type>(
           buffer.children[i], child_info, child_schema, stream));
       }
 
@@ -201,8 +209,8 @@ std::unique_ptr<column> make_column(column_buffer<contains_strings>& buffer,
   }
 }
 
-template <bool contains_strings>
-std::unique_ptr<column> empty_like(column_buffer<contains_strings>& buffer,
+template <class column_buffer_type>
+std::unique_ptr<column> empty_like(column_buffer<column_buffer_type>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -223,8 +231,8 @@ std::unique_ptr<column> empty_like(column_buffer<contains_strings>& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child =
-        cudf::io::detail::empty_like<contains_strings>(buffer.children[0], child_info, stream, mr);
+      auto child = cudf::io::detail::empty_like<column_buffer_type>(
+        buffer.children[0], child_info, stream, mr);
 
       // make the final list column
       return make_lists_column(
@@ -243,7 +251,7 @@ std::unique_ptr<column> empty_like(column_buffer<contains_strings>& buffer,
                          schema_info->children.push_back(column_name_info{""});
                          child_info = &schema_info->children.back();
                        }
-                       return cudf::io::detail::empty_like<contains_strings>(
+                       return cudf::io::detail::empty_like<column_buffer_type>(
                          col, child_info, stream, mr);
                      });
 
@@ -257,11 +265,17 @@ std::unique_ptr<column> empty_like(column_buffer<contains_strings>& buffer,
 
 }  // namespace utilities
 
+using pointer_type = utilities::column_buffer_with_pointers;
+using string_type  = utilities::column_buffer_with_strings;
+
+using pointer_column_buffer = utilities::column_buffer<pointer_type>;
+using string_column_buffer = utilities::column_buffer<string_type>;
+
 template <>
-std::unique_ptr<column> make_column<false>(utilities::column_buffer<false>& buffer,
-                                           column_name_info* schema_info,
-                                           std::optional<reader_column_schema> const& schema,
-                                           rmm::cuda_stream_view stream)
+std::unique_ptr<column> make_column<pointer_type>(pointer_column_buffer& buffer,
+                                                  column_name_info* schema_info,
+                                                  std::optional<reader_column_schema> const& schema,
+                                                  rmm::cuda_stream_view stream)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -311,10 +325,10 @@ std::unique_ptr<column> make_column<false>(utilities::column_buffer<false>& buff
 }
 
 template <>
-std::unique_ptr<column> make_column<true>(utilities::column_buffer<true>& buffer,
-                                          column_name_info* schema_info,
-                                          std::optional<reader_column_schema> const& schema,
-                                          rmm::cuda_stream_view stream)
+std::unique_ptr<column> make_column<string_type>(string_column_buffer& buffer,
+                                                 column_name_info* schema_info,
+                                                 std::optional<reader_column_schema> const& schema,
+                                                 rmm::cuda_stream_view stream)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -391,23 +405,21 @@ std::unique_ptr<column> make_column<true>(utilities::column_buffer<true>& buffer
  * @copydoc cudf::io::detail::empty_like
  */
 template <>
-std::unique_ptr<column> empty_like<true>(utilities::column_buffer<true>& buffer,
-                                         column_name_info* schema_info,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> empty_like<string_type>(utilities::column_buffer<string_type>& buffer,
+                                                column_name_info* schema_info,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   return utilities::empty_like(buffer, schema_info, stream, mr);
 }
 
 template <>
-std::unique_ptr<column> empty_like<false>(utilities::column_buffer<false>& buffer,
-                                          column_name_info* schema_info,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> empty_like<pointer_type>(utilities::column_buffer<pointer_type>& buffer,
+                                                 column_name_info* schema_info,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   return utilities::empty_like(buffer, schema_info, stream, mr);
 }
 
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index a186fde3ac6..80a8d507844 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -64,29 +64,97 @@ using string_index_pair = thrust::pair<const char*, size_type>;
 
 namespace utilities {
 
-struct column_buffer_with_pointers {
-  void create_strings(size_type _size, rmm::cuda_stream_view stream);
+struct column_buffer_base {
+  column_buffer_base() = default;
 
-  std::optional<void*> str_data()
+  // construct without a known size. call create() later to actually
+  // allocate memory
+  column_buffer_base(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
+
+  // construct with a known size. allocates memory
+  column_buffer_base(data_type _type,
+                     size_type _size,
+                     bool _is_nullable,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr)
+    : column_buffer_base(_type, _is_nullable)
   {
-    return _strings ? std::optional(_strings->data()) : std::nullopt;
+    create(_size, stream, mr);
   }
-  std::optional<size_t> str_data_size() const
+
+  // instantiate a column of known type with a specified size.  Allows deferred creation for
+  // preprocessing steps such as in the Parquet reader
+  virtual void create(size_type _size,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* _mr);
+
+  template <typename T = uint32_t>
+  auto null_mask()
   {
-    return _strings ? std::optional(_strings->size()) : std::nullopt;
+    return static_cast<T*>(_null_mask.data());
   }
+  auto null_mask_size() { return _null_mask.size(); }
+
+  auto& null_count() { return _null_count; }
+
+  rmm::device_buffer _data{};
+  rmm::device_buffer _null_mask{};
+  size_type _null_count{0};
+
+  data_type type{type_id::EMPTY};
+  bool is_nullable{false};
+  size_type size{0};
+  uint32_t user_data{0};  // arbitrary user data
+  std::string name;
+
+  rmm::mr::device_memory_resource* mr;
+};
+
+struct column_buffer_with_pointers : public column_buffer_base {
+  column_buffer_with_pointers() = default;
+
+  // construct without a known size. call create() later to actually
+  // allocate memory
+  column_buffer_with_pointers(data_type _type, bool _is_nullable)
+    : column_buffer_base(_type, _is_nullable)
+  {
+  }
+
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr);
+
+  void* data() { return _strings ? _strings->data() : _data.data(); }
+  size_t data_size() const { return _strings ? _strings->size() : _data.size(); }
 
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
 };
 
-struct column_buffer_with_strings {
+struct column_buffer_with_strings : public column_buffer_base {
+  column_buffer_with_strings() = default;
+
+  // construct without a known size. call create() later to actually
+  // allocate memory
+  column_buffer_with_strings(data_type _type, bool _is_nullable)
+    : column_buffer_base(_type, _is_nullable)
+  {
+  }
+
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr);
+
   void create_strings(size_type _size,
                       rmm::cuda_stream_view stream,
                       rmm::mr::device_memory_resource* mr);
 
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+  {
+    this->create_strings(num_bytes, stream, mr);
+  }
+
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 
+  void* data() { return _data.data(); }
+  size_t data_size() const { return _data.size(); }
+
   rmm::device_buffer _string_data{};
 };
 
@@ -94,15 +162,13 @@ struct column_buffer_with_strings {
  * @brief Class for holding device memory buffers to column data that eventually
  * will be used to create a column.
  */
-template <bool contains_strings>
-struct column_buffer : std::conditional<contains_strings,
-                                        column_buffer_with_strings,
-                                        column_buffer_with_pointers>::type {
+template <class column_buffer_type>
+struct column_buffer : column_buffer_type {
   column_buffer() = default;
 
   // construct without a known size. call create() later to actually
   // allocate memory
-  column_buffer(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
+  column_buffer(data_type _type, bool _is_nullable) : column_buffer_type(_type, _is_nullable) {}
 
   // construct with a known size. allocates memory
   column_buffer(data_type _type,
@@ -110,84 +176,31 @@ struct column_buffer : std::conditional<contains_strings,
                 bool _is_nullable,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr)
-    : column_buffer(_type, _is_nullable)
+    : column_buffer_type(_type, _is_nullable)
   {
-    create(_size, stream, mr);
+    column_buffer_type::create(_size, stream, mr);
   }
 
   // move constructor
-  column_buffer(column_buffer<contains_strings>&& col)                              = default;
-  column_buffer<contains_strings>& operator=(column_buffer<contains_strings>&& col) = default;
+  column_buffer(column_buffer<column_buffer_type>&& col) = default;
+  column_buffer<column_buffer_type>& operator=(column_buffer<column_buffer_type>&& col) = default;
 
   // copy constructor
-  column_buffer(column_buffer<contains_strings> const& col)                              = delete;
-  column_buffer<contains_strings>& operator=(column_buffer<contains_strings> const& col) = delete;
-
-  // instantiate a column of known type with a specified size.  Allows deferred creation for
-  // preprocessing steps such as in the Parquet reader
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
-
-  template <bool T = contains_strings>
-  typename std::enable_if<T, void>::type create_string_data(size_t num_bytes,
-                                                            rmm::cuda_stream_view stream)
-  {
-    this->create_strings(num_bytes, stream, mr);
-  }
-
-  template <bool T = contains_strings>
-  typename std::enable_if<T, void*>::type data()
-  {
-    return _data.data();
-  }
-
-  template <bool T = contains_strings>
-  typename std::enable_if<!T, void*>::type data()
-  {
-    return this->str_data().value_or(_data.data());
-  }
-
-  template <bool T = contains_strings>
-  typename std::enable_if<T, size_t>::type data_size() const
-  {
-    return _data.size();
-  }
-
-  template <bool T = contains_strings>
-  typename std::enable_if<!T, size_t>::type data_size() const
-  {
-    return this->str_data_size().value_or(_data.size());
-  }
-
-  template <typename T = uint32_t>
-  auto null_mask()
-  {
-    return static_cast<T*>(_null_mask.data());
-  }
-  auto null_mask_size() { return _null_mask.size(); }
-
-  auto& null_count() { return _null_count; }
+  column_buffer(column_buffer<column_buffer_type> const& col) = delete;
+  column_buffer<column_buffer_type>& operator=(column_buffer<column_buffer_type> const& col) =
+    delete;
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
-  static column_buffer<contains_strings> empty_like(column_buffer<contains_strings> const& input);
+  static column_buffer<column_buffer_type> empty_like(
+    column_buffer<column_buffer_type> const& input);
 
-  rmm::device_buffer _data{};
-  rmm::device_buffer _null_mask{};
-  size_type _null_count{0};
-
-  data_type type{type_id::EMPTY};
-  bool is_nullable{false};
-  size_type size{0};
-  std::vector<column_buffer<contains_strings>> children;
-  uint32_t user_data{0};  // arbitrary user data
-  std::string name;
-
-  rmm::mr::device_memory_resource* mr;
+  std::vector<column_buffer<column_buffer_type>> children;
 };
 
 }  // namespace utilities
 
-using column_buffer = utilities::column_buffer<false>;
+using column_buffer = utilities::column_buffer<utilities::column_buffer_with_pointers>;
 
 /**
  * @brief Creates a column from an existing set of device memory buffers.
@@ -200,8 +213,8 @@ using column_buffer = utilities::column_buffer<false>;
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-template <bool contains_strings>
-std::unique_ptr<column> make_column(utilities::column_buffer<contains_strings>& buffer,
+template <class column_buffer_type>
+std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream);
@@ -221,8 +234,8 @@ std::unique_ptr<column> make_column(utilities::column_buffer<contains_strings>&
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-template <bool contains_strings>
-std::unique_ptr<column> empty_like(utilities::column_buffer<contains_strings>& buffer,
+template <class column_buffer_type>
+std::unique_ptr<column> empty_like(utilities::column_buffer<column_buffer_type>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);

From 2b5a5e06835ecee87b2066121c7cd9eb550146fc Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 19 May 2023 16:38:12 -0700
Subject: [PATCH 064/114] remove unused constructor

---
 cpp/src/io/utilities/column_buffer.hpp | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 80a8d507844..8064b6f9e6a 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -67,21 +67,9 @@ namespace utilities {
 struct column_buffer_base {
   column_buffer_base() = default;
 
-  // construct without a known size. call create() later to actually
-  // allocate memory
+  // construct without a known size. call create() later to actually allocate memory
   column_buffer_base(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
 
-  // construct with a known size. allocates memory
-  column_buffer_base(data_type _type,
-                     size_type _size,
-                     bool _is_nullable,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
-    : column_buffer_base(_type, _is_nullable)
-  {
-    create(_size, stream, mr);
-  }
-
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
   virtual void create(size_type _size,
@@ -113,8 +101,7 @@ struct column_buffer_base {
 struct column_buffer_with_pointers : public column_buffer_base {
   column_buffer_with_pointers() = default;
 
-  // construct without a known size. call create() later to actually
-  // allocate memory
+  // construct without a known size. call create() later to actually allocate memory
   column_buffer_with_pointers(data_type _type, bool _is_nullable)
     : column_buffer_base(_type, _is_nullable)
   {
@@ -131,8 +118,7 @@ struct column_buffer_with_pointers : public column_buffer_base {
 struct column_buffer_with_strings : public column_buffer_base {
   column_buffer_with_strings() = default;
 
-  // construct without a known size. call create() later to actually
-  // allocate memory
+  // construct without a known size. call create() later to actually allocate memory
   column_buffer_with_strings(data_type _type, bool _is_nullable)
     : column_buffer_base(_type, _is_nullable)
   {
@@ -166,8 +152,7 @@ template <class column_buffer_type>
 struct column_buffer : column_buffer_type {
   column_buffer() = default;
 
-  // construct without a known size. call create() later to actually
-  // allocate memory
+  // construct without a known size. call create() later to actually allocate memory
   column_buffer(data_type _type, bool _is_nullable) : column_buffer_type(_type, _is_nullable) {}
 
   // construct with a known size. allocates memory
@@ -182,7 +167,7 @@ struct column_buffer : column_buffer_type {
   }
 
   // move constructor
-  column_buffer(column_buffer<column_buffer_type>&& col) = default;
+  column_buffer(column_buffer<column_buffer_type>&& col)                                = default;
   column_buffer<column_buffer_type>& operator=(column_buffer<column_buffer_type>&& col) = default;
 
   // copy constructor

From 30bfe9f39d4b40960581c832c28c9986766d0b52 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 19 May 2023 16:54:29 -0700
Subject: [PATCH 065/114] get rid of another unnecessary function

---
 cpp/src/io/utilities/column_buffer.cpp | 8 +++-----
 cpp/src/io/utilities/column_buffer.hpp | 9 +--------
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 1a50227e747..a36f5abadaa 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -85,11 +85,9 @@ void column_buffer_with_strings::create(size_type _size,
   }
 }
 
-void column_buffer_with_strings::create_strings(size_type num_bytes,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* _mr)
+void column_buffer_with_strings::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
 {
-  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
+  _string_data = rmm::device_buffer(num_bytes, stream, mr);
 }
 
 namespace {
@@ -269,7 +267,7 @@ using pointer_type = utilities::column_buffer_with_pointers;
 using string_type  = utilities::column_buffer_with_strings;
 
 using pointer_column_buffer = utilities::column_buffer<pointer_type>;
-using string_column_buffer = utilities::column_buffer<string_type>;
+using string_column_buffer  = utilities::column_buffer<string_type>;
 
 template <>
 std::unique_ptr<column> make_column<pointer_type>(pointer_column_buffer& buffer,
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 8064b6f9e6a..53de9284cef 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -126,14 +126,7 @@ struct column_buffer_with_strings : public column_buffer_base {
 
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr);
 
-  void create_strings(size_type _size,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr);
-
-  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
-  {
-    this->create_strings(num_bytes, stream, mr);
-  }
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
 
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }

From dea407cf3859dfaa6ef363fb35017a6a7a20a65c Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 19 May 2023 16:58:05 -0700
Subject: [PATCH 066/114] rearrange some

---
 cpp/src/io/utilities/column_buffer.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 53de9284cef..4d9d5057fb4 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -126,14 +126,15 @@ struct column_buffer_with_strings : public column_buffer_base {
 
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr);
 
+  void* data() { return _data.data(); }
+  size_t data_size() const { return _data.size(); }
+
+  // additional methods for string data
   void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
 
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 
-  void* data() { return _data.data(); }
-  size_t data_size() const { return _data.size(); }
-
   rmm::device_buffer _string_data{};
 };
 

From 10d00d091583b79d572a0acccbb92f40eefcfa81 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 10:21:12 -0700
Subject: [PATCH 067/114] move make_column into policy object

---
 cpp/src/io/utilities/column_buffer.cpp | 266 ++++++++++---------------
 cpp/src/io/utilities/column_buffer.hpp |   4 +
 2 files changed, 112 insertions(+), 158 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index a36f5abadaa..e8ecdd727f3 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -74,6 +74,15 @@ void column_buffer_with_pointers::create(size_type _size,
   }
 }
 
+std::unique_ptr<column> column_buffer_with_pointers::make_column(rmm::cuda_stream_view stream)
+{
+  // make_strings_column allocates new memory, it does not simply move
+  // from the inputs, so we need to pass it the memory resource given to
+  // the buffer on construction so that the memory is allocated using the
+  // resource that the calling code expected.
+  return make_strings_column(*_strings, stream, mr);
+}
+
 void column_buffer_with_strings::create(size_type _size,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* _mr)
@@ -90,6 +99,32 @@ void column_buffer_with_strings::create_string_data(size_t num_bytes, rmm::cuda_
   _string_data = rmm::device_buffer(num_bytes, stream, mr);
 }
 
+std::unique_ptr<column> column_buffer_with_strings::make_column(rmm::cuda_stream_view stream)
+{
+  // no need for copies, just transfer ownership of the data_buffers to the columns
+  auto const& mr   = _string_data.memory_resource();
+  auto const state = mask_state::UNALLOCATED;
+  auto str_col =
+    _string_data.size() == 0
+      ? make_empty_column(data_type{type_id::INT8})
+      : std::make_unique<column>(data_type{type_id::INT8},
+                                 string_size(),
+                                 std::move(_string_data),
+                                 cudf::detail::create_null_mask(size, state, stream, mr),
+                                 state_null_count(state, size),
+                                 std::vector<std::unique_ptr<column>>{});
+  auto offsets_col =
+    std::make_unique<column>(data_type{type_to_id<size_type>()},
+                             size + 1,
+                             std::move(_data),
+                             cudf::detail::create_null_mask(size + 1, state, stream, mr),
+                             state_null_count(state, size + 1),
+                             std::vector<std::unique_ptr<column>>{});
+
+  return make_strings_column(
+    size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask));
+}
+
 namespace {
 
 /**
@@ -126,8 +161,10 @@ column_buffer<column_buffer_type> column_buffer<column_buffer_type>::empty_like(
 template class column_buffer<column_buffer_with_strings>;
 template class column_buffer<column_buffer_with_pointers>;
 
+}  // namespace utilities
+
 template <class column_buffer_type>
-std::unique_ptr<column> make_column(column_buffer<column_buffer_type>& buffer,
+std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
@@ -135,6 +172,47 @@ std::unique_ptr<column> make_column(column_buffer<column_buffer_type>& buffer,
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
   switch (buffer.type.id()) {
+    case type_id::STRING: {
+      if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
+        if (schema_info != nullptr) {
+          schema_info->children.push_back(column_name_info{"offsets"});
+          schema_info->children.push_back(column_name_info{"chars"});
+        }
+
+        // make_strings_column allocates new memory, it does not simply move
+        // from the inputs, so we need to pass it the memory resource given to
+        // the buffer on construction so that the memory is allocated using the
+        // resource that the calling code expected.
+        return buffer.make_column(stream);
+      } else {
+        // convert to binary
+        auto const string_col = buffer.make_column(stream);
+        auto const num_rows   = string_col->size();
+        auto const null_count = string_col->null_count();
+        auto col_content      = string_col->release();
+
+        // convert to uint8 column, strings are currently stored as int8
+        auto contents =
+          col_content.children[strings_column_view::chars_column_index].release()->release();
+        auto data = contents.data.release();
+
+        auto uint8_col = std::make_unique<column>(
+          data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
+
+        if (schema_info != nullptr) {
+          schema_info->children.push_back(column_name_info{"offsets"});
+          schema_info->children.push_back(column_name_info{"binary"});
+        }
+
+        return make_lists_column(
+          num_rows,
+          std::move(col_content.children[strings_column_view::offsets_column_index]),
+          std::move(uint8_col),
+          null_count,
+          std::move(*col_content.null_mask));
+      }
+    } break;
+
     case type_id::LIST: {
       // make offsets column
       auto offsets = std::make_unique<column>(
@@ -155,8 +233,8 @@ std::unique_ptr<column> make_column(column_buffer<column_buffer_type>& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = cudf::io::detail::make_column<column_buffer_type>(
-        buffer.children[0], child_info, child_schema, stream);
+      auto child =
+        make_column<column_buffer_type>(buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -185,8 +263,8 @@ std::unique_ptr<column> make_column(column_buffer<column_buffer_type>& buffer,
                                     ? std::make_optional<reader_column_schema>(schema->child(i))
                                     : std::nullopt;
 
-        output_children.emplace_back(cudf::io::detail::make_column<column_buffer_type>(
-          buffer.children[i], child_info, child_schema, stream));
+        output_children.emplace_back(
+          make_column<column_buffer_type>(buffer.children[i], child_info, child_schema, stream));
       }
 
       return make_structs_column(buffer.size,
@@ -207,8 +285,11 @@ std::unique_ptr<column> make_column(column_buffer<column_buffer_type>& buffer,
   }
 }
 
+/**
+ * @copydoc cudf::io::detail::empty_like
+ */
 template <class column_buffer_type>
-std::unique_ptr<column> empty_like(column_buffer<column_buffer_type>& buffer,
+std::unique_ptr<column> empty_like(utilities::column_buffer<column_buffer_type>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -261,163 +342,32 @@ std::unique_ptr<column> empty_like(column_buffer<column_buffer_type>& buffer,
   }
 }
 
-}  // namespace utilities
-
 using pointer_type = utilities::column_buffer_with_pointers;
 using string_type  = utilities::column_buffer_with_strings;
 
 using pointer_column_buffer = utilities::column_buffer<pointer_type>;
 using string_column_buffer  = utilities::column_buffer<string_type>;
 
-template <>
-std::unique_ptr<column> make_column<pointer_type>(pointer_column_buffer& buffer,
-                                                  column_name_info* schema_info,
-                                                  std::optional<reader_column_schema> const& schema,
-                                                  rmm::cuda_stream_view stream)
-{
-  if (schema_info != nullptr) { schema_info->name = buffer.name; }
-
-  if (buffer.type.id() == type_id::STRING) {
-    if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{"chars"});
-      }
-
-      // make_strings_column allocates new memory, it does not simply move
-      // from the inputs, so we need to pass it the memory resource given to
-      // the buffer on construction so that the memory is allocated using the
-      // resource that the calling code expected.
-      return make_strings_column(*buffer._strings, stream, buffer.mr);
-    } else {
-      // convert to binary
-      auto const string_col = make_strings_column(*buffer._strings, stream, buffer.mr);
-      auto const num_rows   = string_col->size();
-      auto const null_count = string_col->null_count();
-      auto col_content      = string_col->release();
-
-      // convert to uint8 column, strings are currently stored as int8
-      auto contents =
-        col_content.children[strings_column_view::chars_column_index].release()->release();
-      auto data = contents.data.release();
-
-      auto uint8_col = std::make_unique<column>(
-        data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
-
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{"binary"});
-      }
-
-      return make_lists_column(
-        num_rows,
-        std::move(col_content.children[strings_column_view::offsets_column_index]),
-        std::move(uint8_col),
-        null_count,
-        std::move(*col_content.null_mask));
-    }
-  }
-
-  // not a string
-  return utilities::make_column(buffer, schema_info, schema, stream);
-}
-
-template <>
-std::unique_ptr<column> make_column<string_type>(string_column_buffer& buffer,
-                                                 column_name_info* schema_info,
-                                                 std::optional<reader_column_schema> const& schema,
-                                                 rmm::cuda_stream_view stream)
-{
-  if (schema_info != nullptr) { schema_info->name = buffer.name; }
-
-  if (buffer.type.id() == type_id::STRING) {
-    auto make_string_col = [stream](auto& buffer) {
-      // no need for copies, just transfer ownership of the data_buffers to the columns
-      auto const& mr   = buffer._string_data.memory_resource();
-      auto const state = mask_state::UNALLOCATED;
-      auto str_col =
-        buffer._string_data.size() == 0
-          ? make_empty_column(data_type{type_id::INT8})
-          : std::make_unique<column>(data_type{type_id::INT8},
-                                     buffer.string_size(),
-                                     std::move(buffer._string_data),
-                                     cudf::detail::create_null_mask(buffer.size, state, stream, mr),
-                                     state_null_count(state, buffer.size),
-                                     std::vector<std::unique_ptr<column>>{});
-      auto offsets_col =
-        std::make_unique<column>(data_type{type_to_id<size_type>()},
-                                 buffer.size + 1,
-                                 std::move(buffer._data),
-                                 cudf::detail::create_null_mask(buffer.size + 1, state, stream, mr),
-                                 state_null_count(state, buffer.size + 1),
-                                 std::vector<std::unique_ptr<column>>{});
-
-      return make_strings_column(buffer.size,
-                                 std::move(offsets_col),
-                                 std::move(str_col),
-                                 buffer.null_count(),
-                                 std::move(buffer._null_mask));
-    };
-
-    if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{"chars"});
-      }
-
-      return make_string_col(buffer);
-    } else {
-      // convert to binary
-      auto const string_col = make_string_col(buffer);
-      auto const num_rows   = string_col->size();
-      auto const null_count = string_col->null_count();
-      auto col_content      = string_col->release();
-
-      // convert to uint8 column, strings are currently stored as int8
-      auto contents =
-        col_content.children[strings_column_view::chars_column_index].release()->release();
-      auto data = contents.data.release();
-
-      auto uint8_col = std::make_unique<column>(
-        data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
-
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{"binary"});
-      }
-
-      return make_lists_column(
-        num_rows,
-        std::move(col_content.children[strings_column_view::offsets_column_index]),
-        std::move(uint8_col),
-        null_count,
-        std::move(*col_content.null_mask));
-    }
-  }
-
-  // not a string
-  return utilities::make_column(buffer, schema_info, schema, stream);
-}
-
-/**
- * @copydoc cudf::io::detail::empty_like
- */
-template <>
-std::unique_ptr<column> empty_like<string_type>(utilities::column_buffer<string_type>& buffer,
-                                                column_name_info* schema_info,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
-{
-  return utilities::empty_like(buffer, schema_info, stream, mr);
-}
-
-template <>
-std::unique_ptr<column> empty_like<pointer_type>(utilities::column_buffer<pointer_type>& buffer,
-                                                 column_name_info* schema_info,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  return utilities::empty_like(buffer, schema_info, stream, mr);
-}
+template std::unique_ptr<column> make_column<string_type>(
+  string_column_buffer& buffer,
+  column_name_info* schema_info,
+  std::optional<reader_column_schema> const& schema,
+  rmm::cuda_stream_view stream);
+
+template std::unique_ptr<column> make_column<pointer_type>(
+  pointer_column_buffer& buffer,
+  column_name_info* schema_info,
+  std::optional<reader_column_schema> const& schema,
+  rmm::cuda_stream_view stream);
+
+template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
+                                                         column_name_info* schema_info,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer& buffer,
+                                                          column_name_info* schema_info,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 4d9d5057fb4..1da55e4eaa8 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -112,6 +112,8 @@ struct column_buffer_with_pointers : public column_buffer_base {
   void* data() { return _strings ? _strings->data() : _data.data(); }
   size_t data_size() const { return _strings ? _strings->size() : _data.size(); }
 
+  std::unique_ptr<column> make_column(rmm::cuda_stream_view stream);
+
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
 };
 
@@ -135,6 +137,8 @@ struct column_buffer_with_strings : public column_buffer_base {
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 
+  std::unique_ptr<column> make_column(rmm::cuda_stream_view stream);
+
   rmm::device_buffer _string_data{};
 };
 

From 22b3d55459b7b1c2205374391a19335c8bac3b9f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 10:24:56 -0700
Subject: [PATCH 068/114] reduce diff

---
 cpp/src/io/utilities/column_buffer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index e8ecdd727f3..e26e25c9def 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -172,7 +172,7 @@ std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
   switch (buffer.type.id()) {
-    case type_id::STRING: {
+    case type_id::STRING:
       if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
@@ -211,7 +211,6 @@ std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>
           null_count,
           std::move(*col_content.null_mask));
       }
-    } break;
 
     case type_id::LIST: {
       // make offsets column

From b907a1af8522bcf6b77588661da19f30bbc591ee Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 10:30:06 -0700
Subject: [PATCH 069/114] unify interfaces for policy objects

---
 cpp/src/io/utilities/column_buffer.hpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 1da55e4eaa8..c125e8c4807 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -112,6 +112,14 @@ struct column_buffer_with_pointers : public column_buffer_base {
   void* data() { return _strings ? _strings->data() : _data.data(); }
   size_t data_size() const { return _strings ? _strings->size() : _data.size(); }
 
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+  {
+    CUDF_FAIL("method not implemented for type");
+  }
+
+  void* string_data() { CUDF_FAIL("method not implemented for type"); }
+  size_t string_size() const { CUDF_FAIL("method not implemented for type"); }
+
   std::unique_ptr<column> make_column(rmm::cuda_stream_view stream);
 
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
@@ -131,7 +139,6 @@ struct column_buffer_with_strings : public column_buffer_base {
   void* data() { return _data.data(); }
   size_t data_size() const { return _data.size(); }
 
-  // additional methods for string data
   void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
 
   void* string_data() { return _string_data.data(); }

From 73b822992f6c7e083afd7991338a411c3dd73936 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 10:37:12 -0700
Subject: [PATCH 070/114] change template param name to string_policy

---
 cpp/src/io/utilities/column_buffer.cpp | 35 +++++++++++++-------------
 cpp/src/io/utilities/column_buffer.hpp | 32 +++++++++++------------
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index e26e25c9def..a57ed798586 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -133,24 +133,24 @@ namespace {
  * @param buff The old output buffer
  * @param new_buff The new output buffer
  */
-template <class column_buffer_type>
-void copy_buffer_data(column_buffer<column_buffer_type> const& buff,
-                      column_buffer<column_buffer_type>& new_buff)
+template <class string_policy>
+void copy_buffer_data(column_buffer<string_policy> const& buff,
+                      column_buffer<string_policy>& new_buff)
 {
   new_buff.name      = buff.name;
   new_buff.user_data = buff.user_data;
   for (auto const& child : buff.children) {
-    auto& new_child = new_buff.children.emplace_back(
-      column_buffer<column_buffer_type>(child.type, child.is_nullable));
+    auto& new_child =
+      new_buff.children.emplace_back(column_buffer<string_policy>(child.type, child.is_nullable));
     copy_buffer_data(child, new_child);
   }
 }
 
 }  // namespace
 
-template <class column_buffer_type>
-column_buffer<column_buffer_type> column_buffer<column_buffer_type>::empty_like(
-  column_buffer<column_buffer_type> const& input)
+template <class string_policy>
+column_buffer<string_policy> column_buffer<string_policy>::empty_like(
+  column_buffer<string_policy> const& input)
 {
   auto new_buff = column_buffer(input.type, input.is_nullable);
   copy_buffer_data(input, new_buff);
@@ -163,8 +163,8 @@ template class column_buffer<column_buffer_with_pointers>;
 
 }  // namespace utilities
 
-template <class column_buffer_type>
-std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>& buffer,
+template <class string_policy>
+std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
@@ -232,8 +232,7 @@ std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child =
-        make_column<column_buffer_type>(buffer.children[0], child_info, child_schema, stream);
+      auto child = make_column<string_policy>(buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -263,7 +262,7 @@ std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>
                                     : std::nullopt;
 
         output_children.emplace_back(
-          make_column<column_buffer_type>(buffer.children[i], child_info, child_schema, stream));
+          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream));
       }
 
       return make_structs_column(buffer.size,
@@ -287,8 +286,8 @@ std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>
 /**
  * @copydoc cudf::io::detail::empty_like
  */
-template <class column_buffer_type>
-std::unique_ptr<column> empty_like(utilities::column_buffer<column_buffer_type>& buffer,
+template <class string_policy>
+std::unique_ptr<column> empty_like(utilities::column_buffer<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -309,8 +308,8 @@ std::unique_ptr<column> empty_like(utilities::column_buffer<column_buffer_type>&
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = cudf::io::detail::empty_like<column_buffer_type>(
-        buffer.children[0], child_info, stream, mr);
+      auto child =
+        cudf::io::detail::empty_like<string_policy>(buffer.children[0], child_info, stream, mr);
 
       // make the final list column
       return make_lists_column(
@@ -329,7 +328,7 @@ std::unique_ptr<column> empty_like(utilities::column_buffer<column_buffer_type>&
                          schema_info->children.push_back(column_name_info{""});
                          child_info = &schema_info->children.back();
                        }
-                       return cudf::io::detail::empty_like<column_buffer_type>(
+                       return cudf::io::detail::empty_like<string_policy>(
                          col, child_info, stream, mr);
                      });
 
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index c125e8c4807..41695ef02a1 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -153,12 +153,12 @@ struct column_buffer_with_strings : public column_buffer_base {
  * @brief Class for holding device memory buffers to column data that eventually
  * will be used to create a column.
  */
-template <class column_buffer_type>
-struct column_buffer : column_buffer_type {
+template <class string_policy>
+struct column_buffer : string_policy {
   column_buffer() = default;
 
   // construct without a known size. call create() later to actually allocate memory
-  column_buffer(data_type _type, bool _is_nullable) : column_buffer_type(_type, _is_nullable) {}
+  column_buffer(data_type _type, bool _is_nullable) : string_policy(_type, _is_nullable) {}
 
   // construct with a known size. allocates memory
   column_buffer(data_type _type,
@@ -166,26 +166,24 @@ struct column_buffer : column_buffer_type {
                 bool _is_nullable,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr)
-    : column_buffer_type(_type, _is_nullable)
+    : string_policy(_type, _is_nullable)
   {
-    column_buffer_type::create(_size, stream, mr);
+    string_policy::create(_size, stream, mr);
   }
 
   // move constructor
-  column_buffer(column_buffer<column_buffer_type>&& col)                                = default;
-  column_buffer<column_buffer_type>& operator=(column_buffer<column_buffer_type>&& col) = default;
+  column_buffer(column_buffer<string_policy>&& col)                           = default;
+  column_buffer<string_policy>& operator=(column_buffer<string_policy>&& col) = default;
 
   // copy constructor
-  column_buffer(column_buffer<column_buffer_type> const& col) = delete;
-  column_buffer<column_buffer_type>& operator=(column_buffer<column_buffer_type> const& col) =
-    delete;
+  column_buffer(column_buffer<string_policy> const& col)                           = delete;
+  column_buffer<string_policy>& operator=(column_buffer<string_policy> const& col) = delete;
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
-  static column_buffer<column_buffer_type> empty_like(
-    column_buffer<column_buffer_type> const& input);
+  static column_buffer<string_policy> empty_like(column_buffer<string_policy> const& input);
 
-  std::vector<column_buffer<column_buffer_type>> children;
+  std::vector<column_buffer<string_policy>> children;
 };
 
 }  // namespace utilities
@@ -203,8 +201,8 @@ using column_buffer = utilities::column_buffer<utilities::column_buffer_with_poi
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-template <class column_buffer_type>
-std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>& buffer,
+template <class string_policy>
+std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream);
@@ -224,8 +222,8 @@ std::unique_ptr<column> make_column(utilities::column_buffer<column_buffer_type>
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-template <class column_buffer_type>
-std::unique_ptr<column> empty_like(utilities::column_buffer<column_buffer_type>& buffer,
+template <class string_policy>
+std::unique_ptr<column> empty_like(utilities::column_buffer<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);

From af1e407d514b96cd39ef22e19feb74426e1f5a86 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 11:23:37 -0700
Subject: [PATCH 071/114] restore null_count_back_copier

---
 cpp/src/io/parquet/page_data.cu          |  3 +--
 cpp/src/io/parquet/page_decode.cuh       | 29 +++++++++++++-----------
 cpp/src/io/parquet/page_string_decode.cu | 13 +++--------
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index fdd79449874..a70e6c891f4 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -831,6 +831,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int page_idx                   = blockIdx.x;
   int t                          = threadIdx.x;
   int out_thread0;
+  [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
@@ -848,7 +849,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   //
   if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
                                                is_page_contained(s, min_row, num_rows)))) {
-    restore_decode_cache(s);
     return;
   }
 
@@ -987,7 +987,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
     }
     __syncthreads();
   }
-  restore_decode_cache(s);
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 88d4e0bac68..bb1c28752df 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -89,22 +89,25 @@ struct page_state_buffers_s {
   uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
 };
 
-// if we are using the nesting decode cache, copy null count back. call this if
-// setupLocalPageInfo returns true.
-__device__ void restore_decode_cache(page_state_s* s)
-{
-  if (s->nesting_info == s->nesting_decode_cache) {
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      int const thread_depth = depth + threadIdx.x;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        s->page.nesting_decode[thread_depth].null_count =
-          s->nesting_decode_cache[thread_depth].null_count;
+// Copies null counts back to `nesting_decode` at the end of scope
+struct null_count_back_copier {
+  page_state_s* s;
+  int t;
+  __device__ ~null_count_back_copier()
+  {
+    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
+      int depth = 0;
+      while (depth < s->page.num_output_nesting_levels) {
+        int const thread_depth = depth + t;
+        if (thread_depth < s->page.num_output_nesting_levels) {
+          s->page.nesting_decode[thread_depth].null_count =
+            s->nesting_decode_cache[thread_depth].null_count;
+        }
+        depth += blockDim.x;
       }
-      depth += blockDim.x;
     }
   }
-}
+};
 
 /**
  * @brief Test if the given page is in a string column
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 965b6ddb0ee..d7e77e3336e 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -543,10 +543,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows);
 
   // if we're skipping this page anyway, no need to count it
-  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) {
-    restore_decode_cache(s);  // TODO is this necessary?
-    return;
-  }
+  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
 
   // find start/end value indices
   auto const [start_value, end_value] =
@@ -597,8 +594,6 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     // TODO check for overflow
     pp->str_bytes = str_bytes;
   }
-  // TODO: is this necessary?
-  restore_decode_cache(s);
 }
 
 /**
@@ -629,6 +624,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
 
   // set during string copy by lane 0
   int first_non_null = -1;
@@ -653,7 +649,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   //
   if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
                                                is_page_contained(s, min_row, num_rows)))) {
-    restore_decode_cache(s);
     return;
   }
 
@@ -828,7 +823,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     }
     __syncthreads();
   }
-  restore_decode_cache(s);
 }
 
 /**
@@ -859,6 +853,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
 
   // set during string copy by lane 0
   int first_non_null = -1;
@@ -883,7 +878,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   //
   if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
                                                is_page_contained(s, min_row, num_rows)))) {
-    restore_decode_cache(s);
     return;
   }
 
@@ -1049,7 +1043,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     }
     __syncthreads();
   }
-  restore_decode_cache(s);
 }
 
 }  // anonymous namespace

From 1e18f1cd90a1455d264f23604ec566ac8c7c310f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 14:30:56 -0700
Subject: [PATCH 072/114] fix for page spanning rows

---
 cpp/src/io/parquet/page_data.cu          |  7 +++--
 cpp/src/io/parquet/page_decode.cuh       | 11 ++++++--
 cpp/src/io/parquet/page_string_decode.cu | 35 ++++++++++++++++++------
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index a70e6c891f4..219a55bcb4a 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -722,7 +722,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
 
   // in the trim pass, for anything with lists, we only need to fully process bounding pages
   // (those at the beginning or the end of the row bounds)
-  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows)) {
+  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) {
     int depth = 0;
     while (depth < s->page.num_output_nesting_levels) {
       auto const thread_depth = depth + t;
@@ -847,8 +847,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   //      row start           row end
   // P1 will contain 0 rows
   //
-  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
-                                               is_page_contained(s, min_row, num_rows)))) {
+  if (s->num_rows == 0 &&
+      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
+                           is_page_contained(s, min_row, num_rows)))) {
     return;
   }
 
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index bb1c28752df..ada76e24210 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -127,17 +127,24 @@ constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc c
  * @param s The page to be checked
  * @param start_row The starting row index
  * @param num_rows The number of rows
+ * @param has_repetition True if the schema has nesting
  *
  * @return True if the page spans the beginning or the end of the row bounds
  */
-inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
+inline __device__ bool is_bounds_page(page_state_s* const s,
+                                      size_t start_row,
+                                      size_t num_rows,
+                                      bool has_repetition)
 {
   size_t const page_begin = s->col.start_row + s->page.chunk_row;
   size_t const page_end   = page_begin + s->page.num_rows;
   size_t const begin      = start_row;
   size_t const end        = start_row + num_rows;
 
-  return ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
+  // for non-nested schemas, rows cannot span pages, so use a more restrictive test
+  return has_repetition
+           ? ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end))
+           : ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
 }
 
 /**
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d7e77e3336e..08a3399bc81 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -179,7 +179,8 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     auto const page_rows      = page_start_row + begin_row + max_page_rows <= max_row
                                   ? max_page_rows
                                   : max_row - (page_start_row + begin_row);
-    auto const end_row        = begin_row + page_rows;
+    auto end_row              = begin_row + page_rows;
+    int row_fudge             = -1;
 
     // short circuit for no nulls
     if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; }
@@ -195,6 +196,17 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
       if (has_repetition) {
         decoders[level_type::REPETITION].decode_next(t);
         __syncthreads();
+
+        // special case where page does not begin at a row boundary
+        if (processed == 0 && rep_decode[0] != 0) {
+          if (t == 0) {
+            skipped_values      = 0;
+            skipped_leaf_values = 0;
+          }
+          skipped_values_set = true;
+          end_row++;  // need to finish off the previous row
+          row_fudge = 0;
+        }
       }
 
       // the # of rep/def levels will always be the same size
@@ -223,7 +235,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
         // if we have not set skipped values yet, see if we found the first in-bounds row
         if (!skipped_values_set && row_count + block_row_count > begin_row) {
           // if this thread is in row bounds
-          int const row_index = (thread_row_count + row_count) - 1;
+          int const row_index = thread_row_count + row_count - 1;
           int const in_row_bounds =
             idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
 
@@ -247,8 +259,9 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
         // test if row_count will exceed end_row in this batch
         if (!end_value_set && row_count + block_row_count >= end_row) {
-          // if this thread exceeds row bounds
-          int const row_index          = (thread_row_count + row_count) - 1;
+          // if this thread exceeds row bounds. row_fudge change depending on whether we've faked
+          // the end row to account for starting a page in the middle of a row.
+          int const row_index          = thread_row_count + row_count + row_fudge;
           int const exceeds_row_bounds = row_index >= end_row;
 
           int local_count, global_count;
@@ -264,6 +277,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
               end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
             }
             end_value_set = true;
+            break;
           }
         }
 
@@ -273,6 +287,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
         start_val += preprocess_block_size;
       }
       __syncthreads();
+      if (end_value_set) { break; }
     }
 
     start_value = skipped_values_set ? skipped_leaf_values : 0;
@@ -540,7 +555,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   }
   __syncthreads();
 
-  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows);
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
 
   // if we're skipping this page anyway, no need to count it
   if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
@@ -647,8 +662,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   //      row start           row end
   // P1 will contain 0 rows
   //
-  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
-                                               is_page_contained(s, min_row, num_rows)))) {
+  if (s->num_rows == 0 &&
+      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
+                           is_page_contained(s, min_row, num_rows)))) {
     return;
   }
 
@@ -876,8 +892,9 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   //      row start           row end
   // P1 will contain 0 rows
   //
-  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
-                                               is_page_contained(s, min_row, num_rows)))) {
+  if (s->num_rows == 0 &&
+      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
+                           is_page_contained(s, min_row, num_rows)))) {
     return;
   }
 

From 681d57df9c886ba1d5beb3fadb3371007f654079 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 17:20:26 -0700
Subject: [PATCH 073/114] undo some reformatting of comments

---
 cpp/src/io/parquet/page_data.cu | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 219a55bcb4a..1dcf523b458 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -704,9 +704,9 @@ __global__ void __launch_bounds__(preprocess_block_size)
 
   // early out optimizations:
 
-  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't
-  // need to do the expensive work of traversing the level data to determine sizes.  we can just
-  // compute it directly.
+  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
+  // to do the expensive work of traversing the level data to determine sizes.  we can just compute
+  // it directly.
   if (!has_repetition && !compute_string_sizes) {
     int depth = 0;
     while (depth < s->page.num_output_nesting_levels) {
@@ -720,8 +720,8 @@ __global__ void __launch_bounds__(preprocess_block_size)
     return;
   }
 
-  // in the trim pass, for anything with lists, we only need to fully process bounding pages
-  // (those at the beginning or the end of the row bounds)
+  // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
+  // at the beginning or the end of the row bounds)
   if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) {
     int depth = 0;
     while (depth < s->page.num_output_nesting_levels) {
@@ -919,13 +919,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       //
       if (!has_repetition) { dst_pos -= s->first_row; }
 
-      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative
-      // (values before first_row) in the flat hierarchy case.
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+      // before first_row) in the flat hierarchy case.
       if (src_pos < target_pos && dst_pos >= 0) {
         // src_pos represents the logical row position we want to read from. But in the case of
-        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read
-        // position has to take into account the # of values we have to skip in the page to get to
-        // the desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read position
+        // has to take into account the # of values we have to skip in the page to get to the
+        // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
         uint32_t val_src_pos = src_pos + skipped_leaf_values;
 
         // nesting level that is storing actual leaf values

From d8bb072969df1eb5f3ea46e27396c2d9d897d33f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 22 May 2023 18:03:44 -0700
Subject: [PATCH 074/114] change make_column to make_string_column

---
 cpp/src/io/utilities/column_buffer.cpp | 10 +++++-----
 cpp/src/io/utilities/column_buffer.hpp |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index a57ed798586..91fbb75422a 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -74,7 +74,8 @@ void column_buffer_with_pointers::create(size_type _size,
   }
 }
 
-std::unique_ptr<column> column_buffer_with_pointers::make_column(rmm::cuda_stream_view stream)
+std::unique_ptr<column> column_buffer_with_pointers::make_string_column(
+  rmm::cuda_stream_view stream)
 {
   // make_strings_column allocates new memory, it does not simply move
   // from the inputs, so we need to pass it the memory resource given to
@@ -99,10 +100,9 @@ void column_buffer_with_strings::create_string_data(size_t num_bytes, rmm::cuda_
   _string_data = rmm::device_buffer(num_bytes, stream, mr);
 }
 
-std::unique_ptr<column> column_buffer_with_strings::make_column(rmm::cuda_stream_view stream)
+std::unique_ptr<column> column_buffer_with_strings::make_string_column(rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto const& mr   = _string_data.memory_resource();
   auto const state = mask_state::UNALLOCATED;
   auto str_col =
     _string_data.size() == 0
@@ -183,10 +183,10 @@ std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buf
         // from the inputs, so we need to pass it the memory resource given to
         // the buffer on construction so that the memory is allocated using the
         // resource that the calling code expected.
-        return buffer.make_column(stream);
+        return buffer.make_string_column(stream);
       } else {
         // convert to binary
-        auto const string_col = buffer.make_column(stream);
+        auto const string_col = buffer.make_string_column(stream);
         auto const num_rows   = string_col->size();
         auto const null_count = string_col->null_count();
         auto col_content      = string_col->release();
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 41695ef02a1..c23553c9c56 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -120,7 +120,7 @@ struct column_buffer_with_pointers : public column_buffer_base {
   void* string_data() { CUDF_FAIL("method not implemented for type"); }
   size_t string_size() const { CUDF_FAIL("method not implemented for type"); }
 
-  std::unique_ptr<column> make_column(rmm::cuda_stream_view stream);
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream);
 
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
 };
@@ -144,7 +144,7 @@ struct column_buffer_with_strings : public column_buffer_base {
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 
-  std::unique_ptr<column> make_column(rmm::cuda_stream_view stream);
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream);
 
   rmm::device_buffer _string_data{};
 };

From 2c596f91cca4747427faadf13d81cd114151d713 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 23 May 2023 09:25:59 -0700
Subject: [PATCH 075/114] move gpuDecodeRleBooleans

---
 cpp/src/io/parquet/page_data.cu    | 67 ------------------------------
 cpp/src/io/parquet/page_decode.cuh | 67 ++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 1dcf523b458..595c56cc446 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -27,73 +27,6 @@ namespace gpu {
 
 namespace {
 
-/**
- * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target write position
- * @param[in] t Thread ID
- *
- * @return The new output position
- */
-__device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
-                                    volatile page_state_buffers_s* sb,
-                                    int target_pos,
-                                    int t)
-{
-  const uint8_t* end = s->data_end;
-  int pos            = s->dict_pos;
-
-  while (pos < target_pos) {
-    int is_literal, batch_len;
-    if (!t) {
-      uint32_t run       = s->dict_run;
-      const uint8_t* cur = s->data_start;
-      if (run <= 1) {
-        run = (cur < end) ? get_vlq32(cur, end) : 0;
-        if (!(run & 1)) {
-          // Repeated value
-          s->dict_val = (cur < end) ? cur[0] & 1 : 0;
-          cur++;
-        }
-      }
-      if (run & 1) {
-        // Literal batch: must output a multiple of 8, except for the last batch
-        int batch_len_div8;
-        batch_len = max(min(32, (int)(run >> 1) * 8), 1);
-        if (batch_len >= 8) { batch_len &= ~7; }
-        batch_len_div8 = (batch_len + 7) >> 3;
-        run -= batch_len_div8 * 2;
-        cur += batch_len_div8;
-      } else {
-        batch_len = max(min(32, (int)(run >> 1)), 1);
-        run -= batch_len * 2;
-      }
-      s->dict_run   = run;
-      s->data_start = cur;
-      is_literal    = run & 1;
-      __threadfence_block();
-    }
-    __syncwarp();
-    is_literal = shuffle(is_literal);
-    batch_len  = shuffle(batch_len);
-    if (t < batch_len) {
-      int dict_idx;
-      if (is_literal) {
-        int32_t ofs      = t - ((batch_len + 7) & ~7);
-        const uint8_t* p = s->data_start + (ofs >> 3);
-        dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
-      } else {
-        dict_idx = s->dict_val;
-      }
-      sb->dict_idx[rolling_index(pos + t)] = dict_idx;
-    }
-    pos += batch_len;
-  }
-  return pos;
-}
-
 /**
  * @brief Output a string descriptor
  *
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index ada76e24210..9b0e7a8b643 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -324,6 +324,73 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
   return {pos, str_len};
 }
 
+/**
+ * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target write position
+ * @param[in] t Thread ID
+ *
+ * @return The new output position
+ */
+__device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
+                                    volatile page_state_buffers_s* sb,
+                                    int target_pos,
+                                    int t)
+{
+  const uint8_t* end = s->data_end;
+  int pos            = s->dict_pos;
+
+  while (pos < target_pos) {
+    int is_literal, batch_len;
+    if (!t) {
+      uint32_t run       = s->dict_run;
+      const uint8_t* cur = s->data_start;
+      if (run <= 1) {
+        run = (cur < end) ? get_vlq32(cur, end) : 0;
+        if (!(run & 1)) {
+          // Repeated value
+          s->dict_val = (cur < end) ? cur[0] & 1 : 0;
+          cur++;
+        }
+      }
+      if (run & 1) {
+        // Literal batch: must output a multiple of 8, except for the last batch
+        int batch_len_div8;
+        batch_len = max(min(32, (int)(run >> 1) * 8), 1);
+        if (batch_len >= 8) { batch_len &= ~7; }
+        batch_len_div8 = (batch_len + 7) >> 3;
+        run -= batch_len_div8 * 2;
+        cur += batch_len_div8;
+      } else {
+        batch_len = max(min(32, (int)(run >> 1)), 1);
+        run -= batch_len * 2;
+      }
+      s->dict_run   = run;
+      s->data_start = cur;
+      is_literal    = run & 1;
+      __threadfence_block();
+    }
+    __syncwarp();
+    is_literal = shuffle(is_literal);
+    batch_len  = shuffle(batch_len);
+    if (t < batch_len) {
+      int dict_idx;
+      if (is_literal) {
+        int32_t ofs      = t - ((batch_len + 7) & ~7);
+        const uint8_t* p = s->data_start + (ofs >> 3);
+        dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
+      } else {
+        dict_idx = s->dict_val;
+      }
+      sb->dict_idx[rolling_index(pos + t)] = dict_idx;
+    }
+    pos += batch_len;
+  }
+  return pos;
+}
+
 /**
  * @brief Parses the length and position of strings and returns total length of all strings
  * processed

From 06521bd188c80174fe5efe69feeeb4b95261e6d7 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 23 May 2023 12:09:55 -0700
Subject: [PATCH 076/114] remove t from docs

---
 cpp/src/io/parquet/page_string_decode.cu | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 08a3399bc81..f4d4029f2da 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -104,7 +104,6 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
  * @param is_bounds_pg True if this page is clipped
  * @param has_repetition True if the schema is nested
  * @param decoders Definition and repetition level decoders
- * @param t Thread index
  * @return pair containing start and end value indexes
  */
 template <int lvl_buf_size, typename level_t>
@@ -346,7 +345,6 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
  * @param data_size Size of the page data in bytes
  * @param start_value Do not count values that occur before this index
  * @param end_value Do not count values that occur after this index
- * @param t Thread index
  */
 __device__ size_t countDictEntries(uint8_t const* data,
                                    uint8_t const* dict_base,
@@ -473,7 +471,6 @@ __device__ size_t countDictEntries(uint8_t const* data,
  * @param data_size Length of data
  * @param start_value Do not count values that occur before this index
  * @param end_value Do not count values that occur after this index
- * @param t Thread index
  */
 __device__ size_t countPlainEntries(uint8_t const* data,
                                     int data_size,

From 6625690d011d848d3c49cbfaf6330e8d5de48176 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Wed, 24 May 2023 18:53:18 -0700
Subject: [PATCH 077/114] CRTP, I think

---
 cpp/src/io/parquet/reader_impl.cpp         |   4 +-
 cpp/src/io/parquet/reader_impl.hpp         |   5 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp |  15 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp |  15 +-
 cpp/src/io/utilities/column_buffer.cpp     |  74 +++-------
 cpp/src/io/utilities/column_buffer.hpp     | 156 ++++++++++++---------
 6 files changed, 125 insertions(+), 144 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 4f01f1f18db..6ca9b4806cd 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -284,7 +284,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Don't need to do it if we read the file all at once.
   if (_chunk_read_limit > 0) {
     for (auto const& buff : _output_buffers) {
-      _output_buffers_template.emplace_back(column_buffer::empty_like(buff));
+      _output_buffers_template.emplace_back(column_buffer_with_strings::empty_like(buff));
     }
   }
 }
@@ -402,7 +402,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_read_limit > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(column_buffer_with_strings::empty_like(buff));
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 8386026b85c..9f00a5ce78b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -36,6 +36,7 @@
 #include <vector>
 
 namespace cudf::io::detail::parquet {
+
 /**
  * @brief Implementation for Parquet reader
  */
@@ -238,10 +239,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<column_buffer> _output_buffers;
+  std::vector<column_buffer_with_strings> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<column_buffer> _output_buffers_template;
+  std::vector<column_buffer_with_strings> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 1f70a7afdfe..00e762056ff 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -383,7 +383,9 @@ aggregate_reader_metadata::select_row_groups(
   return {rows_to_skip, rows_to_read, std::move(selection)};
 }
 
-std::tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+std::tuple<std::vector<input_column_info>,
+           std::vector<column_buffer_with_strings>,
+           std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
                                           bool strings_to_categorical,
@@ -400,17 +402,17 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<column_buffer> output_columns;
+  std::vector<column_buffer_with_strings> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
+  std::function<bool(column_name_info const*, int, std::vector<column_buffer_with_strings>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<column_buffer>& out_col_array,
+                       std::vector<column_buffer_with_strings>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -431,7 +433,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      column_buffer_with_strings output_col(dtype, schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -471,7 +473,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          column_buffer_with_strings element_col(element_dtype,
+                                                 schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 0e3b1a20d96..d98b83c7c77 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -29,8 +29,6 @@
 namespace cudf::io::detail::parquet {
 
 using namespace cudf::io::parquet;
-using string_type = cudf::io::detail::utilities::column_buffer_with_strings;
-using column_buffer = cudf::io::detail::utilities::column_buffer<string_type>;
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
@@ -189,12 +187,13 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
-    select_columns(std::optional<std::vector<std::string>> const& use_names,
-                   bool include_index,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id) const;
+  [[nodiscard]] std::tuple<std::vector<input_column_info>,
+                           std::vector<column_buffer_with_strings>,
+                           std::vector<size_type>>
+  select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 bool include_index,
+                 bool strings_to_categorical,
+                 type_id timestamp_type_id) const;
 };
 
 }  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 91fbb75422a..c5762e836cb 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -27,40 +27,9 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 namespace cudf::io::detail {
-namespace utilities {
 
-void column_buffer_base::create(size_type _size,
-                                rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* _mr)
+void column_buffer_with_pointers::allocate_strings_data(rmm::cuda_stream_view stream)
 {
-  size = _size;
-  mr   = _mr;
-
-  switch (type.id()) {
-    case type_id::STRING:
-      // will be handled by children
-      break;
-
-    // list columns store a buffer of int32's as offsets to represent
-    // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
-
-    // struct columns store no data themselves.  just validity and children.
-    case type_id::STRUCT: break;
-
-    default: _data = create_data(type, size, stream, mr); break;
-  }
-  if (is_nullable) {
-    _null_mask =
-      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-  }
-}
-
-void column_buffer_with_pointers::create(size_type _size,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* _mr)
-{
-  column_buffer_base::create(_size, stream, _mr);
   if (type.id() == type_id::STRING) {
     // The contents of _strings will never be directly returned to the user.
     // Due to the fact that make_strings_column copies the input data to
@@ -74,7 +43,7 @@ void column_buffer_with_pointers::create(size_type _size,
   }
 }
 
-std::unique_ptr<column> column_buffer_with_pointers::make_string_column(
+std::unique_ptr<column> column_buffer_with_pointers::make_string_column_impl(
   rmm::cuda_stream_view stream)
 {
   // make_strings_column allocates new memory, it does not simply move
@@ -84,11 +53,8 @@ std::unique_ptr<column> column_buffer_with_pointers::make_string_column(
   return make_strings_column(*_strings, stream, mr);
 }
 
-void column_buffer_with_strings::create(size_type _size,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* _mr)
+void column_buffer_with_strings::allocate_strings_data(rmm::cuda_stream_view stream)
 {
-  column_buffer_base::create(_size, stream, _mr);
   if (type.id() == type_id::STRING) {
     // size + 1 for final offset. _string_data will be initialized later.
     _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
@@ -100,7 +66,8 @@ void column_buffer_with_strings::create_string_data(size_t num_bytes, rmm::cuda_
   _string_data = rmm::device_buffer(num_bytes, stream, mr);
 }
 
-std::unique_ptr<column> column_buffer_with_strings::make_string_column(rmm::cuda_stream_view stream)
+std::unique_ptr<column> column_buffer_with_strings::make_string_column_impl(
+  rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -134,14 +101,12 @@ namespace {
  * @param new_buff The new output buffer
  */
 template <class string_policy>
-void copy_buffer_data(column_buffer<string_policy> const& buff,
-                      column_buffer<string_policy>& new_buff)
+void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 {
   new_buff.name      = buff.name;
   new_buff.user_data = buff.user_data;
   for (auto const& child : buff.children) {
-    auto& new_child =
-      new_buff.children.emplace_back(column_buffer<string_policy>(child.type, child.is_nullable));
+    auto& new_child = new_buff.children.emplace_back(string_policy(child.type, child.is_nullable));
     copy_buffer_data(child, new_child);
   }
 }
@@ -149,22 +114,15 @@ void copy_buffer_data(column_buffer<string_policy> const& buff,
 }  // namespace
 
 template <class string_policy>
-column_buffer<string_policy> column_buffer<string_policy>::empty_like(
-  column_buffer<string_policy> const& input)
+string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
 {
-  auto new_buff = column_buffer(input.type, input.is_nullable);
+  auto new_buff = string_policy(input.type, input.is_nullable);
   copy_buffer_data(input, new_buff);
   return new_buff;
 }
 
-// force instantiation of both column_buffers
-template class column_buffer<column_buffer_with_strings>;
-template class column_buffer<column_buffer_with_pointers>;
-
-}  // namespace utilities
-
 template <class string_policy>
-std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buffer,
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
@@ -287,7 +245,7 @@ std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buf
  * @copydoc cudf::io::detail::empty_like
  */
 template <class string_policy>
-std::unique_ptr<column> empty_like(utilities::column_buffer<string_policy>& buffer,
+std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -340,11 +298,11 @@ std::unique_ptr<column> empty_like(utilities::column_buffer<string_policy>& buff
   }
 }
 
-using pointer_type = utilities::column_buffer_with_pointers;
-using string_type  = utilities::column_buffer_with_strings;
+using pointer_type = column_buffer_with_pointers;
+using string_type  = column_buffer_with_strings;
 
-using pointer_column_buffer = utilities::column_buffer<pointer_type>;
-using string_column_buffer  = utilities::column_buffer<string_type>;
+using pointer_column_buffer = column_buffer_base<pointer_type>;
+using string_column_buffer  = column_buffer_base<string_type>;
 
 template std::unique_ptr<column> make_column<string_type>(
   string_column_buffer& buffer,
@@ -368,4 +326,6 @@ template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer&
                                                           rmm::cuda_stream_view stream,
                                                           rmm::mr::device_memory_resource* mr);
 
+template class column_buffer_base<pointer_type>;
+template class column_buffer_base<string_type>;
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index c23553c9c56..108006b0a53 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -62,19 +62,54 @@ inline rmm::device_buffer create_data(data_type type,
 
 using string_index_pair = thrust::pair<const char*, size_type>;
 
-namespace utilities {
-
-struct column_buffer_base {
+template <typename string_policy>
+class column_buffer_base {
+ public:
   column_buffer_base() = default;
 
   // construct without a known size. call create() later to actually allocate memory
   column_buffer_base(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
 
+  column_buffer_base(data_type _type,
+                     size_type _size,
+                     bool _is_nullable,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr)
+    : column_buffer_base(_type, _is_nullable)
+  {
+    create(_size, stream, mr);
+  }
+
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  virtual void create(size_type _size,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* _mr);
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr)
+  {
+    size = _size;
+    mr   = _mr;
+
+    switch (type.id()) {
+      case type_id::STRING:
+        return static_cast<string_policy*>(this)->allocate_strings_data(stream);
+        break;
+
+      // list columns store a buffer of int32's as offsets to represent
+      // their individual rows
+      case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+
+      // struct columns store no data themselves.  just validity and children.
+      case type_id::STRUCT: break;
+
+      default: _data = create_data(type, size, stream, mr); break;
+    }
+    if (is_nullable) {
+      _null_mask = cudf::detail::create_null_mask(
+        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+    }
+  }
+
+  // Create a new column_buffer that has empty data but with the same basic information as the
+  // input column, including same type, nullability, name, and user_data.
+  static string_policy empty_like(string_policy const& input);
 
   template <typename T = uint32_t>
   auto null_mask()
@@ -82,9 +117,17 @@ struct column_buffer_base {
     return static_cast<T*>(_null_mask.data());
   }
   auto null_mask_size() { return _null_mask.size(); }
-
   auto& null_count() { return _null_count; }
 
+  auto data() { return static_cast<string_policy*>(this)->data_impl(); }
+  auto data_size() { return static_cast<string_policy*>(this)->data_size_impl(); }
+
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
+  {
+    return static_cast<string_policy*>(this)->make_string_column_impl(stream);
+  }
+
+ public:
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
@@ -96,99 +139,74 @@ struct column_buffer_base {
   std::string name;
 
   rmm::mr::device_memory_resource* mr;
+
+  std::vector<string_policy> children;
 };
 
-struct column_buffer_with_pointers : public column_buffer_base {
+class column_buffer_with_pointers : public column_buffer_base<column_buffer_with_pointers> {
+ public:
   column_buffer_with_pointers() = default;
 
   // construct without a known size. call create() later to actually allocate memory
   column_buffer_with_pointers(data_type _type, bool _is_nullable)
-    : column_buffer_base(_type, _is_nullable)
+    : column_buffer_base<column_buffer_with_pointers>(_type, _is_nullable)
   {
   }
 
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr);
-
-  void* data() { return _strings ? _strings->data() : _data.data(); }
-  size_t data_size() const { return _strings ? _strings->size() : _data.size(); }
-
-  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+  column_buffer_with_pointers(data_type _type,
+                              size_type _size,
+                              bool _is_nullable,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<column_buffer_with_pointers>(_type, _size, _is_nullable, stream, mr)
   {
-    CUDF_FAIL("method not implemented for type");
   }
 
-  void* string_data() { CUDF_FAIL("method not implemented for type"); }
-  size_t string_size() const { CUDF_FAIL("method not implemented for type"); }
+  void allocate_strings_data(rmm::cuda_stream_view stream);
 
-  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream);
+  void* data_impl() { return _strings ? _strings->data() : _data.data(); }
+  size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
 
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+
+ public:
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
 };
 
-struct column_buffer_with_strings : public column_buffer_base {
+class column_buffer_with_strings : public column_buffer_base<column_buffer_with_strings> {
+ public:
   column_buffer_with_strings() = default;
 
   // construct without a known size. call create() later to actually allocate memory
   column_buffer_with_strings(data_type _type, bool _is_nullable)
-    : column_buffer_base(_type, _is_nullable)
+    : column_buffer_base<column_buffer_with_strings>(_type, _is_nullable)
   {
   }
 
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr);
+  column_buffer_with_strings(data_type _type,
+                             size_type _size,
+                             bool _is_nullable,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<column_buffer_with_strings>(_type, _size, _is_nullable, stream, mr)
+  {
+  }
 
-  void* data() { return _data.data(); }
-  size_t data_size() const { return _data.size(); }
+  void allocate_strings_data(rmm::cuda_stream_view stream);
 
-  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
+  void* data_impl() { return _data.data(); }
+  size_t data_size_impl() const { return _data.size(); }
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 
-  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream);
-
+ private:
   rmm::device_buffer _string_data{};
 };
 
-/**
- * @brief Class for holding device memory buffers to column data that eventually
- * will be used to create a column.
- */
-template <class string_policy>
-struct column_buffer : string_policy {
-  column_buffer() = default;
-
-  // construct without a known size. call create() later to actually allocate memory
-  column_buffer(data_type _type, bool _is_nullable) : string_policy(_type, _is_nullable) {}
-
-  // construct with a known size. allocates memory
-  column_buffer(data_type _type,
-                size_type _size,
-                bool _is_nullable,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
-    : string_policy(_type, _is_nullable)
-  {
-    string_policy::create(_size, stream, mr);
-  }
-
-  // move constructor
-  column_buffer(column_buffer<string_policy>&& col)                           = default;
-  column_buffer<string_policy>& operator=(column_buffer<string_policy>&& col) = default;
-
-  // copy constructor
-  column_buffer(column_buffer<string_policy> const& col)                           = delete;
-  column_buffer<string_policy>& operator=(column_buffer<string_policy> const& col) = delete;
-
-  // Create a new column_buffer that has empty data but with the same basic information as the
-  // input column, including same type, nullability, name, and user_data.
-  static column_buffer<string_policy> empty_like(column_buffer<string_policy> const& input);
-
-  std::vector<column_buffer<string_policy>> children;
-};
-
-}  // namespace utilities
-
-using column_buffer = utilities::column_buffer<utilities::column_buffer_with_pointers>;
+using column_buffer = column_buffer_with_pointers;
 
 /**
  * @brief Creates a column from an existing set of device memory buffers.
@@ -202,7 +220,7 @@ using column_buffer = utilities::column_buffer<utilities::column_buffer_with_poi
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
 template <class string_policy>
-std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buffer,
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream);
@@ -223,7 +241,7 @@ std::unique_ptr<column> make_column(utilities::column_buffer<string_policy>& buf
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
 template <class string_policy>
-std::unique_ptr<column> empty_like(utilities::column_buffer<string_policy>& buffer,
+std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);

From d742498541d05550dfc05eb052f2cbbed9c6b97d Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 25 May 2023 13:40:17 -0700
Subject: [PATCH 078/114] checkpoint CRTP changes

---
 cpp/src/io/orc/reader_impl.cu              |   6 +-
 cpp/src/io/parquet/reader_impl.cpp         |   4 +-
 cpp/src/io/parquet/reader_impl.hpp         |   4 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp |  13 +--
 cpp/src/io/parquet/reader_impl_helpers.hpp |  13 +--
 cpp/src/io/utilities/column_buffer.cpp     |  30 +++--
 cpp/src/io/utilities/column_buffer.hpp     | 121 +++++++++++++--------
 7 files changed, 106 insertions(+), 85 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1561737da48..07e1bc2dd7f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -521,13 +521,13 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
                            };
                          });
 
-        out_buffers[col_idx]._null_mask = std::move(merged_null_mask);
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
 
       } else {
         // Since child column doesn't have a mask, copy parent null mask
         auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx]._null_mask =
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
       }
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 6ca9b4806cd..73932baf958 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -284,7 +284,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Don't need to do it if we read the file all at once.
   if (_chunk_read_limit > 0) {
     for (auto const& buff : _output_buffers) {
-      _output_buffers_template.emplace_back(column_buffer_with_strings::empty_like(buff));
+      _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
     }
   }
 }
@@ -402,7 +402,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_read_limit > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(column_buffer_with_strings::empty_like(buff));
+      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9f00a5ce78b..3d8c71c6c63 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -239,10 +239,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<column_buffer_with_strings> _output_buffers;
+  std::vector<inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<column_buffer_with_strings> _output_buffers_template;
+  std::vector<inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 00e762056ff..e7122b503ea 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -384,7 +384,7 @@ aggregate_reader_metadata::select_row_groups(
 }
 
 std::tuple<std::vector<input_column_info>,
-           std::vector<column_buffer_with_strings>,
+           std::vector<inline_column_buffer>,
            std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
@@ -402,17 +402,17 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<column_buffer_with_strings> output_columns;
+  std::vector<inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<column_buffer_with_strings>&, bool)>
+  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<column_buffer_with_strings>& out_col_array,
+                       std::vector<inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -433,7 +433,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      column_buffer_with_strings output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -473,8 +473,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          column_buffer_with_strings element_col(element_dtype,
-                                                 schema_elem.repetition_type == OPTIONAL);
+          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index d98b83c7c77..0192dcd373b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -187,13 +187,12 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::tuple<std::vector<input_column_info>,
-                           std::vector<column_buffer_with_strings>,
-                           std::vector<size_type>>
-  select_columns(std::optional<std::vector<std::string>> const& use_names,
-                 bool include_index,
-                 bool strings_to_categorical,
-                 type_id timestamp_type_id) const;
+  [[nodiscard]] std::
+    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
+    select_columns(std::optional<std::vector<std::string>> const& use_names,
+                   bool include_index,
+                   bool strings_to_categorical,
+                   type_id timestamp_type_id) const;
 };
 
 }  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index c5762e836cb..ac476256238 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -28,7 +28,7 @@
 
 namespace cudf::io::detail {
 
-void column_buffer_with_pointers::allocate_strings_data(rmm::cuda_stream_view stream)
+void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   if (type.id() == type_id::STRING) {
     // The contents of _strings will never be directly returned to the user.
@@ -43,31 +43,29 @@ void column_buffer_with_pointers::allocate_strings_data(rmm::cuda_stream_view st
   }
 }
 
-std::unique_ptr<column> column_buffer_with_pointers::make_string_column_impl(
-  rmm::cuda_stream_view stream)
+std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
 {
   // make_strings_column allocates new memory, it does not simply move
   // from the inputs, so we need to pass it the memory resource given to
   // the buffer on construction so that the memory is allocated using the
   // resource that the calling code expected.
-  return make_strings_column(*_strings, stream, mr);
+  return make_strings_column(*_strings, stream, _mr);
 }
 
-void column_buffer_with_strings::allocate_strings_data(rmm::cuda_stream_view stream)
+void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   if (type.id() == type_id::STRING) {
     // size + 1 for final offset. _string_data will be initialized later.
-    _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
+    _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
   }
 }
 
-void column_buffer_with_strings::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
 {
-  _string_data = rmm::device_buffer(num_bytes, stream, mr);
+  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> column_buffer_with_strings::make_string_column_impl(
-  rmm::cuda_stream_view stream)
+std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -77,14 +75,14 @@ std::unique_ptr<column> column_buffer_with_strings::make_string_column_impl(
       : std::make_unique<column>(data_type{type_id::INT8},
                                  string_size(),
                                  std::move(_string_data),
-                                 cudf::detail::create_null_mask(size, state, stream, mr),
+                                 cudf::detail::create_null_mask(size, state, stream, _mr),
                                  state_null_count(state, size),
                                  std::vector<std::unique_ptr<column>>{});
   auto offsets_col =
     std::make_unique<column>(data_type{type_to_id<size_type>()},
                              size + 1,
                              std::move(_data),
-                             cudf::detail::create_null_mask(size + 1, state, stream, mr),
+                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
                              state_null_count(state, size + 1),
                              std::vector<std::unique_ptr<column>>{});
 
@@ -200,7 +198,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                buffer._null_count,
                                std::move(buffer._null_mask),
                                stream,
-                               buffer.mr);
+                               buffer._mr);
     } break;
 
     case type_id::STRUCT: {
@@ -228,7 +226,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                  buffer._null_count,
                                  std::move(buffer._null_mask),
                                  stream,
-                                 buffer.mr);
+                                 buffer._mr);
     } break;
 
     default: {
@@ -298,8 +296,8 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
   }
 }
 
-using pointer_type = column_buffer_with_pointers;
-using string_type  = column_buffer_with_strings;
+using pointer_type = gather_column_buffer;
+using string_type  = inline_column_buffer;
 
 using pointer_column_buffer = column_buffer_base<pointer_type>;
 using string_column_buffer  = column_buffer_base<string_type>;
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 108006b0a53..93327e18549 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -62,6 +62,27 @@ inline rmm::device_buffer create_data(data_type type,
 
 using string_index_pair = thrust::pair<const char*, size_type>;
 
+// forward declare friend functions
+template <typename string_policy>
+class column_buffer_base;
+
+/**
+ * @brief Creates a column from an existing set of device memory buffers.
+ *
+ * @throws std::bad_alloc if device memory allocation fails
+ *
+ * @param buffer Column buffer descriptors
+ * @param schema_info Schema information for the column to write optionally.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ * @return `std::unique_ptr<cudf::column>` Column from the existing device data
+ */
+template <class string_policy>
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
+                                    column_name_info* schema_info,
+                                    std::optional<reader_column_schema> const& schema,
+                                    rmm::cuda_stream_view stream);
+
 template <typename string_policy>
 class column_buffer_base {
  public:
@@ -77,33 +98,38 @@ class column_buffer_base {
                      rmm::mr::device_memory_resource* mr)
     : column_buffer_base(_type, _is_nullable)
   {
-    create(_size, stream, mr);
+    create(_size, stream, mr, true);
   }
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* _mr)
+  void create(size_type _size,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr,
+              bool is_ctor = false)
   {
     size = _size;
-    mr   = _mr;
+    _mr  = mr;
 
     switch (type.id()) {
       case type_id::STRING:
-        return static_cast<string_policy*>(this)->allocate_strings_data(stream);
+        // if calling from constructor, then some members may not yet exist, so defer this to
+        // child constructor
+        if (not is_ctor) { static_cast<string_policy*>(this)->allocate_strings_data(stream); }
         break;
 
       // list columns store a buffer of int32's as offsets to represent
       // their individual rows
-      case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+      case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
 
       // struct columns store no data themselves.  just validity and children.
       case type_id::STRUCT: break;
 
-      default: _data = create_data(type, size, stream, mr); break;
+      default: _data = create_data(type, size, stream, _mr); break;
     }
     if (is_nullable) {
       _null_mask = cudf::detail::create_null_mask(
-        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
     }
   }
 
@@ -111,6 +137,8 @@ class column_buffer_base {
   // input column, including same type, nullability, name, and user_data.
   static string_policy empty_like(string_policy const& input);
 
+  void set_null_mask(rmm::device_buffer&& mask) { _null_mask = std::move(mask); }
+
   template <typename T = uint32_t>
   auto null_mask()
   {
@@ -127,39 +155,49 @@ class column_buffer_base {
     return static_cast<string_policy*>(this)->make_string_column_impl(stream);
   }
 
- public:
+ protected:
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
+  rmm::mr::device_memory_resource* _mr;
 
+ public:
   data_type type{type_id::EMPTY};
   bool is_nullable{false};
   size_type size{0};
   uint32_t user_data{0};  // arbitrary user data
   std::string name;
 
-  rmm::mr::device_memory_resource* mr;
-
   std::vector<string_policy> children;
+
+  friend std::unique_ptr<column> make_column<string_policy>(
+    column_buffer_base& buffer,
+    column_name_info* schema_info,
+    std::optional<reader_column_schema> const& schema,
+    rmm::cuda_stream_view stream);
 };
 
-class column_buffer_with_pointers : public column_buffer_base<column_buffer_with_pointers> {
+// column buffer that uses a string_index_pair for strings data, requiring a gather step when
+// creating a string column
+class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
  public:
-  column_buffer_with_pointers() = default;
+  gather_column_buffer() = default;
 
   // construct without a known size. call create() later to actually allocate memory
-  column_buffer_with_pointers(data_type _type, bool _is_nullable)
-    : column_buffer_base<column_buffer_with_pointers>(_type, _is_nullable)
+  gather_column_buffer(data_type _type, bool _is_nullable)
+    : column_buffer_base<gather_column_buffer>(_type, _is_nullable)
   {
   }
 
-  column_buffer_with_pointers(data_type _type,
-                              size_type _size,
-                              bool _is_nullable,
-                              rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
-    : column_buffer_base<column_buffer_with_pointers>(_type, _size, _is_nullable, stream, mr)
+  gather_column_buffer(data_type _type,
+                       size_type _size,
+                       bool _is_nullable,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
+    // allocate strings data now
+    if (type.id() == type_id::STRING) { allocate_strings_data(stream); }
   }
 
   void allocate_strings_data(rmm::cuda_stream_view stream);
@@ -173,23 +211,27 @@ class column_buffer_with_pointers : public column_buffer_base<column_buffer_with
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
 };
 
-class column_buffer_with_strings : public column_buffer_base<column_buffer_with_strings> {
+// column buffer that stores string data internally which can be passed directly when
+// creating a string column
+class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
  public:
-  column_buffer_with_strings() = default;
+  inline_column_buffer() = default;
 
   // construct without a known size. call create() later to actually allocate memory
-  column_buffer_with_strings(data_type _type, bool _is_nullable)
-    : column_buffer_base<column_buffer_with_strings>(_type, _is_nullable)
+  inline_column_buffer(data_type _type, bool _is_nullable)
+    : column_buffer_base<inline_column_buffer>(_type, _is_nullable)
   {
   }
 
-  column_buffer_with_strings(data_type _type,
-                             size_type _size,
-                             bool _is_nullable,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-    : column_buffer_base<column_buffer_with_strings>(_type, _size, _is_nullable, stream, mr)
+  inline_column_buffer(data_type _type,
+                       size_type _size,
+                       bool _is_nullable,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
+    // allocate strings data now
+    if (type.id() == type_id::STRING) { allocate_strings_data(stream); }
   }
 
   void allocate_strings_data(rmm::cuda_stream_view stream);
@@ -206,24 +248,7 @@ class column_buffer_with_strings : public column_buffer_base<column_buffer_with_
   rmm::device_buffer _string_data{};
 };
 
-using column_buffer = column_buffer_with_pointers;
-
-/**
- * @brief Creates a column from an existing set of device memory buffers.
- *
- * @throws std::bad_alloc if device memory allocation fails
- *
- * @param buffer Column buffer descriptors
- * @param schema_info Schema information for the column to write optionally.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- * @return `std::unique_ptr<cudf::column>` Column from the existing device data
- */
-template <class string_policy>
-std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
-                                    column_name_info* schema_info,
-                                    std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream);
+using column_buffer = gather_column_buffer;
 
 /**
  * @brief Creates an equivalent empty column from an existing set of device memory buffers.

From 3e708219505dd1cbcf0df03a0f8dc1ffe0d911a9 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 25 May 2023 14:15:45 -0700
Subject: [PATCH 079/114] better fix for initializing _strings

---
 cpp/src/io/utilities/column_buffer.hpp | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 93327e18549..521098d5083 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -73,6 +73,7 @@ class column_buffer_base;
  *
  * @param buffer Column buffer descriptors
  * @param schema_info Schema information for the column to write optionally.
+ * @param schema Optional schema used to control string to binary conversions.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
@@ -98,25 +99,17 @@ class column_buffer_base {
                      rmm::mr::device_memory_resource* mr)
     : column_buffer_base(_type, _is_nullable)
   {
-    create(_size, stream, mr, true);
   }
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr,
-              bool is_ctor = false)
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   {
     size = _size;
     _mr  = mr;
 
     switch (type.id()) {
-      case type_id::STRING:
-        // if calling from constructor, then some members may not yet exist, so defer this to
-        // child constructor
-        if (not is_ctor) { static_cast<string_policy*>(this)->allocate_strings_data(stream); }
-        break;
+      case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
 
       // list columns store a buffer of int32's as offsets to represent
       // their individual rows
@@ -196,8 +189,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
                        rmm::mr::device_memory_resource* mr)
     : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
-    // allocate strings data now
-    if (type.id() == type_id::STRING) { allocate_strings_data(stream); }
+    create(_size, stream, mr);
   }
 
   void allocate_strings_data(rmm::cuda_stream_view stream);
@@ -230,8 +222,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
                        rmm::mr::device_memory_resource* mr)
     : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
-    // allocate strings data now
-    if (type.id() == type_id::STRING) { allocate_strings_data(stream); }
+    create(_size, stream, mr);
   }
 
   void allocate_strings_data(rmm::cuda_stream_view stream);

From a120a4a82bd60b1b98d4e2f5f2dd50bd171a5044 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 25 May 2023 14:26:15 -0700
Subject: [PATCH 080/114] json no longer needs to fully qualify make_column

---
 cpp/src/io/json/reader_impl.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 228f99c0d9f..baec3cb1439 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -556,7 +556,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = io::detail::make_column(out_buffers[i], nullptr, std::nullopt, stream);
+    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(

From 755918b89a7b6ae74045e46d733662cae6d26318 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 25 May 2023 17:39:51 -0700
Subject: [PATCH 081/114] calculate col_sizes on device to save a round trip
 for the PageInfo data. also fixes another but in null handling.

---
 cpp/src/io/parquet/page_decode.cuh       | 12 ++++-
 cpp/src/io/parquet/page_string_decode.cu | 64 ++++++++++++++++++++++--
 cpp/src/io/parquet/parquet_gpu.hpp       | 16 +++---
 cpp/src/io/parquet/reader_impl.cpp       | 16 +-----
 4 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 9b0e7a8b643..b6b5b684380 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -109,6 +109,15 @@ struct null_count_back_copier {
   }
 };
 
+/**
+ * @brief Test if the given page is in a string column
+ */
+constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+{
+  return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and
+         chunk.converted_type != DECIMAL;
+}
+
 /**
  * @brief Test if the given page is in a string column
  */
@@ -116,8 +125,7 @@ constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc c
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
-  return (col.data_type & 7) == BYTE_ARRAY and (col.data_type >> 3) != 4 and
-         col.converted_type != DECIMAL;
+  return is_string_col(col);
 }
 
 /**
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index f4d4029f2da..10b373aea72 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -547,8 +547,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
 
   if (!t) {
-    s->page.num_nulls = 0;
-    s->page.str_bytes = 0;
+    s->page.num_nulls  = 0;
+    s->page.num_valids = 0;
+    s->page.str_bytes  = 0;
   }
   __syncthreads();
 
@@ -810,7 +811,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   // if there are nulls clean up the offsets array.
   if (s->page.num_nulls != 0) {
     int const leaf_level_index = s->col.max_nesting_depth - 1;
-    int const value_count      = nesting_info_base[leaf_level_index].value_count;
+    int value_count            = nesting_info_base[leaf_level_index].value_count;
+
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
     auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
 
@@ -1031,7 +1036,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   // if there are nulls clean up the offsets array.
   if (s->page.num_nulls != 0) {
     int const leaf_level_index = s->col.max_nesting_depth - 1;
-    int const value_count      = nesting_info_base[leaf_level_index].value_count;
+    int value_count            = nesting_info_base[leaf_level_index].value_count;
+
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
     auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
 
@@ -1059,6 +1068,43 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   }
 }
 
+__global__ void __launch_bounds__(preprocess_block_size)
+  gpuComputePageOffsets(device_span<ColumnChunkDesc const> chunks, device_span<size_type> col_sizes)
+{
+  using block_scan = cub::BlockScan<size_type, preprocess_block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+
+  auto const t         = threadIdx.x;
+  auto const col_index = blockIdx.x;
+  col_sizes[col_index] = 0;
+
+  for (auto const& chunk : chunks) {
+    if (chunk.src_col_index == col_index) {
+      // short circuit return if this is not a string column
+      if (not is_string_col(chunk)) { return; }
+
+      size_type cumulative_offset = col_sizes[col_index];
+
+      for (int i = 0; i < chunk.max_num_pages; i += preprocess_block_size) {
+        int idx       = i + t;
+        size_type len = idx < chunk.max_num_pages and
+                            (chunk.page_info[idx].flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0
+                          ? chunk.page_info[idx].str_bytes
+                          : 0;
+
+        size_type offset, block_total;
+        block_scan(scan_storage).ExclusiveSum(len, offset, block_total);
+        if (idx < chunk.max_num_pages) {
+          chunk.page_info[idx].str_offset = offset + cumulative_offset;
+        }
+        cumulative_offset += block_total;
+      }
+      if (t == 0) { col_sizes[col_index] = cumulative_offset; }
+      __syncthreads();
+    }
+  }
+}
+
 }  // anonymous namespace
 
 /**
@@ -1066,6 +1112,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
  */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            std::vector<size_type>& col_sizes,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
@@ -1080,6 +1127,15 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
     gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
+
+  rmm::device_uvector<size_type> d_col_sizes(col_sizes.size(), stream);
+  gpuComputePageOffsets<<<col_sizes.size(), dim_block, 0, stream.value()>>>(chunks, d_col_sizes);
+  cudaMemcpyAsync(col_sizes.data(),
+                  d_col_sizes.data(),
+                  sizeof(size_type) * col_sizes.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+  stream.synchronize();
 }
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 49adfefa63c..2d3c6f655e7 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -483,17 +483,19 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
  *
  * String columns need accurate data size information to preallocate memory in the column buffer to
  * store the char data. This calls a kernel to calculate information needed by the string decoding
- * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
- * are updated. This call ignores non-string columns.
+ * kernel. On exit, the `str_bytes`, `num_nulls`, `num_valids`, and `str_offset` fields of the
+ * PageInfo struct are updated. This call ignores non-string columns.
  *
- * @param pages All pages to be decoded
- * @param chunks All chunks to be decoded
- * @param min_rows crop all rows below min_row
- * @param num_rows Maximum number of rows to read
- * @param stream CUDA stream to use, default 0
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[out] col_sizes On output, contains total size of string data for each column
+ * @param[in] min_rows crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] stream CUDA stream to use, default 0
  */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            std::vector<size_type>& col_sizes,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 73932baf958..338fd3986ec 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -53,21 +53,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   std::vector<size_type> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
-
-    // TODO do the following on device with thrust/kernel to avoid the pages round trip
-    pages.device_to_host(_stream, true);
-    for (auto& page : pages) {
-      if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0) {
-        auto const& col = chunks[page.chunk_idx];
-        if (is_string_col(col)) {
-          size_type const offset       = col_sizes[col.src_col_index];
-          page.str_offset              = offset;
-          col_sizes[col.src_col_index] = offset + page.str_bytes;
-        }
-      }
-    }
-    pages.host_to_device(_stream);
+      pages, chunks, col_sizes, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector

From e1fd10311154ee8d9e297353502241149a382220 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 26 May 2023 13:59:07 -0700
Subject: [PATCH 082/114] calculate offsets with exclusive scan

---
 cpp/src/io/parquet/page_string_decode.cu | 84 +++++++++---------------
 1 file changed, 31 insertions(+), 53 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 10b373aea72..0a683217201 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -92,6 +92,26 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
   }
 }
 
+/**
+ * @brief Perform exclusive scan for offsets array. Called for each page.
+ */
+__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
+{
+  using block_scan = cub::BlockScan<size_type, decode_block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+  int const t = threadIdx.x;
+
+  // do a series of block sums, storing results in arr as we go
+  size_type block_sum;
+  for (int pos = 0; pos < length; pos += decode_block_size) {
+    int tidx       = pos + t;
+    size_type tval = tidx < length ? arr[tidx] : 0;
+    block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
+    if (tidx < length) { arr[tidx] = tval; }
+    initial_value += block_sum;
+  }
+}
+
 /**
  * @brief Compute the start and end page value bounds for this page
  *
@@ -639,9 +659,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   int const t                    = threadIdx.x;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  // set during string copy by lane 0
-  int first_non_null = -1;
-
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
@@ -666,8 +683,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     return;
   }
 
-  int out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
-
+  int const out_thread0                          = s->dict_base && s->dict_bits == 0 ? 32 : 64;
+  int const leaf_level_index                     = s->col.max_nesting_depth - 1;
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
   __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
@@ -707,20 +724,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     } else {
       int const me = t - out_thread0;
 
-      // if this is the first page, then the first non-null entry will have an offset of 0.
-      // pages that start with a run of nulls will have repeated 0 values, so for the fixing
-      // of null offsets done at the end, we need to know the last index that should be 0.
-      if (me == 0 && s->page.str_offset == 0 && first_non_null == -1) {
-        for (int i = src_pos; i < target_pos; i++) {
-          int dst_pos = sb->nz_idx[rolling_index(i)];
-          if (!has_repetition) { dst_pos -= s->first_row; }
-          if (dst_pos >= 0) {
-            first_non_null = dst_pos;
-            break;
-          }
-        }
-      }
-
       // WARP1..WARP3: Decode values
       src_pos += t - out_thread0;
 
@@ -745,7 +748,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       using cudf::detail::warp_size;
       auto const use_char_ll =
         s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
 
       if (me < warp_size) {
         for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
@@ -778,7 +780,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
                 auto offptr =
                   reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
                   dsts[ss];
-                *offptr = offsets[ss];
+                *offptr = lengths[ss];
                 auto str_ptr =
                   nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
                 ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
@@ -789,7 +791,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
             if (src_pos + i < target_pos && dst_pos >= 0) {
               auto offptr =
                 reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
-              *offptr = offset;
+              *offptr = len;
               auto str_ptr =
                 nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
               memcpy(str_ptr, ptr, len);
@@ -808,39 +810,15 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
     __syncthreads();
   }
 
-  // if there are nulls clean up the offsets array.
-  if (s->page.num_nulls != 0) {
-    int const leaf_level_index = s->col.max_nesting_depth - 1;
-    int value_count            = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
 
-    auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
 
-    if (nesting_info_base[leaf_level_index].null_count > 0) {
-      // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
-      if (s->nz_count == 0) {
-        for (int i = t; i < value_count; i += decode_block_size) {
-          offptr[i] = s->page.str_offset;
-        }
-      }
-      // just some nulls, do this serially for now
-      else if (t == out_thread0) {
-        if (first_non_null == -1) { first_non_null = 0; }
-
-        if (offptr[value_count - 1] == 0 && value_count - 1 != first_non_null) {
-          offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
-        }
-        for (int i = value_count - 2; i > first_non_null; i--) {
-          if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
-        }
-        offptr[0] = s->page.str_offset;
-      }
-    }
-    __syncthreads();
-  }
+  auto offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum(offptr, value_count, s->page.str_offset);
 }
 
 /**

From 539ef1f3e1816452ebd15298be519cfa19338571 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 26 May 2023 15:36:40 -0700
Subject: [PATCH 083/114] cleanups

---
 cpp/src/io/parquet/page_string_decode.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 0a683217201..0a87c4e0690 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -95,17 +95,18 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
 /**
  * @brief Perform exclusive scan for offsets array. Called for each page.
  */
+template <int block_size>
 __device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
 {
-  using block_scan = cub::BlockScan<size_type, decode_block_size>;
+  using block_scan = cub::BlockScan<size_type, block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
   int const t = threadIdx.x;
 
   // do a series of block sums, storing results in arr as we go
-  size_type block_sum;
-  for (int pos = 0; pos < length; pos += decode_block_size) {
-    int tidx       = pos + t;
+  for (int pos = 0; pos < length; pos += block_size) {
+    int const tidx = pos + t;
     size_type tval = tidx < length ? arr[tidx] : 0;
+    size_type block_sum;
     block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
     if (tidx < length) { arr[tidx] = tval; }
     initial_value += block_sum;
@@ -817,8 +818,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   // values until we reach first_row. account for that here.
   if (!has_repetition) { value_count -= s->first_row; }
 
-  auto offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum(offptr, value_count, s->page.str_offset);
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 }
 
 /**

From 723e21dab31605c098322cafbee57a28cb51b38d Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 26 May 2023 16:30:43 -0700
Subject: [PATCH 084/114] only call string decode kernel if there are string
 columns

---
 cpp/src/io/parquet/reader_impl.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 338fd3986ec..633539b6148 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -41,9 +41,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // for each string page. This size info will be used to pre-allocate memory for the column,
   // allowing the page decoder to write string data directly to the column buffer, rather than
   // doing a gather operation later on.
-  // TODO: The current implementation does a round trip for the page info. Need to explore doing
-  // this step on device. This call is also somewhat redundant if size info has already been
-  // calculated (nested schema, chunked reader).
+  // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
+  // chunked reader).
   auto is_string_col = [](gpu::ColumnChunkDesc const& chunk) {
     return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4 &&
            chunk.converted_type != DECIMAL;
@@ -152,8 +151,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // TODO: explore launching these concurrently with a stream pool
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-  gpu::DecodeStringPageData(
-    pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+  if (has_strings) {
+    gpu::DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+  }
 
   pages.device_to_host(_stream);
   page_nesting.device_to_host(_stream);

From 2bf8c19e69e2456d42f9c7f16a13320bb5eb0682 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 08:49:38 -0700
Subject: [PATCH 085/114] offsets can be page local now

---
 cpp/src/io/parquet/page_string_decode.cu | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 0a87c4e0690..012d0840fe1 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -664,8 +664,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // offsets is global...but the output is local, so account for that below
-  if (t == 0) { last_offset = s->page.str_offset; }
+  // offsets are local to the page
+  if (t == 0) { last_offset = 0; }
   __syncthreads();
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
@@ -781,9 +781,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
                 auto offptr =
                   reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
                   dsts[ss];
-                *offptr = lengths[ss];
-                auto str_ptr =
-                  nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
+                *offptr      = lengths[ss];
+                auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss];
                 ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
               }
             }
@@ -792,9 +791,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
             if (src_pos + i < target_pos && dst_pos >= 0) {
               auto offptr =
                 reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
-              *offptr = len;
-              auto str_ptr =
-                nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
+              *offptr      = len;
+              auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
               memcpy(str_ptr, ptr, len);
             }
             __syncwarp();

From 15986a962e53a5a40802fcbbb65e7c053e93bc73 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 12:32:20 -0700
Subject: [PATCH 086/114] move create() back to cpp file

---
 cpp/src/io/utilities/column_buffer.cpp | 26 ++++++++++++++++++++++++++
 cpp/src/io/utilities/column_buffer.hpp | 23 +----------------------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index ac476256238..6fe564d2d76 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -111,6 +111,32 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 
 }  // namespace
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  size = _size;
+  _mr  = mr;
+
+  switch (type.id()) {
+    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+
+    // list columns store a buffer of int32's as offsets to represent
+    // their individual rows
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+
+    // struct columns store no data themselves.  just validity and children.
+    case type_id::STRUCT: break;
+
+    default: _data = create_data(type, size, stream, _mr); break;
+  }
+  if (is_nullable) {
+    _null_mask = cudf::detail::create_null_mask(
+      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+  }
+}
+
 template <class string_policy>
 string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
 {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 521098d5083..eedfd763a64 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -103,28 +103,7 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
-  {
-    size = _size;
-    _mr  = mr;
-
-    switch (type.id()) {
-      case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
-
-      // list columns store a buffer of int32's as offsets to represent
-      // their individual rows
-      case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
-
-      // struct columns store no data themselves.  just validity and children.
-      case type_id::STRUCT: break;
-
-      default: _data = create_data(type, size, stream, _mr); break;
-    }
-    if (is_nullable) {
-      _null_mask = cudf::detail::create_null_mask(
-        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
-    }
-  }
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.

From 614c4603fad47e63fabe2b9fd6887310b37986b5 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 13:25:16 -0700
Subject: [PATCH 087/114] remove memory resource from column buffer. instead
 pass it in when necessary.

---
 cpp/src/io/avro/reader_impl.cu         |  2 +-
 cpp/src/io/csv/reader_impl.cu          |  2 +-
 cpp/src/io/json/reader_impl.cu         |  2 +-
 cpp/src/io/orc/reader_impl.cu          |  2 +-
 cpp/src/io/parquet/reader_impl.cpp     |  6 +--
 cpp/src/io/utilities/column_buffer.cpp | 61 +++++++++++++++-----------
 cpp/src/io/utilities/column_buffer.hpp | 26 ++++++-----
 7 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index a1be00dff9b..1f931bb2670 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -591,7 +591,7 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                                      mr);
 
       for (size_t i = 0; i < column_types.size(); ++i) {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream, mr));
       }
     } else {
       // Create empty columns
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index cb6d746047c..ea8b0680e37 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -868,7 +868,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
         out_columns.emplace_back(
           cudf::strings::detail::replace(col->view(), dblquotechar, quotechar, -1, stream, mr));
       } else {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream, mr));
       }
     }
   } else {
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index baec3cb1439..4add19bf07c 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -556,7 +556,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream);
+    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream, mr);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 07e1bc2dd7f..8058d7c3e2b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -877,7 +877,7 @@ void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_
                  [&](auto const col_meta) {
                    schema_info.emplace_back("");
                    auto col_buffer = assemble_buffer(col_meta.id, col_buffers, 0, stream);
-                   return make_column(col_buffer, &schema_info.back(), std::nullopt, stream);
+                   return make_column(col_buffer, &schema_info.back(), std::nullopt, stream, _mr);
                  });
 }
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 633539b6148..514509de421 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -129,7 +129,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         data[idx]   = out_buf.data();
         // only do string buffer for leaf
         if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
+          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream, _mr);
         }
         str_data[idx] = out_buf.string_data();
         out_buf.user_data |=
@@ -329,9 +329,9 @@ table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bound
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
       column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-      out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream));
+      out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream, _mr));
     } else {
-      out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream));
+      out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream, _mr));
     }
   }
 
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 6fe564d2d76..4681ba4a74c 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -28,7 +28,8 @@
 
 namespace cudf::io::detail {
 
-void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   if (type.id() == type_id::STRING) {
     // The contents of _strings will never be directly returned to the user.
@@ -43,29 +44,34 @@ void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
   }
 }
 
-std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+std::unique_ptr<column> gather_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   // make_strings_column allocates new memory, it does not simply move
   // from the inputs, so we need to pass it the memory resource given to
   // the buffer on construction so that the memory is allocated using the
   // resource that the calling code expected.
-  return make_strings_column(*_strings, stream, _mr);
+  return make_strings_column(*_strings, stream, mr);
 }
 
-void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   if (type.id() == type_id::STRING) {
     // size + 1 for final offset. _string_data will be initialized later.
-    _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
+    _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
   }
 }
 
-void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+void inline_column_buffer::create_string_data(size_t num_bytes,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
-  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
+  _string_data = rmm::device_buffer(num_bytes, stream, mr);
 }
 
-std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+std::unique_ptr<column> inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -75,14 +81,14 @@ std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_
       : std::make_unique<column>(data_type{type_id::INT8},
                                  string_size(),
                                  std::move(_string_data),
-                                 cudf::detail::create_null_mask(size, state, stream, _mr),
+                                 cudf::detail::create_null_mask(size, state, stream, mr),
                                  state_null_count(state, size),
                                  std::vector<std::unique_ptr<column>>{});
   auto offsets_col =
     std::make_unique<column>(data_type{type_to_id<size_type>()},
                              size + 1,
                              std::move(_data),
-                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
+                             cudf::detail::create_null_mask(size + 1, state, stream, mr),
                              state_null_count(state, size + 1),
                              std::vector<std::unique_ptr<column>>{});
 
@@ -117,23 +123,24 @@ void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::mr::device_memory_resource* mr)
 {
   size = _size;
-  _mr  = mr;
 
   switch (type.id()) {
-    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+    case type_id::STRING:
+      static_cast<string_policy*>(this)->allocate_strings_data(stream, mr);
+      break;
 
     // list columns store a buffer of int32's as offsets to represent
     // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
 
     // struct columns store no data themselves.  just validity and children.
     case type_id::STRUCT: break;
 
-    default: _data = create_data(type, size, stream, _mr); break;
+    default: _data = create_data(type, size, stream, mr); break;
   }
   if (is_nullable) {
-    _null_mask = cudf::detail::create_null_mask(
-      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+    _null_mask =
+      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
   }
 }
 
@@ -149,7 +156,8 @@ template <class string_policy>
 std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -165,10 +173,10 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         // from the inputs, so we need to pass it the memory resource given to
         // the buffer on construction so that the memory is allocated using the
         // resource that the calling code expected.
-        return buffer.make_string_column(stream);
+        return buffer.make_string_column(stream, mr);
       } else {
         // convert to binary
-        auto const string_col = buffer.make_string_column(stream);
+        auto const string_col = buffer.make_string_column(stream, mr);
         auto const num_rows   = string_col->size();
         auto const null_count = string_col->null_count();
         auto col_content      = string_col->release();
@@ -214,7 +222,8 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column<string_policy>(buffer.children[0], child_info, child_schema, stream);
+      auto child =
+        make_column<string_policy>(buffer.children[0], child_info, child_schema, stream, mr);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -224,7 +233,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                buffer._null_count,
                                std::move(buffer._null_mask),
                                stream,
-                               buffer._mr);
+                               mr);
     } break;
 
     case type_id::STRUCT: {
@@ -244,7 +253,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     : std::nullopt;
 
         output_children.emplace_back(
-          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream));
+          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream, mr));
       }
 
       return make_structs_column(buffer.size,
@@ -252,7 +261,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                  buffer._null_count,
                                  std::move(buffer._null_mask),
                                  stream,
-                                 buffer._mr);
+                                 mr);
     } break;
 
     default: {
@@ -332,13 +341,15 @@ template std::unique_ptr<column> make_column<string_type>(
   string_column_buffer& buffer,
   column_name_info* schema_info,
   std::optional<reader_column_schema> const& schema,
-  rmm::cuda_stream_view stream);
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 
 template std::unique_ptr<column> make_column<pointer_type>(
   pointer_column_buffer& buffer,
   column_name_info* schema_info,
   std::optional<reader_column_schema> const& schema,
-  rmm::cuda_stream_view stream);
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 
 template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
                                                          column_name_info* schema_info,
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index eedfd763a64..062b1a9c67f 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -82,7 +82,8 @@ template <class string_policy>
 std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream);
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 template <typename string_policy>
 class column_buffer_base {
@@ -122,16 +123,16 @@ class column_buffer_base {
   auto data() { return static_cast<string_policy*>(this)->data_impl(); }
   auto data_size() { return static_cast<string_policy*>(this)->data_size_impl(); }
 
-  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
   {
-    return static_cast<string_policy*>(this)->make_string_column_impl(stream);
+    return static_cast<string_policy*>(this)->make_string_column_impl(stream, mr);
   }
 
  protected:
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
-  rmm::mr::device_memory_resource* _mr;
 
  public:
   data_type type{type_id::EMPTY};
@@ -146,7 +147,8 @@ class column_buffer_base {
     column_buffer_base& buffer,
     column_name_info* schema_info,
     std::optional<reader_column_schema> const& schema,
-    rmm::cuda_stream_view stream);
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr);
 };
 
 // column buffer that uses a string_index_pair for strings data, requiring a gather step when
@@ -171,12 +173,13 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
   void* data_impl() { return _strings ? _strings->data() : _data.data(); }
   size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
 
-  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
  public:
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
@@ -204,13 +207,16 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
   void* data_impl() { return _data.data(); }
   size_t data_size_impl() const { return _data.size(); }
-  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
-  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
+  void create_string_data(size_t num_bytes,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr);
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 

From 7f1d245dc3c04a27b1115c7a2fb9eafa7b2932fa Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 13:37:41 -0700
Subject: [PATCH 088/114] remove if and add CUDF_EXPECTS to
 allocate_strings_data()

---
 cpp/src/io/utilities/column_buffer.cpp | 28 ++++++++++++--------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 4681ba4a74c..de425ac5b5b 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -31,17 +31,16 @@ namespace cudf::io::detail {
 void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  if (type.id() == type_id::STRING) {
-    // The contents of _strings will never be directly returned to the user.
-    // Due to the fact that make_strings_column copies the input data to
-    // produce its outputs, _strings is actually a temporary. As a result, we
-    // do not pass the provided mr to the call to
-    // make_zeroed_device_uvector_async here and instead let it use the
-    // default rmm memory resource.
-    _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-        size, stream, rmm::mr::get_current_device_resource()));
-  }
+  CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
+  // The contents of _strings will never be directly returned to the user.
+  // Due to the fact that make_strings_column copies the input data to
+  // produce its outputs, _strings is actually a temporary. As a result, we
+  // do not pass the provided mr to the call to
+  // make_zeroed_device_uvector_async here and instead let it use the
+  // default rmm memory resource.
+  _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+      size, stream, rmm::mr::get_current_device_resource()));
 }
 
 std::unique_ptr<column> gather_column_buffer::make_string_column_impl(
@@ -57,10 +56,9 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(
 void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  if (type.id() == type_id::STRING) {
-    // size + 1 for final offset. _string_data will be initialized later.
-    _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
-  }
+  CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
+  // size + 1 for final offset. _string_data will be initialized later.
+  _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
 }
 
 void inline_column_buffer::create_string_data(size_t num_bytes,

From a0fb80e7d53dd0ac80b014aa83339788d6c30b7f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 13:55:21 -0700
Subject: [PATCH 089/114] get rid of anonymous namespace

---
 cpp/src/io/parquet/page_decode.cuh | 36 ++++++++++++++----------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b6b5b684380..2c59b732dc1 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -23,7 +23,6 @@
 #include <io/utilities/block_utils.cuh>
 
 namespace cudf::io::parquet::gpu {
-namespace {
 
 constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
 constexpr int decode_block_size     = 128;
@@ -342,10 +341,10 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
  *
  * @return The new output position
  */
-__device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
-                                    volatile page_state_buffers_s* sb,
-                                    int target_pos,
-                                    int t)
+inline __device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
+                                           volatile page_state_buffers_s* sb,
+                                           int target_pos,
+                                           int t)
 {
   const uint8_t* end = s->data_end;
   int pos            = s->dict_pos;
@@ -549,9 +548,9 @@ __device__ void gpuDecodeStream(
  * @param[in] valid_mask The validity mask to be stored
  * @param[in] value_count # of bits in the validity mask
  */
-__device__ void store_validity(PageNestingDecodeInfo* nesting_info,
-                               uint32_t valid_mask,
-                               int32_t value_count)
+inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
+                                      uint32_t valid_mask,
+                                      int32_t value_count)
 {
   int word_offset = nesting_info->valid_map_offset / 32;
   int bit_offset  = nesting_info->valid_map_offset % 32;
@@ -874,10 +873,10 @@ __device__ void gpuDecodeLevels(page_state_s* s,
  *
  * @return The length of the section
  */
-__device__ uint32_t InitLevelSection(page_state_s* s,
-                                     const uint8_t* cur,
-                                     const uint8_t* end,
-                                     level_type lvl)
+inline __device__ uint32_t InitLevelSection(page_state_s* s,
+                                            const uint8_t* cur,
+                                            const uint8_t* end,
+                                            level_type lvl)
 {
   int32_t len;
   int level_bits    = s->col.level_bits[lvl];
@@ -947,12 +946,12 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
  * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
  * Currently only used by gpuComputePageSizes step)
  */
-__device__ bool setupLocalPageInfo(page_state_s* const s,
-                                   PageInfo const* p,
-                                   device_span<ColumnChunkDesc const> chunks,
-                                   size_t min_row,
-                                   size_t num_rows,
-                                   bool is_decode_step)
+inline __device__ bool setupLocalPageInfo(page_state_s* const s,
+                                          PageInfo const* p,
+                                          device_span<ColumnChunkDesc const> chunks,
+                                          size_t min_row,
+                                          size_t num_rows,
+                                          bool is_decode_step)
 {
   int t = threadIdx.x;
   int chunk_idx;
@@ -1287,5 +1286,4 @@ __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace
 }  // namespace cudf::io::parquet::gpu

From fac2f3f114db084b191bb8de81e19e6d5e413254 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 15:02:17 -0700
Subject: [PATCH 090/114] delete copy constructor

---
 cpp/src/io/utilities/column_buffer.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 062b1a9c67f..6610259473d 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -102,6 +102,14 @@ class column_buffer_base {
   {
   }
 
+  // move constructor
+  column_buffer_base(column_buffer_base&& col)            = default;
+  column_buffer_base& operator=(column_buffer_base&& col) = default;
+
+  // copy constructor
+  column_buffer_base(column_buffer_base const& col)            = delete;
+  column_buffer_base& operator=(column_buffer_base const& col) = delete;
+
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);

From 53b38b8caec67c050e71bbc362a34cb5cc71080c Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 30 May 2023 17:02:21 -0700
Subject: [PATCH 091/114] revert removal of memory resource

---
 cpp/src/io/avro/reader_impl.cu         |  2 +-
 cpp/src/io/csv/reader_impl.cu          |  2 +-
 cpp/src/io/json/reader_impl.cu         |  2 +-
 cpp/src/io/orc/reader_impl.cu          |  2 +-
 cpp/src/io/parquet/reader_impl.cpp     |  6 +--
 cpp/src/io/utilities/column_buffer.cpp | 61 +++++++++++---------------
 cpp/src/io/utilities/column_buffer.hpp | 26 +++++------
 7 files changed, 42 insertions(+), 59 deletions(-)

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 1f931bb2670..a1be00dff9b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -591,7 +591,7 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                                      mr);
 
       for (size_t i = 0; i < column_types.size(); ++i) {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream, mr));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream));
       }
     } else {
       // Create empty columns
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index ea8b0680e37..cb6d746047c 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -868,7 +868,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
         out_columns.emplace_back(
           cudf::strings::detail::replace(col->view(), dblquotechar, quotechar, -1, stream, mr));
       } else {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream, mr));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream));
       }
     }
   } else {
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 4add19bf07c..baec3cb1439 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -556,7 +556,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream, mr);
+    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 8058d7c3e2b..07e1bc2dd7f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -877,7 +877,7 @@ void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_
                  [&](auto const col_meta) {
                    schema_info.emplace_back("");
                    auto col_buffer = assemble_buffer(col_meta.id, col_buffers, 0, stream);
-                   return make_column(col_buffer, &schema_info.back(), std::nullopt, stream, _mr);
+                   return make_column(col_buffer, &schema_info.back(), std::nullopt, stream);
                  });
 }
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 514509de421..633539b6148 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -129,7 +129,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         data[idx]   = out_buf.data();
         // only do string buffer for leaf
         if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream, _mr);
+          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
         }
         str_data[idx] = out_buf.string_data();
         out_buf.user_data |=
@@ -329,9 +329,9 @@ table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bound
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
       column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-      out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream, _mr));
+      out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream));
     } else {
-      out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream, _mr));
+      out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream));
     }
   }
 
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index de425ac5b5b..9b8754d6318 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -28,8 +28,7 @@
 
 namespace cudf::io::detail {
 
-void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // The contents of _strings will never be directly returned to the user.
@@ -43,33 +42,28 @@ void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
       size, stream, rmm::mr::get_current_device_resource()));
 }
 
-std::unique_ptr<column> gather_column_buffer::make_string_column_impl(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
 {
   // make_strings_column allocates new memory, it does not simply move
   // from the inputs, so we need to pass it the memory resource given to
   // the buffer on construction so that the memory is allocated using the
   // resource that the calling code expected.
-  return make_strings_column(*_strings, stream, mr);
+  return make_strings_column(*_strings, stream, _mr);
 }
 
-void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
-  _data = create_data(data_type{type_id::INT32}, size + 1, stream, mr);
+  _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
 }
 
-void inline_column_buffer::create_string_data(size_t num_bytes,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
 {
-  _string_data = rmm::device_buffer(num_bytes, stream, mr);
+  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> inline_column_buffer::make_string_column_impl(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -79,14 +73,14 @@ std::unique_ptr<column> inline_column_buffer::make_string_column_impl(
       : std::make_unique<column>(data_type{type_id::INT8},
                                  string_size(),
                                  std::move(_string_data),
-                                 cudf::detail::create_null_mask(size, state, stream, mr),
+                                 cudf::detail::create_null_mask(size, state, stream, _mr),
                                  state_null_count(state, size),
                                  std::vector<std::unique_ptr<column>>{});
   auto offsets_col =
     std::make_unique<column>(data_type{type_to_id<size_type>()},
                              size + 1,
                              std::move(_data),
-                             cudf::detail::create_null_mask(size + 1, state, stream, mr),
+                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
                              state_null_count(state, size + 1),
                              std::vector<std::unique_ptr<column>>{});
 
@@ -121,24 +115,23 @@ void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::mr::device_memory_resource* mr)
 {
   size = _size;
+  _mr  = mr;
 
   switch (type.id()) {
-    case type_id::STRING:
-      static_cast<string_policy*>(this)->allocate_strings_data(stream, mr);
-      break;
+    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
 
     // list columns store a buffer of int32's as offsets to represent
     // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
 
     // struct columns store no data themselves.  just validity and children.
     case type_id::STRUCT: break;
 
-    default: _data = create_data(type, size, stream, mr); break;
+    default: _data = create_data(type, size, stream, _mr); break;
   }
   if (is_nullable) {
-    _null_mask =
-      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+    _null_mask = cudf::detail::create_null_mask(
+      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
   }
 }
 
@@ -154,8 +147,7 @@ template <class string_policy>
 std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::cuda_stream_view stream)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -171,10 +163,10 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         // from the inputs, so we need to pass it the memory resource given to
         // the buffer on construction so that the memory is allocated using the
         // resource that the calling code expected.
-        return buffer.make_string_column(stream, mr);
+        return buffer.make_string_column(stream);
       } else {
         // convert to binary
-        auto const string_col = buffer.make_string_column(stream, mr);
+        auto const string_col = buffer.make_string_column(stream);
         auto const num_rows   = string_col->size();
         auto const null_count = string_col->null_count();
         auto col_content      = string_col->release();
@@ -220,8 +212,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child =
-        make_column<string_policy>(buffer.children[0], child_info, child_schema, stream, mr);
+      auto child = make_column<string_policy>(buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -231,7 +222,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                buffer._null_count,
                                std::move(buffer._null_mask),
                                stream,
-                               mr);
+                               buffer._mr);
     } break;
 
     case type_id::STRUCT: {
@@ -251,7 +242,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     : std::nullopt;
 
         output_children.emplace_back(
-          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream, mr));
+          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream));
       }
 
       return make_structs_column(buffer.size,
@@ -259,7 +250,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                  buffer._null_count,
                                  std::move(buffer._null_mask),
                                  stream,
-                                 mr);
+                                 buffer._mr);
     } break;
 
     default: {
@@ -339,15 +330,13 @@ template std::unique_ptr<column> make_column<string_type>(
   string_column_buffer& buffer,
   column_name_info* schema_info,
   std::optional<reader_column_schema> const& schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::cuda_stream_view stream);
 
 template std::unique_ptr<column> make_column<pointer_type>(
   pointer_column_buffer& buffer,
   column_name_info* schema_info,
   std::optional<reader_column_schema> const& schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::cuda_stream_view stream);
 
 template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
                                                          column_name_info* schema_info,
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 6610259473d..a701d811b1d 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -82,8 +82,7 @@ template <class string_policy>
 std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::cuda_stream_view stream);
 
 template <typename string_policy>
 class column_buffer_base {
@@ -131,16 +130,16 @@ class column_buffer_base {
   auto data() { return static_cast<string_policy*>(this)->data_impl(); }
   auto data_size() { return static_cast<string_policy*>(this)->data_size_impl(); }
 
-  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
   {
-    return static_cast<string_policy*>(this)->make_string_column_impl(stream, mr);
+    return static_cast<string_policy*>(this)->make_string_column_impl(stream);
   }
 
  protected:
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
+  rmm::mr::device_memory_resource* _mr;
 
  public:
   data_type type{type_id::EMPTY};
@@ -155,8 +154,7 @@ class column_buffer_base {
     column_buffer_base& buffer,
     column_name_info* schema_info,
     std::optional<reader_column_schema> const& schema,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::cuda_stream_view stream);
 };
 
 // column buffer that uses a string_index_pair for strings data, requiring a gather step when
@@ -181,13 +179,12 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  void allocate_strings_data(rmm::cuda_stream_view stream);
 
   void* data_impl() { return _strings ? _strings->data() : _data.data(); }
   size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
 
-  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
  public:
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
@@ -215,16 +212,13 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  void allocate_strings_data(rmm::cuda_stream_view stream);
 
   void* data_impl() { return _data.data(); }
   size_t data_size_impl() const { return _data.size(); }
-  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
-  void create_string_data(size_t num_bytes,
-                          rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
   void* string_data() { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 

From 6408f5b8e99ace43bc5894c65cbd86a53d41eb92 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 2 Jun 2023 12:18:32 -0700
Subject: [PATCH 092/114] fix some bitrot and add explanation for presence of
 gpuDecodeStringPageDataV2

---
 cpp/src/io/parquet/page_string_decode.cu | 80 ++++++------------------
 1 file changed, 20 insertions(+), 60 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 012d0840fe1..336a055a7cb 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -850,15 +850,12 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   int const t                    = threadIdx.x;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  // set during string copy by lane 0
-  int first_non_null = -1;
-
   if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
 
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // offsets is global...but the output is local, so account for that below
-  if (t == 0) { last_offset = s->page.str_offset; }
+  // offsets are local to the page
+  if (t == 0) { last_offset = 0; }
   __syncthreads();
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
@@ -877,8 +874,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     return;
   }
 
-  int out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64;
-
+  int out_thread0                                = s->dict_base && s->dict_bits == 0 ? 32 : 64;
+  int const leaf_level_index                     = s->col.max_nesting_depth - 1;
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
   __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
@@ -914,20 +911,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     // target_pos for value decoding
     target_pos = min(s->nz_count, target_pos);
 
-    // if this is the first page, then the first non-null entry will have an offset of 0.
-    // pages that start with a run of nulls will have repeated 0 values, so for the fixing
-    // of null offsets done at the end, we need to know the last index that should be 0.
-    if (t == 0 && s->page.str_offset == 0 && first_non_null == -1) {
-      for (int i = src_pos; i < target_pos; i++) {
-        int dst_pos = sb->nz_idx[rolling_index(i)];
-        if (!has_repetition) { dst_pos -= s->first_row; }
-        if (dst_pos >= 0) {
-          first_non_null = dst_pos;
-          break;
-        }
-      }
-    }
-
     // Decode values
     src_pos += t;
 
@@ -952,7 +935,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     using cudf::detail::warp_size;
     auto const use_char_ll =
       s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
-    int const leaf_level_index = s->col.max_nesting_depth - 1;
 
     auto [ptr, len] = src_pos < target_pos && dst_pos >= 0
                         ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values)
@@ -985,9 +967,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
         if (dsts[ss] >= 0) {
           auto offptr =
             reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dsts[ss];
-          *offptr = offsets[ss];
-          auto str_ptr =
-            nesting_info_base[leaf_level_index].string_out + offsets[ss] - s->page.str_offset;
+          *offptr      = lengths[ss];
+          auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss];
           ll_strcpy(str_ptr, pointers[ss], lengths[ss], lane_id);
         }
       }
@@ -995,8 +976,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
       if (src_pos < target_pos && dst_pos >= 0) {
         auto offptr =
           reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
-        *offptr      = offset;
-        auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset - s->page.str_offset;
+        *offptr      = len;
+        auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
         memcpy(str_ptr, ptr, len);
       }
     }
@@ -1010,39 +991,15 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
     __syncthreads();
   }
 
-  // if there are nulls clean up the offsets array.
-  if (s->page.num_nulls != 0) {
-    int const leaf_level_index = s->col.max_nesting_depth - 1;
-    int value_count            = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto offptr = reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out);
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
 
-    if (nesting_info_base[leaf_level_index].null_count > 0) {
-      // if nz_count is 0, then it's all nulls.  set all offsets to str_offset
-      if (s->nz_count == 0) {
-        for (int i = t; i < value_count; i += decode_block_size) {
-          offptr[i] = s->page.str_offset;
-        }
-      }
-      // just some nulls, do this serially for now
-      else if (t == 0) {
-        if (first_non_null == -1) { first_non_null = 0; }
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
 
-        if (offptr[value_count - 1] == 0 && value_count - 1 != first_non_null) {
-          offptr[value_count - 1] = s->page.str_offset + s->page.str_bytes;
-        }
-        for (int i = value_count - 2; i > first_non_null; i--) {
-          if (offptr[i] == 0) { offptr[i] = offptr[i + 1]; }
-        }
-        offptr[0] = s->page.str_offset;
-      }
-    }
-    __syncthreads();
-  }
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 }
 
 __global__ void __launch_bounds__(preprocess_block_size)
@@ -1130,8 +1087,11 @@ void __host__ DecodeStringPageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  // TODO figure out when one version is better than the other.  waiting on further changes to
-  // rle_stream to simplify the decode step.
+  // TODO gpuDecodeStringPageDataV2 (needs a better name) is an alternative approach that uses
+  // all threads in the thread block to do the string copies (rather than the original approach
+  // which uses a single warp). It is faster in some cases, and slower in others. It's being left
+  // in but unused because it will be the likely only implementation once the dictionary decoding
+  // is modified to use more than a single warp.
   if constexpr (true) {
     if (level_type_size == 1) {
       gpuDecodeStringPageData<non_zero_buffer_size, uint8_t>

From 204d2a24639b65696a2d202b7198fa551ac222c7 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 2 Jun 2023 12:57:40 -0700
Subject: [PATCH 093/114] add const versions of pointer accessors

---
 cpp/src/io/utilities/column_buffer.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index a701d811b1d..b8d610f8828 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -128,7 +128,8 @@ class column_buffer_base {
   auto& null_count() { return _null_count; }
 
   auto data() { return static_cast<string_policy*>(this)->data_impl(); }
-  auto data_size() { return static_cast<string_policy*>(this)->data_size_impl(); }
+  auto data() const { return static_cast<string_policy const*>(this)->data_impl(); }
+  auto data_size() const { return static_cast<string_policy const*>(this)->data_size_impl(); }
 
   std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
   {
@@ -182,6 +183,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
   void allocate_strings_data(rmm::cuda_stream_view stream);
 
   void* data_impl() { return _strings ? _strings->data() : _data.data(); }
+  void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
   size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
 
   std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
@@ -215,11 +217,13 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
   void allocate_strings_data(rmm::cuda_stream_view stream);
 
   void* data_impl() { return _data.data(); }
+  void const* data_impl() const { return _data.data(); }
   size_t data_size_impl() const { return _data.size(); }
   std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
   void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
   void* string_data() { return _string_data.data(); }
+  void const* string_data() const { return _string_data.data(); }
   size_t string_size() const { return _string_data.size(); }
 
  private:

From f2028d2fcda6f35a6db8629870581756545fd51f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 2 Jun 2023 13:23:17 -0700
Subject: [PATCH 094/114] document template params

---
 cpp/src/io/parquet/page_string_decode.cu | 44 ++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 336a055a7cb..0a34bd0fb3d 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -93,7 +93,7 @@ __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t
 }
 
 /**
- * @brief Perform exclusive scan for offsets array. Called for each page.
+ * @brief Perform exclusive scan on an array of any length using a single block of threads.
  */
 template <int block_size>
 __device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
@@ -126,6 +126,8 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
  * @param has_repetition True if the schema is nested
  * @param decoders Definition and repetition level decoders
  * @return pair containing start and end value indexes
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <int lvl_buf_size, typename level_t>
 __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
@@ -367,13 +369,13 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
  * @param start_value Do not count values that occur before this index
  * @param end_value Do not count values that occur after this index
  */
-__device__ size_t countDictEntries(uint8_t const* data,
-                                   uint8_t const* dict_base,
-                                   int dict_bits,
-                                   int dict_size,
-                                   int data_size,
-                                   int start_value,
-                                   int end_value)
+__device__ size_t totalDictEntriesSize(uint8_t const* data,
+                                       uint8_t const* dict_base,
+                                       int dict_bits,
+                                       int dict_size,
+                                       int data_size,
+                                       int start_value,
+                                       int end_value)
 {
   int const t              = threadIdx.x;
   uint8_t const* ptr       = data;
@@ -493,10 +495,10 @@ __device__ size_t countDictEntries(uint8_t const* data,
  * @param start_value Do not count values that occur before this index
  * @param end_value Do not count values that occur after this index
  */
-__device__ size_t countPlainEntries(uint8_t const* data,
-                                    int data_size,
-                                    int start_value,
-                                    int end_value)
+__device__ size_t totalPlainEntriesSize(uint8_t const* data,
+                                        int data_size,
+                                        int start_value,
+                                        int end_value)
 {
   int const t      = threadIdx.x;
   int pos          = 0;
@@ -538,6 +540,8 @@ __device__ size_t countPlainEntries(uint8_t const* data,
  * @param chunks All chunks to be decoded
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
@@ -614,12 +618,12 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
       // FIXME: need to return an error condition...this won't actually do anything
       if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
 
-      str_bytes = countDictEntries(
+      str_bytes = totalDictEntriesSize(
         data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
       break;
     case Encoding::PLAIN:
       dict_size = static_cast<int32_t>(end - data);
-      str_bytes = is_bounds_pg ? countPlainEntries(data, dict_size, start_value, end_value)
+      str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
                                : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
       break;
   }
@@ -642,6 +646,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
@@ -688,8 +694,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   int const leaf_level_index                     = s->col.max_nesting_depth - 1;
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
-  __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
-  __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
+  __shared__ level_t rep[lvl_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[lvl_buf_size];  // circular buffer of definition level values
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
@@ -832,6 +838,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <int lvl_buf_size, typename level_t>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
@@ -878,8 +886,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   int const leaf_level_index                     = s->col.max_nesting_depth - 1;
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
-  __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
-  __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
+  __shared__ level_t rep[lvl_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[lvl_buf_size];  // circular buffer of definition level values
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;

From 2a9b0ffb7d455d657da4be8d63755629d0ec1e0d Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Tue, 6 Jun 2023 11:38:06 -0700
Subject: [PATCH 095/114] only need one version of is_string_col

---
 cpp/src/io/parquet/page_decode.cuh | 9 ---------
 cpp/src/io/parquet/parquet_gpu.hpp | 9 +++++++++
 cpp/src/io/parquet/reader_impl.cpp | 6 +-----
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 2c59b732dc1..2b738c33bda 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -108,15 +108,6 @@ struct null_count_back_copier {
   }
 };
 
-/**
- * @brief Test if the given page is in a string column
- */
-constexpr bool is_string_col(ColumnChunkDesc const& chunk)
-{
-  return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and
-         chunk.converted_type != DECIMAL;
-}
-
 /**
  * @brief Test if the given page is in a string column
  */
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 2d3c6f655e7..5f4e7977d7b 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -424,6 +424,15 @@ struct EncPage {
   compression_result* comp_res;  //!< Ptr to compression result
 };
 
+/**
+ * @brief Test if the given column chunk is in a string column
+ */
+constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+{
+  return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and
+         chunk.converted_type != DECIMAL;
+}
+
 /**
  * @brief Launches kernel for parsing the page headers in the column chunks
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 633539b6148..bc7fcd71651 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -43,11 +43,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto is_string_col = [](gpu::ColumnChunkDesc const& chunk) {
-    return (chunk.data_type & 7) == BYTE_ARRAY && (chunk.data_type >> 3) != 4 &&
-           chunk.converted_type != DECIMAL;
-  };
-  auto const has_strings = std::any_of(chunks.begin(), chunks.end(), is_string_col);
+  auto const has_strings = std::any_of(chunks.begin(), chunks.end(), gpu::is_string_col);
 
   std::vector<size_type> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {

From 798913516602ca17254b0747888f6191a9747870 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 7 Jun 2023 09:24:31 -0700
Subject: [PATCH 096/114] check for string overflow

---
 cpp/src/io/parquet/page_string_decode.cu |  8 ++++----
 cpp/src/io/parquet/parquet_gpu.hpp       |  2 +-
 cpp/src/io/parquet/reader_impl.cpp       | 10 ++++++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 0a34bd0fb3d..90a63a630ac 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1011,7 +1011,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 }
 
 __global__ void __launch_bounds__(preprocess_block_size)
-  gpuComputePageOffsets(device_span<ColumnChunkDesc const> chunks, device_span<size_type> col_sizes)
+  gpuComputePageOffsets(device_span<ColumnChunkDesc const> chunks, device_span<size_t> col_sizes)
 {
   using block_scan = cub::BlockScan<size_type, preprocess_block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
@@ -1054,7 +1054,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
  */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            std::vector<size_type>& col_sizes,
+                            std::vector<size_t>& col_sizes,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
@@ -1070,11 +1070,11 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
 
-  rmm::device_uvector<size_type> d_col_sizes(col_sizes.size(), stream);
+  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), stream);
   gpuComputePageOffsets<<<col_sizes.size(), dim_block, 0, stream.value()>>>(chunks, d_col_sizes);
   cudaMemcpyAsync(col_sizes.data(),
                   d_col_sizes.data(),
-                  sizeof(size_type) * col_sizes.size(),
+                  sizeof(size_t) * col_sizes.size(),
                   cudaMemcpyDeviceToHost,
                   stream);
   stream.synchronize();
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 5f4e7977d7b..d9f713810ee 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -504,7 +504,7 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
  */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            std::vector<size_type>& col_sizes,
+                            std::vector<size_t>& col_sizes,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index bc7fcd71651..2aed6f56d66 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -45,10 +45,16 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // chunked reader).
   auto const has_strings = std::any_of(chunks.begin(), chunks.end(), gpu::is_string_col);
 
-  std::vector<size_type> col_sizes(_input_columns.size(), 0L);
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(
       pages, chunks, col_sizes, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+    // check for overflow
+    if (std::any_of(col_sizes.begin(), col_sizes.end(), [](size_t sz) {
+          return sz > std::numeric_limits<size_type>::max();
+        })) {
+      CUDF_FAIL("String column exceeds 2GB limit");
+    }
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -187,7 +193,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
-        size_type const sz = col_sizes[idx];
+        size_type const sz = static_cast<size_type>(col_sizes[idx]);
         cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + out_buf.size,
                         &sz,
                         sizeof(size_type),

From 1bfece45972a18b2e923e6133f21ba83af5d0565 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 7 Jun 2023 09:58:19 -0700
Subject: [PATCH 097/114] more size_type -> size_t

---
 cpp/src/io/parquet/page_string_decode.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 90a63a630ac..b7580f98260 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1013,7 +1013,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
 __global__ void __launch_bounds__(preprocess_block_size)
   gpuComputePageOffsets(device_span<ColumnChunkDesc const> chunks, device_span<size_t> col_sizes)
 {
-  using block_scan = cub::BlockScan<size_type, preprocess_block_size>;
+  using block_scan = cub::BlockScan<size_t, preprocess_block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
 
   auto const t         = threadIdx.x;
@@ -1025,16 +1025,16 @@ __global__ void __launch_bounds__(preprocess_block_size)
       // short circuit return if this is not a string column
       if (not is_string_col(chunk)) { return; }
 
-      size_type cumulative_offset = col_sizes[col_index];
+      size_t cumulative_offset = col_sizes[col_index];
 
       for (int i = 0; i < chunk.max_num_pages; i += preprocess_block_size) {
-        int idx       = i + t;
-        size_type len = idx < chunk.max_num_pages and
-                            (chunk.page_info[idx].flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0
-                          ? chunk.page_info[idx].str_bytes
-                          : 0;
+        int const idx    = i + t;
+        size_t const len = idx < chunk.max_num_pages and
+                               (chunk.page_info[idx].flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0
+                             ? chunk.page_info[idx].str_bytes
+                             : 0;
 
-        size_type offset, block_total;
+        size_t offset, block_total;
         block_scan(scan_storage).ExclusiveSum(len, offset, block_total);
         if (idx < chunk.max_num_pages) {
           chunk.page_info[idx].str_offset = offset + cumulative_offset;

From 6a3400f7a2256ee9a4dfa4e01c980304f45369f9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 7 Jun 2023 12:40:33 -0700
Subject: [PATCH 098/114] implement suggestion from review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/src/io/parquet/page_decode.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 2b738c33bda..a3b2bccbc53 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -19,9 +19,10 @@
 #include "parquet_gpu.hpp"
 #include "rle_stream.cuh"
 
-#include <cuda/std/tuple>
 #include <io/utilities/block_utils.cuh>
 
+#include <cuda/std/tuple>
+
 namespace cudf::io::parquet::gpu {
 
 constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512

From 59bfd1be94bba846599c8989d0a91f338ee558b8 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 8 Jun 2023 17:07:45 -0700
Subject: [PATCH 099/114] use thrust to calculate page string offsets use
 src_col_index rather than src_col_schema to generate page_keys (src_col_index
 honors column ordering)

---
 cpp/src/io/parquet/page_string_decode.cu     | 47 ----------
 cpp/src/io/parquet/parquet_gpu.hpp           |  2 -
 cpp/src/io/parquet/reader_impl.cpp           |  5 +-
 cpp/src/io/parquet/reader_impl.hpp           |  7 ++
 cpp/src/io/parquet/reader_impl_preprocess.cu | 94 ++++++++++++++++++--
 5 files changed, 98 insertions(+), 57 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index b7580f98260..e9d1418482f 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1010,43 +1010,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 }
 
-__global__ void __launch_bounds__(preprocess_block_size)
-  gpuComputePageOffsets(device_span<ColumnChunkDesc const> chunks, device_span<size_t> col_sizes)
-{
-  using block_scan = cub::BlockScan<size_t, preprocess_block_size>;
-  __shared__ typename block_scan::TempStorage scan_storage;
-
-  auto const t         = threadIdx.x;
-  auto const col_index = blockIdx.x;
-  col_sizes[col_index] = 0;
-
-  for (auto const& chunk : chunks) {
-    if (chunk.src_col_index == col_index) {
-      // short circuit return if this is not a string column
-      if (not is_string_col(chunk)) { return; }
-
-      size_t cumulative_offset = col_sizes[col_index];
-
-      for (int i = 0; i < chunk.max_num_pages; i += preprocess_block_size) {
-        int const idx    = i + t;
-        size_t const len = idx < chunk.max_num_pages and
-                               (chunk.page_info[idx].flags & gpu::PAGEINFO_FLAGS_DICTIONARY) == 0
-                             ? chunk.page_info[idx].str_bytes
-                             : 0;
-
-        size_t offset, block_total;
-        block_scan(scan_storage).ExclusiveSum(len, offset, block_total);
-        if (idx < chunk.max_num_pages) {
-          chunk.page_info[idx].str_offset = offset + cumulative_offset;
-        }
-        cumulative_offset += block_total;
-      }
-      if (t == 0) { col_sizes[col_index] = cumulative_offset; }
-      __syncthreads();
-    }
-  }
-}
-
 }  // anonymous namespace
 
 /**
@@ -1054,7 +1017,6 @@ __global__ void __launch_bounds__(preprocess_block_size)
  */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            std::vector<size_t>& col_sizes,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
@@ -1069,15 +1031,6 @@ void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
     gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
-
-  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), stream);
-  gpuComputePageOffsets<<<col_sizes.size(), dim_block, 0, stream.value()>>>(chunks, d_col_sizes);
-  cudaMemcpyAsync(col_sizes.data(),
-                  d_col_sizes.data(),
-                  sizeof(size_t) * col_sizes.size(),
-                  cudaMemcpyDeviceToHost,
-                  stream);
-  stream.synchronize();
 }
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index d9f713810ee..66b588b79aa 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -497,14 +497,12 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
  *
  * @param[in,out] pages All pages to be decoded
  * @param[in] chunks All chunks to be decoded
- * @param[out] col_sizes On output, contains total size of string data for each column
  * @param[in] min_rows crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] stream CUDA stream to use, default 0
  */
 void ComputePageStringSizes(hostdevice_vector<PageInfo>& pages,
                             hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            std::vector<size_t>& col_sizes,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 2aed6f56d66..fb3723157dc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -48,7 +48,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(
-      pages, chunks, col_sizes, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+
+    col_sizes = calculate_page_string_offsets();
+
     // check for overflow
     if (std::any_of(col_sizes.begin(), col_sizes.end(), [](size_t sz) {
           return sz > std::numeric_limits<size_type>::max();
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 3d8c71c6c63..d25bf1e4c1f 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -220,6 +220,13 @@ class reader::impl {
    */
   void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
 
+  /**
+   * @brief Calculate per-page offsets for string data
+   *
+   * @return Vector of total string data sizes for each column
+   */
+  std::vector<size_t> calculate_page_string_offsets();
+
   /**
    * @brief Converts the page data and outputs to columns.
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 4433561ff1b..00259ef04b4 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1256,8 +1256,12 @@ struct get_page_num_rows {
   __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
 };
 
-struct get_page_schema {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.src_col_schema; }
+struct get_page_column_index {
+  gpu::ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(gpu::PageInfo const& page)
+  {
+    return chunks[page.chunk_idx].src_col_index;
+  }
 };
 
 struct input_col_info {
@@ -1480,6 +1484,43 @@ void detect_malformed_pages(hostdevice_vector<gpu::PageInfo>& pages,
   }
 }
 
+struct page_to_string_size {
+  gpu::PageInfo* pages;
+  gpu::ColumnChunkDesc const* chunks;
+
+  __device__ size_t operator()(size_type page_idx) const
+  {
+    auto const page  = pages[page_idx];
+    auto const chunk = chunks[page.chunk_idx];
+
+    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
+      return 0;
+    }
+    return pages[page_idx].str_bytes;
+  }
+};
+
+struct page_offset_output_iter {
+  gpu::PageInfo* p;
+  size_type const* index;
+
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  __host__ __device__ page_offset_output_iter operator+(int i)
+  {
+    return page_offset_output_iter{p, index + i};
+  }
+
+  __host__ __device__ void operator++() { index++; }
+
+  __device__ reference operator[](int i) { return p[index[i]].str_offset; }
+  __device__ reference operator*() { return p[*index].str_offset; }
+};
+
 }  // anonymous namespace
 
 void reader::impl::preprocess_pages(size_t skip_rows,
@@ -1516,7 +1557,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                       pages.device_ptr(),
                       pages.device_ptr() + pages.size(),
                       page_keys.begin(),
-                      get_page_schema{});
+                      get_page_column_index{chunks.device_ptr()});
 
     thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end());
     thrust::stable_sort_by_key(rmm::exec_policy(_stream),
@@ -1635,16 +1676,16 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                                   page_input,
                                   chunk_row_output_iter{pages.device_ptr()});
 
-    // preserve page ordering data
-    _chunk_itm_data.page_keys  = std::move(page_keys);
-    _chunk_itm_data.page_index = std::move(page_index);
-
     // retrieve pages back
     pages.device_to_host(_stream, true);
 
     // print_pages(pages, _stream);
   }
 
+  // preserve page ordering data for string decoder
+  _chunk_itm_data.page_keys  = std::move(page_keys);
+  _chunk_itm_data.page_index = std::move(page_index);
+
   // compute splits if necessary. otherwise return a single split representing
   // the whole file.
   _chunk_read_info = chunk_read_limit > 0
@@ -1787,4 +1828,43 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   }
 }
 
+std::vector<size_t> reader::impl::calculate_page_string_offsets()
+{
+  auto& chunks           = _file_itm_data.chunks;
+  auto& pages            = _file_itm_data.pages_info;
+  auto const& page_keys  = _chunk_itm_data.page_keys;
+  auto const& page_index = _chunk_itm_data.page_index;
+
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+  // page key is column index, but need to sort by key to make all pages for a column contiguous
+  auto val_iter = thrust::make_transform_iterator(
+    page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
+
+  rmm::device_uvector<size_t> page_offsets(pages.size(), _stream);
+  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                val_iter,
+                                page_offset_output_iter{pages.device_ptr(), page_index.data()});
+
+  // now sum up page sizes
+  rmm::device_uvector<int> red_keys(col_sizes.size(), _stream);
+  thrust::reduce_by_key(rmm::exec_policy(_stream),
+                        page_keys.begin(),
+                        page_keys.end(),
+                        val_iter,
+                        red_keys.begin(),
+                        d_col_sizes.begin());
+
+  cudaMemcpyAsync(col_sizes.data(),
+                  d_col_sizes.data(),
+                  sizeof(size_t) * col_sizes.size(),
+                  cudaMemcpyDeviceToHost,
+                  _stream);
+  _stream.synchronize();
+
+  return col_sizes;
+}
+
 }  // namespace cudf::io::detail::parquet

From 5f3e5affd3457652e58deba0001dad9f54000b75 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 8 Jun 2023 17:27:10 -0700
Subject: [PATCH 100/114] some cleanup

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 00259ef04b4..05ea164361c 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1837,11 +1837,12 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
 
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
-  // page key is column index, but need to sort by key to make all pages for a column contiguous
+
+  // use page_index to fetch page string sizes in the proper order
   auto val_iter = thrust::make_transform_iterator(
     page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
 
-  rmm::device_uvector<size_t> page_offsets(pages.size(), _stream);
+  // do scan by key to calculate string offsets for each page
   thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
                                 page_keys.begin(),
                                 page_keys.end(),
@@ -1849,12 +1850,12 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                                 page_offset_output_iter{pages.device_ptr(), page_index.data()});
 
   // now sum up page sizes
-  rmm::device_uvector<int> red_keys(col_sizes.size(), _stream);
+  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy(_stream),
                         page_keys.begin(),
                         page_keys.end(),
                         val_iter,
-                        red_keys.begin(),
+                        reduce_keys.begin(),
                         d_col_sizes.begin());
 
   cudaMemcpyAsync(col_sizes.data(),

From cbd742fa37a6f13524925022ac3bb51a7f04e15b Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 9 Jun 2023 08:28:30 -0700
Subject: [PATCH 101/114] throw std::overflow_error if string column gets too
 big

---
 cpp/src/io/parquet/parquet_gpu.hpp | 2 ++
 cpp/src/io/parquet/reader_impl.cpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 9079b797d31..c50fbdcd3b3 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -499,6 +499,7 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] chunks All chunks to be decoded
  * @param[in] min_rows crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] stream CUDA stream to use, default 0
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -538,6 +539,7 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] chunks All chunks to be decoded
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] stream CUDA stream to use, default 0
  */
 void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index da37d80f27b..dfa08fcdf05 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -56,7 +56,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     if (std::any_of(col_sizes.begin(), col_sizes.end(), [](size_t sz) {
           return sz > std::numeric_limits<size_type>::max();
         })) {
-      CUDF_FAIL("String column exceeds 2GB limit");
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
   }
 

From 967ecf66e4c79711425c7a24de6ec1557cf46bb9 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 9 Jun 2023 11:39:15 -0700
Subject: [PATCH 102/114] only allocate memory for string nesting data if there
 are string columns

---
 cpp/src/io/parquet/page_decode.cuh |  6 ++++--
 cpp/src/io/parquet/reader_impl.cpp | 16 +++++++++-------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a3b2bccbc53..85cd099e0c5 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1141,8 +1141,10 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
 
           if (s->col.column_data_base != nullptr) {
-            nesting_info->data_out   = static_cast<uint8_t*>(s->col.column_data_base[idx]);
-            nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+            if (s->col.column_string_base != nullptr) {
+              nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
+            }
 
             nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index dfa08fcdf05..6d1930e9c60 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -65,9 +65,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
   auto chunk_nested_valids =
     cudf::detail::hostdevice_vector<bitmask_type*>(sum_max_depths, _stream);
-  auto chunk_nested_data     = cudf::detail::hostdevice_vector<void*>(sum_max_depths, _stream);
-  auto chunk_nested_str_data = cudf::detail::hostdevice_vector<void*>(sum_max_depths, _stream);
-  auto chunk_offsets         = std::vector<size_t>();
+  auto chunk_nested_data = cudf::detail::hostdevice_vector<void*>(sum_max_depths, _stream);
+  auto chunk_offsets     = std::vector<size_t>();
+  auto chunk_nested_str_data =
+    cudf::detail::hostdevice_vector<void*>(has_strings ? sum_max_depths : 0, _stream);
 
   // Update chunks with pointers to column data.
   for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
@@ -88,8 +89,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     auto data                  = chunk_nested_data.host_ptr(chunk_off);
     chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
-    auto str_data                = chunk_nested_str_data.host_ptr(chunk_off);
-    chunks[c].column_string_base = chunk_nested_str_data.device_ptr(chunk_off);
+    auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr;
+    chunks[c].column_string_base =
+      has_strings ? chunk_nested_str_data.device_ptr(chunk_off) : nullptr;
 
     chunk_off += max_depth;
 
@@ -137,7 +139,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
           out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
         }
-        str_data[idx] = out_buf.string_data();
+        if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
           static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
       } else {
@@ -153,11 +155,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
-  chunk_nested_str_data.host_to_device_async(_stream);
 
   // TODO: explore launching these concurrently with a stream pool
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
   if (has_strings) {
+    chunk_nested_str_data.host_to_device_async(_stream);
     gpu::DecodeStringPageData(
       pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
   }

From 0d1ac33bf064eb2050f0bb8ab0d8250e562a4092 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Mon, 12 Jun 2023 09:16:32 -0700
Subject: [PATCH 103/114] east const for new files

---
 cpp/src/io/parquet/page_decode.cuh | 64 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 85cd099e0c5..f4d9cdd2df7 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -37,10 +37,10 @@ constexpr int rolling_lvl_index(int index)
 }
 
 struct page_state_s {
-  const uint8_t* data_start;
-  const uint8_t* data_end;
-  const uint8_t* lvl_end;
-  const uint8_t* dict_base;    // ptr to dictionary page data
+  uint8_t const* data_start;
+  uint8_t const* data_end;
+  uint8_t const* lvl_end;
+  uint8_t const* dict_base;    // ptr to dictionary page data
   int32_t dict_size;           // size of dictionary data
   int32_t first_row;           // First row in page to output
   int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
@@ -67,9 +67,9 @@ struct page_state_s {
   int32_t input_value_count;                  // how many values of the input we've processed
   int32_t input_row_count;                    // how many rows of the input we've processed
   int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  const uint8_t* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  const uint8_t* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
+  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
   int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
   int32_t row_index_lower_bound;                  // lower bound of row indices we should process
 
@@ -175,10 +175,10 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
  *
  * @return A pair containing a pointer to the string and its length
  */
-inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
-  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos)
+inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
+  page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos)
 {
-  const char* ptr = nullptr;
+  char const* ptr = nullptr;
   size_t len      = 0;
 
   if (s->dict_base) {
@@ -186,7 +186,7 @@ inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
     uint32_t dict_pos =
       (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
-      const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+      auto const* src = reinterpret_cast<string_index_pair const*>(s->dict_base + dict_pos);
       ptr             = src->first;
       len             = src->second;
     }
@@ -194,7 +194,7 @@ inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
     // Plain encoding
     uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
     if (dict_pos <= (uint32_t)s->dict_size) {
-      ptr = reinterpret_cast<const char*>(s->data_start + dict_pos);
+      ptr = reinterpret_cast<char const*>(s->data_start + dict_pos);
       len = sb->str_len[rolling_index(src_pos)];
     }
   }
@@ -218,12 +218,12 @@ inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
  */
 template <bool sizes_only>
 __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  volatile page_state_s* s,
-  [[maybe_unused]] volatile page_state_buffers_s* sb,
+  page_state_s volatile* s,
+  [[maybe_unused]] page_state_buffers_s volatile* sb,
   int target_pos,
   int t)
 {
-  const uint8_t* end = s->data_end;
+  uint8_t const* end = s->data_end;
   int dict_bits      = s->dict_bits;
   int pos            = s->dict_pos;
   int str_len        = 0;
@@ -232,7 +232,7 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
     int is_literal, batch_len;
     if (!t) {
       uint32_t run       = s->dict_run;
-      const uint8_t* cur = s->data_start;
+      uint8_t const* cur = s->data_start;
       if (run <= 1) {
         run = (cur < end) ? get_vlq32(cur, end) : 0;
         if (!(run & 1)) {
@@ -278,7 +278,7 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
       dict_idx = s->dict_val;
       if (is_literal) {
         int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
-        const uint8_t* p = s->data_start + (ofs >> 3);
+        uint8_t const* p = s->data_start + (ofs >> 3);
         ofs &= 7;
         if (p < end) {
           uint32_t c = 8 - ofs;
@@ -333,19 +333,19 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
  *
  * @return The new output position
  */
-inline __device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
-                                           volatile page_state_buffers_s* sb,
+inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
+                                           page_state_buffers_s volatile* sb,
                                            int target_pos,
                                            int t)
 {
-  const uint8_t* end = s->data_end;
+  uint8_t const* end = s->data_end;
   int pos            = s->dict_pos;
 
   while (pos < target_pos) {
     int is_literal, batch_len;
     if (!t) {
       uint32_t run       = s->dict_run;
-      const uint8_t* cur = s->data_start;
+      uint8_t const* cur = s->data_start;
       if (run <= 1) {
         run = (cur < end) ? get_vlq32(cur, end) : 0;
         if (!(run & 1)) {
@@ -378,7 +378,7 @@ inline __device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
       int dict_idx;
       if (is_literal) {
         int32_t ofs      = t - ((batch_len + 7) & ~7);
-        const uint8_t* p = s->data_start + (ofs >> 3);
+        uint8_t const* p = s->data_start + (ofs >> 3);
         dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
       } else {
         dict_idx = s->dict_val;
@@ -402,8 +402,8 @@ inline __device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
  * @return Total length of strings processed
  */
 template <bool sizes_only>
-__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
-                                              [[maybe_unused]] volatile page_state_buffers_s* sb,
+__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
+                                              [[maybe_unused]] page_state_buffers_s volatile* sb,
                                               int target_pos,
                                               int t)
 {
@@ -412,7 +412,7 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
 
   // This step is purely serial
   if (!t) {
-    const uint8_t* cur = s->data_start;
+    uint8_t const* cur = s->data_start;
     int dict_size      = s->dict_size;
     int k              = s->dict_val;
 
@@ -452,8 +452,8 @@ template <typename level_t>
 __device__ void gpuDecodeStream(
   level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
-  const uint8_t* cur_def    = s->lvl_start[lvl];
-  const uint8_t* end        = s->lvl_end;
+  uint8_t const* cur_def    = s->lvl_start[lvl];
+  uint8_t const* end        = s->lvl_end;
   uint32_t level_run        = s->initial_rle_run[lvl];
   int32_t level_val         = s->initial_rle_value[lvl];
   int level_bits            = s->col.level_bits[lvl];
@@ -467,7 +467,7 @@ __device__ void gpuDecodeStream(
       // Get a new run symbol from the byte stream
       int sym_len = 0;
       if (!t) {
-        const uint8_t* cur = cur_def;
+        uint8_t const* cur = cur_def;
         if (cur < end) { level_run = get_vlq32(cur, end); }
         if (!(level_run & 1)) {
           if (cur < end) level_val = cur[0];
@@ -496,7 +496,7 @@ __device__ void gpuDecodeStream(
       batch_len8 = (batch_len + 7) >> 3;
       if (t < batch_len) {
         int bitpos         = t * level_bits;
-        const uint8_t* cur = cur_def + (bitpos >> 3);
+        uint8_t const* cur = cur_def + (bitpos >> 3);
         bitpos &= 7;
         if (cur < end) level_val = cur[0];
         cur++;
@@ -866,8 +866,8 @@ __device__ void gpuDecodeLevels(page_state_s* s,
  * @return The length of the section
  */
 inline __device__ uint32_t InitLevelSection(page_state_s* s,
-                                            const uint8_t* cur,
-                                            const uint8_t* end,
+                                            uint8_t const* cur,
+                                            uint8_t const* end,
                                             level_type lvl)
 {
   int32_t len;
@@ -1186,7 +1186,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           // RLE-packed dictionary indices, first byte indicates index length in bits
           if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
             // String dictionary: use index
-            s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index);
+            s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
           } else {
             s->dict_base =

From 98b345f6798b647f3004d6c467da83b2e449109e Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 21 Jun 2023 13:31:17 -0700
Subject: [PATCH 104/114] add new worst-case benchmark for strings

---
 cpp/benchmarks/common/generate_input.cu       | 19 +++++++++
 cpp/benchmarks/common/generate_input.hpp      | 15 +++++++
 .../io/parquet/parquet_reader_input.cpp       | 41 +++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index c2901dc61ee..28710f2a745 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -785,6 +785,25 @@ std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_
   return out_dtypes;
 }
 
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num)
+{
+  std::vector<cudf::type_id> out_dtypes;
+  out_dtypes.reserve(num_cols);
+  for (cudf::size_type col = 0; col < first_num; ++col)
+    out_dtypes.push_back(dtype_ids.first);
+  for (cudf::size_type col = first_num; col < num_cols; ++col)
+    out_dtypes.push_back(dtype_ids.second);
+  return out_dtypes;
+}
+
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
                                                  table_size_bytes table_bytes,
                                                  data_profile const& profile,
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 8a5811218d0..a2efdb819bf 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -666,6 +666,21 @@ std::unique_ptr<cudf::table> create_sequence_table(
  */
 std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_ids,
                                         cudf::size_type num_cols);
+
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ *
+ * @param dtype_ids Pair of requested column types
+ * @param num_cols Total number of columns in the output vector
+ * @param first_num Total number of columns of type `dtype_ids.first`
+ * @return A vector of type_ids
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num);
 /**
  * @brief Create a random null mask object
  *
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index a8d40492890..9d1026f80fc 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -114,6 +114,38 @@ void BM_parquet_read_io_compression(
   parquet_read_common(write_opts, source_sink, state);
 }
 
+template <cudf::io::io_type IOType>
+void BM_parquet_read_io_small_mixed(nvbench::state& state,
+                                    nvbench::type_list<nvbench::enum_type<IOType>>)
+{
+  auto const d_type =
+    std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};
+
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_strings = state.get_int64("num_string_cols");
+  auto const source_type            = IOType;
+
+  // want 80 pages total, across 4 columns, so 20 pages per column
+  cudf::size_type constexpr n_col          = 4;
+  cudf::size_type constexpr page_size_rows = 10'000;
+  cudf::size_type constexpr num_rows       = page_size_rows * (80 / n_col);
+
+  auto const tbl =
+    create_random_table(mix_dtypes(d_type, n_col, num_strings),
+                        row_count{num_rows},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cuio_source_sink_pair source_sink(source_type);
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .max_page_size_rows(10'000)
+      .compression(cudf::io::compression_type::NONE);
+
+  parquet_read_common(write_opts, source_sink, state);
+}
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_parquet_read_chunks(
   nvbench::state& state,
@@ -203,3 +235,12 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("byte_limit", {0, 500'000});
+
+NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
+  .set_name("parquet_read_io_small_mixed")
+  .set_type_axes_names({"io"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32})
+  .add_int64_axis("num_string_cols", {1, 2, 3});

From 3de35544671e1989da721b3e3310ac7924fbdcea Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 21 Jun 2023 13:32:01 -0700
Subject: [PATCH 105/114] use stream pool for decode kernels

---
 cpp/src/io/parquet/reader_impl.cpp | 38 ++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 192b506a2f9..c110ec0cb48 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -17,11 +17,24 @@
 #include "reader_impl.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
+namespace {
+
+auto& get_stream_pool()
+{
+  // don't really need 16 here, but it's a reasonable limit we might see if we use a mechanism
+  // like this more generally.
+  static auto pool = rmm::cuda_stream_pool(1);
+  return pool;
+}
+
+}  // namespace
+
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
   auto& chunks              = _file_itm_data.chunks;
@@ -156,12 +169,27 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
-  // TODO: explore launching these concurrently with a stream pool
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-  if (has_strings) {
-    chunk_nested_str_data.host_to_device_async(_stream);
-    gpu::DecodeStringPageData(
+  // FIXME: leaving in single-stream version for testing. remove before merge.
+  if constexpr (true) {
+    auto stream1 = get_stream_pool().get_stream();
+    gpu::DecodePageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
+    if (has_strings) {
+      auto stream2 = get_stream_pool().get_stream();
+      chunk_nested_str_data.host_to_device_async(stream2);
+      gpu::DecodeStringPageData(
+        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
+      stream2.synchronize();
+    }
+    stream1.synchronize();
+  } else {
+    gpu::DecodePageData(
       pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+    if (has_strings) {
+      chunk_nested_str_data.host_to_device_async(_stream);
+      gpu::DecodeStringPageData(
+        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+    }
   }
 
   pages.device_to_host_async(_stream);

From a4548e747b3a3624c44dfea33f9b019a928167df Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Wed, 21 Jun 2023 16:10:11 -0700
Subject: [PATCH 106/114] move stream pool to impl object

---
 cpp/src/io/parquet/reader_impl.cpp | 50 ++++++++++--------------------
 cpp/src/io/parquet/reader_impl.hpp |  4 +++
 2 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c110ec0cb48..b8cabf0b296 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -17,23 +17,13 @@
 #include "reader_impl.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
-namespace {
-
-auto& get_stream_pool()
-{
-  // don't really need 16 here, but it's a reasonable limit we might see if we use a mechanism
-  // like this more generally.
-  static auto pool = rmm::cuda_stream_pool(1);
-  return pool;
-}
-
-}  // namespace
+// how many page decode kernels are there
+int constexpr NUM_DECODERS = 2;
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
@@ -169,28 +159,16 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
-  // FIXME: leaving in single-stream version for testing. remove before merge.
-  if constexpr (true) {
-    auto stream1 = get_stream_pool().get_stream();
-    gpu::DecodePageData(
-      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
-    if (has_strings) {
-      auto stream2 = get_stream_pool().get_stream();
-      chunk_nested_str_data.host_to_device_async(stream2);
-      gpu::DecodeStringPageData(
-        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
-      stream2.synchronize();
-    }
-    stream1.synchronize();
-  } else {
-    gpu::DecodePageData(
-      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-    if (has_strings) {
-      chunk_nested_str_data.host_to_device_async(_stream);
-      gpu::DecodeStringPageData(
-        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-    }
+  auto stream1 = _stream_pool.get_stream();
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
+  if (has_strings) {
+    auto stream2 = _stream_pool.get_stream();
+    chunk_nested_str_data.host_to_device_async(stream2);
+    gpu::DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
+    stream2.synchronize();
   }
+  stream1.synchronize();
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
@@ -280,7 +258,11 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _stream{stream}, _mr{mr}, _sources{std::move(sources)}, _chunk_read_limit{chunk_read_limit}
+  : _stream{stream},
+    _mr{mr},
+    _sources{std::move(sources)},
+    _chunk_read_limit{chunk_read_limit},
+    _stream_pool(NUM_DECODERS)
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index d25bf1e4c1f..c92a142a631 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -28,6 +28,7 @@
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -268,6 +269,9 @@ class reader::impl {
   std::size_t _chunk_read_limit{0};
   std::size_t _current_read_chunk{0};
   bool _file_preprocessed{false};
+
+  // stream pool for page decoding
+  rmm::cuda_stream_pool _stream_pool;
 };
 
 }  // namespace cudf::io::detail::parquet

From 47cd9e143a2f1a5279d12e6d5e6f98c698f7b7b1 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 08:56:14 -0700
Subject: [PATCH 107/114] filter on data types in setupLocalPageInfo

---
 cpp/src/io/parquet/page_data.cu          | 15 ++++++----
 cpp/src/io/parquet/page_decode.cuh       | 35 ++++++++++++++++++++++--
 cpp/src/io/parquet/page_string_decode.cu | 28 +++++++++++--------
 3 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 8f5f403afaa..af76b064cd0 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -592,7 +592,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
   rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; }
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
@@ -756,20 +756,23 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(16) page_state_buffers_s state_buffers;
 
-  // string cols handled elsewhere
-  if (is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int page_idx                   = blockIdx.x;
   int t                          = threadIdx.x;
   int out_thread0;
-  [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, non_string_filter{chunks}, true)) {
+    return;
+  }
 
+  // this needs to be declared after we've decided to process this page
+  [[maybe_unused]] null_count_back_copier _{s, t};
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
+  // FIXME do this in setupLocalPageInfo
+  //
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
   // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index f4d9cdd2df7..5533a8d2393 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -926,6 +926,31 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
   return static_cast<uint32_t>(len);
 }
 
+/**
+ * @brief Functor for setupLocalPageInfo that always returns true.
+ */
+struct all_types_filter {
+  __device__ inline bool operator()(PageInfo const& page) { return true; }
+};
+
+/**
+ * @brief Functor for setupLocalPageInfo that returns true if this is not a string column.
+ */
+struct non_string_filter {
+  device_span<ColumnChunkDesc const> chunks;
+
+  __device__ inline bool operator()(PageInfo const& page) { return !is_string_col(page, chunks); }
+};
+
+/**
+ * @brief Functor for setupLocalPageInfo that returns true if this is a string column.
+ */
+struct string_filter {
+  device_span<ColumnChunkDesc const> chunks;
+
+  __device__ inline bool operator()(PageInfo const& page) { return is_string_col(page, chunks); }
+};
+
 /**
  * @brief Sets up block-local page state information from the global pages.
  *
@@ -934,15 +959,19 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
  * @param[in] chunks The global list of chunks
  * @param[in] min_row Crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
+ * @param[in] filter Filtering function used to decide which pages to operate on
  * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
  * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
- * Currently only used by gpuComputePageSizes step)
+ * @tparam Filter Function that takes a PageInfo reference and returns true if the given page should
+ * be operated on Currently only used by gpuComputePageSizes step)
  */
+template <typename Filter>
 inline __device__ bool setupLocalPageInfo(page_state_s* const s,
                                           PageInfo const* p,
                                           device_span<ColumnChunkDesc const> chunks,
                                           size_t min_row,
                                           size_t num_rows,
+                                          Filter filter,
                                           bool is_decode_step)
 {
   int t = threadIdx.x;
@@ -955,7 +984,9 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   }
   __syncthreads();
 
-  if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; }
+  // return false if this is a dictionary page or it does not pass the filter condition
+  if ((s->page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 || (!filter(s->page))) { return false; }
+
   // Fetch column chunk info
   chunk_idx = s->page.chunk_idx;
   if (!t) { s->col = chunks[chunk_idx]; }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 18038095c47..c8827cb3388 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -569,7 +569,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, string_filter{chunks}, false)) {
+    return;
+  }
 
   if (!t) {
     s->page.num_nulls  = 0;
@@ -657,23 +659,26 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
-  // return if not a string column
-  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
-  [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) {
+    return;
+  }
 
+  // this needs to be declared after we've decided to process this page
+  [[maybe_unused]] null_count_back_copier _{s, t};
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets are local to the page
   if (t == 0) { last_offset = 0; }
   __syncthreads();
 
+  // FIXME do this in setupLocalPageInfo
+  //
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   //
   // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
@@ -849,17 +854,18 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
   __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
 
-  // return if not a string column
-  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   page_state_s* const s          = &state_g;
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
-  [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) {
+    return;
+  }
 
+  // this needs to be declared after we've decided to process this page
+  [[maybe_unused]] null_count_back_copier _{s, t};
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets are local to the page

From a1304c2762978fc41f71c8d0e4e4cf9e3d30224f Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 09:14:44 -0700
Subject: [PATCH 108/114] remove experimental decode kernel

---
 cpp/src/io/parquet/page_string_decode.cu | 210 +----------------------
 1 file changed, 5 insertions(+), 205 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index c8827cb3388..6989b3dba6a 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -831,191 +831,6 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 }
 
-/**
- * @brief Kernel for computing the string column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk.
- *
- * This version uses all threads in the block to do the string copies.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
- * @tparam level_t Type used to store decoded repetition and definition levels
- */
-template <int lvl_buf_size, typename level_t>
-__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageDataV2(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
-{
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s state_buffers;
-  __shared__ __align__(4) size_type last_offset;
-
-  page_state_s* const s          = &state_g;
-  page_state_buffers_s* const sb = &state_buffers;
-  int const page_idx             = blockIdx.x;
-  int const t                    = threadIdx.x;
-
-  if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) {
-    return;
-  }
-
-  // this needs to be declared after we've decided to process this page
-  [[maybe_unused]] null_count_back_copier _{s, t};
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  // offsets are local to the page
-  if (t == 0) { last_offset = 0; }
-  __syncthreads();
-
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  //
-  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
-  // starts before this page and ends after this page:
-  //       P0        P1        P2
-  //  |---------|---------|----------|
-  //        ^------------------^
-  //      row start           row end
-  // P1 will contain 0 rows
-  //
-  if (s->num_rows == 0 &&
-      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
-                           is_page_contained(s, min_row, num_rows)))) {
-    return;
-  }
-
-  int out_thread0                                = s->dict_base && s->dict_bits == 0 ? 32 : 64;
-  int const leaf_level_index                     = s->col.max_nesting_depth - 1;
-  PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
-
-  __shared__ level_t rep[lvl_buf_size];  // circular buffer of repetition level values
-  __shared__ level_t def[lvl_buf_size];  // circular buffer of definition level values
-
-  // skipped_leaf_values will always be 0 for flat hierarchies.
-  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
-    int src_pos = s->src_pos;
-    // target pos for level decoding
-    int target_pos = min(src_pos + decode_block_size, s->nz_count + decode_block_size);
-
-    if (t < 32) {
-      // decode repetition and definition levels.
-      // - update validity vectors
-      // - updates offsets (for nested columns)
-      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels<lvl_buf_size>(s, sb, target_pos, rep, def, t);
-    } else if (t < out_thread0) {
-      // skipped_leaf_values will always be 0 for flat hierarchies.
-      uint32_t src_target_pos = target_pos + skipped_leaf_values;
-
-      // WARP1: Decode dictionary indices, booleans or string positions
-      if (s->dict_base) {
-        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
-      } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
-      }
-      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
-    }
-    __syncthreads();
-
-    // target_pos for value decoding
-    target_pos = min(s->nz_count, target_pos);
-
-    // Decode values
-    src_pos += t;
-
-    // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index(src_pos)];
-
-    // for the flat hierarchy case we will be reading from the beginning of the value stream,
-    // regardless of the value of first_row. so adjust our destination offset accordingly.
-    // example:
-    // - user has passed skip_rows = 2, so our first_row to output is 2
-    // - the row values we get from nz_idx will be
-    //   0, 1, 2, 3, 4 ....
-    // - by shifting these values by first_row, the sequence becomes
-    //   -1, -2, 0, 1, 2 ...
-    // - so we will end up ignoring the first two input rows, and input rows 2..n will
-    //   get written to the output starting at position 0.
-    //
-    if (!has_repetition) { dst_pos -= s->first_row; }
-
-    // need to do this before we branch on src_pos/dst_pos so we don't deadlock
-    // choose a character parallel string copy when the average string is longer than a warp
-    using cudf::detail::warp_size;
-    auto const use_char_ll =
-      s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
-
-    auto [ptr, len] = src_pos < target_pos && dst_pos >= 0
-                        ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values)
-                        : cuda::std::pair<char const*, size_t>{nullptr, 0};
-
-    using block_scan = cub::BlockScan<size_type, decode_block_size>;
-    __shared__ typename block_scan::TempStorage scan_storage;
-    size_type offset;
-    block_scan(scan_storage).ExclusiveSum(len, offset);
-
-    offset += last_offset;
-
-    if (use_char_ll) {
-      __shared__ __align__(8) uint8_t const* pointers[decode_block_size];
-      __shared__ __align__(4) size_type offsets[decode_block_size];
-      __shared__ __align__(4) int dsts[decode_block_size];
-      __shared__ __align__(4) int lengths[decode_block_size];
-
-      offsets[t]  = offset;
-      pointers[t] = reinterpret_cast<uint8_t const*>(ptr);
-      dsts[t]     = dst_pos;
-      lengths[t]  = len;
-      __syncthreads();
-
-      using cudf::detail::warp_size;
-      constexpr int nwarp = decode_block_size / warp_size;
-      int const warpid    = t / warp_size;
-      int const lane_id   = t % warp_size;
-      for (int ss = warpid; ss < decode_block_size && ss + s->src_pos < target_pos; ss += nwarp) {
-        if (dsts[ss] >= 0) {
-          auto offptr =
-            reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dsts[ss];
-          *offptr      = lengths[ss];
-          auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss];
-          ll_strcpy(str_ptr, pointers[ss], lengths[ss], lane_id);
-        }
-      }
-    } else {
-      if (src_pos < target_pos && dst_pos >= 0) {
-        auto offptr =
-          reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
-        *offptr      = len;
-        auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
-        memcpy(str_ptr, ptr, len);
-      }
-    }
-    __syncthreads();
-
-    // last thread in block updates last_offset.
-    if (t == decode_block_size - 1) {
-      last_offset = offset + len;
-      *(volatile int32_t*)&s->src_pos += decode_block_size;
-    }
-    __syncthreads();
-  }
-
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
-
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
-
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
-}
-
 }  // anonymous namespace
 
 /**
@@ -1054,27 +869,12 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  // TODO gpuDecodeStringPageDataV2 (needs a better name) is an alternative approach that uses
-  // all threads in the thread block to do the string copies (rather than the original approach
-  // which uses a single warp). It is faster in some cases, and slower in others. It's being left
-  // in but unused because it will be the likely only implementation once the dictionary decoding
-  // is modified to use more than a single warp.
-  if constexpr (true) {
-    if (level_type_size == 1) {
-      gpuDecodeStringPageData<non_zero_buffer_size, uint8_t>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
-    } else {
-      gpuDecodeStringPageData<non_zero_buffer_size, uint16_t>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
-    }
+  if (level_type_size == 1) {
+    gpuDecodeStringPageData<non_zero_buffer_size, uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    if (level_type_size == 1) {
-      gpuDecodeStringPageDataV2<non_zero_buffer_size, uint8_t>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
-    } else {
-      gpuDecodeStringPageDataV2<non_zero_buffer_size, uint16_t>
-        <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
-    }
+    gpuDecodeStringPageData<non_zero_buffer_size, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
 }
 

From ce2acbe1c1d7b61327c2dbcbb4f263562ff760a7 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 09:33:14 -0700
Subject: [PATCH 109/114] Revert "move stream pool to impl object"

This reverts commit a4548e747b3a3624c44dfea33f9b019a928167df.
---
 cpp/src/io/parquet/reader_impl.cpp | 50 ++++++++++++++++++++----------
 cpp/src/io/parquet/reader_impl.hpp |  4 ---
 2 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b8cabf0b296..c110ec0cb48 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -17,13 +17,23 @@
 #include "reader_impl.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
-// how many page decode kernels are there
-int constexpr NUM_DECODERS = 2;
+namespace {
+
+auto& get_stream_pool()
+{
+  // don't really need 16 here, but it's a reasonable limit we might see if we use a mechanism
+  // like this more generally.
+  static auto pool = rmm::cuda_stream_pool(1);
+  return pool;
+}
+
+}  // namespace
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
@@ -159,16 +169,28 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
-  auto stream1 = _stream_pool.get_stream();
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
-  if (has_strings) {
-    auto stream2 = _stream_pool.get_stream();
-    chunk_nested_str_data.host_to_device_async(stream2);
-    gpu::DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
-    stream2.synchronize();
+  // FIXME: leaving in single-stream version for testing. remove before merge.
+  if constexpr (true) {
+    auto stream1 = get_stream_pool().get_stream();
+    gpu::DecodePageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
+    if (has_strings) {
+      auto stream2 = get_stream_pool().get_stream();
+      chunk_nested_str_data.host_to_device_async(stream2);
+      gpu::DecodeStringPageData(
+        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
+      stream2.synchronize();
+    }
+    stream1.synchronize();
+  } else {
+    gpu::DecodePageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+    if (has_strings) {
+      chunk_nested_str_data.host_to_device_async(_stream);
+      gpu::DecodeStringPageData(
+        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+    }
   }
-  stream1.synchronize();
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
@@ -258,11 +280,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _stream{stream},
-    _mr{mr},
-    _sources{std::move(sources)},
-    _chunk_read_limit{chunk_read_limit},
-    _stream_pool(NUM_DECODERS)
+  : _stream{stream}, _mr{mr}, _sources{std::move(sources)}, _chunk_read_limit{chunk_read_limit}
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index c92a142a631..d25bf1e4c1f 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -28,7 +28,6 @@
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 
-#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -269,9 +268,6 @@ class reader::impl {
   std::size_t _chunk_read_limit{0};
   std::size_t _current_read_chunk{0};
   bool _file_preprocessed{false};
-
-  // stream pool for page decoding
-  rmm::cuda_stream_pool _stream_pool;
 };
 
 }  // namespace cudf::io::detail::parquet

From 8653b933243addf12a5cdec840e208a35689a0ee Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 09:39:14 -0700
Subject: [PATCH 110/114] finish moving back to static stream pool

---
 cpp/src/io/parquet/reader_impl.cpp | 38 +++++++++++-------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c110ec0cb48..c239f6cfcf3 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -25,11 +25,13 @@ namespace cudf::io::detail::parquet {
 
 namespace {
 
+int constexpr NUM_DECODERS       = 2;
+int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
+int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
+
 auto& get_stream_pool()
 {
-  // don't really need 16 here, but it's a reasonable limit we might see if we use a mechanism
-  // like this more generally.
-  static auto pool = rmm::cuda_stream_pool(1);
+  static auto pool = rmm::cuda_stream_pool(STREAM_POOL_SIZE);
   return pool;
 }
 
@@ -169,28 +171,16 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
-  // FIXME: leaving in single-stream version for testing. remove before merge.
-  if constexpr (true) {
-    auto stream1 = get_stream_pool().get_stream();
-    gpu::DecodePageData(
-      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
-    if (has_strings) {
-      auto stream2 = get_stream_pool().get_stream();
-      chunk_nested_str_data.host_to_device_async(stream2);
-      gpu::DecodeStringPageData(
-        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
-      stream2.synchronize();
-    }
-    stream1.synchronize();
-  } else {
-    gpu::DecodePageData(
-      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-    if (has_strings) {
-      chunk_nested_str_data.host_to_device_async(_stream);
-      gpu::DecodeStringPageData(
-        pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
-    }
+  auto stream1 = get_stream_pool().get_stream();
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
+  if (has_strings) {
+    auto stream2 = get_stream_pool().get_stream();
+    chunk_nested_str_data.host_to_device_async(stream2);
+    gpu::DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
+    stream2.synchronize();
   }
+  stream1.synchronize();
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);

From 6ee7b2905b368ea72f317ff742f20fc943225f64 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 09:40:58 -0700
Subject: [PATCH 111/114] add comment for NUM_DECODERS

---
 cpp/src/io/parquet/reader_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c239f6cfcf3..55238e189d2 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -25,7 +25,7 @@ namespace cudf::io::detail::parquet {
 
 namespace {
 
-int constexpr NUM_DECODERS       = 2;
+int constexpr NUM_DECODERS       = 2;  // how many decode kernels are there to run
 int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
 int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
 

From a0db39c8191ffd3f031b7cfbcf31bd45401472e3 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 10:39:23 -0700
Subject: [PATCH 112/114] call synch on _stream before launching decode kernels

---
 cpp/src/io/parquet/reader_impl.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 55238e189d2..e53bb7edb25 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -170,6 +170,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
+  _stream.synchronize();
 
   auto stream1 = get_stream_pool().get_stream();
   gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);

From b3ebab5e1a9128ef8aa273ee3fcf2ad2145e4a79 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Thu, 22 Jun 2023 16:00:22 -0700
Subject: [PATCH 113/114] workaround for nvbench shutdown error

---
 cpp/src/io/parquet/reader_impl.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index e53bb7edb25..0237bf820b0 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -31,8 +31,12 @@ int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
 
 auto& get_stream_pool()
 {
-  static auto pool = rmm::cuda_stream_pool(STREAM_POOL_SIZE);
-  return pool;
+  // TODO: creating this on the heap because there were issues with trying to call the
+  // stream pool destructor during cuda shutdown that lead to a segmentation fault in
+  // nvbench. this allocation is being deliberately leaked to avoid the above, but still
+  // results in non-fatal warnings when running nvbench in cuda-gdb.
+  static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE};
+  return *pool;
 }
 
 }  // namespace

From 5b3d070beaa4d61f406db8c0bca5a5c392c71ab9 Mon Sep 17 00:00:00 2001
From: seidl <seidl2@llnl.gov>
Date: Fri, 23 Jun 2023 10:10:00 -0700
Subject: [PATCH 114/114] move page bounds check into setupLocalPageInfo

---
 cpp/src/io/parquet/page_data.cu          | 21 +------
 cpp/src/io/parquet/page_decode.cuh       | 75 ++++++++++++++----------
 cpp/src/io/parquet/page_string_decode.cu | 21 +------
 3 files changed, 46 insertions(+), 71 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index af76b064cd0..e49378485fc 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -761,34 +761,15 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int page_idx                   = blockIdx.x;
   int t                          = threadIdx.x;
   int out_thread0;
+  [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(
         s, &pages[page_idx], chunks, min_row, num_rows, non_string_filter{chunks}, true)) {
     return;
   }
 
-  // this needs to be declared after we've decided to process this page
-  [[maybe_unused]] null_count_back_copier _{s, t};
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // FIXME do this in setupLocalPageInfo
-  //
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  //
-  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
-  // starts before this page and ends after this page:
-  //       P0        P1        P2
-  //  |---------|---------|----------|
-  //        ^------------------^
-  //      row start           row end
-  // P1 will contain 0 rows
-  //
-  if (s->num_rows == 0 &&
-      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
-                           is_page_contained(s, min_row, num_rows)))) {
-    return;
-  }
-
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 5533a8d2393..4469ec59b7a 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -975,21 +975,22 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
                                           bool is_decode_step)
 {
   int t = threadIdx.x;
-  int chunk_idx;
 
   // Fetch page info
   if (!t) {
     s->page         = *p;
     s->nesting_info = nullptr;
+    s->col          = chunks[s->page.chunk_idx];
   }
   __syncthreads();
 
   // return false if this is a dictionary page or it does not pass the filter condition
-  if ((s->page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 || (!filter(s->page))) { return false; }
+  if ((s->page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 || !filter(s->page)) { return false; }
 
-  // Fetch column chunk info
-  chunk_idx = s->page.chunk_idx;
-  if (!t) { s->col = chunks[chunk_idx]; }
+  // our starting row (absolute index) is
+  // col.start_row == absolute row index
+  // page.chunk-row == relative row index within the chunk
+  size_t const page_start_row = s->col.start_row + s->page.chunk_row;
 
   // if we can use the nesting decode cache, set it up now
   auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
@@ -1011,8 +1012,28 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       depth += blockDim.x;
     }
   }
+
   if (!t) {
     s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
+
+    // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
+    // invalid/bogus during first pass of the preprocess step for nested types. this is ok
+    // because we ignore these values in that stage.
+    auto const max_row = min_row + num_rows;
+
+    // if we are totally outside the range of the input, do nothing
+    if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
+      s->first_row = 0;
+      s->num_rows  = 0;
+    }
+    // otherwise
+    else {
+      s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
+      auto const max_page_rows = s->page.num_rows - s->first_row;
+      s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
+                                   ? max_page_rows
+                                   : max_row - (page_start_row + s->first_row);
+    }
   }
 
   __syncthreads();
@@ -1030,14 +1051,27 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   }
   __syncthreads();
 
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  // NOTE: this check needs to be done after the null counts have been zeroed out
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  if (is_decode_step && s->num_rows == 0 &&
+      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
+                           is_page_contained(s, min_row, num_rows)))) {
+    return false;
+  }
+
   if (!t) {
     s->error = 0;
 
-    // our starting row (absolute index) is
-    // col.start_row == absolute row index
-    // page.chunk-row == relative row index within the chunk
-    size_t page_start_row = s->col.start_row + s->page.chunk_row;
-
     // IMPORTANT : nested schemas can have 0 rows in a page but still have
     // values. The case is:
     // - On page N-1, the last row starts, with 2/6 values encoded
@@ -1126,27 +1160,6 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
         s->dtype_len = 8;  // Convert to 64-bit timestamp
       }
 
-      // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
-      // invalid/bogus during first pass of the preprocess step for nested types. this is ok
-      // because we ignore these values in that stage.
-      {
-        auto const max_row = min_row + num_rows;
-
-        // if we are totally outside the range of the input, do nothing
-        if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
-          s->first_row = 0;
-          s->num_rows  = 0;
-        }
-        // otherwise
-        else {
-          s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
-          auto const max_page_rows = s->page.num_rows - s->first_row;
-          s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
-                                       ? max_page_rows
-                                       : max_row - (page_start_row + s->first_row);
-        }
-      }
-
       // during the decoding step we need to offset the global output buffers
       // for each level of nesting so that we write to the section this page
       // is responsible for.
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 6989b3dba6a..9173d408192 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -663,38 +663,19 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   page_state_buffers_s* const sb = &state_buffers;
   int const page_idx             = blockIdx.x;
   int const t                    = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(
         s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) {
     return;
   }
 
-  // this needs to be declared after we've decided to process this page
-  [[maybe_unused]] null_count_back_copier _{s, t};
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // offsets are local to the page
   if (t == 0) { last_offset = 0; }
   __syncthreads();
 
-  // FIXME do this in setupLocalPageInfo
-  //
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  //
-  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
-  // starts before this page and ends after this page:
-  //       P0        P1        P2
-  //  |---------|---------|----------|
-  //        ^------------------^
-  //      row start           row end
-  // P1 will contain 0 rows
-  //
-  if (s->num_rows == 0 &&
-      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
-                           is_page_contained(s, min_row, num_rows)))) {
-    return;
-  }
-
   int const out_thread0                          = s->dict_base && s->dict_bits == 0 ? 32 : 64;
   int const leaf_level_index                     = s->col.max_nesting_depth - 1;
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;