rapidsai · rapids-bot · Jul 29, 2022 · Jul 21, 2022 · Jul 21, 2022 · Jul 21, 2022
@@ -155,9 +155,13 @@ __global__ void __launch_bounds__(block_size)
             if (col_type == type_id::STRING) {
               // Strings are stored as 4 byte length + string bytes
               return 4 + data_col.element<string_view>(val_idx).size_bytes();
+            } else if (col_type == type_id::LIST) {
+              // Binary is stored as 4 byte length + bytes
+              return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
             }
             CUDF_UNREACHABLE(
-              "Byte array only supports string column types for dictionary encoding!");
+              "Byte array only supports string and list<byte> column types for dictionary "
+              "encoding!");
           }
           case Type::FIXED_LEN_BYTE_ARRAY:
             if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }

@@ -18,6 +18,7 @@
 #include <io/utilities/block_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
@@ -166,10 +167,18 @@ __global__ void __launch_bounds__(block_size)
     uint32_t len;
     if (is_valid) {
       len = dtype_len;
-      if (physical_type != BOOLEAN) {
-        if (physical_type == BYTE_ARRAY) {
-          auto str = s->col.leaf_column->element<string_view>(val_idx);
-          len += str.size_bytes();
+      if (physical_type == BYTE_ARRAY) {
+        switch (leaf_type) {
+          case type_id::STRING: {
+            auto str = s->col.leaf_column->element<string_view>(val_idx);
+            len += str.size_bytes();
+          } break;
+          case type_id::LIST: {
+            auto list_element =
+              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx);
+            len += list_element.size_bytes();
+          } break;
+          default: CUDF_UNREACHABLE("Unsupported data type for leaf column");
         }
       }
     } else {
@@ -973,7 +982,12 @@ __global__ void __launch_bounds__(128, 8)
       if (is_valid) {
         len = dtype_len_out;
         if (physical_type == BYTE_ARRAY) {
-          len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
+          if (type_id == type_id::STRING) {
+            len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
+          } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
+            len +=
+              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
+          }
         }
       } else {
         len = 0;
@@ -1064,13 +1078,25 @@ __global__ void __launch_bounds__(128, 8)
             memcpy(dst + pos, &v, 8);
           } break;
           case BYTE_ARRAY: {
-            auto str     = s->col.leaf_column->element<string_view>(val_idx);
+            auto const bytes = [](cudf::type_id const type_id,
+                                  column_device_view const* leaf_column,
+                                  uint32_t const val_idx) -> void const* {
+              switch (type_id) {
+                case type_id::STRING:
+                  return reinterpret_cast<void const*>(
+                    leaf_column->element<string_view>(val_idx).data());
+                case type_id::LIST:
+                  return reinterpret_cast<void const*>(
+                    get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
+                default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
+              }
+            }(type_id, s->col.leaf_column, val_idx);
             uint32_t v   = len - 4;  // string length
             dst[pos + 0] = v;
             dst[pos + 1] = v >> 8;
             dst[pos + 2] = v >> 16;
             dst[pos + 3] = v >> 24;
-            if (v != 0) memcpy(dst + pos + 4, str.data(), v);
+            if (v != 0) memcpy(dst + pos + 4, bytes, v);
           } break;
           case FIXED_LEN_BYTE_ARRAY: {
             if (type_id == type_id::DECIMAL128) {
@@ -1822,6 +1848,7 @@ dremel_data get_dremel_data(column_view h_col,
                             // TODO(cp): use device_span once it is converted to a single hd_vec
                             rmm::device_uvector<uint8_t> const& d_nullability,
                             std::vector<uint8_t> const& nullability,
+                            bool output_as_byte_array,
                             rmm::cuda_stream_view stream)
 {
   auto get_list_level = [](column_view col) {
@@ -1929,7 +1956,13 @@ dremel_data get_dremel_data(column_view h_col,
       curr_col = curr_col.child(0);
     }
     if (curr_col.type().id() == type_id::LIST) {
-      curr_col = curr_col.child(lists_column_view::child_column_index);
+      auto child = curr_col.child(lists_column_view::child_column_index);
+      if ((child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8) &&
+          output_as_byte_array) {
+        // consider this the bottom
+        break;
+      }
+      curr_col = child;
       if (not is_nested(curr_col.type())) {
         // Special case: when the leaf data column is the immediate child of the list col then we
         // want it to be included right away. Otherwise the struct containing it will be included in

@@ -303,17 +303,20 @@ inline size_type __device__ row_to_value_idx(size_type idx,
 {
   // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave
   // the size at the parent level because we are writing out parent row byte arrays.
-  if (!parquet_col.output_as_byte_array) {
-    auto col = *parquet_col.parent_column;
-    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-      if (col.type().id() == type_id::STRUCT) {
-        idx += col.offset();
-        col = col.child(0);
-      } else {
-        auto list_col = cudf::detail::lists_column_device_view(col);
-        idx           = list_col.offset_at(idx);
-        col           = list_col.child();
+  auto col = *parquet_col.parent_column;
+  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+    if (col.type().id() == type_id::STRUCT) {
+      idx += col.offset();
+      col = col.child(0);
+    } else {
+      auto list_col = cudf::detail::lists_column_device_view(col);
+      auto child    = list_col.child();
+      if (parquet_col.output_as_byte_array &&
+          (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
+        break;
       }
+      idx = list_col.offset_at(idx);
+      col = child;
     }
   }
   return idx;
@@ -494,6 +497,7 @@ struct dremel_data {
 dremel_data get_dremel_data(column_view h_col,
                             rmm::device_uvector<uint8_t> const& d_nullability,
                             std::vector<uint8_t> const& nullability,
+                            bool output_as_byte_array,
                             rmm::cuda_stream_view stream);
 
 /**

@@ -536,7 +536,41 @@ std::vector<schema_tree_node> construct_schema_tree(
         }
       };
 
-      if (col->type().id() == type_id::STRUCT) {
+      auto is_last_list_child = [](cudf::detail::LinkedColPtr col) {
+        if (col->type().id() != type_id::LIST) { return false; }
+        auto const child_col_type =
+          col->children[lists_column_view::child_column_index]->type().id();
+        return child_col_type == type_id::INT8 or child_col_type == type_id::UINT8;
+      };
+
+      // There is a special case for a list<int8> column with one byte column child. This column can
+      // have a special flag that indicates we write this out as binary instead of a list. This is a
+      // more efficient storage mechanism for a single-depth list of bytes, but is a departure from
+      // original cuIO behavior so it is locked behind the option. If the option is selected on a
+      // column that isn't a single-depth list<int8> the code will throw.
+      if (col_meta.is_enabled_output_as_binary() && is_last_list_child(col)) {
+        CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
+                     "Binary column's corresponding metadata should have zero or two children!");
+        if (col_meta.num_children() > 0) {
+          auto const data_col_type =
+            col->children[lists_column_view::child_column_index]->type().id();
+
+          CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.size() == 0,
+                       "Binary column must not be nested!");
+        }
+
+        schema_tree_node col_schema{};
+        col_schema.type            = Type::BYTE_ARRAY;
+        col_schema.converted_type  = ConvertedType::UNKNOWN;
+        col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
+        col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
+        col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        col_schema.parent_idx  = parent_idx;
+        col_schema.leaf_column = col;
+        set_field_id(col_schema, col_meta);
+        col_schema.output_as_byte_array = col_meta.is_enabled_output_as_binary();
+        schema.push_back(col_schema);
+      } else if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
         schema_tree_node struct_schema{};
         struct_schema.repetition_type =
@@ -814,11 +848,12 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
     // size of the leaf column
     // Calculate row offset into dremel data (repetition/definition values) and the respective
     // definition and repetition levels
-    gpu::dremel_data dremel = gpu::get_dremel_data(cudf_col, _d_nullability, _nullability, stream);
-    _dremel_offsets         = std::move(dremel.dremel_offsets);
-    _rep_level              = std::move(dremel.rep_level);
-    _def_level              = std::move(dremel.def_level);
-    _data_count = dremel.leaf_data_size;  // Needed for knowing what size dictionary to allocate
+    gpu::dremel_data dremel = gpu::get_dremel_data(
+      cudf_col, _d_nullability, _nullability, schema_node.output_as_byte_array, stream);
+    _dremel_offsets = std::move(dremel.dremel_offsets);
+    _rep_level      = std::move(dremel.rep_level);
+    _def_level      = std::move(dremel.def_level);
+    _data_count     = dremel.leaf_data_size;  // Needed for knowing what size dictionary to allocate
 
     stream.synchronize();
   } else {
@@ -829,15 +864,21 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
 
 column_view parquet_column_view::leaf_column_view() const
 {
-  auto col = cudf_col;
-  while (cudf::is_nested(col.type())) {
-    if (col.type().id() == type_id::LIST) {
-      col = col.child(lists_column_view::child_column_index);
-    } else if (col.type().id() == type_id::STRUCT) {
-      col = col.child(0);  // Stored cudf_col has only one child if struct
+  if (!schema_node.output_as_byte_array) {
+    auto col = cudf_col;
+    while (cudf::is_nested(col.type())) {
+      if (col.type().id() == type_id::LIST) {
+        col = col.child(lists_column_view::child_column_index);
+      } else if (col.type().id() == type_id::STRUCT) {
+        col = col.child(0);  // Stored cudf_col has only one child if struct
+      }
     }
+    return col;
+  } else {
+    // TODO: investigate why the leaf node is computed twice instead of using the schema leaf node
+    // for everything
+    return *schema_node.leaf_column;
   }
-  return col;
 }
 
 gpu::parquet_column_device_view parquet_column_view::get_device_view(
@@ -986,7 +1027,9 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
   hash_maps_storage.reserve(h_chunks.size());
   for (auto& chunk : h_chunks) {
-    if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN) {
+    if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN ||
+        (col_desc[chunk.col_desc_id].output_as_byte_array &&
+         col_desc[chunk.col_desc_id].physical_type == Type::BYTE_ARRAY)) {
       chunk.use_dictionary = false;
     } else {
       chunk.use_dictionary = true;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <io/statistics/statistics.cuh>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -65,10 +67,13 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
       size_type index) mutable {
       col_desc[index].parent_column = parent_col_view.begin() + index;
       column_device_view col        = parent_col_view.column(index);
-      // traverse till leaf column
-      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-        col = (col.type().id() == type_id::LIST) ? col.child(lists_column_view::child_column_index)
-                                                 : col.child(0);
+      if (col_desc[index].stats_dtype != dtype_byte_array) {
+        // traverse till leaf column
+        while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+          col = (col.type().id() == type_id::LIST)
+                  ? col.child(lists_column_view::child_column_index)
+                  : col.child(0);
+        }
       }
       // Store leaf_column to device storage
       column_device_view* leaf_col_ptr = leaf_columns.begin() + index;