Removing int8 column option from parquet byte_array writing (rapidsai…

…#11539) As suggested in rapidsai#11526 and captured in issue rapidsai#11536 the usage of both INT8 and UINT8 as supported types for byte_arrays is unnecessary and adds complexity to the code. This change removes INT8 as an option and only allows UINT8 columns to be written out as byte_arrays. ~~This matches with cudf string columns which contain an INT8 column for data.~~ closes rapidsai#11536 Authors: - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - Tobias Ribizel (https://github.com/upsj) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - MithunR (https://github.com/mythrocks) - Bradley Dice (https://github.com/bdice) URL: rapidsai#11539
abellina · Oct 18, 2022 · 1effe19 · 1effe19
1 parent cea10ca
commit 1effe19
Show file tree

Hide file tree

Showing 10 changed files with 57 additions and 38 deletions.
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -321,10 +321,7 @@ inline size_type __device__ row_to_value_idx(size_type idx,
     } else {
       auto list_col = cudf::detail::lists_column_device_view(col);
       auto child    = list_col.child();
-      if (parquet_col.output_as_byte_array &&
-          (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
-        break;
-      }
+      if (parquet_col.output_as_byte_array && child.type().id() == type_id::UINT8) { break; }
       idx = list_col.offset_at(idx);
       col = child;
     }

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
@@ -511,7 +511,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         if (col->type().id() != type_id::LIST) { return false; }
         auto const child_col_type =
           col->children[lists_column_view::child_column_index]->type().id();
-        return child_col_type == type_id::INT8 or child_col_type == type_id::UINT8;
+        return child_col_type == type_id::UINT8;
       };
 
       // There is a special case for a list<int8> column with one byte column child. This column can

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
@@ -22,6 +22,7 @@
 #include "column_buffer.hpp"
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
 
 namespace cudf {
 namespace io {
@@ -78,7 +79,19 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
         // convert to binary
         auto const string_col = make_strings_column(*buffer._strings, stream, mr);
         auto const num_rows   = string_col->size();
-        auto col_contest      = string_col->release();
+        auto col_content      = string_col->release();
+
+        // convert to uint8 column, strings are currently stores as int8
+        auto contents =
+          col_content.children[strings_column_view::chars_column_index].release()->release();
+        auto data      = contents.data.release();
+        auto null_mask = contents.null_mask.release();
+
+        auto uint8_col = std::make_unique<column>(data_type{type_id::UINT8},
+                                                  data->size(),
+                                                  std::move(*data),
+                                                  std::move(*null_mask),
+                                                  UNKNOWN_NULL_COUNT);
 
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
@@ -87,10 +100,10 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 
         return make_lists_column(
           num_rows,
-          std::move(col_contest.children[strings_column_view::offsets_column_index]),
-          std::move(col_contest.children[strings_column_view::chars_column_index]),
+          std::move(col_content.children[strings_column_view::offsets_column_index]),
+          std::move(uint8_col),
           UNKNOWN_NULL_COUNT,
-          std::move(*col_contest.null_mask));
+          std::move(*col_content.null_mask));
       }
 
     case type_id::LIST: {

diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh
@@ -64,7 +64,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
     iter,
     iter + parent_table_device_view.num_columns(),
     [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
-      size_type index) mutable {
+      size_type index) {
       col_desc[index].parent_column = parent_col_view.begin() + index;
       column_device_view col        = parent_col_view.column(index);
       // traverse till leaf column
@@ -74,7 +74,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
                              : col.child(0);
         // stop early if writing a byte array
         if (col_desc[index].stats_dtype == dtype_byte_array &&
-            (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
+            child.type().id() == type_id::UINT8) {
           break;
         }
         col = child;

diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
@@ -192,8 +192,7 @@ dremel_data get_dremel_data(column_view h_col,
     }
     if (curr_col.type().id() == type_id::LIST) {
       auto child = curr_col.child(lists_column_view::child_column_index);
-      if ((child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8) &&
-          output_as_byte_array) {
+      if (output_as_byte_array && child.type().id() == type_id::UINT8) {
         // consider this the bottom
         break;
       }

diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
@@ -101,11 +101,21 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
   auto strings_count = input_strings.size();
   if (strings_count == 0) return cudf::empty_like(input_column);
 
-  auto contents = std::make_unique<column>(input_column, stream, mr)->release();
+  auto col_content = std::make_unique<column>(input_column, stream, mr)->release();
+  auto contents =
+    col_content.children[strings_column_view::chars_column_index].release()->release();
+  auto data      = contents.data.release();
+  auto null_mask = contents.null_mask.release();
+  auto uint8_col = std::make_unique<column>(data_type{type_id::UINT8},
+                                            data->size(),
+                                            std::move(*data),
+                                            std::move(*null_mask),
+                                            UNKNOWN_NULL_COUNT);
+
   return make_lists_column(
     input_column.size(),
-    std::move(contents.children[cudf::strings_column_view::offsets_column_index]),
-    std::move(contents.children[cudf::strings_column_view::chars_column_index]),
+    std::move(col_content.children[cudf::strings_column_view::offsets_column_index]),
+    std::move(uint8_col),
     input_column.null_count(),
     detail::copy_bitmask(input_column, stream, mr),
     stream,

diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
@@ -718,17 +718,17 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
   column_wrapper<cudf::string_view> col0{ascii_strings.begin(), ascii_strings.end()};
   column_wrapper<cudf::string_view> col1{unicode_strings.begin(), unicode_strings.end()};
   column_wrapper<cudf::string_view> col2{ascii_strings.begin(), ascii_strings.end()};
-  cudf::test::lists_column_wrapper<int8_t> col3{{'M', 'o', 'n', 'd', 'a', 'y'},
-                                                {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'M', 'o', 'n', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'F', 'u', 'n', 'd', 'a', 'y'}};
-  cudf::test::lists_column_wrapper<int8_t> col4{
+  cudf::test::lists_column_wrapper<uint8_t> col3{{'M', 'o', 'n', 'd', 'a', 'y'},
+                                                 {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'M', 'o', 'n', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'F', 'u', 'n', 'd', 'a', 'y'}};
+  cudf::test::lists_column_wrapper<uint8_t> col4{
     {'M', 'o', 'n', 'd', 'a', 'y'},
-    {'W', -56, -123, 'd', 'n', -56, -123, 's', 'd', 'a', 'y'},
+    {'W', 200, 133, 'd', 'n', 200, 133, 's', 'd', 'a', 'y'},
     {'F', 'r', 'i', 'd', 'a', 'y'},
     {'M', 'o', 'n', 'd', 'a', 'y'},
     {'F', 'r', 'i', 'd', 'a', 'y'},
@@ -4459,13 +4459,13 @@ TEST_F(ParquetReaderTest, BinaryAsStrings)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto seq_col3 = random_values<int8_t>(num_rows);
+  auto seq_col3 = random_values<uint8_t>(num_rows);
   auto validity = cudf::test::iterators::no_nulls();
 
   column_wrapper<int> int_col{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<cudf::string_view> string_col{strings.begin(), strings.end()};
   column_wrapper<float> float_col{seq_col2.begin(), seq_col2.end(), validity};
-  cudf::test::lists_column_wrapper<int8_t> list_int_col{
+  cudf::test::lists_column_wrapper<uint8_t> list_int_col{
     {'M', 'o', 'n', 'd', 'a', 'y'},
     {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
     {'F', 'r', 'i', 'd', 'a', 'y'},
@@ -4526,12 +4526,12 @@ TEST_F(ParquetReaderTest, NestedByteArray)
 
   auto seq_col0       = random_values<int>(num_rows);
   auto seq_col2       = random_values<float>(num_rows);
-  auto seq_col3       = random_values<int8_t>(num_rows);
+  auto seq_col3       = random_values<uint8_t>(num_rows);
   auto const validity = cudf::test::iterators::no_nulls();
 
   column_wrapper<int> int_col{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<float> float_col{seq_col2.begin(), seq_col2.end(), validity};
-  cudf::test::lists_column_wrapper<int8_t> list_list_int_col{
+  cudf::test::lists_column_wrapper<uint8_t> list_list_int_col{
     {{'M', 'o', 'n', 'd', 'a', 'y'},
      {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
      {'F', 'r', 'i', 'd', 'a', 'y'}},
@@ -4637,12 +4637,12 @@ TEST_F(ParquetReaderTest, StructByteArray)
 {
   constexpr auto num_rows = 100;
 
-  auto seq_col0       = random_values<int8_t>(num_rows);
+  auto seq_col0       = random_values<uint8_t>(num_rows);
   auto const validity = cudf::test::iterators::no_nulls();
 
-  column_wrapper<int8_t> int_col{seq_col0.begin(), seq_col0.end(), validity};
-  cudf::test::lists_column_wrapper<int8_t> list_of_int{{seq_col0.begin(), seq_col0.begin() + 50},
-                                                       {seq_col0.begin() + 50, seq_col0.end()}};
+  column_wrapper<uint8_t> int_col{seq_col0.begin(), seq_col0.end(), validity};
+  cudf::test::lists_column_wrapper<uint8_t> list_of_int{{seq_col0.begin(), seq_col0.begin() + 50},
+                                                        {seq_col0.begin() + 50, seq_col0.end()}};
   auto struct_col = cudf::test::structs_column_wrapper{{list_of_int}, validity};
 
   auto const expected = table_view{{struct_col}};

diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -327,7 +327,7 @@ TEST_F(ByteCastTest, StringValues)
 {
   strings_column_wrapper const strings_col(
     {"", "The quick", " brown fox...", "!\"#$%&\'()*+,-./", "0123456789:;<=>?@", "[\\]^_`{|}~"});
-  lists_column_wrapper<int8_t> const strings_expected(
+  lists_column_wrapper<uint8_t> const strings_expected(
     {{},
      {0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b},
      {0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e, 0x20, 0x66, 0x6f, 0x78, 0x2e, 0x2e, 0x2e},

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3942,7 +3942,7 @@ void testCastStringToByteList() {
     "\\THE\t8\ud720", "tést strings", "", "éé");
         ColumnVector res = cv.asByteList(true);
         ColumnVector expected = ColumnVector.fromLists(new HostColumnVector.ListType(true,
-          new HostColumnVector.BasicType(true, DType.INT8)), list1, list2, list3, list4, list5,
+          new HostColumnVector.BasicType(true, DType.UINT8)), list1, list2, list3, list4, list5,
           list6, list7, list8)) {
       assertColumnsAreEqual(expected, res);
     }

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -621,7 +621,7 @@ void testParquetWriteToBufferChunkedBinary() {
     List<Byte> bin2 = asList(string2);
 
     try (Table binTable = new Table.TestBuilder()
-        .column(new ListType(true, new BasicType(false, DType.INT8)),
+        .column(new ListType(true, new BasicType(false, DType.UINT8)),
             bin1, bin2)
         .build();
          Table stringTable = new Table.TestBuilder()