Skip to content

Commit

Permalink
Removing int8 column option from parquet byte_array writing (rapidsai…
Browse files Browse the repository at this point in the history
…#11539)

As suggested in rapidsai#11526 and captured in issue rapidsai#11536 the usage of both INT8 and UINT8 as supported types for byte_arrays is unnecessary and adds complexity to the code. This change removes INT8 as an option and only allows UINT8 columns to be written out as byte_arrays. ~~This matches with cudf string columns which contain an INT8 column for data.~~

closes rapidsai#11536

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Tobias Ribizel (https://github.com/upsj)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - MithunR (https://github.com/mythrocks)
  - Bradley Dice (https://github.com/bdice)

URL: rapidsai#11539
  • Loading branch information
hyperbolic2346 authored Oct 18, 2022
1 parent cea10ca commit 1effe19
Show file tree
Hide file tree
Showing 10 changed files with 57 additions and 38 deletions.
5 changes: 1 addition & 4 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,10 +321,7 @@ inline size_type __device__ row_to_value_idx(size_type idx,
} else {
auto list_col = cudf::detail::lists_column_device_view(col);
auto child = list_col.child();
if (parquet_col.output_as_byte_array &&
(child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
break;
}
if (parquet_col.output_as_byte_array && child.type().id() == type_id::UINT8) { break; }
idx = list_col.offset_at(idx);
col = child;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ std::vector<schema_tree_node> construct_schema_tree(
if (col->type().id() != type_id::LIST) { return false; }
auto const child_col_type =
col->children[lists_column_view::child_column_index]->type().id();
return child_col_type == type_id::INT8 or child_col_type == type_id::UINT8;
return child_col_type == type_id::UINT8;
};

// There is a special case for a list<int8> column with one byte column child. This column can
Expand Down
21 changes: 17 additions & 4 deletions cpp/src/io/utilities/column_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "column_buffer.hpp"
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/types.hpp>

namespace cudf {
namespace io {
Expand Down Expand Up @@ -78,7 +79,19 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
// convert to binary
auto const string_col = make_strings_column(*buffer._strings, stream, mr);
auto const num_rows = string_col->size();
auto col_contest = string_col->release();
auto col_content = string_col->release();

// convert to uint8 column, strings are currently stores as int8
auto contents =
col_content.children[strings_column_view::chars_column_index].release()->release();
auto data = contents.data.release();
auto null_mask = contents.null_mask.release();

auto uint8_col = std::make_unique<column>(data_type{type_id::UINT8},
data->size(),
std::move(*data),
std::move(*null_mask),
UNKNOWN_NULL_COUNT);

if (schema_info != nullptr) {
schema_info->children.push_back(column_name_info{"offsets"});
Expand All @@ -87,10 +100,10 @@ std::unique_ptr<column> make_column(column_buffer& buffer,

return make_lists_column(
num_rows,
std::move(col_contest.children[strings_column_view::offsets_column_index]),
std::move(col_contest.children[strings_column_view::chars_column_index]),
std::move(col_content.children[strings_column_view::offsets_column_index]),
std::move(uint8_col),
UNKNOWN_NULL_COUNT,
std::move(*col_contest.null_mask));
std::move(*col_content.null_mask));
}

case type_id::LIST: {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/utilities/column_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
iter,
iter + parent_table_device_view.num_columns(),
[col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
size_type index) mutable {
size_type index) {
col_desc[index].parent_column = parent_col_view.begin() + index;
column_device_view col = parent_col_view.column(index);
// traverse till leaf column
Expand All @@ -74,7 +74,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
: col.child(0);
// stop early if writing a byte array
if (col_desc[index].stats_dtype == dtype_byte_array &&
(child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
child.type().id() == type_id::UINT8) {
break;
}
col = child;
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/lists/dremel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,7 @@ dremel_data get_dremel_data(column_view h_col,
}
if (curr_col.type().id() == type_id::LIST) {
auto child = curr_col.child(lists_column_view::child_column_index);
if ((child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8) &&
output_as_byte_array) {
if (output_as_byte_array && child.type().id() == type_id::UINT8) {
// consider this the bottom
break;
}
Expand Down
16 changes: 13 additions & 3 deletions cpp/src/reshape/byte_cast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,21 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
auto strings_count = input_strings.size();
if (strings_count == 0) return cudf::empty_like(input_column);

auto contents = std::make_unique<column>(input_column, stream, mr)->release();
auto col_content = std::make_unique<column>(input_column, stream, mr)->release();
auto contents =
col_content.children[strings_column_view::chars_column_index].release()->release();
auto data = contents.data.release();
auto null_mask = contents.null_mask.release();
auto uint8_col = std::make_unique<column>(data_type{type_id::UINT8},
data->size(),
std::move(*data),
std::move(*null_mask),
UNKNOWN_NULL_COUNT);

return make_lists_column(
input_column.size(),
std::move(contents.children[cudf::strings_column_view::offsets_column_index]),
std::move(contents.children[cudf::strings_column_view::chars_column_index]),
std::move(col_content.children[cudf::strings_column_view::offsets_column_index]),
std::move(uint8_col),
input_column.null_count(),
detail::copy_bitmask(input_column, stream, mr),
stream,
Expand Down
36 changes: 18 additions & 18 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,17 +718,17 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
column_wrapper<cudf::string_view> col0{ascii_strings.begin(), ascii_strings.end()};
column_wrapper<cudf::string_view> col1{unicode_strings.begin(), unicode_strings.end()};
column_wrapper<cudf::string_view> col2{ascii_strings.begin(), ascii_strings.end()};
cudf::test::lists_column_wrapper<int8_t> col3{{'M', 'o', 'n', 'd', 'a', 'y'},
{'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'M', 'o', 'n', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'F', 'u', 'n', 'd', 'a', 'y'}};
cudf::test::lists_column_wrapper<int8_t> col4{
cudf::test::lists_column_wrapper<uint8_t> col3{{'M', 'o', 'n', 'd', 'a', 'y'},
{'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'M', 'o', 'n', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'F', 'u', 'n', 'd', 'a', 'y'}};
cudf::test::lists_column_wrapper<uint8_t> col4{
{'M', 'o', 'n', 'd', 'a', 'y'},
{'W', -56, -123, 'd', 'n', -56, -123, 's', 'd', 'a', 'y'},
{'W', 200, 133, 'd', 'n', 200, 133, 's', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
{'M', 'o', 'n', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
Expand Down Expand Up @@ -4459,13 +4459,13 @@ TEST_F(ParquetReaderTest, BinaryAsStrings)

auto seq_col0 = random_values<int>(num_rows);
auto seq_col2 = random_values<float>(num_rows);
auto seq_col3 = random_values<int8_t>(num_rows);
auto seq_col3 = random_values<uint8_t>(num_rows);
auto validity = cudf::test::iterators::no_nulls();

column_wrapper<int> int_col{seq_col0.begin(), seq_col0.end(), validity};
column_wrapper<cudf::string_view> string_col{strings.begin(), strings.end()};
column_wrapper<float> float_col{seq_col2.begin(), seq_col2.end(), validity};
cudf::test::lists_column_wrapper<int8_t> list_int_col{
cudf::test::lists_column_wrapper<uint8_t> list_int_col{
{'M', 'o', 'n', 'd', 'a', 'y'},
{'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'},
Expand Down Expand Up @@ -4526,12 +4526,12 @@ TEST_F(ParquetReaderTest, NestedByteArray)

auto seq_col0 = random_values<int>(num_rows);
auto seq_col2 = random_values<float>(num_rows);
auto seq_col3 = random_values<int8_t>(num_rows);
auto seq_col3 = random_values<uint8_t>(num_rows);
auto const validity = cudf::test::iterators::no_nulls();

column_wrapper<int> int_col{seq_col0.begin(), seq_col0.end(), validity};
column_wrapper<float> float_col{seq_col2.begin(), seq_col2.end(), validity};
cudf::test::lists_column_wrapper<int8_t> list_list_int_col{
cudf::test::lists_column_wrapper<uint8_t> list_list_int_col{
{{'M', 'o', 'n', 'd', 'a', 'y'},
{'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
{'F', 'r', 'i', 'd', 'a', 'y'}},
Expand Down Expand Up @@ -4637,12 +4637,12 @@ TEST_F(ParquetReaderTest, StructByteArray)
{
constexpr auto num_rows = 100;

auto seq_col0 = random_values<int8_t>(num_rows);
auto seq_col0 = random_values<uint8_t>(num_rows);
auto const validity = cudf::test::iterators::no_nulls();

column_wrapper<int8_t> int_col{seq_col0.begin(), seq_col0.end(), validity};
cudf::test::lists_column_wrapper<int8_t> list_of_int{{seq_col0.begin(), seq_col0.begin() + 50},
{seq_col0.begin() + 50, seq_col0.end()}};
column_wrapper<uint8_t> int_col{seq_col0.begin(), seq_col0.end(), validity};
cudf::test::lists_column_wrapper<uint8_t> list_of_int{{seq_col0.begin(), seq_col0.begin() + 50},
{seq_col0.begin() + 50, seq_col0.end()}};
auto struct_col = cudf::test::structs_column_wrapper{{list_of_int}, validity};

auto const expected = table_view{{struct_col}};
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/reshape/byte_cast_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -327,7 +327,7 @@ TEST_F(ByteCastTest, StringValues)
{
strings_column_wrapper const strings_col(
{"", "The quick", " brown fox...", "!\"#$%&\'()*+,-./", "0123456789:;<=>?@", "[\\]^_`{|}~"});
lists_column_wrapper<int8_t> const strings_expected(
lists_column_wrapper<uint8_t> const strings_expected(
{{},
{0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b},
{0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e, 0x20, 0x66, 0x6f, 0x78, 0x2e, 0x2e, 0x2e},
Expand Down
2 changes: 1 addition & 1 deletion java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3942,7 +3942,7 @@ void testCastStringToByteList() {
"\\THE\t8\ud720", "tést strings", "", "éé");
ColumnVector res = cv.asByteList(true);
ColumnVector expected = ColumnVector.fromLists(new HostColumnVector.ListType(true,
new HostColumnVector.BasicType(true, DType.INT8)), list1, list2, list3, list4, list5,
new HostColumnVector.BasicType(true, DType.UINT8)), list1, list2, list3, list4, list5,
list6, list7, list8)) {
assertColumnsAreEqual(expected, res);
}
Expand Down
2 changes: 1 addition & 1 deletion java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ void testParquetWriteToBufferChunkedBinary() {
List<Byte> bin2 = asList(string2);

try (Table binTable = new Table.TestBuilder()
.column(new ListType(true, new BasicType(false, DType.INT8)),
.column(new ListType(true, new BasicType(false, DType.UINT8)),
bin1, bin2)
.build();
Table stringTable = new Table.TestBuilder()
Expand Down

0 comments on commit 1effe19

Please sign in to comment.