Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing int8 column option from parquet byte_array writing #11539

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,7 @@ inline size_type __device__ row_to_value_idx(size_type idx,
} else {
auto list_col = cudf::detail::lists_column_device_view(col);
auto child = list_col.child();
if (parquet_col.output_as_byte_array &&
(child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
break;
}
if (parquet_col.output_as_byte_array && child.type().id() == type_id::INT8) { break; }
idx = list_col.offset_at(idx);
col = child;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ std::vector<schema_tree_node> construct_schema_tree(
if (col->type().id() != type_id::LIST) { return false; }
auto const child_col_type =
col->children[lists_column_view::child_column_index]->type().id();
return child_col_type == type_id::INT8 or child_col_type == type_id::UINT8;
return child_col_type == type_id::INT8;
};

// There is a special case for a list<int8> column with one byte column child. This column can
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/utilities/column_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
? col.child(lists_column_view::child_column_index)
: col.child(0);
// stop early if writing a byte array
if (col_desc[index].stats_dtype == dtype_byte_array &&
(child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
if (col_desc[index].stats_dtype == dtype_byte_array && child.type().id() == type_id::INT8) {
break;
}
col = child;
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/lists/dremel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,7 @@ dremel_data get_dremel_data(column_view h_col,
}
if (curr_col.type().id() == type_id::LIST) {
auto child = curr_col.child(lists_column_view::child_column_index);
if ((child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8) &&
output_as_byte_array) {
if (child.type().id() == type_id::INT8 && output_as_byte_array) {
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
// consider this the bottom
break;
}
Expand Down
18 changes: 9 additions & 9 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4215,15 +4215,15 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
// check that byte array min and max statistics are written as expected. If a byte array is
// written as a string, max utf8 is 0xf7bfbfbf and so the minimum value will be set to that value
// instead of a potential minimum higher than that.
std::vector<uint8_t> expected_col0_min{0xf0};
std::vector<uint8_t> expected_col0_max{0xf0, 0xf5, 0xf5};
std::vector<uint8_t> expected_col1_min{0xfe, 0xfe, 0xfe};
std::vector<uint8_t> expected_col1_max{0xfe, 0xfe, 0xfe};

cudf::test::lists_column_wrapper<uint8_t> list_int_col0{
{0xf0}, {0xf0, 0xf5, 0xf3}, {0xf0, 0xf5, 0xf5}};
cudf::test::lists_column_wrapper<uint8_t> list_int_col1{
{0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}};
std::vector<uint8_t> expected_col0_min{0x70};
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
std::vector<uint8_t> expected_col0_max{0x70, 0x75, 0x75};
std::vector<uint8_t> expected_col1_min{0x7e, 0x7e, 0x7e};
std::vector<uint8_t> expected_col1_max{0x7e, 0x7e, 0x7e};

cudf::test::lists_column_wrapper<int8_t> list_int_col0{
{0x70}, {0x70, 0x75, 0x73}, {0x70, 0x75, 0x75}};
cudf::test::lists_column_wrapper<int8_t> list_int_col1{
{0x7e, 0x7e, 0x7e}, {0x7e, 0x7e, 0x7e}, {0x7e, 0x7e, 0x7e}};

std::vector<std::unique_ptr<column>> cols;
cols.push_back(list_int_col0.release());
Expand Down