Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix an issue reading struct-of-list types in Parquet. #11910

Merged
merged 1 commit into from
Oct 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1860,11 +1860,8 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
out_buf.create(size, stream, mr);
}

// for nested hierarchies, compute per-page start offset.
// it would be better/safer to be checking (schema.max_repetition_level > 0) here, but there's
// no easy way to get at that info here. we'd have to move this function into reader_impl.cu
if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) ||
out_buf.type.id() == type_id::LIST) {
// for nested hierarchies, compute per-page start offset
if (input_col.has_repetition) {
thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
page_keys.begin(),
page_keys.end(),
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,16 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
struct input_column_info {
int schema_idx;
std::string name;
bool has_repetition;
// size == nesting depth. the associated real output
// buffer index in the dest column for each level of nesting.
std::vector<int> nesting;

input_column_info(int _schema_idx, std::string _name, bool _has_repetition)
: schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition)
{
}
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved

auto nesting_depth() const { return nesting.size(); }
};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -684,8 +684,8 @@ class aggregate_reader_metadata {
// if I have no children, we're at a leaf and I'm an input column (that is, one with actual
// data stored) so add me to the list.
if (schema_elem.num_children == 0) {
input_column_info& input_col =
input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
input_column_info& input_col = input_columns.emplace_back(
input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});

// set up child output column for one-level encoding list
if (schema_elem.is_one_level_list()) {
Expand Down
7 changes: 6 additions & 1 deletion cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2633,6 +2633,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offset_iter,
offset_iter + num_rows + 1);

auto _c3_valids =
cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; });
std::vector<bool> c3_valids(num_rows);
std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin());
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
auto _c3_list =
cudf::make_lists_column(num_rows,
offsets.release(),
Expand All @@ -2646,7 +2651,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
c3_children.push_back(std::move(c3_list));
c3_children.push_back(c3_ints.release());
c3_children.push_back(c3_floats.release());
cudf::test::structs_column_wrapper _c3(std::move(c3_children));
cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids);
auto c3 = cudf::purge_nonempty_nulls(static_cast<cudf::structs_column_view>(_c3));

// write it out
Expand Down