Skip to content

Commit

Permalink
Fix an issue reading struct-of-list types in Parquet. (#11910)
Browse files Browse the repository at this point in the history
Fixes NVIDIA/spark-rapids#6718

There was a bug introduced recently #11752 where an insufficient check for whether an input column contained repetition information could cause incorrect results for column hierarchies with structs at the root.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Jim Brennan (https://github.com/jbrennan333)
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #11910
  • Loading branch information
nvdbaranec authored Oct 13, 2022
1 parent 678946b commit fb0922f
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 8 deletions.
7 changes: 2 additions & 5 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1860,11 +1860,8 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
out_buf.create(size, stream, mr);
}

// for nested hierarchies, compute per-page start offset.
// it would be better/safer to be checking (schema.max_repetition_level > 0) here, but there's
// no easy way to get at that info here. we'd have to move this function into reader_impl.cu
if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) ||
out_buf.type.id() == type_id::LIST) {
// for nested hierarchies, compute per-page start offset
if (input_col.has_repetition) {
thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
page_keys.begin(),
page_keys.end(),
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,16 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
struct input_column_info {
int schema_idx;
std::string name;
bool has_repetition;
// size == nesting depth. the associated real output
// buffer index in the dest column for each level of nesting.
std::vector<int> nesting;

input_column_info(int _schema_idx, std::string _name, bool _has_repetition)
: schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition)
{
}

auto nesting_depth() const { return nesting.size(); }
};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -684,8 +684,8 @@ class aggregate_reader_metadata {
// if I have no children, we're at a leaf and I'm an input column (that is, one with actual
// data stored) so add me to the list.
if (schema_elem.num_children == 0) {
input_column_info& input_col =
input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
input_column_info& input_col = input_columns.emplace_back(
input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});

// set up child output column for one-level encoding list
if (schema_elem.is_one_level_list()) {
Expand Down
7 changes: 6 additions & 1 deletion cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2633,6 +2633,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offset_iter,
offset_iter + num_rows + 1);

auto _c3_valids =
cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; });
std::vector<bool> c3_valids(num_rows);
std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin());
auto _c3_list =
cudf::make_lists_column(num_rows,
offsets.release(),
Expand All @@ -2646,7 +2651,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
c3_children.push_back(std::move(c3_list));
c3_children.push_back(c3_ints.release());
c3_children.push_back(c3_floats.release());
cudf::test::structs_column_wrapper _c3(std::move(c3_children));
cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids);
auto c3 = cudf::purge_nonempty_nulls(static_cast<cudf::structs_column_view>(_c3));

// write it out
Expand Down

0 comments on commit fb0922f

Please sign in to comment.