From 489237defc840c635771c879b57bbd652c7f13e3 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 11 Oct 2022 16:35:10 -0500 Subject: [PATCH] Fix an issue where a weak test for whether a column contained repetition information was causing a failure. --- cpp/src/io/parquet/page_data.cu | 7 ++----- cpp/src/io/parquet/parquet_gpu.hpp | 7 +++++++ cpp/src/io/parquet/reader_impl.cu | 4 ++-- cpp/tests/io/parquet_test.cpp | 7 ++++++- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index a5f6d737637..57d55be6145 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -1860,11 +1860,8 @@ void PreprocessColumnData(hostdevice_vector& pages, out_buf.create(size, stream, mr); } - // for nested hierarchies, compute per-page start offset. - // it would be better/safer to be checking (schema.max_repetition_level > 0) here, but there's - // no easy way to get at that info here. we'd have to move this function into reader_impl.cu - if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) || - out_buf.type.id() == type_id::LIST) { + // for nested hierarchies, compute per-page start offset + if (input_col.has_repetition) { thrust::exclusive_scan_by_key(rmm::exec_policy(stream), page_keys.begin(), page_keys.end(), diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 8f4cd5c6f3b..1a8c0f4cd9e 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -57,9 +57,16 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1; struct input_column_info { int schema_idx; std::string name; + bool has_repetition; // size == nesting depth. the associated real output // buffer index in the dest column for each level of nesting. std::vector nesting; + + input_column_info(int _schema_idx, std::string _name, bool _has_repetition) + : schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition) + { + } + auto nesting_depth() const { return nesting.size(); } }; diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 07869189089..0997d2a968d 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -684,8 +684,8 @@ class aggregate_reader_metadata { // if I have no children, we're at a leaf and I'm an input column (that is, one with actual // data stored) so add me to the list. if (schema_elem.num_children == 0) { - input_column_info& input_col = - input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name}); + input_column_info& input_col = input_columns.emplace_back( + input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); // set up child output column for one-level encoding list if (schema_elem.is_one_level_list()) { diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 134eff54144..6f1c5ef7eb1 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -2633,6 +2633,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) 0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; }); cudf::test::fixed_width_column_wrapper offsets(offset_iter, offset_iter + num_rows + 1); + + auto _c3_valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; }); + std::vector c3_valids(num_rows); + std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin()); auto _c3_list = cudf::make_lists_column(num_rows, offsets.release(), @@ -2646,7 +2651,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) c3_children.push_back(std::move(c3_list)); c3_children.push_back(c3_ints.release()); c3_children.push_back(c3_floats.release()); - cudf::test::structs_column_wrapper _c3(std::move(c3_children)); + cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids); auto c3 = cudf::purge_nonempty_nulls(static_cast(_c3)); // write it out