diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index b5f36ca713f..48c7e2fca57 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -180,11 +180,14 @@ struct SchemaElement { // https://github.com/apache/parquet-cpp/blob/642da05/src/parquet/schema.h#L49-L50 // One-level LIST encoding: Only allows required lists with required cells: // repeated value_type name - [[nodiscard]] bool is_one_level_list() const + [[nodiscard]] bool is_one_level_list(SchemaElement const& parent) const { - return repetition_type == REPEATED and num_children == 0; + return repetition_type == REPEATED and num_children == 0 and not parent.is_list(); } + // returns true if the element is a list + [[nodiscard]] bool is_list() const { return converted_type == LIST; } + // in parquet terms, a group is a level of nesting in the schema. a group // can be a struct or a list [[nodiscard]] bool is_struct() const diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index a3c89c3beda..1f70a7afdfe 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -426,7 +426,7 @@ aggregate_reader_metadata::select_columns(std::optional } // if we're at the root, this is a new output column - auto const col_type = schema_elem.is_one_level_list() + auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)) ? type_id::LIST : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); @@ -465,7 +465,7 @@ aggregate_reader_metadata::select_columns(std::optional input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); // set up child output column for one-level encoding list - if (schema_elem.is_one_level_list()) { + if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { // determine the element data type auto const element_type = to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); @@ -486,7 +486,9 @@ aggregate_reader_metadata::select_columns(std::optional std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); // pop off the extra nesting element. - if (schema_elem.is_one_level_list()) { nesting.pop_back(); } + if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { + nesting.pop_back(); + } path_is_valid = true; // If we're able to reach leaf then path is valid } diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index cdc5896803c..748f0164244 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -129,11 +129,12 @@ class aggregate_reader_metadata { // walk upwards, skipping repeated fields while (schema_index > 0) { - if (!pfm.schema[schema_index].is_stub()) { depth++; } + auto const& elm = pfm.schema[schema_index]; + if (!elm.is_stub()) { depth++; } // schema of one-level encoding list doesn't contain nesting information, so we need to // manually add an extra nesting level - if (pfm.schema[schema_index].is_one_level_list()) { depth++; } - schema_index = pfm.schema[schema_index].parent_idx; + if (elm.is_one_level_list(pfm.schema[elm.parent_idx])) { depth++; } + schema_index = elm.parent_idx; } return depth; } diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index bcb02466a68..14aaec48b2b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -119,7 +119,7 @@ void generate_depth_remappings(std::map, std::ve shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth; } // if it's one-level encoding list - else if (cur_schema.is_one_level_list()) { + else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) { shallowest = cur_depth - 1; } if (!cur_schema.is_stub()) { cur_depth--; } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 406471917ec..8662e1d5bd0 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -5376,4 +5376,64 @@ TEST_F(ParquetWriterTest, UserNullabilityInvalid) EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error); } +TEST_F(ParquetReaderTest, SingleLevelLists) +{ + unsigned char list_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, + 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, + 0x02, 0x19, 0x3c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x15, 0x02, 0x00, 0x35, 0x00, 0x18, 0x01, 0x66, 0x15, 0x02, 0x15, 0x06, 0x4c, 0x3c, 0x00, + 0x00, 0x00, 0x15, 0x02, 0x25, 0x04, 0x18, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x00, 0x16, 0x02, + 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, + 0x66, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x15, 0x00, 0x16, 0x04, 0x16, 0x56, 0x16, 0x56, 0x26, + 0x08, 0x3c, 0x18, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, + 0x28, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x1c, 0x15, + 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x56, 0x16, 0x02, 0x26, 0x08, 0x16, 0x56, + 0x14, 0x00, 0x00, 0x28, 0x13, 0x52, 0x41, 0x50, 0x49, 0x44, 0x53, 0x20, 0x53, 0x70, 0x61, 0x72, + 0x6b, 0x20, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x9f, 0x00, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + // read single level list reproducing parquet file + cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); + auto table = cudf::io::read_parquet(read_opts); + + auto const c0 = table.tbl->get_column(0); + EXPECT_TRUE(c0.type().id() == cudf::type_id::LIST); + + auto const lc = cudf::lists_column_view(c0); + auto const child = lc.child(); + EXPECT_TRUE(child.type().id() == cudf::type_id::INT32); +} + +TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) +{ + unsigned char list_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, + 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, + 0x02, 0x19, 0x3c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x15, 0x02, 0x00, 0x35, 0x00, 0x18, 0x01, 0x66, 0x15, 0x02, 0x15, 0x06, 0x4c, 0x3c, 0x00, + 0x00, 0x00, 0x15, 0x02, 0x25, 0x04, 0x18, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x00, 0x16, 0x02, + 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, + 0x66, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x15, 0x00, 0x16, 0x04, 0x16, 0x56, 0x16, 0x56, 0x26, + 0x08, 0x3c, 0x18, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, + 0x28, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x1c, 0x15, + 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x56, 0x16, 0x02, 0x26, 0x08, 0x16, 0x56, + 0x14, 0x00, 0x00, 0x28, 0x13, 0x52, 0x41, 0x50, 0x49, 0x44, 0x53, 0x20, 0x53, 0x70, 0x61, 0x72, + 0x6b, 0x20, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x9f, 0x00, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + auto reader = cudf::io::chunked_parquet_reader( + 1L << 31, + cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); + int iterations = 0; + while (reader.has_next() && iterations < 10) { + auto chunk = reader.read_chunk(); + } + EXPECT_TRUE(iterations < 10); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list3.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list3.parquet new file mode 100644 index 00000000000..788e2c05743 Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/one_level_list3.parquet differ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c24ff080033..0ab5d35f9f8 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2573,6 +2573,20 @@ def postprocess(val): assert_eq(expect, got, check_dtype=False) +# testing a specific bug-fix/edge case. +# specifically: in a parquet file containing a particular way of representing +# a list column in a schema, the cudf reader was confusing +# nesting information and building a list of list of int instead +# of a list of int +def test_parquet_reader_one_level_list3(datadir): + fname = datadir / "one_level_list3.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got, check_dtype=True) + + @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) def test_to_parquet_row_group_size(