diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index a40993ee2dd..f165bd5ec3b 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -679,6 +679,10 @@ class aggregate_reader_metadata { } std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); + + // pop off the extra nesting element. + if (schema_elem.is_one_level_list()) { nesting.pop_back(); } + path_is_valid = true; // If we're able to reach leaf then path is valid } diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet new file mode 100644 index 00000000000..cd5acd04594 Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet differ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 727200293f7..3a07ce6234c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir): assert_eq(expect, got) +# testing a specific bug-fix/edge case. +# specifically: int a parquet file containing a particular way of representing +# a list column in a schema, the cudf reader was confusing +# nesting information between a list column and a subsequent +# string column, ultimately causing a crash. +def test_parquet_reader_one_level_list2(datadir): + # we are reading in a file containing binary types, but cudf returns + # those as strings. so we have to massage the pandas data to get + # them to compare correctly. + def postprocess(val): + if isinstance(val, bytes): + return val.decode() + elif isinstance(val, np.ndarray): + return np.array([v.decode() for v in val]) + else: + return val + + fname = datadir / "one_level_list2.parquet" + + expect = pd.read_parquet(fname) + expect = expect.applymap(postprocess) + got = cudf.read_parquet(fname) + + assert_eq(expect, got, check_dtype=False) + + @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) def test_parquet_writer_row_group_size(