From 005949b79aa7581557d0545fd9eed1b60616ea95 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 27 Apr 2022 16:02:13 -0500 Subject: [PATCH] Fix an issue with one_level_list schemas which were causing nesting information to propagate between columns, causing crashes. --- cpp/src/io/parquet/reader_impl.cu | 4 +++ .../data/parquet/one_level_list2.parquet | Bin 0 -> 656 bytes python/cudf/cudf/tests/test_parquet.py | 26 ++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 python/cudf/cudf/tests/data/parquet/one_level_list2.parquet diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 46b3206f731..bbafcc40aa9 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -679,6 +679,10 @@ class aggregate_reader_metadata { } std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); + + // pop off the extra nesting element. + if (schema_elem.is_one_level_list()) { nesting.pop_back(); } + path_is_valid = true; // If we're able to reach leaf then path is valid } diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cd5acd045949218a7a419c66a11305eb7ddeb0d7 GIT binary patch literal 656 zcmZuvL2JS=6i%B^iwHXwA!>?8yPF5xZaqSsiaIT1sQ5O>YkL@x};4+TOE@dABdVdZclNi z^MvC(D~||Etrm&O5F?K3Jo&auTh72jwgX%WStgnE?a76?e(plE6K-4K=4|fglcPB` zNBx6QgYJHy2LrGdcKUl$x9?cUsU`%l&GrT?s7(H`)BnQfc~5^*b(pak)L`mgX0cDg z25C*Ars=KdXCht5;W&zuER5r^ Uj7Cz#awr$_8WnI=rf?U(0hF3wRR910 literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 727200293f7..3a07ce6234c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir): assert_eq(expect, got) +# testing a specific bug-fix/edge case. +# specifically: int a parquet file containing a particular way of representing +# a list column in a schema, the cudf reader was confusing +# nesting information between a list column and a subsequent +# string column, ultimately causing a crash. +def test_parquet_reader_one_level_list2(datadir): + # we are reading in a file containing binary types, but cudf returns + # those as strings. so we have to massage the pandas data to get + # them to compare correctly. + def postprocess(val): + if isinstance(val, bytes): + return val.decode() + elif isinstance(val, np.ndarray): + return np.array([v.decode() for v in val]) + else: + return val + + fname = datadir / "one_level_list2.parquet" + + expect = pd.read_parquet(fname) + expect = expect.applymap(postprocess) + got = cudf.read_parquet(fname) + + assert_eq(expect, got, check_dtype=False) + + @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) def test_parquet_writer_row_group_size(