From 9b8d26f8bf98424bf740627a1b226233861f961e Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Fri, 29 Apr 2022 13:04:18 -0500 Subject: [PATCH] Fix an issue with one_level_list schemas in parquet reader. (#10750) Partially addresses: https://github.com/rapidsai/cudf/issues/10733 For a particular way of encoding list schemas (an old way that Spark seems to use sometimes), the parquet reader was accidentally propagating incorrect nesting information between columns. Just a simple bug of not popping an extra value off a stack. Note: this is simply a fix so that the files read correctly, however the internal data in the file is actually of binary type and cudf converts these to string columns. This PR does not add support for binary as a real type in cudf. Authors: - https://github.com/nvdbaranec Approvers: - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10750 --- cpp/src/io/parquet/reader_impl.cu | 4 +++ .../data/parquet/one_level_list2.parquet | Bin 0 -> 656 bytes python/cudf/cudf/tests/test_parquet.py | 26 ++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 python/cudf/cudf/tests/data/parquet/one_level_list2.parquet diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index a40993ee2dd..f165bd5ec3b 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -679,6 +679,10 @@ class aggregate_reader_metadata { } std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); + + // pop off the extra nesting element. + if (schema_elem.is_one_level_list()) { nesting.pop_back(); } + path_is_valid = true; // If we're able to reach leaf then path is valid } diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cd5acd045949218a7a419c66a11305eb7ddeb0d7 GIT binary patch literal 656 zcmZuvL2JS=6i%B^iwHXwA!>?8yPF5xZaqSsiaIT1sQ5O>YkL@x};4+TOE@dABdVdZclNi z^MvC(D~||Etrm&O5F?K3Jo&auTh72jwgX%WStgnE?a76?e(plE6K-4K=4|fglcPB` zNBx6QgYJHy2LrGdcKUl$x9?cUsU`%l&GrT?s7(H`)BnQfc~5^*b(pak)L`mgX0cDg z25C*Ars=KdXCht5;W&zuER5r^ Uj7Cz#awr$_8WnI=rf?U(0hF3wRR910 literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 727200293f7..3a07ce6234c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir): assert_eq(expect, got) +# testing a specific bug-fix/edge case. +# specifically: int a parquet file containing a particular way of representing +# a list column in a schema, the cudf reader was confusing +# nesting information between a list column and a subsequent +# string column, ultimately causing a crash. +def test_parquet_reader_one_level_list2(datadir): + # we are reading in a file containing binary types, but cudf returns + # those as strings. so we have to massage the pandas data to get + # them to compare correctly. + def postprocess(val): + if isinstance(val, bytes): + return val.decode() + elif isinstance(val, np.ndarray): + return np.array([v.decode() for v in val]) + else: + return val + + fname = datadir / "one_level_list2.parquet" + + expect = pd.read_parquet(fname) + expect = expect.applymap(postprocess) + got = cudf.read_parquet(fname) + + assert_eq(expect, got, check_dtype=False) + + @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) def test_parquet_writer_row_group_size(