From 9b8d26f8bf98424bf740627a1b226233861f961e Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 29 Apr 2022 13:04:18 -0500
Subject: [PATCH] Fix an issue with one_level_list schemas in parquet reader.
 (#10750)

Partially addresses: https://github.com/rapidsai/cudf/issues/10733

For a particular way of encoding list schemas (an old way that Spark seems to use sometimes), the parquet reader was accidentally propagating incorrect nesting information between columns.  Just a simple bug of not popping an extra value off a stack.

Note:  this is simply a fix so that the files read correctly, however the internal data in the file is actually of binary type and cudf converts these to string columns.  This PR does not add support for binary as a real type in cudf.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10750
---
 cpp/src/io/parquet/reader_impl.cu             |   4 +++
 .../data/parquet/one_level_list2.parquet      | Bin 0 -> 656 bytes
 python/cudf/cudf/tests/test_parquet.py        |  26 ++++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 python/cudf/cudf/tests/data/parquet/one_level_list2.parquet

diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a40993ee2dd..f165bd5ec3b 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -679,6 +679,10 @@ class aggregate_reader_metadata {
           }
 
           std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+
+          // pop off the extra nesting element.
+          if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
+
           path_is_valid = true;  // If we're able to reach leaf then path is valid
         }
 
diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cd5acd045949218a7a419c66a11305eb7ddeb0d7
GIT binary patch
literal 656
zcmZuvL2JS=6i%B^iwHX<q|!q#76-1Zt)haL-L{+T<Y5S{X+X&8R_iwXJv;eZJnjeW
zrONEwN}89x_kG`c{mAt0F{FrgX}3oS94u<P9KV+($Egqu!DGUUF@_z|IzR!CE_G`Z
zqEVa}u-8cXQy2ciwGOXHHF(wO2p3fJs9g<Yxx6t5240#7j-wzt6Pgv_N^3#hh7bp_
zCpStp;C$i`T@8Z>wA!>?8yPF5xZaqSsiaIT1sQ5O>YkL@x};4+TOE@dABdVdZclNi
z^MvC(D~||Etrm&O5F?K3Jo&auTh72jwgX%WStgnE?a76?e(plE6K-4K=4|fglcPB`
zNBx6QgYJHy2LrGdcKUl$x9?cUsU`%l&GrT?s7(H`)BnQfc~5^*b(pak)L`mgX0cDg
z2<kq(B*p6{D{r0)?=36V%T?}$!6*p5Zn{}2;q{X=>5C*Ars=KdXCht5;W&zuER5r^
Uj7Cz#awr$_8WnI=rf?U(0hF3wRR910

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 727200293f7..3a07ce6234c 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir):
     assert_eq(expect, got)
 
 
+# testing a specific bug-fix/edge case.
+# specifically:  int a parquet file containing a particular way of representing
+#                a list column in a schema, the cudf reader was confusing
+#                nesting information between a list column and a subsequent
+#                string column, ultimately causing a crash.
+def test_parquet_reader_one_level_list2(datadir):
+    # we are reading in a file containing binary types, but cudf returns
+    # those as strings. so we have to massage the pandas data to get
+    # them to compare correctly.
+    def postprocess(val):
+        if isinstance(val, bytes):
+            return val.decode()
+        elif isinstance(val, np.ndarray):
+            return np.array([v.decode() for v in val])
+        else:
+            return val
+
+    fname = datadir / "one_level_list2.parquet"
+
+    expect = pd.read_parquet(fname)
+    expect = expect.applymap(postprocess)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
 @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
 @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
 def test_parquet_writer_row_group_size(