Fix an issue with one_level_list schemas which were causing nesting i…

…nformation to propagate between columns, causing crashes.
rapidsai · Apr 27, 2022 · 005949b · 005949b
1 parent 75f3873
commit 005949b
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 0 deletions.
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
@@ -679,6 +679,10 @@ class aggregate_reader_metadata {
           }
 
           std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+
+          // pop off the extra nesting element.
+          if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
+
           path_is_valid = true;  // If we're able to reach leaf then path is valid
         }
 

diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir):
     assert_eq(expect, got)
 
 
+# testing a specific bug-fix/edge case.
+# specifically:  int a parquet file containing a particular way of representing
+#                a list column in a schema, the cudf reader was confusing
+#                nesting information between a list column and a subsequent
+#                string column, ultimately causing a crash.
+def test_parquet_reader_one_level_list2(datadir):
+    # we are reading in a file containing binary types, but cudf returns
+    # those as strings. so we have to massage the pandas data to get
+    # them to compare correctly.
+    def postprocess(val):
+        if isinstance(val, bytes):
+            return val.decode()
+        elif isinstance(val, np.ndarray):
+            return np.array([v.decode() for v in val])
+        else:
+            return val
+
+    fname = datadir / "one_level_list2.parquet"
+
+    expect = pd.read_parquet(fname)
+    expect = expect.applymap(postprocess)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
 @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
 @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
 def test_parquet_writer_row_group_size(