Skip to content

Commit

Permalink
Fix an issue with one_level_list schemas which were causing nesting i…
Browse files Browse the repository at this point in the history
…nformation to propagate between

columns, causing crashes.
  • Loading branch information
nvdbaranec committed Apr 27, 2022
1 parent 75f3873 commit 005949b
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 0 deletions.
4 changes: 4 additions & 0 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,10 @@ class aggregate_reader_metadata {
}

std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));

// pop off the extra nesting element.
if (schema_elem.is_one_level_list()) { nesting.pop_back(); }

path_is_valid = true; // If we're able to reach leaf then path is valid
}

Expand Down
Binary file not shown.
26 changes: 26 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir):
assert_eq(expect, got)


# testing a specific bug-fix/edge case.
# specifically: int a parquet file containing a particular way of representing
# a list column in a schema, the cudf reader was confusing
# nesting information between a list column and a subsequent
# string column, ultimately causing a crash.
def test_parquet_reader_one_level_list2(datadir):
# we are reading in a file containing binary types, but cudf returns
# those as strings. so we have to massage the pandas data to get
# them to compare correctly.
def postprocess(val):
if isinstance(val, bytes):
return val.decode()
elif isinstance(val, np.ndarray):
return np.array([v.decode() for v in val])
else:
return val

fname = datadir / "one_level_list2.parquet"

expect = pd.read_parquet(fname)
expect = expect.applymap(postprocess)
got = cudf.read_parquet(fname)

assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
def test_parquet_writer_row_group_size(
Expand Down

0 comments on commit 005949b

Please sign in to comment.