Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Deprecate skiprows & num_rows in parquet reader #11218

Merged
merged 9 commits into from
Jul 21, 2022
22 changes: 20 additions & 2 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,17 @@ def read_parquet_metadata(path):

pq_file = pq.ParquetFile(path)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

num_rows = pq_file.metadata.num_rows
metadata = pq_file.metadata
num_rows = metadata.num_rows
num_row_groups = pq_file.num_row_groups
col_names = pq_file.schema.names

return num_rows, num_row_groups, col_names
return (
num_rows,
num_row_groups,
col_names,
[metadata.row_group(i) for i in range(num_row_groups)],
)


@_cudf_nvtx_annotate
Expand Down Expand Up @@ -371,6 +377,18 @@ def read_parquet(
):
"""{docstring}"""

if skiprows is not None:
warnings.warn(
"skiprows is deprecated and will be removed.",
FutureWarning,
)

if num_rows is not None:
warnings.warn(
"num_rows is deprecated and will be removed.",
FutureWarning,
)

# Do not allow the user to set file-opening options
# when `use_python_file_object=False` is specified
if use_python_file_object is False:
Expand Down
43 changes: 36 additions & 7 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def num_row_groups(rows, group_size):
row_group_size = 5
pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size)

num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
num_rows, row_groups, col_names, _ = cudf.io.read_parquet_metadata(fname)

assert num_rows == len(pdf.index)
assert row_groups == num_row_groups(num_rows, row_group_size)
Expand Down Expand Up @@ -573,7 +573,7 @@ def test_parquet_read_row_groups(tmpdir, pdf, row_group_size):
fname = tmpdir.join("row_group.parquet")
pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)

num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
num_rows, row_groups, col_names, _ = cudf.io.read_parquet_metadata(fname)

gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)]
gdf = cudf.concat(gdf)
Expand All @@ -597,8 +597,7 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):

fname = tmpdir.join("row_group.parquet")
pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)

num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
num_rows, row_groups, col_names, _ = cudf.io.read_parquet_metadata(fname)

# alternate rows between the two sources
gdf = cudf.read_parquet(
Expand All @@ -618,6 +617,12 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
assert_eq(ref_df, gdf)


@pytest.mark.filterwarnings(
"ignore:skiprows is deprecated and will be removed."
)
@pytest.mark.filterwarnings(
"ignore:num_rows is deprecated and will be removed."
)
@pytest.mark.parametrize("row_group_size", [1, 4, 33])
def test_parquet_read_rows(tmpdir, pdf, row_group_size):
if len(pdf) > 100:
Expand All @@ -626,7 +631,7 @@ def test_parquet_read_rows(tmpdir, pdf, row_group_size):
fname = tmpdir.join("row_group.parquet")
pdf.to_parquet(fname, compression="None", row_group_size=row_group_size)

total_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
total_rows, row_groups, col_names, _ = cudf.io.read_parquet_metadata(fname)

num_rows = total_rows // 4
skiprows = (total_rows - num_rows) // 2
Expand Down Expand Up @@ -702,6 +707,12 @@ def test_parquet_reader_invalids(tmpdir):
assert_eq(expect, got)


@pytest.mark.filterwarnings(
"ignore:skiprows is deprecated and will be removed."
)
@pytest.mark.filterwarnings(
"ignore:num_rows is deprecated and will be removed."
)
def test_parquet_chunked_skiprows(tmpdir):
processed = 0
batch = 10000
Expand Down Expand Up @@ -1120,6 +1131,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
assert_eq(expect, got)


@pytest.mark.filterwarnings(
"ignore:skiprows is deprecated and will be removed."
)
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_skiprows(skip, tmpdir):
num_rows = 10
Expand All @@ -1142,6 +1156,12 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
assert pa.Table.from_pandas(expect).equals(got.to_arrow())


@pytest.mark.filterwarnings(
"ignore:skiprows is deprecated and will be removed."
)
@pytest.mark.filterwarnings(
"ignore:num_rows is deprecated and will be removed."
)
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_num_rows(skip, tmpdir):
num_rows = 20
Expand Down Expand Up @@ -1638,7 +1658,9 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
writer.write_table(gdf)

# Simple check for multiple row-groups
nrows, nrow_groups, columns = cudf.io.parquet.read_parquet_metadata(fname)
nrows, nrow_groups, columns, _ = cudf.io.parquet.read_parquet_metadata(
fname
)
assert nrows == size
assert nrow_groups > 1
assert columns == ["a", "b"]
Expand Down Expand Up @@ -2516,7 +2538,14 @@ def test_to_parquet_row_group_size(
fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
)

num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
(
num_rows,
row_groups,
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
col_names,
row_group_metadata,
) = cudf.io.read_parquet_metadata(fname)
assert len(row_group_metadata) == row_groups
assert num_rows == sum(meta.num_rows for meta in row_group_metadata)
# 8 bytes per row, as the column is int64
expected_num_rows = max(
math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,12 @@
Total number of rows
Number of row groups
List of column names
List of metadata of each row-group
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> import cudf
>>> num_rows, num_row_groups, names = cudf.io.read_parquet_metadata(filename)
>>> num_rows, num_row_groups, names, row_group_metadatas = cudf.io.read_parquet_metadata(filename)
>>> df = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
>>> df = cudf.concat(df)
>>> df
Expand All @@ -111,7 +112,7 @@
See Also
--------
cudf.read_parquet
"""
""" # noqa: E501
doc_read_parquet_metadata = docfmt_partial(
docstring=_docstring_read_parquet_metadata
)
Expand Down