Skip to content

Commit

Permalink
Fix some warnings in test_parquet.py (#10416)
Browse files Browse the repository at this point in the history
Resolves part of #10363, there still are some warnings remaining, which I tried to resolve and went down a rabbit hole of a bug inside pyarrow<->pandas conversions so will take it up later.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #10416
  • Loading branch information
galipremsagar authored Mar 11, 2022
1 parent c0f7fe6 commit f29c8d9
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ def read_parquet(
# (There is a good chance this was not the intention)
if engine != "cudf":
warnings.warn(
"Using CPU via PyArrow to read Parquet dataset."
"Using CPU via PyArrow to read Parquet dataset. "
"This option is both inefficient and unstable!"
)
if filters is not None:
Expand Down
31 changes: 25 additions & 6 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pathlib
import random
from contextlib import contextmanager
from io import BytesIO
from string import ascii_letters

Expand Down Expand Up @@ -32,6 +33,19 @@
)


@contextmanager
def _hide_pyarrow_parquet_cpu_warnings(engine):
if engine == "pyarrow":
with pytest.warns(
UserWarning,
match="Using CPU via PyArrow to read Parquet dataset. This option "
"is both inefficient and unstable!",
):
yield
else:
yield


@pytest.fixture(scope="module")
def datadir(datadir):
return datadir / "parquet"
Expand Down Expand Up @@ -891,7 +905,7 @@ def test_parquet_reader_list_table(tmpdir):
expect.to_parquet(fname)
assert os.path.exists(fname)
got = cudf.read_parquet(fname)
assert_eq(expect, got, check_dtype=False)
assert pa.Table.from_pandas(expect).equals(got.to_arrow())


def int_gen(first_val, i):
Expand Down Expand Up @@ -1051,7 +1065,7 @@ def test_parquet_reader_list_large_mixed(tmpdir):
expect.to_parquet(fname)
assert os.path.exists(fname)
got = cudf.read_parquet(fname)
assert_eq(expect, got, check_dtype=False)
assert pa.Table.from_pandas(expect).equals(got.to_arrow())


def test_parquet_reader_list_large_multi_rowgroup(tmpdir):
Expand Down Expand Up @@ -1121,7 +1135,10 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):

expect = src.iloc[skip:]
got = cudf.read_parquet(fname, skiprows=skip)
assert_eq(expect, got, check_dtype=False)
if expect.empty:
assert_eq(expect, got)
else:
assert pa.Table.from_pandas(expect).equals(got.to_arrow())


@pytest.mark.parametrize("skip", [0, 1, 5, 10])
Expand All @@ -1145,7 +1162,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir):
rows_to_read = min(3, (num_rows - skip) - 5)
expect = src.iloc[skip:].head(rows_to_read)
got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read)
assert_eq(expect, got, check_dtype=False)
assert pa.Table.from_pandas(expect).equals(got.to_arrow())


def struct_gen(gen, skip_rows, num_rows, include_validity=False):
Expand Down Expand Up @@ -2005,7 +2022,8 @@ def test_parquet_nullable_boolean(tmpdir, engine):
expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]})

pdf.to_parquet(pandas_path)
actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
with _hide_pyarrow_parquet_cpu_warnings(engine):
actual_gdf = cudf.read_parquet(pandas_path, engine=engine)

assert_eq(actual_gdf, expected_gdf)

Expand Down Expand Up @@ -2079,7 +2097,8 @@ def test_parquet_allnull_str(tmpdir, engine):
)

pdf.to_parquet(pandas_path)
actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
with _hide_pyarrow_parquet_cpu_warnings(engine):
actual_gdf = cudf.read_parquet(pandas_path, engine=engine)

assert_eq(actual_gdf, expected_gdf)

Expand Down

0 comments on commit f29c8d9

Please sign in to comment.