diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 253d7950c54..52203d0194b 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -415,7 +415,7 @@ def read_parquet( # (There is a good chance this was not the intention) if engine != "cudf": warnings.warn( - "Using CPU via PyArrow to read Parquet dataset." + "Using CPU via PyArrow to read Parquet dataset. " "This option is both inefficient and unstable!" ) if filters is not None: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 7feaa400446..91b4009995b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -5,6 +5,7 @@ import os import pathlib import random +from contextlib import contextmanager from io import BytesIO from string import ascii_letters @@ -32,6 +33,19 @@ ) +@contextmanager +def _hide_pyarrow_parquet_cpu_warnings(engine): + if engine == "pyarrow": + with pytest.warns( + UserWarning, + match="Using CPU via PyArrow to read Parquet dataset. This option " + "is both inefficient and unstable!", + ): + yield + else: + yield + + @pytest.fixture(scope="module") def datadir(datadir): return datadir / "parquet" @@ -891,7 +905,7 @@ def test_parquet_reader_list_table(tmpdir): expect.to_parquet(fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) def int_gen(first_val, i): @@ -1051,7 +1065,7 @@ def test_parquet_reader_list_large_mixed(tmpdir): expect.to_parquet(fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) def test_parquet_reader_list_large_multi_rowgroup(tmpdir): @@ -1121,7 +1135,10 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): expect = src.iloc[skip:] got = cudf.read_parquet(fname, skiprows=skip) - assert_eq(expect, got, check_dtype=False) + if expect.empty: + assert_eq(expect, got) + else: + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) @pytest.mark.parametrize("skip", [0, 1, 5, 10]) @@ -1145,7 +1162,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir): rows_to_read = min(3, (num_rows - skip) - 5) expect = src.iloc[skip:].head(rows_to_read) got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read) - assert_eq(expect, got, check_dtype=False) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) def struct_gen(gen, skip_rows, num_rows, include_validity=False): @@ -2005,7 +2022,8 @@ def test_parquet_nullable_boolean(tmpdir, engine): expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) pdf.to_parquet(pandas_path) - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) + with _hide_pyarrow_parquet_cpu_warnings(engine): + actual_gdf = cudf.read_parquet(pandas_path, engine=engine) assert_eq(actual_gdf, expected_gdf) @@ -2079,7 +2097,8 @@ def test_parquet_allnull_str(tmpdir, engine): ) pdf.to_parquet(pandas_path) - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) + with _hide_pyarrow_parquet_cpu_warnings(engine): + actual_gdf = cudf.read_parquet(pandas_path, engine=engine) assert_eq(actual_gdf, expected_gdf)