From f29c8d970ba3f6927f35c68b2702aad9a9a45db8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 11 Mar 2022 14:27:26 -0600 Subject: [PATCH] Fix some warnings in `test_parquet.py` (#10416) Resolves part of https://github.com/rapidsai/cudf/issues/10363, there still are some warnings remaining, which I tried to resolve and went down a rabbit hole of a bug inside pyarrow<->pandas conversions so will take it up later. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10416 --- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 31 +++++++++++++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 253d7950c54..52203d0194b 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -415,7 +415,7 @@ def read_parquet( # (There is a good chance this was not the intention) if engine != "cudf": warnings.warn( - "Using CPU via PyArrow to read Parquet dataset." + "Using CPU via PyArrow to read Parquet dataset. " "This option is both inefficient and unstable!" ) if filters is not None: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 7feaa400446..91b4009995b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -5,6 +5,7 @@ import os import pathlib import random +from contextlib import contextmanager from io import BytesIO from string import ascii_letters @@ -32,6 +33,19 @@ ) +@contextmanager +def _hide_pyarrow_parquet_cpu_warnings(engine): + if engine == "pyarrow": + with pytest.warns( + UserWarning, + match="Using CPU via PyArrow to read Parquet dataset. This option " + "is both inefficient and unstable!", + ): + yield + else: + yield + + @pytest.fixture(scope="module") def datadir(datadir): return datadir / "parquet" @@ -891,7 +905,7 @@ def test_parquet_reader_list_table(tmpdir): expect.to_parquet(fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) def int_gen(first_val, i): @@ -1051,7 +1065,7 @@ def test_parquet_reader_list_large_mixed(tmpdir): expect.to_parquet(fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) def test_parquet_reader_list_large_multi_rowgroup(tmpdir): @@ -1121,7 +1135,10 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): expect = src.iloc[skip:] got = cudf.read_parquet(fname, skiprows=skip) - assert_eq(expect, got, check_dtype=False) + if expect.empty: + assert_eq(expect, got) + else: + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) @pytest.mark.parametrize("skip", [0, 1, 5, 10]) @@ -1145,7 +1162,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir): rows_to_read = min(3, (num_rows - skip) - 5) expect = src.iloc[skip:].head(rows_to_read) got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read) - assert_eq(expect, got, check_dtype=False) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) def struct_gen(gen, skip_rows, num_rows, include_validity=False): @@ -2005,7 +2022,8 @@ def test_parquet_nullable_boolean(tmpdir, engine): expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) pdf.to_parquet(pandas_path) - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) + with _hide_pyarrow_parquet_cpu_warnings(engine): + actual_gdf = cudf.read_parquet(pandas_path, engine=engine) assert_eq(actual_gdf, expected_gdf) @@ -2079,7 +2097,8 @@ def test_parquet_allnull_str(tmpdir, engine): ) pdf.to_parquet(pandas_path) - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) + with _hide_pyarrow_parquet_cpu_warnings(engine): + actual_gdf = cudf.read_parquet(pandas_path, engine=engine) assert_eq(actual_gdf, expected_gdf)