Fix some warnings in test_parquet.py (#10416)

Resolves part of #10363, there still are some warnings remaining, which I tried to resolve and went down a rabbit hole of a bug inside pyarrow<->pandas conversions so will take it up later. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) URL: #10416
rapidsai · Mar 11, 2022 · f29c8d9 · f29c8d9
1 parent c0f7fe6
commit f29c8d9
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 7 deletions.
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -415,7 +415,7 @@ def read_parquet(
     # (There is a good chance this was not the intention)
     if engine != "cudf":
         warnings.warn(
-            "Using CPU via PyArrow to read Parquet dataset."
+            "Using CPU via PyArrow to read Parquet dataset. "
             "This option is both inefficient and unstable!"
         )
         if filters is not None:

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import random
+from contextlib import contextmanager
 from io import BytesIO
 from string import ascii_letters
 
@@ -32,6 +33,19 @@
 )
 
 
+@contextmanager
+def _hide_pyarrow_parquet_cpu_warnings(engine):
+    if engine == "pyarrow":
+        with pytest.warns(
+            UserWarning,
+            match="Using CPU via PyArrow to read Parquet dataset. This option "
+            "is both inefficient and unstable!",
+        ):
+            yield
+    else:
+        yield
+
+
 @pytest.fixture(scope="module")
 def datadir(datadir):
     return datadir / "parquet"
@@ -891,7 +905,7 @@ def test_parquet_reader_list_table(tmpdir):
     expect.to_parquet(fname)
     assert os.path.exists(fname)
     got = cudf.read_parquet(fname)
-    assert_eq(expect, got, check_dtype=False)
+    assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 def int_gen(first_val, i):
@@ -1051,7 +1065,7 @@ def test_parquet_reader_list_large_mixed(tmpdir):
     expect.to_parquet(fname)
     assert os.path.exists(fname)
     got = cudf.read_parquet(fname)
-    assert_eq(expect, got, check_dtype=False)
+    assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 def test_parquet_reader_list_large_multi_rowgroup(tmpdir):
@@ -1121,7 +1135,10 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
 
     expect = src.iloc[skip:]
     got = cudf.read_parquet(fname, skiprows=skip)
-    assert_eq(expect, got, check_dtype=False)
+    if expect.empty:
+        assert_eq(expect, got)
+    else:
+        assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 @pytest.mark.parametrize("skip", [0, 1, 5, 10])
@@ -1145,7 +1162,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir):
     rows_to_read = min(3, (num_rows - skip) - 5)
     expect = src.iloc[skip:].head(rows_to_read)
     got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read)
-    assert_eq(expect, got, check_dtype=False)
+    assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 def struct_gen(gen, skip_rows, num_rows, include_validity=False):
@@ -2005,7 +2022,8 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]})
 
     pdf.to_parquet(pandas_path)
-    actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
+    with _hide_pyarrow_parquet_cpu_warnings(engine):
+        actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
 
     assert_eq(actual_gdf, expected_gdf)
 
@@ -2079,7 +2097,8 @@ def test_parquet_allnull_str(tmpdir, engine):
     )
 
     pdf.to_parquet(pandas_path)
-    actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
+    with _hide_pyarrow_parquet_cpu_warnings(engine):
+        actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
 
     assert_eq(actual_gdf, expected_gdf)