From f29c8d970ba3f6927f35c68b2702aad9a9a45db8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 11 Mar 2022 14:27:26 -0600
Subject: [PATCH] Fix some warnings in `test_parquet.py` (#10416)

Resolves part of https://github.com/rapidsai/cudf/issues/10363, there still are some warnings remaining, which I tried to resolve and went down a rabbit hole of a bug inside pyarrow<->pandas conversions so will take it up later.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10416
---
 python/cudf/cudf/io/parquet.py         |  2 +-
 python/cudf/cudf/tests/test_parquet.py | 31 +++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 253d7950c54..52203d0194b 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -415,7 +415,7 @@ def read_parquet(
     # (There is a good chance this was not the intention)
     if engine != "cudf":
         warnings.warn(
-            "Using CPU via PyArrow to read Parquet dataset."
+            "Using CPU via PyArrow to read Parquet dataset. "
             "This option is both inefficient and unstable!"
         )
         if filters is not None:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 7feaa400446..91b4009995b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import random
+from contextlib import contextmanager
 from io import BytesIO
 from string import ascii_letters
 
@@ -32,6 +33,19 @@
 )
 
 
+@contextmanager
+def _hide_pyarrow_parquet_cpu_warnings(engine):
+    if engine == "pyarrow":
+        with pytest.warns(
+            UserWarning,
+            match="Using CPU via PyArrow to read Parquet dataset. This option "
+            "is both inefficient and unstable!",
+        ):
+            yield
+    else:
+        yield
+
+
 @pytest.fixture(scope="module")
 def datadir(datadir):
     return datadir / "parquet"
@@ -891,7 +905,7 @@ def test_parquet_reader_list_table(tmpdir):
     expect.to_parquet(fname)
     assert os.path.exists(fname)
     got = cudf.read_parquet(fname)
-    assert_eq(expect, got, check_dtype=False)
+    assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 def int_gen(first_val, i):
@@ -1051,7 +1065,7 @@ def test_parquet_reader_list_large_mixed(tmpdir):
     expect.to_parquet(fname)
     assert os.path.exists(fname)
     got = cudf.read_parquet(fname)
-    assert_eq(expect, got, check_dtype=False)
+    assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 def test_parquet_reader_list_large_multi_rowgroup(tmpdir):
@@ -1121,7 +1135,10 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
 
     expect = src.iloc[skip:]
     got = cudf.read_parquet(fname, skiprows=skip)
-    assert_eq(expect, got, check_dtype=False)
+    if expect.empty:
+        assert_eq(expect, got)
+    else:
+        assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 @pytest.mark.parametrize("skip", [0, 1, 5, 10])
@@ -1145,7 +1162,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir):
     rows_to_read = min(3, (num_rows - skip) - 5)
     expect = src.iloc[skip:].head(rows_to_read)
     got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read)
-    assert_eq(expect, got, check_dtype=False)
+    assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
 def struct_gen(gen, skip_rows, num_rows, include_validity=False):
@@ -2005,7 +2022,8 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]})
 
     pdf.to_parquet(pandas_path)
-    actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
+    with _hide_pyarrow_parquet_cpu_warnings(engine):
+        actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
 
     assert_eq(actual_gdf, expected_gdf)
 
@@ -2079,7 +2097,8 @@ def test_parquet_allnull_str(tmpdir, engine):
     )
 
     pdf.to_parquet(pandas_path)
-    actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
+    with _hide_pyarrow_parquet_cpu_warnings(engine):
+        actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
 
     assert_eq(actual_gdf, expected_gdf)