From c5f05205931408acafb5823449a5d724200f229a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Feb 2024 17:34:02 -0800
Subject: [PATCH 1/2] Fix ORC and JSON tests failures for pandas 2.2

---
 python/cudf/cudf/tests/test_json.py | 19 +++++++------------
 python/cudf/cudf/tests/test_orc.py  | 22 ++++++++++++++--------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index ec980adc334..6c45680c1de 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1177,20 +1177,15 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):
         df = cudf.concat(chunks, ignore_index=True)
         assert expected.to_arrow().equals(df.to_arrow())
 
-    def test_order_nested_json_reader(self, tag, data):
+    def test_order_nested_json_reader(self, request, tag, data):
         expected = pd.read_json(StringIO(data), lines=True)
         target = cudf.read_json(StringIO(data), lines=True)
-        if tag == "dtype_mismatch":
-            with pytest.raises(AssertionError):
-                # pandas parses integer values in float representation
-                # as integer
-                assert pa.Table.from_pandas(expected).equals(target.to_arrow())
-        elif tag == "missing":
-            with pytest.raises(AssertionError):
-                # pandas inferences integer with nulls as float64
-                assert pa.Table.from_pandas(expected).equals(target.to_arrow())
-        else:
-            assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+        request.applymarker(
+            pytest.mark.xfail(
+                tag == "dtype_mismatch", reason="int vs float mismatch"
+            )
+        )
+        assert_eq(expected, target)
 
 
 def test_json_round_trip_gzip():
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index cf2fd29d41e..80fc815dd76 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -13,6 +13,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.io.orc import ORCWriter
 from cudf.testing import assert_frame_equal
 from cudf.testing._utils import (
@@ -130,16 +131,21 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src):
 
 def test_orc_reader_trailing_nulls(datadir):
     path = datadir / "TestOrcFile.nulls-at-end-snappy.orc"
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+    if PANDAS_GE_220:
+        check_categorical = True
+    else:
+        check_categorical = False
+        expect = expect.fillna(0)
+        got = got.fillna(0)
 
-    expect = pd.read_orc(path).fillna(0)
-    got = cudf.read_orc(path).fillna(0)
-
-    # PANDAS uses NaN to represent invalid data, which forces float dtype
-    # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
-    for col in expect.columns:
-        expect[col] = expect[col].astype(got[col].dtype)
+        # PANDAS uses NaN to represent invalid data, which forces float dtype
+        # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
+        for col in expect.columns:
+            expect[col] = expect[col].astype(got[col].dtype)
 
-    assert_eq(expect, got, check_categorical=False)
+    assert_eq(expect, got, check_categorical=check_categorical)
 
 
 @pytest.mark.parametrize("use_index", [False, True])

From 3be6e1f608c23a363a37b5cba63dc9b2bb892545 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 20 Feb 2024 13:24:48 -0800
Subject: [PATCH 2/2] Use Prems fix, undo pyarrow comparison

---
 python/cudf/cudf/tests/test_json.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 6c45680c1de..12ea74bd7a7 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -13,7 +13,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210
+from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_220
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -1177,15 +1177,26 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):
         df = cudf.concat(chunks, ignore_index=True)
         assert expected.to_arrow().equals(df.to_arrow())
 
-    def test_order_nested_json_reader(self, request, tag, data):
+    def test_order_nested_json_reader(self, tag, data):
         expected = pd.read_json(StringIO(data), lines=True)
+        if PANDAS_GE_220:
+            # TODO: Remove after https://github.com/pandas-dev/pandas/issues/57429
+            # is fixed
+            expected = expected.reset_index(drop=True)
         target = cudf.read_json(StringIO(data), lines=True)
-        request.applymarker(
-            pytest.mark.xfail(
-                tag == "dtype_mismatch", reason="int vs float mismatch"
-            )
-        )
-        assert_eq(expected, target)
+        # Using pyarrow instead of assert_eq because pandas
+        # doesn't handle nested values comparisons correctly
+        if tag == "dtype_mismatch":
+            with pytest.raises(AssertionError):
+                # pandas parses integer values in float representation
+                # as integer
+                assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+        elif tag == "missing":
+            with pytest.raises(AssertionError):
+                # pandas inferences integer with nulls as float64
+                assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+        else:
+            assert pa.Table.from_pandas(expected).equals(target.to_arrow())
 
 
 def test_json_round_trip_gzip():