From c5f05205931408acafb5823449a5d724200f229a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 14 Feb 2024 17:34:02 -0800 Subject: [PATCH 1/2] Fix ORC and JSON tests failures for pandas 2.2 --- python/cudf/cudf/tests/test_json.py | 19 +++++++------------ python/cudf/cudf/tests/test_orc.py | 22 ++++++++++++++-------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index ec980adc334..6c45680c1de 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1177,20 +1177,15 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size): df = cudf.concat(chunks, ignore_index=True) assert expected.to_arrow().equals(df.to_arrow()) - def test_order_nested_json_reader(self, tag, data): + def test_order_nested_json_reader(self, request, tag, data): expected = pd.read_json(StringIO(data), lines=True) target = cudf.read_json(StringIO(data), lines=True) - if tag == "dtype_mismatch": - with pytest.raises(AssertionError): - # pandas parses integer values in float representation - # as integer - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - elif tag == "missing": - with pytest.raises(AssertionError): - # pandas inferences integer with nulls as float64 - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - else: - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + request.applymarker( + pytest.mark.xfail( + tag == "dtype_mismatch", reason="int vs float mismatch" + ) + ) + assert_eq(expected, target) def test_json_round_trip_gzip(): diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index cf2fd29d41e..80fc815dd76 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -13,6 +13,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.io.orc import ORCWriter from cudf.testing import assert_frame_equal from cudf.testing._utils import ( @@ -130,16 +131,21 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src): def test_orc_reader_trailing_nulls(datadir): path = datadir / "TestOrcFile.nulls-at-end-snappy.orc" + expect = pd.read_orc(path) + got = cudf.read_orc(path) + if PANDAS_GE_220: + check_categorical = True + else: + check_categorical = False + expect = expect.fillna(0) + got = got.fillna(0) - expect = pd.read_orc(path).fillna(0) - got = cudf.read_orc(path).fillna(0) - - # PANDAS uses NaN to represent invalid data, which forces float dtype - # For comparison, we can replace NaN with 0 and cast to the cuDF dtype - for col in expect.columns: - expect[col] = expect[col].astype(got[col].dtype) + # PANDAS uses NaN to represent invalid data, which forces float dtype + # For comparison, we can replace NaN with 0 and cast to the cuDF dtype + for col in expect.columns: + expect[col] = expect[col].astype(got[col].dtype) - assert_eq(expect, got, check_categorical=False) + assert_eq(expect, got, check_categorical=check_categorical) @pytest.mark.parametrize("use_index", [False, True]) From 3be6e1f608c23a363a37b5cba63dc9b2bb892545 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:24:48 -0800 Subject: [PATCH 2/2] Use Prems fix, undo pyarrow comparison --- python/cudf/cudf/tests/test_json.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 6c45680c1de..12ea74bd7a7 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -13,7 +13,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_220 from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -1177,15 +1177,26 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size): df = cudf.concat(chunks, ignore_index=True) assert expected.to_arrow().equals(df.to_arrow()) - def test_order_nested_json_reader(self, request, tag, data): + def test_order_nested_json_reader(self, tag, data): expected = pd.read_json(StringIO(data), lines=True) + if PANDAS_GE_220: + # TODO: Remove after https://github.com/pandas-dev/pandas/issues/57429 + # is fixed + expected = expected.reset_index(drop=True) target = cudf.read_json(StringIO(data), lines=True) - request.applymarker( - pytest.mark.xfail( - tag == "dtype_mismatch", reason="int vs float mismatch" - ) - ) - assert_eq(expected, target) + # Using pyarrow instead of assert_eq because pandas + # doesn't handle nested values comparisons correctly + if tag == "dtype_mismatch": + with pytest.raises(AssertionError): + # pandas parses integer values in float representation + # as integer + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + elif tag == "missing": + with pytest.raises(AssertionError): + # pandas inferences integer with nulls as float64 + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + else: + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) def test_json_round_trip_gzip():