Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ORC and JSON tests failures for pandas 2.2 #15062

Merged
merged 3 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,20 +1177,15 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):
df = cudf.concat(chunks, ignore_index=True)
assert expected.to_arrow().equals(df.to_arrow())

def test_order_nested_json_reader(self, tag, data):
def test_order_nested_json_reader(self, request, tag, data):
expected = pd.read_json(StringIO(data), lines=True)
target = cudf.read_json(StringIO(data), lines=True)
if tag == "dtype_mismatch":
with pytest.raises(AssertionError):
# pandas parses integer values in float representation
# as integer
assert pa.Table.from_pandas(expected).equals(target.to_arrow())
elif tag == "missing":
with pytest.raises(AssertionError):
# pandas inferences integer with nulls as float64
assert pa.Table.from_pandas(expected).equals(target.to_arrow())
else:
assert pa.Table.from_pandas(expected).equals(target.to_arrow())
request.applymarker(
pytest.mark.xfail(
tag == "dtype_mismatch", reason="int vs float mismatch"
)
)
assert_eq(expected, target)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can pandas now handle comparing the equality of nested types/values? I remember the last time I checked pandas wasn't able to that properly and thus we resorted to use pyarrow for comparisons in case of nested types.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just was able to triage the issue here, looks like pd.read_json has a regression where the dataframe is expected to have a RangeIndex but a materialized Index of int64 dtype is being returned in pandas-2.2. We should just change this test to the following:

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index ec980adc33..5a459e98d1 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1179,6 +1179,9 @@ class TestNestedJsonReaderCommon:
 
     def test_order_nested_json_reader(self, tag, data):
         expected = pd.read_json(StringIO(data), lines=True)
+        if PANDAS_GE_200:
+            # TODO: Remove after bug fix: <Pandas-bug-URL>
+            expected = expected.reset_index(drop=True)
         target = cudf.read_json(StringIO(data), lines=True)
         if tag == "dtype_mismatch":
             with pytest.raises(AssertionError):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. Yeah testing of nested data isn't well tested in pandas so probably better to use pyarrow to compare here. Will incorporate your change



def test_json_round_trip_gzip():
Expand Down
22 changes: 14 additions & 8 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_220
from cudf.io.orc import ORCWriter
from cudf.testing import assert_frame_equal
from cudf.testing._utils import (
Expand Down Expand Up @@ -130,16 +131,21 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src):

def test_orc_reader_trailing_nulls(datadir):
path = datadir / "TestOrcFile.nulls-at-end-snappy.orc"
expect = pd.read_orc(path)
got = cudf.read_orc(path)
if PANDAS_GE_220:
check_categorical = True
else:
check_categorical = False
expect = expect.fillna(0)
got = got.fillna(0)

expect = pd.read_orc(path).fillna(0)
got = cudf.read_orc(path).fillna(0)

# PANDAS uses NaN to represent invalid data, which forces float dtype
# For comparison, we can replace NaN with 0 and cast to the cuDF dtype
for col in expect.columns:
expect[col] = expect[col].astype(got[col].dtype)
# PANDAS uses NaN to represent invalid data, which forces float dtype
# For comparison, we can replace NaN with 0 and cast to the cuDF dtype
for col in expect.columns:
expect[col] = expect[col].astype(got[col].dtype)

assert_eq(expect, got, check_categorical=False)
assert_eq(expect, got, check_categorical=check_categorical)


@pytest.mark.parametrize("use_index", [False, True])
Expand Down
Loading