Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ORC and JSON tests failures for pandas 2.2 #15062

Merged
merged 3 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210
from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_220
from cudf.testing._utils import (
DATETIME_TYPES,
NUMERIC_TYPES,
Expand Down Expand Up @@ -1179,7 +1179,13 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):

def test_order_nested_json_reader(self, tag, data):
expected = pd.read_json(StringIO(data), lines=True)
if PANDAS_GE_220:
# TODO: Remove after https://github.com/pandas-dev/pandas/issues/57429
# is fixed
expected = expected.reset_index(drop=True)
target = cudf.read_json(StringIO(data), lines=True)
# Using pyarrow instead of assert_eq because pandas
# doesn't handle nested values comparisons correctly
if tag == "dtype_mismatch":
with pytest.raises(AssertionError):
# pandas parses integer values in float representation
Expand Down
22 changes: 14 additions & 8 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_220
from cudf.io.orc import ORCWriter
from cudf.testing import assert_frame_equal
from cudf.testing._utils import (
Expand Down Expand Up @@ -130,16 +131,21 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src):

def test_orc_reader_trailing_nulls(datadir):
path = datadir / "TestOrcFile.nulls-at-end-snappy.orc"
expect = pd.read_orc(path)
got = cudf.read_orc(path)
if PANDAS_GE_220:
check_categorical = True
else:
check_categorical = False
expect = expect.fillna(0)
got = got.fillna(0)

expect = pd.read_orc(path).fillna(0)
got = cudf.read_orc(path).fillna(0)

# PANDAS uses NaN to represent invalid data, which forces float dtype
# For comparison, we can replace NaN with 0 and cast to the cuDF dtype
for col in expect.columns:
expect[col] = expect[col].astype(got[col].dtype)
# PANDAS uses NaN to represent invalid data, which forces float dtype
# For comparison, we can replace NaN with 0 and cast to the cuDF dtype
for col in expect.columns:
expect[col] = expect[col].astype(got[col].dtype)

assert_eq(expect, got, check_categorical=False)
assert_eq(expect, got, check_categorical=check_categorical)


@pytest.mark.parametrize("use_index", [False, True])
Expand Down
Loading