Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] First pass of pd.read_orc changes in tests #12103

Merged
merged 4 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
import cupy
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.orc
import pytest
from pandas import testing as tm

import cudf
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf.core._compat import PANDAS_GE_100
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils import dtypes as dtypeutils

Expand Down Expand Up @@ -379,6 +382,13 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


def _pandas_read_orc(fname, columns=None):
if PANDAS_GE_100:
return pd.read_orc(fname, columns=columns)
else:
return pa.orc.ORCFile(fname).read(columns=columns).to_pandas()


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/tests/test_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.orc
import pytest

import cudf
from cudf.testing._utils import assert_eq
from cudf.testing._utils import _pandas_read_orc, assert_eq

gcsfs = pytest.importorskip("gcsfs")

Expand Down Expand Up @@ -71,5 +69,5 @@ def mock_open(*args, **kwargs):
monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open)
gdf.to_orc(f"gcs://{gcs_fname}")

got = pa.orc.ORCFile(local_filepath).read().to_pandas()
got = _pandas_read_orc(local_filepath)
assert_eq(pdf, got)
7 changes: 3 additions & 4 deletions python/cudf/cudf/tests/test_hdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import pandas as pd
import pyarrow as pa
import pytest
from pyarrow import orc

import cudf
from cudf.testing._utils import assert_eq
from cudf.testing._utils import _pandas_read_orc, assert_eq

if not os.environ.get("RUN_HDFS_TESTS"):
pytestmark = pytest.mark.skip("Env not configured to run HDFS tests")
Expand Down Expand Up @@ -212,7 +211,7 @@ def test_read_orc(datadir, hdfs, test_url):
hd_fpath = f"hdfs://{basedir}/file.orc"

got = cudf.read_orc(hd_fpath)
expect = orc.ORCFile(buffer).read().to_pandas()
expect = _pandas_read_orc(buffer)
assert_eq(expect, got)


Expand All @@ -232,7 +231,7 @@ def test_write_orc(pdf, hdfs, test_url):

assert hdfs.exists(f"{basedir}/test_orc_writer.orc")
with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f:
got = orc.ORCFile(f).read().to_pandas()
got = _pandas_read_orc(f)

assert_eq(pdf, got)

Expand Down
Loading