Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use more public pandas APIs #14929

Merged
merged 4 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/groupby.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from functools import singledispatch

from pandas.core.groupby.groupby import DataError
from pandas.errors import DataError

from cudf.api.types import is_string_dtype
from cudf.core.buffer import acquire_spill_lock
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2189,7 +2189,7 @@ def as_column(
elif (
arbitrary.size != 0
and arb_dtype.kind in ("O")
and isinstance(arbitrary[0], pd._libs.interval.Interval)
and isinstance(arbitrary[0], pd.Interval)
):
# changing from pd array to series,possible arrow bug
interval_series = pd.Series(arbitrary)
Expand Down
18 changes: 8 additions & 10 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
import pandas as pd
import pyarrow as pa
from nvtx import annotate
from pandas._config import get_option
from pandas.core.dtypes.common import is_float, is_integer
from pandas.io.formats import console
from pandas.io.formats.printing import pprint_thing
from typing_extensions import Self, assert_never
Expand Down Expand Up @@ -1817,12 +1815,12 @@ def _clean_renderable_dataframe(self, output):
dimensions (rows x columns)
"""

max_rows = get_option("display.max_rows")
min_rows = get_option("display.min_rows")
max_cols = get_option("display.max_columns")
max_colwidth = get_option("display.max_colwidth")
show_dimensions = get_option("display.show_dimensions")
if get_option("display.expand_frame_repr"):
max_rows = pd.get_option("display.max_rows")
min_rows = pd.get_option("display.min_rows")
max_cols = pd.get_option("display.max_columns")
max_colwidth = pd.get_option("display.max_colwidth")
show_dimensions = pd.get_option("display.show_dimensions")
if pd.get_option("display.expand_frame_repr"):
width, _ = console.get_console_size()
else:
width = None
Expand Down Expand Up @@ -3318,8 +3316,8 @@ def diff(self, periods=1, axis=0):

Diff currently only supports numeric dtype columns.
"""
if not is_integer(periods):
if not (is_float(periods) and periods.is_integer()):
if not isinstance(periods, int):
if not (isinstance(periods, float) and periods.is_integer()):
raise ValueError("periods must be an integer")
periods = int(periods)

Expand Down
12 changes: 4 additions & 8 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@
from pandas.api import types as pd_types
from pandas.api.extensions import ExtensionDtype
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
from pandas.core.dtypes.dtypes import (
CategoricalDtype as pd_CategoricalDtype,
CategoricalDtypeType as pd_CategoricalDtypeType,
)

import cudf
from cudf._typing import Dtype
Expand Down Expand Up @@ -971,7 +967,7 @@ def _is_categorical_dtype(obj):
if isinstance(
obj,
(
pd_CategoricalDtype,
pd.CategoricalDtype,
cudf.CategoricalDtype,
cudf.core.index.CategoricalIndex,
cudf.core.column.CategoricalColumn,
Expand All @@ -988,8 +984,8 @@ def _is_categorical_dtype(obj):
obj is t
for t in (
cudf.CategoricalDtype,
pd_CategoricalDtype,
pd_CategoricalDtypeType,
pd.CategoricalDtype,
pd.CategoricalDtype.type,
)
):
return True
Expand All @@ -1010,7 +1006,7 @@ def _is_categorical_dtype(obj):
):
return _is_categorical_dtype(obj.dtype)
if hasattr(obj, "type"):
if obj.type is pd_CategoricalDtypeType:
if obj.type is pd.CategoricalDtype.type:
return True
# TODO: A lot of the above checks are probably redundant and should be
# farmed out to this function here instead.
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import cupy
import numpy as np
import pandas as pd
from pandas._config import get_option
from typing_extensions import Self

import cudf
Expand Down Expand Up @@ -1306,7 +1305,7 @@ def get_loc(self, key):

@_cudf_nvtx_annotate
def __repr__(self):
max_seq_items = get_option("max_seq_items") or len(self)
max_seq_items = pd.get_option("max_seq_items") or len(self)
mr = 0
if 2 * max_seq_items < len(self):
mr = max_seq_items + 1
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import cupy as cp
import numpy as np
import pandas as pd
from pandas._config import get_option

import cudf
import cudf._lib as libcudf
Expand Down Expand Up @@ -428,7 +427,7 @@ def copy(

@_cudf_nvtx_annotate
def __repr__(self):
max_seq_items = get_option("display.max_seq_items") or len(self)
max_seq_items = pd.get_option("display.max_seq_items") or len(self)

if len(self) > max_seq_items:
n = int(max_seq_items / 2) + 1
Expand Down
14 changes: 6 additions & 8 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import cupy
import numpy as np
import pandas as pd
from pandas._config import get_option
from pandas.core.dtypes.common import is_float
from typing_extensions import Self, assert_never

import cudf
Expand Down Expand Up @@ -1405,8 +1403,8 @@ def __repr__(self):
_, height = get_terminal_size()
max_rows = (
height
if get_option("display.max_rows") == 0
else get_option("display.max_rows")
if pd.get_option("display.max_rows") == 0
else pd.get_option("display.max_rows")
)
if max_rows not in (0, None) and len(self) > max_rows:
top = self.head(int(max_rows / 2 + 1))
Expand Down Expand Up @@ -1451,10 +1449,10 @@ def __repr__(self):
):
min_rows = (
height
if get_option("display.min_rows") == 0
else get_option("display.min_rows")
if pd.get_option("display.min_rows") == 0
else pd.get_option("display.min_rows")
)
show_dimensions = get_option("display.show_dimensions")
show_dimensions = pd.get_option("display.show_dimensions")
if preprocess._column.categories.dtype.kind == "f":
pd_series = (
preprocess.astype("str")
Expand Down Expand Up @@ -3392,7 +3390,7 @@ def diff(self, periods=1):
dtype: int64
"""
if not is_integer(periods):
if not (is_float(periods) and periods.is_integer()):
if not (isinstance(periods, float) and periods.is_integer()):
raise ValueError("periods must be an integer")
periods = int(periods)

Expand Down
26 changes: 25 additions & 1 deletion python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np
import pandas as pd
import pandas.tseries.offsets as pd_offset
from pandas.core.tools.datetimes import _unit_map
from typing_extensions import Self

import cudf
Expand All @@ -21,6 +20,31 @@
from cudf.core import column
from cudf.core.index import as_index

# https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
_unit_map = {
"year": "year",
"years": "year",
"month": "month",
"months": "month",
"day": "day",
"days": "day",
"hour": "h",
"hours": "h",
"minute": "m",
"minutes": "m",
"second": "s",
"seconds": "s",
"ms": "ms",
"millisecond": "ms",
"milliseconds": "ms",
"us": "us",
"microsecond": "us",
"microseconds": "us",
"ns": "ns",
"nanosecond": "ns",
"nanoseconds": "ns",
}

_unit_dtype_map = {
"ns": "datetime64[ns]",
"us": "datetime64[us]",
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/io/hdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import warnings

Expand Down Expand Up @@ -27,4 +27,4 @@ def to_hdf(path_or_buf, key, value, *args, **kwargs):
"be GPU accelerated in the future"
)
pd_value = value.to_pandas()
pd.io.pytables.to_hdf(path_or_buf, key, pd_value, *args, **kwargs)
pd_value.to_hdf(path_or_buf, key, *args, **kwargs)
3 changes: 1 addition & 2 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,8 @@ def to_json(
pd_value = pd.DataFrame(pd_data)
else:
pd_value = maybe_return_nullable_pd_obj(cudf_val)
return pd.io.json.to_json(
return pd_value.to_json(
path_or_buf,
pd_value,
orient=orient,
storage_options=storage_options,
*args,
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -50,9 +50,6 @@ xfail_strict = true
filterwarnings = [
"error:Sparse:FutureWarning",
"error:The SparseArray:FutureWarning",
# Deprecation gives warning on import during pytest collection
"ignore:pandas.core.index is deprecated:FutureWarning:importlib",
"ignore:pandas.util.testing is deprecated:FutureWarning:importlib",
# Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758
"ignore:`np.MachAr` is deprecated:DeprecationWarning:numba",
]
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1705,14 +1705,14 @@ def test_date_range_raise_overflow():
start = np.datetime64(np.iinfo("int64").max, "ns")
periods = 2
freq = cudf.DateOffset(nanoseconds=1)
with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
with pytest.raises(pd.errors.OutOfBoundsDatetime):
cudf.date_range(start=start, periods=periods, freq=freq)

# Non-fixed offset
start = np.datetime64(np.iinfo("int64").max, "ns")
periods = 2
freq = cudf.DateOffset(months=1)
with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
with pytest.raises(pd.errors.OutOfBoundsDatetime):
# Extending beyond the max value will trigger a warning when pandas
# does an internal conversion to a Python built-in datetime.datetime
# object, which only supports down to microsecond resolution.
Expand Down
4 changes: 2 additions & 2 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from dask.utils import Dispatch, is_arraylike

import cudf
from cudf.api.types import _is_datetime64tz_dtype, is_string_dtype
from cudf.api.types import is_string_dtype
from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate

from .core import DataFrame, Index, Series
Expand Down Expand Up @@ -126,7 +126,7 @@ def _get_non_empty_data(s):
data = cudf.core.column.as_column(data, dtype=s.dtype)
elif is_string_dtype(s.dtype):
data = pa.array(["cat", "dog"])
elif _is_datetime64tz_dtype(s.dtype):
elif isinstance(s.dtype, pd.DatetimeTZDtype):
from cudf.utils.dtypes import get_time_unit

data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))
Expand Down
Loading