From 767dde16e413f34cac16cb0b96b7eca18d71b7e9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 Jan 2024 19:47:07 -1000 Subject: [PATCH] Use more public pandas APIs (#14929) As noted what's public in https://pandas.pydata.org/docs/reference/index.html Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/14929 --- python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dataframe.py | 18 ++++++------- python/cudf/cudf/core/dtypes.py | 12 +++------ python/cudf/cudf/core/index.py | 3 +-- python/cudf/cudf/core/multiindex.py | 3 +-- python/cudf/cudf/core/series.py | 14 +++++----- python/cudf/cudf/core/tools/datetimes.py | 26 ++++++++++++++++++- python/cudf/cudf/io/hdf.py | 4 +-- python/cudf/cudf/io/json.py | 3 +-- .../cudf/pandas/scripts/run-pandas-tests.sh | 5 +--- python/cudf/cudf/tests/test_datetime.py | 4 +-- python/dask_cudf/dask_cudf/backends.py | 4 +-- 13 files changed, 55 insertions(+), 45 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 8848649736b..db4c5e6173a 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from functools import singledispatch -from pandas.core.groupby.groupby import DataError +from pandas.errors import DataError from cudf.api.types import is_string_dtype from cudf.core.buffer import acquire_spill_lock diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ad56cabb48e..9143c7f5e9e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2189,7 +2189,7 @@ def as_column( elif ( arbitrary.size != 0 and arb_dtype.kind in ("O") - and isinstance(arbitrary[0], pd._libs.interval.Interval) + and isinstance(arbitrary[0], pd.Interval) ): # changing from pd array to series,possible arrow bug interval_series = pd.Series(arbitrary) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1b0f83c5d70..727d5135297 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -34,8 +34,6 @@ import pandas as pd import pyarrow as pa from nvtx import annotate -from pandas._config import get_option -from pandas.core.dtypes.common import is_float, is_integer from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing from typing_extensions import Self, assert_never @@ -1817,12 +1815,12 @@ def _clean_renderable_dataframe(self, output): dimensions (rows x columns) """ - max_rows = get_option("display.max_rows") - min_rows = get_option("display.min_rows") - max_cols = get_option("display.max_columns") - max_colwidth = get_option("display.max_colwidth") - show_dimensions = get_option("display.show_dimensions") - if get_option("display.expand_frame_repr"): + max_rows = pd.get_option("display.max_rows") + min_rows = pd.get_option("display.min_rows") + max_cols = pd.get_option("display.max_columns") + max_colwidth = pd.get_option("display.max_colwidth") + show_dimensions = pd.get_option("display.show_dimensions") + if pd.get_option("display.expand_frame_repr"): width, _ = console.get_console_size() else: width = None @@ -3318,8 +3316,8 @@ def diff(self, periods=1, axis=0): Diff currently only supports numeric dtype columns. """ - if not is_integer(periods): - if not (is_float(periods) and periods.is_integer()): + if not isinstance(periods, int): + if not (isinstance(periods, float) and periods.is_integer()): raise ValueError("periods must be an integer") periods = int(periods) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 17d6d42618a..7892f8065d0 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -14,10 +14,6 @@ from pandas.api import types as pd_types from pandas.api.extensions import ExtensionDtype from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -from pandas.core.dtypes.dtypes import ( - CategoricalDtype as pd_CategoricalDtype, - CategoricalDtypeType as pd_CategoricalDtypeType, -) import cudf from cudf._typing import Dtype @@ -971,7 +967,7 @@ def _is_categorical_dtype(obj): if isinstance( obj, ( - pd_CategoricalDtype, + pd.CategoricalDtype, cudf.CategoricalDtype, cudf.core.index.CategoricalIndex, cudf.core.column.CategoricalColumn, @@ -988,8 +984,8 @@ def _is_categorical_dtype(obj): obj is t for t in ( cudf.CategoricalDtype, - pd_CategoricalDtype, - pd_CategoricalDtypeType, + pd.CategoricalDtype, + pd.CategoricalDtype.type, ) ): return True @@ -1010,7 +1006,7 @@ def _is_categorical_dtype(obj): ): return _is_categorical_dtype(obj.dtype) if hasattr(obj, "type"): - if obj.type is pd_CategoricalDtypeType: + if obj.type is pd.CategoricalDtype.type: return True # TODO: A lot of the above checks are probably redundant and should be # farmed out to this function here instead. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index fa9e49baaa2..c8eedae200b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -21,7 +21,6 @@ import cupy import numpy as np import pandas as pd -from pandas._config import get_option from typing_extensions import Self import cudf @@ -1306,7 +1305,7 @@ def get_loc(self, key): @_cudf_nvtx_annotate def __repr__(self): - max_seq_items = get_option("max_seq_items") or len(self) + max_seq_items = pd.get_option("max_seq_items") or len(self) mr = 0 if 2 * max_seq_items < len(self): mr = max_seq_items + 1 diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a747ca8eea0..a3f7be7b266 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -15,7 +15,6 @@ import cupy as cp import numpy as np import pandas as pd -from pandas._config import get_option import cudf import cudf._lib as libcudf @@ -428,7 +427,7 @@ def copy( @_cudf_nvtx_annotate def __repr__(self): - max_seq_items = get_option("display.max_seq_items") or len(self) + max_seq_items = pd.get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: n = int(max_seq_items / 2) + 1 diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 77ed7644f69..3f51ecdf7dc 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -23,8 +23,6 @@ import cupy import numpy as np import pandas as pd -from pandas._config import get_option -from pandas.core.dtypes.common import is_float from typing_extensions import Self, assert_never import cudf @@ -1405,8 +1403,8 @@ def __repr__(self): _, height = get_terminal_size() max_rows = ( height - if get_option("display.max_rows") == 0 - else get_option("display.max_rows") + if pd.get_option("display.max_rows") == 0 + else pd.get_option("display.max_rows") ) if max_rows not in (0, None) and len(self) > max_rows: top = self.head(int(max_rows / 2 + 1)) @@ -1451,10 +1449,10 @@ def __repr__(self): ): min_rows = ( height - if get_option("display.min_rows") == 0 - else get_option("display.min_rows") + if pd.get_option("display.min_rows") == 0 + else pd.get_option("display.min_rows") ) - show_dimensions = get_option("display.show_dimensions") + show_dimensions = pd.get_option("display.show_dimensions") if preprocess._column.categories.dtype.kind == "f": pd_series = ( preprocess.astype("str") @@ -3392,7 +3390,7 @@ def diff(self, periods=1): dtype: int64 """ if not is_integer(periods): - if not (is_float(periods) and periods.is_integer()): + if not (isinstance(periods, float) and periods.is_integer()): raise ValueError("periods must be an integer") periods = int(periods) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 80a79e60ea9..faa7407daaf 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd import pandas.tseries.offsets as pd_offset -from pandas.core.tools.datetimes import _unit_map from typing_extensions import Self import cudf @@ -21,6 +20,31 @@ from cudf.core import column from cudf.core.index import as_index +# https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 +_unit_map = { + "year": "year", + "years": "year", + "month": "month", + "months": "month", + "day": "day", + "days": "day", + "hour": "h", + "hours": "h", + "minute": "m", + "minutes": "m", + "second": "s", + "seconds": "s", + "ms": "ms", + "millisecond": "ms", + "milliseconds": "ms", + "us": "us", + "microsecond": "us", + "microseconds": "us", + "ns": "ns", + "nanosecond": "ns", + "nanoseconds": "ns", +} + _unit_dtype_map = { "ns": "datetime64[ns]", "us": "datetime64[us]", diff --git a/python/cudf/cudf/io/hdf.py b/python/cudf/cudf/io/hdf.py index 8cf8c01c1df..78e7df649cb 100644 --- a/python/cudf/cudf/io/hdf.py +++ b/python/cudf/cudf/io/hdf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import warnings @@ -27,4 +27,4 @@ def to_hdf(path_or_buf, key, value, *args, **kwargs): "be GPU accelerated in the future" ) pd_value = value.to_pandas() - pd.io.pytables.to_hdf(path_or_buf, key, pd_value, *args, **kwargs) + pd_value.to_hdf(path_or_buf, key, *args, **kwargs) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 4f16263dd05..b2f3fd09146 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -247,9 +247,8 @@ def to_json( pd_value = pd.DataFrame(pd_data) else: pd_value = maybe_return_nullable_pd_obj(cudf_val) - return pd.io.json.to_json( + return pd_value.to_json( path_or_buf, - pd_value, orient=orient, storage_options=storage_options, *args, diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index c4dfe427c93..4fe152cc493 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -50,9 +50,6 @@ xfail_strict = true filterwarnings = [ "error:Sparse:FutureWarning", "error:The SparseArray:FutureWarning", - # Deprecation gives warning on import during pytest collection - "ignore:pandas.core.index is deprecated:FutureWarning:importlib", - "ignore:pandas.util.testing is deprecated:FutureWarning:importlib", # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758 "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba", ] diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 62733625485..24d8aa052e8 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1705,14 +1705,14 @@ def test_date_range_raise_overflow(): start = np.datetime64(np.iinfo("int64").max, "ns") periods = 2 freq = cudf.DateOffset(nanoseconds=1) - with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime): + with pytest.raises(pd.errors.OutOfBoundsDatetime): cudf.date_range(start=start, periods=periods, freq=freq) # Non-fixed offset start = np.datetime64(np.iinfo("int64").max, "ns") periods = 2 freq = cudf.DateOffset(months=1) - with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime): + with pytest.raises(pd.errors.OutOfBoundsDatetime): # Extending beyond the max value will trigger a warning when pandas # does an internal conversion to a Python built-in datetime.datetime # object, which only supports down to microsecond resolution. diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 026ab1d304a..454cce76ff2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -41,7 +41,7 @@ from dask.utils import Dispatch, is_arraylike import cudf -from cudf.api.types import _is_datetime64tz_dtype, is_string_dtype +from cudf.api.types import is_string_dtype from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate from .core import DataFrame, Index, Series @@ -126,7 +126,7 @@ def _get_non_empty_data(s): data = cudf.core.column.as_column(data, dtype=s.dtype) elif is_string_dtype(s.dtype): data = pa.array(["cat", "dog"]) - elif _is_datetime64tz_dtype(s.dtype): + elif isinstance(s.dtype, pd.DatetimeTZDtype): from cudf.utils.dtypes import get_time_unit data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))