Use more public pandas APIs (#14929)

As noted what's public in https://pandas.pydata.org/docs/reference/index.html Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #14929
rapidsai · Feb 1, 2024 · 767dde1 · 767dde1
1 parent bb59715
commit 767dde1
Show file tree

Hide file tree

Showing 13 changed files with 55 additions and 45 deletions.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from functools import singledispatch
 
-from pandas.core.groupby.groupby import DataError
+from pandas.errors import DataError
 
 from cudf.api.types import is_string_dtype
 from cudf.core.buffer import acquire_spill_lock

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2189,7 +2189,7 @@ def as_column(
         elif (
             arbitrary.size != 0
             and arb_dtype.kind in ("O")
-            and isinstance(arbitrary[0], pd._libs.interval.Interval)
+            and isinstance(arbitrary[0], pd.Interval)
         ):
             # changing from pd array to series,possible arrow bug
             interval_series = pd.Series(arbitrary)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -34,8 +34,6 @@
 import pandas as pd
 import pyarrow as pa
 from nvtx import annotate
-from pandas._config import get_option
-from pandas.core.dtypes.common import is_float, is_integer
 from pandas.io.formats import console
 from pandas.io.formats.printing import pprint_thing
 from typing_extensions import Self, assert_never
@@ -1817,12 +1815,12 @@ def _clean_renderable_dataframe(self, output):
         dimensions (rows x columns)
         """
 
-        max_rows = get_option("display.max_rows")
-        min_rows = get_option("display.min_rows")
-        max_cols = get_option("display.max_columns")
-        max_colwidth = get_option("display.max_colwidth")
-        show_dimensions = get_option("display.show_dimensions")
-        if get_option("display.expand_frame_repr"):
+        max_rows = pd.get_option("display.max_rows")
+        min_rows = pd.get_option("display.min_rows")
+        max_cols = pd.get_option("display.max_columns")
+        max_colwidth = pd.get_option("display.max_colwidth")
+        show_dimensions = pd.get_option("display.show_dimensions")
+        if pd.get_option("display.expand_frame_repr"):
             width, _ = console.get_console_size()
         else:
             width = None
@@ -3318,8 +3316,8 @@ def diff(self, periods=1, axis=0):
 
             Diff currently only supports numeric dtype columns.
         """
-        if not is_integer(periods):
-            if not (is_float(periods) and periods.is_integer()):
+        if not isinstance(periods, int):
+            if not (isinstance(periods, float) and periods.is_integer()):
                 raise ValueError("periods must be an integer")
             periods = int(periods)
 

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -14,10 +14,6 @@
 from pandas.api import types as pd_types
 from pandas.api.extensions import ExtensionDtype
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
-from pandas.core.dtypes.dtypes import (
-    CategoricalDtype as pd_CategoricalDtype,
-    CategoricalDtypeType as pd_CategoricalDtypeType,
-)
 
 import cudf
 from cudf._typing import Dtype
@@ -971,7 +967,7 @@ def _is_categorical_dtype(obj):
     if isinstance(
         obj,
         (
-            pd_CategoricalDtype,
+            pd.CategoricalDtype,
             cudf.CategoricalDtype,
             cudf.core.index.CategoricalIndex,
             cudf.core.column.CategoricalColumn,
@@ -988,8 +984,8 @@ def _is_categorical_dtype(obj):
         obj is t
         for t in (
             cudf.CategoricalDtype,
-            pd_CategoricalDtype,
-            pd_CategoricalDtypeType,
+            pd.CategoricalDtype,
+            pd.CategoricalDtype.type,
         )
     ):
         return True
@@ -1010,7 +1006,7 @@ def _is_categorical_dtype(obj):
     ):
         return _is_categorical_dtype(obj.dtype)
     if hasattr(obj, "type"):
-        if obj.type is pd_CategoricalDtypeType:
+        if obj.type is pd.CategoricalDtype.type:
             return True
     # TODO: A lot of the above checks are probably redundant and should be
     # farmed out to this function here instead.

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -21,7 +21,6 @@
 import cupy
 import numpy as np
 import pandas as pd
-from pandas._config import get_option
 from typing_extensions import Self
 
 import cudf
@@ -1306,7 +1305,7 @@ def get_loc(self, key):
 
     @_cudf_nvtx_annotate
     def __repr__(self):
-        max_seq_items = get_option("max_seq_items") or len(self)
+        max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
         if 2 * max_seq_items < len(self):
             mr = max_seq_items + 1

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -15,7 +15,6 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
-from pandas._config import get_option
 
 import cudf
 import cudf._lib as libcudf
@@ -428,7 +427,7 @@ def copy(
 
     @_cudf_nvtx_annotate
     def __repr__(self):
-        max_seq_items = get_option("display.max_seq_items") or len(self)
+        max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
         if len(self) > max_seq_items:
             n = int(max_seq_items / 2) + 1

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -23,8 +23,6 @@
 import cupy
 import numpy as np
 import pandas as pd
-from pandas._config import get_option
-from pandas.core.dtypes.common import is_float
 from typing_extensions import Self, assert_never
 
 import cudf
@@ -1405,8 +1403,8 @@ def __repr__(self):
         _, height = get_terminal_size()
         max_rows = (
             height
-            if get_option("display.max_rows") == 0
-            else get_option("display.max_rows")
+            if pd.get_option("display.max_rows") == 0
+            else pd.get_option("display.max_rows")
         )
         if max_rows not in (0, None) and len(self) > max_rows:
             top = self.head(int(max_rows / 2 + 1))
@@ -1451,10 +1449,10 @@ def __repr__(self):
         ):
             min_rows = (
                 height
-                if get_option("display.min_rows") == 0
-                else get_option("display.min_rows")
+                if pd.get_option("display.min_rows") == 0
+                else pd.get_option("display.min_rows")
             )
-            show_dimensions = get_option("display.show_dimensions")
+            show_dimensions = pd.get_option("display.show_dimensions")
             if preprocess._column.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
@@ -3392,7 +3390,7 @@ def diff(self, periods=1):
         dtype: int64
         """
         if not is_integer(periods):
-            if not (is_float(periods) and periods.is_integer()):
+            if not (isinstance(periods, float) and periods.is_integer()):
                 raise ValueError("periods must be an integer")
             periods = int(periods)
 

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
@@ -9,7 +9,6 @@
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
-from pandas.core.tools.datetimes import _unit_map
 from typing_extensions import Self
 
 import cudf
@@ -21,6 +20,31 @@
 from cudf.core import column
 from cudf.core.index import as_index
 
+# https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
+_unit_map = {
+    "year": "year",
+    "years": "year",
+    "month": "month",
+    "months": "month",
+    "day": "day",
+    "days": "day",
+    "hour": "h",
+    "hours": "h",
+    "minute": "m",
+    "minutes": "m",
+    "second": "s",
+    "seconds": "s",
+    "ms": "ms",
+    "millisecond": "ms",
+    "milliseconds": "ms",
+    "us": "us",
+    "microsecond": "us",
+    "microseconds": "us",
+    "ns": "ns",
+    "nanosecond": "ns",
+    "nanoseconds": "ns",
+}
+
 _unit_dtype_map = {
     "ns": "datetime64[ns]",
     "us": "datetime64[us]",

diff --git a/python/cudf/cudf/io/hdf.py b/python/cudf/cudf/io/hdf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import warnings
 
@@ -27,4 +27,4 @@ def to_hdf(path_or_buf, key, value, *args, **kwargs):
         "be GPU accelerated in the future"
     )
     pd_value = value.to_pandas()
-    pd.io.pytables.to_hdf(path_or_buf, key, pd_value, *args, **kwargs)
+    pd_value.to_hdf(path_or_buf, key, *args, **kwargs)
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
@@ -247,9 +247,8 @@ def to_json(
             pd_value = pd.DataFrame(pd_data)
         else:
             pd_value = maybe_return_nullable_pd_obj(cudf_val)
-        return pd.io.json.to_json(
+        return pd_value.to_json(
             path_or_buf,
-            pd_value,
             orient=orient,
             storage_options=storage_options,
             *args,

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -50,9 +50,6 @@ xfail_strict = true
 filterwarnings = [
   "error:Sparse:FutureWarning",
   "error:The SparseArray:FutureWarning",
-  # Deprecation gives warning on import during pytest collection
-  "ignore:pandas.core.index is deprecated:FutureWarning:importlib",
-  "ignore:pandas.util.testing is deprecated:FutureWarning:importlib",
   # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758
   "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba",
 ]

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
@@ -1705,14 +1705,14 @@ def test_date_range_raise_overflow():
     start = np.datetime64(np.iinfo("int64").max, "ns")
     periods = 2
     freq = cudf.DateOffset(nanoseconds=1)
-    with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
+    with pytest.raises(pd.errors.OutOfBoundsDatetime):
         cudf.date_range(start=start, periods=periods, freq=freq)
 
     # Non-fixed offset
     start = np.datetime64(np.iinfo("int64").max, "ns")
     periods = 2
     freq = cudf.DateOffset(months=1)
-    with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
+    with pytest.raises(pd.errors.OutOfBoundsDatetime):
         # Extending beyond the max value will trigger a warning when pandas
         # does an internal conversion to a Python built-in datetime.datetime
         # object, which only supports down to microsecond resolution.

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
@@ -41,7 +41,7 @@
 from dask.utils import Dispatch, is_arraylike
 
 import cudf
-from cudf.api.types import _is_datetime64tz_dtype, is_string_dtype
+from cudf.api.types import is_string_dtype
 from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from .core import DataFrame, Index, Series
@@ -126,7 +126,7 @@ def _get_non_empty_data(s):
         data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
         data = pa.array(["cat", "dog"])
-    elif _is_datetime64tz_dtype(s.dtype):
+    elif isinstance(s.dtype, pd.DatetimeTZDtype):
         from cudf.utils.dtypes import get_time_unit
 
         data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))