Update minimum pandas and numpy pinnings (#12887)

This PR: - [x] Increments the minimum pinning for `pandas` version from `1.0` to `1.3`. - [x] Sets a minimum pinning for `numpy` as `>=1.21` - [x] Fixes arm conda environment creation by removing `pandoc` version constraint. Resolves #12785. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) - Lawrence Mitchell (https://github.com/wence-) URL: #12887
rapidsai · Mar 10, 2023 · e4557cb · e4557cb
1 parent e591f68
commit e4557cb
Show file tree

Hide file tree

Showing 28 changed files with 131 additions and 399 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -41,13 +41,13 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.56.2
-- numpy
+- numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
 - nvtx>=0.2.1
 - packaging
-- pandas>=1.0,<1.6.0dev0
-- pandoc<=2.0.0
+- pandas>=1.3,<1.6.0dev0
+- pandoc
 - pip
 - pre-commit
 - protobuf>=4.21.6,<4.22

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -62,10 +62,10 @@ requirements:
     - protobuf >=4.21.6,<4.22
     - python
     - typing_extensions
-    - pandas >=1.0,<1.6.0dev0
+    - pandas >=1.3,<1.6.0dev0
     - cupy >=9.5.0,<12.0.0a0
     - numba >=0.56.2
-    - numpy
+    - numpy >=1.21
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
     - fastavro >=0.22.0

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -152,7 +152,7 @@ dependencies:
           - myst-nb
           - nbsphinx
           - numpydoc
-          - pandoc<=2.0.0 # We should check and fix all "<=" pinnings
+          - pandoc
           - pydata-sphinx-theme
           - sphinx
           - sphinx-autobuild
@@ -254,10 +254,10 @@ dependencies:
           - distributed>=2023.1.1
           - fsspec>=0.6.0
           - numba>=0.56.2
-          - numpy
+          - numpy>=1.21
           - nvtx>=0.2.1
           - packaging
-          - pandas>=1.0,<1.6.0dev0
+          - pandas>=1.3,<1.6.0dev0
           - python-confluent-kafka=1.7.0
           - streamz
           - typing_extensions

diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
@@ -4,10 +4,6 @@
 from packaging import version
 
 PANDAS_VERSION = version.parse(pd.__version__)
-PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
-PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
-PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
-PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0")
 PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3")
 PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4")
 PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -21,16 +21,12 @@
     ScalarLike,
 )
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
-from cudf.core._compat import PANDAS_GE_120
 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.utils import _fillna_natwise
 
-if PANDAS_GE_120:
-    _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
-else:
-    _guess_datetime_format = pd.core.tools.datetimes._guess_datetime_format
+_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
 
 # nanoseconds per time_unit
 _dtype_to_format_conversion = {

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -19,7 +19,7 @@
 
 import cudf
 from cudf._typing import Dtype
-from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150
+from cudf.core._compat import PANDAS_GE_150
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
 from cudf.utils.docutils import doc_apply
@@ -875,16 +875,10 @@ def to_arrow(self):
 
     @classmethod
     def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
-        if PANDAS_GE_130:
-            return cls(subtype=pd_dtype.subtype, closed=pd_dtype.closed)
-        else:
-            return cls(subtype=pd_dtype.subtype)
+        return cls(subtype=pd_dtype.subtype, closed=pd_dtype.closed)
 
     def to_pandas(self) -> pd.IntervalDtype:
-        if PANDAS_GE_130:
-            return pd.IntervalDtype(subtype=self.subtype, closed=self.closed)
-        else:
-            return pd.IntervalDtype(subtype=self.subtype)
+        return pd.IntervalDtype(subtype=self.subtype, closed=self.closed)
 
     def __eq__(self, other):
         if isinstance(other, str):

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -21,7 +21,7 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
-from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150
+from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -495,7 +495,7 @@ def __repr__(self):
                 )
             )
 
-            if PANDAS_GE_120 and not PANDAS_GE_150:
+            if not PANDAS_GE_150:
                 # Need this whole `if` block,
                 # this is a workaround for the following issue:
                 # https://github.com/pandas-dev/pandas/issues/39984

diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -19,7 +19,6 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core._compat import PANDAS_GE_110
 from cudf.core.missing import NA
 
 
@@ -699,28 +698,17 @@ def assert_frame_equal(
         obj=f"{obj}.index",
     )
 
-    if PANDAS_GE_110:
-        pd.testing.assert_index_equal(
-            left._data.to_pandas_index(),
-            right._data.to_pandas_index(),
-            exact=check_column_type,
-            check_names=check_names,
-            check_exact=check_exact,
-            check_categorical=check_categorical,
-            rtol=rtol,
-            atol=atol,
-            obj=f"{obj}.columns",
-        )
-    else:
-        pd.testing.assert_index_equal(
-            left._data.to_pandas_index(),
-            right._data.to_pandas_index(),
-            exact=check_column_type,
-            check_names=check_names,
-            check_exact=check_exact,
-            check_categorical=check_categorical,
-            obj=f"{obj}.columns",
-        )
+    pd.testing.assert_index_equal(
+        left._data.to_pandas_index(),
+        right._data.to_pandas_index(),
+        exact=check_column_type,
+        check_names=check_names,
+        check_exact=check_exact,
+        check_categorical=check_categorical,
+        rtol=rtol,
+        atol=atol,
+        obj=f"{obj}.columns",
+    )
 
     for col in left._column_names:
         assert_column_equal(

diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
@@ -11,7 +11,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_134
+from cudf.core._compat import PANDAS_GE_134
 from cudf.testing._utils import (
     NUMERIC_TYPES,
     assert_eq,
@@ -81,7 +81,6 @@ def test_categorical_basic():
     assert_eq(cat.codes, cudf_cat.codes.to_numpy())
 
 
-@pytest.mark.skipif(not PANDAS_GE_110, reason="requires pandas>=1.1.0")
 def test_categorical_integer():
     cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"])
     pdsr = pd.Series(cat)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -22,13 +22,7 @@
 from packaging import version
 
 import cudf
-from cudf.core._compat import (
-    PANDAS_GE_110,
-    PANDAS_GE_120,
-    PANDAS_GE_134,
-    PANDAS_GE_150,
-    PANDAS_LT_140,
-)
+from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_LT_140
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.column import column
 from cudf.testing import _utils as utils
@@ -3227,10 +3221,6 @@ def test_dataframe_reindex_fill_value(
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_dataframe_reindex_change_dtype(copy):
-    if PANDAS_GE_110:
-        kwargs = {"check_freq": False}
-    else:
-        kwargs = {}
     index = pd.date_range("12/29/2009", periods=10, freq="D")
     columns = ["a", "b", "c", "d", "e"]
     gdf = cudf.datasets.randomdata(
@@ -3242,7 +3232,7 @@ def test_dataframe_reindex_change_dtype(copy):
     assert_eq(
         pdf.reindex(index=index, columns=columns, copy=True),
         gdf.reindex(index=index, columns=columns, copy=copy),
-        **kwargs,
+        check_freq=False,
     )
 
 
@@ -4632,10 +4622,6 @@ def test_isin_dataframe(data, values):
     else:
         try:
             expected = pdf.isin(values)
-        except ValueError as e:
-            if str(e) == "Lengths must match." and not PANDAS_GE_110:
-                # https://github.com/pandas-dev/pandas/issues/34256
-                return
         except TypeError as e:
             # Can't do isin with different categories
             if str(e) == (
@@ -5302,12 +5288,7 @@ def test_rowwise_ops_datetime_dtypes_pdbug(data):
     expected = pdf.max(axis=1, skipna=False)
     got = gdf.max(axis=1, skipna=False)
 
-    if PANDAS_GE_120:
-        assert_eq(got, expected)
-    else:
-        # PANDAS BUG: https://github.com/pandas-dev/pandas/issues/36907
-        with pytest.raises(AssertionError, match="numpy array are different"):
-            assert_eq(got, expected)
+    assert_eq(got, expected)
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150
+from cudf.core._compat import PANDAS_GE_150
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
     CategoricalDtype,
@@ -187,10 +187,6 @@ def test_interval_dtype_pyarrow_round_trip(subtype, closed):
     assert expect.equals(got)
 
 
-@pytest.mark.skipif(
-    not PANDAS_GE_130,
-    reason="pandas<1.3.0 doesn't have a closed argument for IntervalDtype",
-)
 def test_interval_dtype_from_pandas(subtype, closed):
     expect = cudf.IntervalDtype(subtype, closed=closed)
     pd_type = pd.IntervalDtype(subtype, closed=closed)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -15,12 +15,7 @@
 
 import cudf
 from cudf import DataFrame, Series
-from cudf.core._compat import (
-    PANDAS_GE_110,
-    PANDAS_GE_130,
-    PANDAS_GE_150,
-    PANDAS_LT_140,
-)
+from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
 from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES
 from cudf.testing._utils import (
     DATETIME_TYPES,
@@ -573,7 +568,7 @@ def test_groupby_2keys_agg(nelem, func):
     # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"],
 )
 @pytest.mark.xfail(
-    condition=PANDAS_GE_130 and PANDAS_LT_140,
+    condition=PANDAS_LT_140,
     reason="https://github.com/pandas-dev/pandas/issues/43209",
 )
 def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
@@ -1507,9 +1502,6 @@ def test_groupby_median(agg, by):
 
 @pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"])
 @pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]])
-@pytest.mark.xfail(
-    condition=not PANDAS_GE_110, reason="pandas >= 1.1 required"
-)
 def test_groupby_nunique(agg, by):
     pdf = pd.DataFrame(
         {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]}
@@ -1545,7 +1537,6 @@ def test_groupby_nth(n, by):
 
 
 @pytest.mark.xfail(
-    condition=PANDAS_GE_130,
     reason="https://github.com/pandas-dev/pandas/issues/43209",
 )
 def test_raise_data_error():

diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
@@ -11,7 +11,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_133, PANDAS_GE_200
+from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200
 from cudf.core.index import (
     CategoricalIndex,
     DatetimeIndex,
@@ -811,17 +811,6 @@ def test_index_difference(data, other, sort):
     gd_data = cudf.core.index.as_index(data)
     gd_other = cudf.core.index.as_index(other)
 
-    if (
-        gd_data.dtype.kind == "f"
-        and gd_other.dtype.kind != "f"
-        or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f")
-    ):
-        pytest.mark.xfail(
-            condition=not PANDAS_GE_110,
-            reason="Bug in Pandas: "
-            "https://github.com/pandas-dev/pandas/issues/35217",
-        )
-
     expected = pd_data.difference(pd_other, sort=sort)
     actual = gd_data.difference(gd_other, sort=sort)
     assert_eq(expected, actual)
@@ -880,15 +869,6 @@ def test_index_equals(data, other):
     gd_data = cudf.core.index.as_index(data)
     gd_other = cudf.core.index.as_index(other)
 
-    if (
-        gd_data.dtype.kind == "f" or gd_other.dtype.kind == "f"
-    ) and cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other):
-        pytest.mark.xfail(
-            condition=not PANDAS_GE_110,
-            reason="Bug in Pandas: "
-            "https://github.com/pandas-dev/pandas/issues/35217",
-        )
-
     expected = pd_data.equals(pd_other)
     actual = gd_data.equals(gd_other)
     assert_eq(expected, actual)
@@ -935,17 +915,6 @@ def test_index_categories_equal(data, other):
     gd_data = cudf.core.index.as_index(data).astype("category")
     gd_other = cudf.core.index.as_index(other)
 
-    if (
-        gd_data.dtype.kind == "f"
-        and gd_other.dtype.kind != "f"
-        or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f")
-    ):
-        pytest.mark.xfail(
-            condition=not PANDAS_GE_110,
-            reason="Bug in Pandas: "
-            "https://github.com/pandas-dev/pandas/issues/35217",
-        )
-
     expected = pd_data.equals(pd_other)
     actual = gd_data.equals(gd_other)
     assert_eq(expected, actual)