Change cross-pandas-version testing in cudf (#15145)

This PR removes redundant version checks in a lot of pytests. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: #15145
rapidsai · Mar 13, 2024 · fe9642b · fe9642b
1 parent 39a365b
commit fe9642b
Show file tree

Hide file tree

Showing 26 changed files with 310 additions and 245 deletions.
diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
@@ -249,3 +249,22 @@ In particular:
 - `testing._utils.assert_eq` is the biggest hammer to reach for. It can be used to compare any pair of objects.
 - For comparing specific objects, use `testing.testing.assert_[frame|series|index]_equal`.
 - For verifying that the expected assertions are raised, use `testing._utils.assert_exceptions_equal`.
+
+
+### Version testing
+
+It is recommended to have `cudf` pytests only work on the latest supported pandas version i.e., `PANDAS_CURRENT_SUPPORTED_VERSION`. Any anticipated failures should be either `skipped` or `xfailed`.
+
+For example:
+
+```python
+@pytest.mark.skipif(PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="bug in older version of pandas")
+def test_bug_from_older_pandas_versions(...):
+    ...
+
+@pytest.mark.xfail(PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, reason="bug in latest version of pandas")
+def test_bug_in_current_and_maybe_future_versions(...):
+    ...
+```
+
+If pandas makes a bugfix release and fixes this, then we'll see it in CI immediately, patch it, and bump `PANDAS_CURRENT_SUPPORTED_VERSION` which also usually happens during pandas upgrades.
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
@@ -3,12 +3,10 @@
 import pandas as pd
 from packaging import version
 
+PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.1")
 PANDAS_VERSION = version.parse(pd.__version__)
-PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0")
-PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0")
-PANDAS_GE_201 = PANDAS_VERSION >= version.parse("2.0.1")
+
+
 PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0")
-PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4")
-PANDAS_LT_203 = PANDAS_VERSION < version.parse("2.0.3")
 PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0")
 PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0")
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_210
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.index import IntervalIndex, interval_range
 from cudf.testing._utils import assert_eq
 
@@ -315,8 +315,8 @@ def test_interval_index_from_breaks(closed):
             1.0,
             0.2,
             None,
-            marks=pytest.mark.xfail(
-                condition=not PANDAS_GE_210,
+            marks=pytest.mark.skipif(
+                condition=PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
                 reason="https://github.com/pandas-dev/pandas/pull/54477",
             ),
         ),
@@ -327,7 +327,7 @@ def test_interval_index_from_breaks(closed):
             0.1,
             None,
             marks=pytest.mark.xfail(
-                condition=not PANDAS_GE_210,
+                condition=PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
                 reason="https://github.com/pandas-dev/pandas/pull/54477",
             ),
         ),

diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
@@ -7,8 +7,7 @@
 
 import cudf
 from cudf.api import types
-from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_214, PANDAS_GE_220
-from cudf.testing._utils import expect_warning_if
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 
 
 @pytest.mark.parametrize(
@@ -499,8 +498,22 @@ def test_is_integer(obj, expect):
         (pd.Series(dtype="int"), False),
         (pd.Series(dtype="float"), False),
         (pd.Series(dtype="complex"), False),
-        (pd.Series(dtype="str"), PANDAS_GE_220),
-        (pd.Series(dtype="unicode"), PANDAS_GE_220),
+        pytest.param(
+            pd.Series(dtype="str"),
+            True,
+            marks=pytest.mark.skipif(
+                PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+                reason="bug in previous pandas versions",
+            ),
+        ),
+        pytest.param(
+            pd.Series(dtype="unicode"),
+            True,
+            marks=pytest.mark.skipif(
+                PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+                reason="bug in previous pandas versions",
+            ),
+        ),
         (pd.Series(dtype="datetime64[s]"), False),
         (pd.Series(dtype="timedelta64[s]"), False),
         (pd.Series(dtype="category"), False),
@@ -964,6 +977,10 @@ def test_is_decimal_dtype(obj, expect):
     assert types.is_decimal_dtype(obj) == expect
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="inconsistent warnings in older pandas versions",
+)
 @pytest.mark.parametrize(
     "obj",
     (
@@ -1037,9 +1054,7 @@ def test_is_decimal_dtype(obj, expect):
     ),
 )
 def test_pandas_agreement(obj):
-    with expect_warning_if(
-        PANDAS_GE_210, DeprecationWarning if PANDAS_GE_214 else FutureWarning
-    ):
+    with pytest.warns(DeprecationWarning):
         expected = pd_types.is_categorical_dtype(obj)
     with pytest.warns(DeprecationWarning):
         actual = types.is_categorical_dtype(obj)

diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
@@ -3,10 +3,14 @@
 import pytest
 
 from cudf import NA, DataFrame
-from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import _utils as utils
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in all versions of pandas",
+)
 @pytest.mark.parametrize(
     "data",
     [
@@ -29,15 +33,15 @@
 def test_applymap_dataframe(data, func, na_action, request):
     request.applymarker(
         pytest.mark.xfail(
-            PANDAS_GE_220
+            PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION
             and request.node.callspec.id == "None-<lambda>2-data3",
             reason="https://github.com/pandas-dev/pandas/issues/57390",
         )
     )
     gdf = DataFrame(data)
     pdf = gdf.to_pandas(nullable=True)
 
-    with utils.expect_warning_if(PANDAS_GE_210):
+    with pytest.warns(FutureWarning):
         expect = pdf.applymap(func, na_action=na_action)
     with pytest.warns(FutureWarning):
         got = gdf.applymap(func, na_action=na_action)

diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -10,7 +10,11 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_210, PANDAS_LT_300
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_LT_300,
+    PANDAS_VERSION,
+)
 from cudf.testing._utils import (
     assert_eq,
     expect_warning_if,
@@ -143,6 +147,10 @@ def test_binary_ufunc_index_array(ufunc, reflect):
             assert_eq(got, expect, check_exact=False)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 @pytest.mark.parametrize("ufunc", _UFUNCS)
 @pytest.mark.parametrize("has_nulls", [True, False])
 @pytest.mark.parametrize("indexed", [True, False])
@@ -231,8 +239,7 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed):
     else:
         if has_nulls:
             with expect_warning_if(
-                PANDAS_GE_210
-                and fname
+                fname
                 in (
                     "isfinite",
                     "isinf",
@@ -351,6 +358,10 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func):
 
 
 # Skip matmul since it requires aligned shapes.
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 @pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul))
 @pytest.mark.parametrize("has_nulls", [True, False])
 @pytest.mark.parametrize("indexed", [True, False])
@@ -431,8 +442,7 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed):
     else:
         if has_nulls:
             with expect_warning_if(
-                PANDAS_GE_210
-                and fname
+                fname
                 in (
                     "isfinite",
                     "isinf",

diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
@@ -13,7 +13,7 @@
 
 import cudf
 from cudf import Series
-from cudf.core._compat import PANDAS_GE_220
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.index import as_index
 from cudf.testing import _utils as utils
@@ -829,7 +829,7 @@ def test_operator_func_series_and_scalar_logical(
 ):
     request.applymarker(
         pytest.mark.xfail(
-            PANDAS_GE_220
+            PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION
             and fill_value == 1.0
             and scalar is np.nan
             and (has_nulls or (not has_nulls and func not in {"eq", "ne"})),
@@ -1719,7 +1719,7 @@ def test_datetime_dateoffset_binaryop(
 ):
     request.applymarker(
         pytest.mark.xfail(
-            PANDAS_GE_220
+            PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION
             and dtype in {"datetime64[ms]", "datetime64[s]"}
             and frequency == "microseconds"
             and n_periods == 0,
@@ -1829,7 +1829,7 @@ def test_datetime_dateoffset_binaryop_reflected(n_periods, frequency, dtype):
 
     # TODO: Remove check_dtype once we get some clarity on:
     # https://github.com/pandas-dev/pandas/issues/57448
-    utils.assert_eq(expect, got, check_dtype=not PANDAS_GE_220)
+    utils.assert_eq(expect, got, check_dtype=False)
 
     with pytest.raises(TypeError):
         poffset - psr

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -17,7 +17,7 @@
 
 import cudf
 from cudf import read_csv
-from cudf.core._compat import PANDAS_GE_200
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
@@ -344,6 +344,10 @@ def test_csv_reader_dtype_extremes(use_names):
     assert_eq(gdf, pdf)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="https://github.com/pandas-dev/pandas/issues/52449",
+)
 def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv")
 
@@ -372,12 +376,8 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
 
     assert len(out.columns) == len(df_out.columns)
     assert len(out) == len(df_out)
-    if PANDAS_GE_200:
-        # TODO: Remove typecast to `ns` after following
-        # issue is fixed:
-        # https://github.com/pandas-dev/pandas/issues/52449
-        out["2"] = out["2"].astype("datetime64[ns]")
-    assert_eq(df_out, out)
+
+    assert_eq(df_out, out, check_dtype=False)
 
 
 def test_csv_reader_negative_vals(tmpdir):

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -26,7 +26,7 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.column import column
 from cudf.errors import MixedTypeError
@@ -1347,11 +1347,7 @@ def test_dataframe_setitem_from_masked_object():
 def test_dataframe_append_to_empty():
     pdf = pd.DataFrame()
     pdf["a"] = []
-    if PANDAS_GE_200:
-        # TODO: Remove this workaround after
-        # the following bug is fixed:
-        # https://github.com/pandas-dev/pandas/issues/56679
-        pdf["a"] = pdf["a"].astype("str")
+    pdf["a"] = pdf["a"].astype("str")
     pdf["b"] = [1, 2, 3]
 
     gdf = cudf.DataFrame()
@@ -6724,7 +6720,8 @@ def test_dataframe_init_from_arrays_cols(data, cols, index):
 def test_dataframe_assign_scalar(request, col_data, assign_val):
     request.applymarker(
         pytest.mark.xfail(
-            condition=PANDAS_GE_200 and len(col_data) == 0,
+            condition=PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION
+            and len(col_data) == 0,
             reason="https://github.com/pandas-dev/pandas/issues/56679",
         )
     )
@@ -9970,6 +9967,10 @@ def test_dataframe_rename_duplicate_column():
 
 
 @pytest_unmark_spilling
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 @pytest.mark.parametrize(
     "data",
     [
@@ -9990,8 +9991,7 @@ def test_dataframe_pct_change(data, periods, fill_method):
     with expect_warning_if(fill_method is not no_default):
         actual = gdf.pct_change(periods=periods, fill_method=fill_method)
     with expect_warning_if(
-        PANDAS_GE_210
-        and (fill_method is not no_default or pdf.isna().any().any())
+        fill_method is not no_default or pdf.isna().any().any()
     ):
         expected = pdf.pct_change(periods=periods, fill_method=fill_method)