From f6c00ff376a7affe561e44f4c1af09f717262016 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 05:11:08 -1000 Subject: [PATCH] Deprecate datelike isin casting strings to dates to match pandas 2.2 (#15046) Matching https://github.com/pandas-dev/pandas/pull/56427 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15046 --- python/cudf/cudf/core/tools/datetimes.py | 10 +++++++ python/cudf/cudf/tests/test_index.py | 34 ++++++++++++------------ python/cudf/cudf/tests/test_series.py | 8 ++++-- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 529296da6a2..0e0df4ecf6e 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -767,10 +767,20 @@ def _isin_datetimelike( rhs = None try: rhs = cudf.core.column.as_column(values) + was_string = len(rhs) and rhs.dtype.kind == "O" if rhs.dtype.kind in {"f", "i", "u"}: return cudf.core.column.full(len(lhs), False, dtype="bool") rhs = rhs.astype(lhs.dtype) + if was_string: + warnings.warn( + f"The behavior of 'isin' with dtype={lhs.dtype} and " + "castable values (e.g. strings) is deprecated. In a " + "future version, these will not be considered matching " + "by isin. Explicitly cast to the appropriate dtype before " + "calling isin instead.", + FutureWarning, + ) res = lhs._isin_earlystop(rhs) if res is not None: return res diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3cbfea8063f..defd42b3d00 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2497,19 +2497,12 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): @pytest.mark.parametrize( - "data", + "index", [ - [], - pd.Series( - ["this", "is", None, "a", "test"], index=["a", "b", "c", "d", "e"] - ), - pd.Series([0, 15, 10], index=[0, None, 9]), - pd.Series( - range(25), - index=pd.date_range( - start="2019-01-01", end="2019-01-02", freq="h" - ), - ), + pd.Index([]), + pd.Index(["a", "b", "c", "d", "e"]), + pd.Index([0, None, 9]), + pd.date_range("2019-01-01", periods=3), ], ) @pytest.mark.parametrize( @@ -2521,12 +2514,19 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"], ], ) -def test_isin_index(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) +def test_isin_index(index, values): + pidx = index + gidx = cudf.Index.from_pandas(pidx) - got = gsr.index.isin(values) - expected = psr.index.isin(values) + is_dt_str = ( + next(iter(values), None) == "2019-01-01 04:00:00" + and len(pidx) + and pidx.dtype.kind == "M" + ) + with expect_warning_if(is_dt_str): + got = gidx.isin(values) + with expect_warning_if(PANDAS_GE_220 and is_dt_str): + expected = pidx.isin(values) assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 14006f90b45..252343391be 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -15,6 +15,7 @@ import cudf from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_GE_220 from cudf.errors import MixedTypeError from cudf.testing._utils import ( NUMERIC_TYPES, @@ -1795,8 +1796,11 @@ def test_isin_datetime(data, values): psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) - got = gsr.isin(values) - expected = psr.isin(values) + is_len_str = isinstance(next(iter(values), None), str) and len(data) + with expect_warning_if(is_len_str): + got = gsr.isin(values) + with expect_warning_if(PANDAS_GE_220 and is_len_str): + expected = psr.isin(values) assert_eq(got, expected)