From 2f6d45643b7624636620b6f915a47afb8109b8cf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Feb 2024 15:21:25 -0800 Subject: [PATCH 1/3] Deprecate datelike isin casting strings to dates to match pandas 2.2 --- python/cudf/cudf/core/tools/datetimes.py | 9 +++++++++ python/cudf/cudf/tests/test_series.py | 8 ++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 529296da6a2..79d6e2fa72a 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -767,6 +767,15 @@ def _isin_datetimelike( rhs = None try: rhs = cudf.core.column.as_column(values) + if len(rhs) and rhs.dtype.kind == "O": + warnings.warn( + f"The behavior of 'isin' with dtype={lhs.dtype} and " + "castable values (e.g. strings) is deprecated. In a " + "future version, these will not be considered matching " + "by isin. Explicitly cast to the appropriate dtype before " + "calling isin instead.", + FutureWarning, + ) if rhs.dtype.kind in {"f", "i", "u"}: return cudf.core.column.full(len(lhs), False, dtype="bool") diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 14006f90b45..80ad388071b 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -15,6 +15,7 @@ import cudf from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_GE_220 from cudf.errors import MixedTypeError from cudf.testing._utils import ( NUMERIC_TYPES, @@ -1795,8 +1796,11 @@ def test_isin_datetime(data, values): psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) - got = gsr.isin(values) - expected = psr.isin(values) + is_str = isinstance(next(iter(values), None), str) + with expect_warning_if(is_str and len(data)): + got = gsr.isin(values) + with expect_warning_if(PANDAS_GE_220 and is_str and len(data)): + expected = psr.isin(values) assert_eq(got, expected) From 31287c152bdd2a86df8ac9344cd28b3daec5dc21 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Feb 2024 15:23:23 -0800 Subject: [PATCH 2/3] Combine variable --- python/cudf/cudf/tests/test_series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 80ad388071b..252343391be 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1796,10 +1796,10 @@ def test_isin_datetime(data, values): psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) - is_str = isinstance(next(iter(values), None), str) - with expect_warning_if(is_str and len(data)): + is_len_str = isinstance(next(iter(values), None), str) and len(data) + with expect_warning_if(is_len_str): got = gsr.isin(values) - with expect_warning_if(PANDAS_GE_220 and is_str and len(data)): + with expect_warning_if(PANDAS_GE_220 and is_len_str): expected = psr.isin(values) assert_eq(got, expected) From 8b7b499aab0b060c2e9b30b58d0b3f3f63192593 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 14 Feb 2024 14:15:28 -0800 Subject: [PATCH 3/3] Only raise if conversion was successful --- python/cudf/cudf/core/tools/datetimes.py | 11 ++++---- python/cudf/cudf/tests/test_index.py | 34 ++++++++++++------------ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 79d6e2fa72a..0e0df4ecf6e 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -767,7 +767,12 @@ def _isin_datetimelike( rhs = None try: rhs = cudf.core.column.as_column(values) - if len(rhs) and rhs.dtype.kind == "O": + was_string = len(rhs) and rhs.dtype.kind == "O" + + if rhs.dtype.kind in {"f", "i", "u"}: + return cudf.core.column.full(len(lhs), False, dtype="bool") + rhs = rhs.astype(lhs.dtype) + if was_string: warnings.warn( f"The behavior of 'isin' with dtype={lhs.dtype} and " "castable values (e.g. strings) is deprecated. In a " @@ -776,10 +781,6 @@ def _isin_datetimelike( "calling isin instead.", FutureWarning, ) - - if rhs.dtype.kind in {"f", "i", "u"}: - return cudf.core.column.full(len(lhs), False, dtype="bool") - rhs = rhs.astype(lhs.dtype) res = lhs._isin_earlystop(rhs) if res is not None: return res diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3cbfea8063f..defd42b3d00 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2497,19 +2497,12 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): @pytest.mark.parametrize( - "data", + "index", [ - [], - pd.Series( - ["this", "is", None, "a", "test"], index=["a", "b", "c", "d", "e"] - ), - pd.Series([0, 15, 10], index=[0, None, 9]), - pd.Series( - range(25), - index=pd.date_range( - start="2019-01-01", end="2019-01-02", freq="h" - ), - ), + pd.Index([]), + pd.Index(["a", "b", "c", "d", "e"]), + pd.Index([0, None, 9]), + pd.date_range("2019-01-01", periods=3), ], ) @pytest.mark.parametrize( @@ -2521,12 +2514,19 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"], ], ) -def test_isin_index(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) +def test_isin_index(index, values): + pidx = index + gidx = cudf.Index.from_pandas(pidx) - got = gsr.index.isin(values) - expected = psr.index.isin(values) + is_dt_str = ( + next(iter(values), None) == "2019-01-01 04:00:00" + and len(pidx) + and pidx.dtype.kind == "M" + ) + with expect_warning_if(is_dt_str): + got = gidx.isin(values) + with expect_warning_if(PANDAS_GE_220 and is_dt_str): + expected = pidx.isin(values) assert_eq(got, expected)