diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index bbff72722ab..9ecd461cf99 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -2,6 +2,7 @@ from __future__ import annotations +import warnings from collections import abc from functools import cached_property from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast @@ -990,7 +991,7 @@ def find_and_replace( replaced, to_replace_col, replacement_col ) - return column.build_categorical_column( + result = column.build_categorical_column( categories=new_cats["cats"], codes=column.build_column(output.base_data, dtype=output.dtype), mask=output.base_mask, @@ -998,6 +999,16 @@ def find_and_replace( size=output.size, ordered=self.dtype.ordered, ) + if result.dtype != self.dtype: + warnings.warn( + "The behavior of replace with " + "CategoricalDtype is deprecated. In a future version, replace " + "will only be used for cases that preserve the categories. " + "To change the categories, use ser.cat.rename_categories " + "instead.", + FutureWarning, + ) + return result def isnull(self) -> ColumnBase: """ diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 0f8f8de36a1..0b57f9fe846 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -57,13 +57,24 @@ def test_series_replace_all(gsr, to_replace, value): else: pd_value = value - actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) - if pd_value is None: - # TODO: Remove this workaround once cudf - # introduces `no_default` values - expected = psr.replace(to_replace=pd_to_replace) - else: - expected = psr.replace(to_replace=pd_to_replace, value=pd_value) + with expect_warning_if( + isinstance(gsr.dtype, cudf.CategoricalDtype) + and isinstance(gd_to_replace, str) + and gd_to_replace == "one" + ): + actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) + with expect_warning_if( + PANDAS_GE_220 + and isinstance(gsr.dtype, cudf.CategoricalDtype) + and isinstance(gd_to_replace, str) + and gd_to_replace == "one" + ): + if pd_value is None: + # TODO: Remove this workaround once cudf + # introduces `no_default` values + expected = psr.replace(to_replace=pd_to_replace) + else: + expected = psr.replace(to_replace=pd_to_replace, value=pd_value) assert_eq( expected.sort_values().reset_index(drop=True), @@ -82,16 +93,19 @@ def test_series_replace(): # Categorical psr3 = pd.Series(["one", "two", "three"], dtype="category") - psr4 = psr3.replace("one", "two") + with expect_warning_if(PANDAS_GE_220): + psr4 = psr3.replace("one", "two") sr3 = cudf.from_pandas(psr3) - sr4 = sr3.replace("one", "two") + with pytest.warns(FutureWarning): + sr4 = sr3.replace("one", "two") assert_eq( psr4.sort_values().reset_index(drop=True), sr4.sort_values().reset_index(drop=True), ) - - psr5 = psr3.replace("one", "five") - sr5 = sr3.replace("one", "five") + with expect_warning_if(PANDAS_GE_220): + psr5 = psr3.replace("one", "five") + with pytest.warns(FutureWarning): + sr5 = sr3.replace("one", "five") assert_eq(psr5, sr5) @@ -236,11 +250,26 @@ def test_dataframe_replace(df, to_replace, value): else: gd_to_replace = to_replace - if pd_value is None: - expected = pdf.replace(to_replace=pd_to_replace) - else: - expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) - actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) + with expect_warning_if( + PANDAS_GE_220 + and isinstance(df["a"].dtype, cudf.CategoricalDtype) + and isinstance(to_replace, str) + and to_replace == "two" + and isinstance(value, str) + and value == "three" + ): + if pd_value is None: + expected = pdf.replace(to_replace=pd_to_replace) + else: + expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) + with expect_warning_if( + isinstance(df["a"].dtype, cudf.CategoricalDtype) + and isinstance(to_replace, str) + and to_replace == "two" + and isinstance(value, str) + and value == "three" + ): + actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) expected_sorted = expected.sort_values(by=list(expected.columns), axis=0) actual_sorted = actual.sort_values(by=list(actual.columns), axis=0) @@ -1342,7 +1371,8 @@ def test_series_replace_errors(): ], ) def test_replace_nulls(gsr, old, new, expected): - actual = gsr.replace(old, new) + with expect_warning_if(isinstance(gsr.dtype, cudf.CategoricalDtype)): + actual = gsr.replace(old, new) assert_eq( expected.sort_values().reset_index(drop=True), actual.sort_values().reset_index(drop=True),