pandas-dev · mroeschke · Jun 25, 2024 · Apr 15, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -381,6 +381,7 @@ Other Removals
 - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`)
 - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`)
 - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`)
+- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`)
 - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`)
 - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`)
 - Enforced silent-downcasting deprecation for :ref:`all relevant methods <whatsnew_220.silent_downcasting>` (:issue:`54710`)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -10,7 +10,6 @@
     cast,
     overload,
 )
-import warnings
 
 import numpy as np
 
@@ -23,7 +22,6 @@
 )
 from pandas._libs.arrays import NDArrayBacked
 from pandas.compat.numpy import function as nv
-from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.cast import (
@@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
         code_values = code_values[null_mask | (code_values >= 0)]
         return algorithms.isin(self.codes, code_values)
 
-    @overload
-    def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ...
-
-    @overload
-    def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ...
-
-    def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None:
-        from pandas import Index
-
-        orig_dtype = self.dtype
-
-        inplace = validate_bool_kwarg(inplace, "inplace")
-        cat = self if inplace else self.copy()
-
-        mask = isna(np.asarray(value))
-        if mask.any():
-            removals = np.asarray(to_replace)[mask]
-            removals = cat.categories[cat.categories.isin(removals)]
-            new_cat = cat.remove_categories(removals)
-            NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
-
-        ser = cat.categories.to_series()
-        ser = ser.replace(to_replace=to_replace, value=value)
-
-        all_values = Index(ser)
-
-        # GH51016: maintain order of existing categories
-        idxr = cat.categories.get_indexer_for(all_values)
-        locs = np.arange(len(ser))
-        locs = np.where(idxr == -1, locs, idxr)
-        locs = locs.argsort()
-
-        new_categories = ser.take(locs)
-        new_categories = new_categories.drop_duplicates(keep="first")
-        index_categories = Index(new_categories)
-        new_codes = recode_for_categories(
-            cat._codes, all_values, index_categories, copy=False
-        )
-        new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered)
-        NDArrayBacked.__init__(cat, new_codes, new_dtype)
-
-        if new_dtype != orig_dtype:
-            warnings.warn(
-                # GH#55147
-                "The behavior of Series.replace (and DataFrame.replace) with "
-                "CategoricalDtype is deprecated. In a future version, replace "
-                "will only be used for cases that preserve the categories. "
-                "To change the categories, use ser.cat.rename_categories "
-                "instead.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-        if not inplace:
-            return cat
-        return None
-
     # ------------------------------------------------------------------------
     # String methods interface
     def _str_map(

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -100,7 +100,6 @@
 )
 from pandas.core.array_algos.transforms import shift
 from pandas.core.arrays import (
-    Categorical,
     DatetimeArray,
     ExtensionArray,
     IntervalArray,
@@ -696,14 +695,6 @@ def replace(
         #  go through replace_list
         values = self.values
 
-        if isinstance(values, Categorical):
-            # TODO: avoid special-casing
-            # GH49404
-            blk = self._maybe_copy(inplace)
-            values = cast(Categorical, blk.values)
-            values._replace(to_replace=to_replace, value=value, inplace=True)
-            return [blk]
-
         if not self._can_hold_element(to_replace):
             # We cannot hold `to_replace`, so we know immediately that
             #  replacing it is a no-op.
@@ -803,14 +794,6 @@ def replace_list(
         """
         values = self.values
 
-        if isinstance(values, Categorical):
-            # TODO: avoid special-casing
-            # GH49404
-            blk = self._maybe_copy(inplace)
-            values = cast(Categorical, blk.values)
-            values._replace(to_replace=src_list, value=dest_list, inplace=True)
-            return [blk]
-
         # Exclude anything that we know we won't contain
         pairs = [
             (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)

diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py
@@ -6,106 +6,66 @@
 
 
 @pytest.mark.parametrize(
-    "to_replace,value,expected,flip_categories",
+    "to_replace,value,expected",
     [
         # one-to-one
-        (1, 2, [2, 2, 3], False),
-        (1, 4, [4, 2, 3], False),
-        (4, 1, [1, 2, 3], False),
-        (5, 6, [1, 2, 3], False),
+        (4, 1, [1, 2, 3]),
+        (3, 1, [1, 2, 1]),
         # many-to-one
-        ([1], 2, [2, 2, 3], False),
-        ([1, 2], 3, [3, 3, 3], False),
-        ([1, 2], 4, [4, 4, 3], False),
-        ((1, 2, 4), 5, [5, 5, 3], False),
-        ((5, 6), 2, [1, 2, 3], False),
-        ([1], [2], [2, 2, 3], False),
-        ([1, 4], [5, 2], [5, 2, 3], False),
-        # GH49404: overlap between to_replace and value
-        ([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
-        # GH50872, GH46884: replace with null
-        (1, None, [None, 2, 3], False),
-        (1, pd.NA, [None, 2, 3], False),
-        # check_categorical sorts categories, which crashes on mixed dtypes
-        (3, "4", [1, 2, "4"], False),
-        ([1, 2, "3"], "5", ["5", "5", 3], True),
+        ((5, 6), 2, [1, 2, 3]),
+        ((3, 2), 1, [1, 1, 1]),
     ],
 )
-@pytest.mark.filterwarnings(
-    "ignore:.*with CategoricalDtype is deprecated:FutureWarning"
-)
-def test_replace_categorical_series(to_replace, value, expected, flip_categories):
+def test_replace_categorical_series(to_replace, value, expected):
     # GH 31720
-
     ser = pd.Series([1, 2, 3], dtype="category")
     result = ser.replace(to_replace, value)
-    expected = pd.Series(expected, dtype="category")
-    ser.replace(to_replace, value, inplace=True)
-
-    if flip_categories:
-        expected = expected.cat.set_categories(expected.cat.categories[::-1])
-
-    tm.assert_series_equal(expected, result, check_category_order=False)
-    tm.assert_series_equal(expected, ser, check_category_order=False)
+    expected = pd.Series(Categorical(expected, categories=[1, 2, 3]))
+    tm.assert_series_equal(result, expected)
 
 
 @pytest.mark.parametrize(
-    "to_replace, value, result, expected_error_msg",
+    "to_replace,value",
     [
-        ("b", "c", ["a", "c"], "Categorical.categories are different"),
-        ("c", "d", ["a", "b"], None),
-        # https://github.com/pandas-dev/pandas/issues/33288
-        ("a", "a", ["a", "b"], None),
-        ("b", None, ["a", None], "Categorical.categories length are different"),
+        # one-to-one
+        (3, 5),
+        # many-to-one
+        ((3, 2), 5),
     ],
 )
-def test_replace_categorical(to_replace, value, result, expected_error_msg):
-    # GH#26988
-    cat = Categorical(["a", "b"])
-    expected = Categorical(result)
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    warn = FutureWarning if expected_error_msg is not None else None
-    with tm.assert_produces_warning(warn, match=msg):
-        result = pd.Series(cat, copy=False).replace(to_replace, value)._values
+def test_replace_categorical_series_new_category_raises(to_replace, value):
+    # GH 31720
+    ser = pd.Series([1, 2, 3], dtype="category")
+    with pytest.raises(
+        TypeError, match="Cannot setitem on a Categorical with a new category"
+    ):
+        ser.replace(to_replace, value)
 
-    tm.assert_categorical_equal(result, expected)
-    if to_replace == "b":  # the "c" test is supposed to be unchanged
-        with pytest.raises(AssertionError, match=expected_error_msg):
-            # ensure non-inplace call does not affect original
-            tm.assert_categorical_equal(cat, expected)
 
-    ser = pd.Series(cat, copy=False)
-    with tm.assert_produces_warning(warn, match=msg):
-        ser.replace(to_replace, value, inplace=True)
-    tm.assert_categorical_equal(cat, expected)
+def test_replace_maintain_ordering():
+    # GH51016
+    dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
+    ser = pd.Series([0, 1, 2], dtype=dtype)
+    result = ser.replace(0, 2)
+    expected = pd.Series([2, 1, 2], dtype=dtype)
+    tm.assert_series_equal(expected, result, check_category_order=True)
 
 
 def test_replace_categorical_ea_dtype():
     # GH49404
-    cat = Categorical(pd.array(["a", "b"], dtype="string"))
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
+    cat = Categorical(pd.array(["a", "b", "c"], dtype="string"))
+    result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values
+    expected = Categorical(
+        pd.array(["c"] * 3, dtype="string"),
+        categories=pd.array(["a", "b", "c"], dtype="string"),
     )
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
-    expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
     tm.assert_categorical_equal(result, expected)
 
 
-def test_replace_maintain_ordering():
-    # GH51016
-    dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
-    ser = pd.Series([0, 1, 2], dtype=dtype)
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = ser.replace(0, 2)
-    expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
-    expected = pd.Series([2, 1, 2], dtype=expected_dtype)
-    tm.assert_series_equal(expected, result, check_category_order=True)
+def test_replace_categorical_ea_dtype_different_cats_raises():
+    # GH49404
+    cat = Categorical(pd.array(["a", "b"], dtype="string"))
+    with pytest.raises(
+        TypeError, match="Cannot setitem on a Categorical with a new category"
+    ):
+        pd.Series(cat).replace(["a", "b"], ["c", pd.NA])
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
@@ -129,18 +129,14 @@ def test_replace_to_replace_wrong_dtype():
 def test_replace_list_categorical():
     df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
     arr = get_array(df, "a")
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        df.replace(["c"], value="a", inplace=True)
+
+    df.replace(["c"], value="a", inplace=True)
     assert np.shares_memory(arr.codes, get_array(df, "a").codes)
     assert df._mgr._has_no_reference(0)
 
     df_orig = df.copy()
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        df2 = df.replace(["b"], value="a")
+    df.replace(["b"], value="a")
+    df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"}))
     assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
 
     tm.assert_frame_equal(df, df_orig)
@@ -150,13 +146,7 @@ def test_replace_list_inplace_refs_categorical():
     df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
     view = df[:]
     df_orig = df.copy()
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        df.replace(["c"], value="a", inplace=True)
-    assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes)
+    df.replace(["c"], value="a", inplace=True)
     tm.assert_frame_equal(df_orig, view)
 
 
@@ -195,56 +185,34 @@ def test_replace_inplace_reference_no_op(to_replace):
 
 
 @pytest.mark.parametrize("to_replace", [1, [1]])
-@pytest.mark.parametrize("val", [1, 1.5])
-def test_replace_categorical_inplace_reference(val, to_replace):
+def test_replace_categorical_inplace_reference(to_replace):
     df = DataFrame({"a": Categorical([1, 2, 3])})
     df_orig = df.copy()
     arr_a = get_array(df, "a")
     view = df[:]
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    warn = FutureWarning if val == 1.5 else None
-    with tm.assert_produces_warning(warn, match=msg):
-        df.replace(to_replace=to_replace, value=val, inplace=True)
-
+    df.replace(to_replace=to_replace, value=1, inplace=True)
     assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
     assert df._mgr._has_no_reference(0)
     assert view._mgr._has_no_reference(0)
     tm.assert_frame_equal(view, df_orig)
 
 
-@pytest.mark.parametrize("val", [1, 1.5])
-def test_replace_categorical_inplace(val):
+def test_replace_categorical_inplace():
     df = DataFrame({"a": Categorical([1, 2, 3])})
     arr_a = get_array(df, "a")
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    warn = FutureWarning if val == 1.5 else None
-    with tm.assert_produces_warning(warn, match=msg):
-        df.replace(to_replace=1, value=val, inplace=True)
+    df.replace(to_replace=1, value=1, inplace=True)
 
     assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
     assert df._mgr._has_no_reference(0)
 
-    expected = DataFrame({"a": Categorical([val, 2, 3])})
+    expected = DataFrame({"a": Categorical([1, 2, 3])})
     tm.assert_frame_equal(df, expected)
 
 
-@pytest.mark.parametrize("val", [1, 1.5])
-def test_replace_categorical(val):
+def test_replace_categorical():
     df = DataFrame({"a": Categorical([1, 2, 3])})
     df_orig = df.copy()
-    msg = (
-        r"The behavior of Series\.replace \(and DataFrame.replace\) "
-        "with CategoricalDtype"
-    )
-    warn = FutureWarning if val == 1.5 else None
-    with tm.assert_produces_warning(warn, match=msg):
-        df2 = df.replace(to_replace=1, value=val)
+    df2 = df.replace(to_replace=1, value=1)
 
     assert df._mgr._has_no_reference(0)
     assert df2._mgr._has_no_reference(0)