Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: enforce deprecation of the Series[categorical].replace special-casing #58270

1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ Other Removals
- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`)
- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`)
- Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`)
- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`)
- Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`)
- Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`)
- Enforced silent-downcasting deprecation for :ref:`all relevant methods <whatsnew_220.silent_downcasting>` (:issue:`54710`)
Expand Down
58 changes: 0 additions & 58 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
cast,
overload,
)
import warnings

import numpy as np

Expand All @@ -23,7 +22,6 @@
)
from pandas._libs.arrays import NDArrayBacked
from pandas.compat.numpy import function as nv
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
code_values = code_values[null_mask | (code_values >= 0)]
return algorithms.isin(self.codes, code_values)

@overload
def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ...

@overload
def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ...

def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None:
from pandas import Index

orig_dtype = self.dtype

inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()

mask = isna(np.asarray(value))
if mask.any():
removals = np.asarray(to_replace)[mask]
removals = cat.categories[cat.categories.isin(removals)]
new_cat = cat.remove_categories(removals)
NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)

ser = cat.categories.to_series()
ser = ser.replace(to_replace=to_replace, value=value)

all_values = Index(ser)

# GH51016: maintain order of existing categories
idxr = cat.categories.get_indexer_for(all_values)
locs = np.arange(len(ser))
locs = np.where(idxr == -1, locs, idxr)
locs = locs.argsort()

new_categories = ser.take(locs)
new_categories = new_categories.drop_duplicates(keep="first")
index_categories = Index(new_categories)
new_codes = recode_for_categories(
cat._codes, all_values, index_categories, copy=False
)
new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered)
NDArrayBacked.__init__(cat, new_codes, new_dtype)

if new_dtype != orig_dtype:
warnings.warn(
# GH#55147
"The behavior of Series.replace (and DataFrame.replace) with "
"CategoricalDtype is deprecated. In a future version, replace "
"will only be used for cases that preserve the categories. "
"To change the categories, use ser.cat.rename_categories "
"instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
if not inplace:
return cat
return None

# ------------------------------------------------------------------------
# String methods interface
def _str_map(
Expand Down
17 changes: 0 additions & 17 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@
)
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
Categorical,
DatetimeArray,
ExtensionArray,
IntervalArray,
Expand Down Expand Up @@ -696,14 +695,6 @@ def replace(
# go through replace_list
values = self.values

if isinstance(values, Categorical):
# TODO: avoid special-casing
# GH49404
blk = self._maybe_copy(inplace)
values = cast(Categorical, blk.values)
values._replace(to_replace=to_replace, value=value, inplace=True)
return [blk]

if not self._can_hold_element(to_replace):
# We cannot hold `to_replace`, so we know immediately that
# replacing it is a no-op.
Expand Down Expand Up @@ -803,14 +794,6 @@ def replace_list(
"""
values = self.values

if isinstance(values, Categorical):
# TODO: avoid special-casing
# GH49404
blk = self._maybe_copy(inplace)
values = cast(Categorical, blk.values)
values._replace(to_replace=src_list, value=dest_list, inplace=True)
return [blk]

# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
Expand Down
118 changes: 39 additions & 79 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,106 +6,66 @@


@pytest.mark.parametrize(
"to_replace,value,expected,flip_categories",
"to_replace,value,expected",
[
# one-to-one
(1, 2, [2, 2, 3], False),
(1, 4, [4, 2, 3], False),
(4, 1, [1, 2, 3], False),
(5, 6, [1, 2, 3], False),
(4, 1, [1, 2, 3]),
(3, 1, [1, 2, 1]),
# many-to-one
([1], 2, [2, 2, 3], False),
([1, 2], 3, [3, 3, 3], False),
([1, 2], 4, [4, 4, 3], False),
((1, 2, 4), 5, [5, 5, 3], False),
((5, 6), 2, [1, 2, 3], False),
([1], [2], [2, 2, 3], False),
([1, 4], [5, 2], [5, 2, 3], False),
# GH49404: overlap between to_replace and value
([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
# GH50872, GH46884: replace with null
(1, None, [None, 2, 3], False),
(1, pd.NA, [None, 2, 3], False),
# check_categorical sorts categories, which crashes on mixed dtypes
(3, "4", [1, 2, "4"], False),
([1, 2, "3"], "5", ["5", "5", 3], True),
((5, 6), 2, [1, 2, 3]),
((3, 2), 1, [1, 1, 1]),
],
)
@pytest.mark.filterwarnings(
"ignore:.*with CategoricalDtype is deprecated:FutureWarning"
)
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
def test_replace_categorical_series(to_replace, value, expected):
# GH 31720

ser = pd.Series([1, 2, 3], dtype="category")
result = ser.replace(to_replace, value)
expected = pd.Series(expected, dtype="category")
ser.replace(to_replace, value, inplace=True)

if flip_categories:
expected = expected.cat.set_categories(expected.cat.categories[::-1])

tm.assert_series_equal(expected, result, check_category_order=False)
tm.assert_series_equal(expected, ser, check_category_order=False)
expected = pd.Series(Categorical(expected, categories=[1, 2, 3]))
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"to_replace, value, result, expected_error_msg",
"to_replace,value",
[
("b", "c", ["a", "c"], "Categorical.categories are different"),
("c", "d", ["a", "b"], None),
# https://github.com/pandas-dev/pandas/issues/33288
("a", "a", ["a", "b"], None),
("b", None, ["a", None], "Categorical.categories length are different"),
# one-to-one
(3, 5),
# many-to-one
((3, 2), 5),
],
)
def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if expected_error_msg is not None else None
with tm.assert_produces_warning(warn, match=msg):
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
def test_replace_categorical_series_new_category_raises(to_replace, value):
# GH 31720
ser = pd.Series([1, 2, 3], dtype="category")
with pytest.raises(
TypeError, match="Cannot setitem on a Categorical with a new category"
):
ser.replace(to_replace, value)

tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)

ser = pd.Series(cat, copy=False)
with tm.assert_produces_warning(warn, match=msg):
ser.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)
def test_replace_maintain_ordering():
# GH51016
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
ser = pd.Series([0, 1, 2], dtype=dtype)
result = ser.replace(0, 2)
expected = pd.Series([2, 1, 2], dtype=dtype)
tm.assert_series_equal(expected, result, check_category_order=True)


def test_replace_categorical_ea_dtype():
# GH49404
cat = Categorical(pd.array(["a", "b"], dtype="string"))
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
cat = Categorical(pd.array(["a", "b", "c"], dtype="string"))
result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values
expected = Categorical(
pd.array(["c"] * 3, dtype="string"),
categories=pd.array(["a", "b", "c"], dtype="string"),
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
tm.assert_categorical_equal(result, expected)


def test_replace_maintain_ordering():
# GH51016
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
ser = pd.Series([0, 1, 2], dtype=dtype)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace(0, 2)
expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
expected = pd.Series([2, 1, 2], dtype=expected_dtype)
tm.assert_series_equal(expected, result, check_category_order=True)
def test_replace_categorical_ea_dtype_different_cats_raises():
# GH49404
cat = Categorical(pd.array(["a", "b"], dtype="string"))
with pytest.raises(
TypeError, match="Cannot setitem on a Categorical with a new category"
):
pd.Series(cat).replace(["a", "b"], ["c", pd.NA])
56 changes: 12 additions & 44 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,14 @@ def test_replace_to_replace_wrong_dtype():
def test_replace_list_categorical():
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
arr = get_array(df, "a")
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.replace(["c"], value="a", inplace=True)

df.replace(["c"], value="a", inplace=True)
assert np.shares_memory(arr.codes, get_array(df, "a").codes)
assert df._mgr._has_no_reference(0)

df_orig = df.copy()
with tm.assert_produces_warning(FutureWarning, match=msg):
df2 = df.replace(["b"], value="a")
df.replace(["b"], value="a")
df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"}))
assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)

tm.assert_frame_equal(df, df_orig)
Expand All @@ -150,13 +146,7 @@ def test_replace_list_inplace_refs_categorical():
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
view = df[:]
df_orig = df.copy()
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.replace(["c"], value="a", inplace=True)
assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes)
df.replace(["c"], value="a", inplace=True)
tm.assert_frame_equal(df_orig, view)


Expand Down Expand Up @@ -195,56 +185,34 @@ def test_replace_inplace_reference_no_op(to_replace):


@pytest.mark.parametrize("to_replace", [1, [1]])
@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace_reference(val, to_replace):
def test_replace_categorical_inplace_reference(to_replace):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df.replace(to_replace=to_replace, value=val, inplace=True)

df.replace(to_replace=to_replace, value=1, inplace=True)
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)


@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace(val):
def test_replace_categorical_inplace():
df = DataFrame({"a": Categorical([1, 2, 3])})
arr_a = get_array(df, "a")
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df.replace(to_replace=1, value=val, inplace=True)
df.replace(to_replace=1, value=1, inplace=True)

assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)

expected = DataFrame({"a": Categorical([val, 2, 3])})
expected = DataFrame({"a": Categorical([1, 2, 3])})
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical(val):
def test_replace_categorical():
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df2 = df.replace(to_replace=1, value=val)
df2 = df.replace(to_replace=1, value=1)

assert df._mgr._has_no_reference(0)
assert df2._mgr._has_no_reference(0)
Expand Down
Loading