Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Series[categorical].replace special-casing #56385

Merged
merged 1 commit into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ Other Deprecations
- Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`)
- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)
- Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`)
- Deprecated the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype`; in a future version replace will change the values while preserving the categories. To change the categories, use ``ser.cat.rename_categories`` instead (:issue:`55147`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious. If one does Categorical[int].replace(1, 1.2) in the future will that case raise?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will behave like ser[ser == 1] = 1.2, which raises TypeError: Cannot setitem on a Categorical with a new category (1.2), set the categories first

- Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`)
- Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`)
- Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`)
Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2626,6 +2626,8 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
def _replace(self, *, to_replace, value, inplace: bool = False):
from pandas import Index

orig_dtype = self.dtype

inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()

Expand Down Expand Up @@ -2656,6 +2658,17 @@ def _replace(self, *, to_replace, value, inplace: bool = False):
new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
NDArrayBacked.__init__(cat, new_codes, new_dtype)

if new_dtype != orig_dtype:
warnings.warn(
# GH#55147
"The behavior of Series.replace (and DataFrame.replace) with "
"CategoricalDtype is deprecated. In a future version, replace "
"will only be used for cases that preserve the categories. "
"To change the categories, use ser.cat.rename_categories "
"instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
if not inplace:
return cat

Expand Down
28 changes: 24 additions & 4 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
([1, 2, "3"], "5", ["5", "5", 3], True),
],
)
@pytest.mark.filterwarnings(
"ignore:.*with CategoricalDtype is deprecated:FutureWarning"
)
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
# GH 31720

Expand Down Expand Up @@ -60,7 +63,13 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if expected_error_msg is not None else None
with tm.assert_produces_warning(warn, match=msg):
result = pd.Series(cat, copy=False).replace(to_replace, value)._values

tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
Expand All @@ -69,14 +78,20 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
tm.assert_categorical_equal(cat, expected)

ser = pd.Series(cat, copy=False)
ser.replace(to_replace, value, inplace=True)
with tm.assert_produces_warning(warn, match=msg):
ser.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)


def test_replace_categorical_ea_dtype():
# GH49404
cat = Categorical(pd.array(["a", "b"], dtype="string"))
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
tm.assert_categorical_equal(result, expected)

Expand All @@ -85,7 +100,12 @@ def test_replace_maintain_ordering():
# GH51016
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
ser = pd.Series([0, 1, 2], dtype=dtype)
result = ser.replace(0, 2)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace(0, 2)
expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
expected = pd.Series([2, 1, 2], dtype=expected_dtype)
tm.assert_series_equal(expected, result, check_category_order=True)
41 changes: 35 additions & 6 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,19 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write):
def test_replace_list_categorical(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
arr = get_array(df, "a")
df.replace(["c"], value="a", inplace=True)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.replace(["c"], value="a", inplace=True)
assert np.shares_memory(arr.codes, get_array(df, "a").codes)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)

df_orig = df.copy()
df2 = df.replace(["b"], value="a")
with tm.assert_produces_warning(FutureWarning, match=msg):
df2 = df.replace(["b"], value="a")
assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)

tm.assert_frame_equal(df, df_orig)
Expand All @@ -177,7 +183,12 @@ def test_replace_list_inplace_refs_categorical(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
view = df[:]
df_orig = df.copy()
df.replace(["c"], value="a", inplace=True)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.replace(["c"], value="a", inplace=True)
if using_copy_on_write:
assert not np.shares_memory(
get_array(view, "a").codes, get_array(df, "a").codes
Expand Down Expand Up @@ -236,7 +247,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=val, inplace=True)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df.replace(to_replace=to_replace, value=val, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
Expand All @@ -251,7 +268,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl
def test_replace_categorical_inplace(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
arr_a = get_array(df, "a")
df.replace(to_replace=1, value=val, inplace=True)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df.replace(to_replace=1, value=val, inplace=True)

assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
if using_copy_on_write:
Expand All @@ -265,7 +288,13 @@ def test_replace_categorical_inplace(using_copy_on_write, val):
def test_replace_categorical(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
df2 = df.replace(to_replace=1, value=val)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df2 = df.replace(to_replace=1, value=val)

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
Expand Down
25 changes: 19 additions & 6 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1279,7 +1279,9 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data):
b = pd.Categorical(final_data[:, 1], categories=ex_cat)

expected = DataFrame({"a": a, "b": b})
result = df.replace(replace_dict, 3)
msg2 = "with CategoricalDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg2):
result = df.replace(replace_dict, 3)
tm.assert_frame_equal(result, expected)
msg = (
r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are "
Expand All @@ -1288,7 +1290,8 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data):
with pytest.raises(AssertionError, match=msg):
# ensure non-inplace call does not affect original
tm.assert_frame_equal(df, expected)
return_value = df.replace(replace_dict, 3, inplace=True)
with tm.assert_produces_warning(FutureWarning, match=msg2):
return_value = df.replace(replace_dict, 3, inplace=True)
assert return_value is None
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -1438,9 +1441,14 @@ def test_replace_value_category_type(self):
)

# replace values in input dataframe
input_df = input_df.replace("d", "z")
input_df = input_df.replace("obj1", "obj9")
result = input_df.replace("cat2", "catX")
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
input_df = input_df.replace("d", "z")
input_df = input_df.replace("obj1", "obj9")
result = input_df.replace("cat2", "catX")

tm.assert_frame_equal(result, expected)

Expand All @@ -1466,7 +1474,12 @@ def test_replace_dict_category_type(self):
)

# replace values in input dataframe using a dict
result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"})
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"})

tm.assert_frame_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,9 +546,9 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki

gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
expected["x"] = expected["x"].replace(4, None)
expected["x"] = expected["x"].cat.remove_categories([4])
if index_kind == "multi":
expected["x2"] = expected["x2"].replace(4, None)
expected["x2"] = expected["x2"].cat.remove_categories([4])
if as_index:
if index_kind == "multi":
expected = expected.set_index(["x", "x2"])
Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/io/pytables/test_file_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,15 @@ def test_latin_encoding(tmp_path, setup_path, dtype, val):
ser.to_hdf(store, key=key, format="table", encoding=enc, nan_rep=nan_rep)
retr = read_hdf(store, key)

s_nan = ser.replace(nan_rep, np.nan)
# TODO:(3.0): once Categorical replace deprecation is enforced,
# we may be able to re-simplify the construction of s_nan
if dtype == "category":
if nan_rep in ser.cat.categories:
s_nan = ser.cat.remove_categories([nan_rep])
else:
s_nan = ser
else:
s_nan = ser.replace(nan_rep, np.nan)

tm.assert_series_equal(s_nan, retr)

Expand Down
17 changes: 13 additions & 4 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ def test_replace_categorical(self, categorical, numeric):
# GH 24971, GH#23305
ser = pd.Series(categorical)
msg = "Downcasting behavior in `replace`"
msg = "with CategoricalDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace({"A": 1, "B": 2})
expected = pd.Series(numeric).astype("category")
Expand All @@ -418,7 +419,9 @@ def test_replace_categorical(self, categorical, numeric):
def test_replace_categorical_inplace(self, data, data_exp):
# GH 53358
result = pd.Series(data, dtype="category")
result.replace(to_replace="a", value="b", inplace=True)
msg = "with CategoricalDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result.replace(to_replace="a", value="b", inplace=True)
expected = pd.Series(data_exp, dtype="category")
tm.assert_series_equal(result, expected)

Expand All @@ -434,16 +437,22 @@ def test_replace_categorical_single(self):
expected = expected.cat.remove_unused_categories()
assert c[2] != "foo"

result = c.replace(c[2], "foo")
msg = "with CategoricalDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = c.replace(c[2], "foo")
tm.assert_series_equal(expected, result)
assert c[2] != "foo" # ensure non-inplace call does not alter original

return_value = c.replace(c[2], "foo", inplace=True)
msg = "with CategoricalDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
return_value = c.replace(c[2], "foo", inplace=True)
assert return_value is None
tm.assert_series_equal(expected, c)

first_value = c[0]
return_value = c.replace(c[1], c[0], inplace=True)
msg = "with CategoricalDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
return_value = c.replace(c[1], c[0], inplace=True)
assert return_value is None
assert c[0] == c[1] == first_value # test replacing with existing value

Expand Down
Loading