From 5e173031f2b030d1f833dd12784a83630cda91ba Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 7 Dec 2023 12:07:06 -0800 Subject: [PATCH] DEPR: Series[categorical].replace special-casing --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/categorical.py | 13 ++++++ .../tests/arrays/categorical/test_replace.py | 28 +++++++++++-- pandas/tests/copy_view/test_replace.py | 41 ++++++++++++++++--- pandas/tests/frame/methods/test_replace.py | 25 ++++++++--- pandas/tests/groupby/test_groupby_dropna.py | 4 +- .../tests/io/pytables/test_file_handling.py | 10 ++++- pandas/tests/series/methods/test_replace.py | 17 ++++++-- 8 files changed, 116 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c878fd2664dc4..5dd04986dcd3e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -473,6 +473,7 @@ Other Deprecations - Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`) - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) - Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`) +- Deprecated the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype`; in a future version replace will change the values while preserving the categories. To change the categories, use ``ser.cat.rename_categories`` instead (:issue:`55147`) - Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eec833c600177..f0aabbb863a79 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2626,6 +2626,8 @@ def isin(self, values) -> npt.NDArray[np.bool_]: def _replace(self, *, to_replace, value, inplace: bool = False): from pandas import Index + orig_dtype = self.dtype + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -2656,6 +2658,17 @@ def _replace(self, *, to_replace, value, inplace: bool = False): new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) NDArrayBacked.__init__(cat, new_codes, new_dtype) + if new_dtype != orig_dtype: + warnings.warn( + # GH#55147 + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is deprecated. In a future version, replace " + "will only be used for cases that preserve the categories. " + "To change the categories, use ser.cat.rename_categories " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if not inplace: return cat diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 0611d04d36d10..3c677142846d7 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -31,6 +31,9 @@ ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) +@pytest.mark.filterwarnings( + "ignore:.*with CategoricalDtype is deprecated:FutureWarning" +) def test_replace_categorical_series(to_replace, value, expected, flip_categories): # GH 31720 @@ -60,7 +63,13 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - result = pd.Series(cat, copy=False).replace(to_replace, value)._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if expected_error_msg is not None else None + with tm.assert_produces_warning(warn, match=msg): + result = pd.Series(cat, copy=False).replace(to_replace, value)._values tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged @@ -69,14 +78,20 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): tm.assert_categorical_equal(cat, expected) ser = pd.Series(cat, copy=False) - ser.replace(to_replace, value, inplace=True) + with tm.assert_produces_warning(warn, match=msg): + ser.replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) def test_replace_categorical_ea_dtype(): # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) @@ -85,7 +100,12 @@ def test_replace_maintain_ordering(): # GH51016 dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) ser = pd.Series([0, 1, 2], dtype=dtype) - result = ser.replace(0, 2) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace(0, 2) expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) expected = pd.Series([2, 1, 2], dtype=expected_dtype) tm.assert_series_equal(expected, result, check_category_order=True) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index eb3b1a5ef68e8..99178918ffdbf 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -161,13 +161,19 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write): def test_replace_list_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) if using_copy_on_write: assert df._mgr._has_no_reference(0) df_orig = df.copy() - df2 = df.replace(["b"], value="a") + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.replace(["b"], value="a") assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -177,7 +183,12 @@ def test_replace_list_inplace_refs_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) if using_copy_on_write: assert not np.shares_memory( get_array(view, "a").codes, get_array(df, "a").codes @@ -236,7 +247,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=to_replace, value=val, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) @@ -251,7 +268,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl def test_replace_categorical_inplace(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - df.replace(to_replace=1, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=1, value=val, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) if using_copy_on_write: @@ -265,7 +288,13 @@ def test_replace_categorical_inplace(using_copy_on_write, val): def test_replace_categorical(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - df2 = df.replace(to_replace=1, value=val) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df2 = df.replace(to_replace=1, value=val) if using_copy_on_write: assert df._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 13e2c1a249ac2..53c45a5f4b5c6 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1279,7 +1279,9 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): b = pd.Categorical(final_data[:, 1], categories=ex_cat) expected = DataFrame({"a": a, "b": b}) - result = df.replace(replace_dict, 3) + msg2 = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) msg = ( r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " @@ -1288,7 +1290,8 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) - return_value = df.replace(replace_dict, 3, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = df.replace(replace_dict, 3, inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) @@ -1438,9 +1441,14 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") tm.assert_frame_equal(result, expected) @@ -1466,7 +1474,12 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b40c8f45b6d19..4f54621b19b64 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -546,9 +546,9 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() - expected["x"] = expected["x"].replace(4, None) + expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": - expected["x2"] = expected["x2"].replace(4, None) + expected["x2"] = expected["x2"].cat.remove_categories([4]) if as_index: if index_kind == "multi": expected = expected.set_index(["x", "x2"]) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index e292bc7fe251e..d93de16816725 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -341,7 +341,15 @@ def test_latin_encoding(tmp_path, setup_path, dtype, val): ser.to_hdf(store, key=key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) - s_nan = ser.replace(nan_rep, np.nan) + # TODO:(3.0): once Categorical replace deprecation is enforced, + # we may be able to re-simplify the construction of s_nan + if dtype == "category": + if nan_rep in ser.cat.categories: + s_nan = ser.cat.remove_categories([nan_rep]) + else: + s_nan = ser + else: + s_nan = ser.replace(nan_rep, np.nan) tm.assert_series_equal(s_nan, retr) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 477f36bdf4214..4330153c186ca 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -403,6 +403,7 @@ def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" + msg = "with CategoricalDtype is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -418,7 +419,9 @@ def test_replace_categorical(self, categorical, numeric): def test_replace_categorical_inplace(self, data, data_exp): # GH 53358 result = pd.Series(data, dtype="category") - result.replace(to_replace="a", value="b", inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result.replace(to_replace="a", value="b", inplace=True) expected = pd.Series(data_exp, dtype="category") tm.assert_series_equal(result, expected) @@ -434,16 +437,22 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - result = c.replace(c[2], "foo") + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = c.replace(c[2], "foo") tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - return_value = c.replace(c[2], "foo", inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[2], "foo", inplace=True) assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] - return_value = c.replace(c[1], c[0], inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[1], c[0], inplace=True) assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value