Skip to content

Commit

Permalink
Deprecate replace with categorical columns (#14988)
Browse files Browse the repository at this point in the history
Matches pandas 2.2 behavior: pandas-dev/pandas#56385

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: #14988
  • Loading branch information
mroeschke authored Feb 9, 2024
1 parent 7294280 commit fbb1f89
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 19 deletions.
13 changes: 12 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import warnings
from collections import abc
from functools import cached_property
from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast
Expand Down Expand Up @@ -990,14 +991,24 @@ def find_and_replace(
replaced, to_replace_col, replacement_col
)

return column.build_categorical_column(
result = column.build_categorical_column(
categories=new_cats["cats"],
codes=column.build_column(output.base_data, dtype=output.dtype),
mask=output.base_mask,
offset=output.offset,
size=output.size,
ordered=self.dtype.ordered,
)
if result.dtype != self.dtype:
warnings.warn(
"The behavior of replace with "
"CategoricalDtype is deprecated. In a future version, replace "
"will only be used for cases that preserve the categories. "
"To change the categories, use ser.cat.rename_categories "
"instead.",
FutureWarning,
)
return result

def isnull(self) -> ColumnBase:
"""
Expand Down
66 changes: 48 additions & 18 deletions python/cudf/cudf/tests/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,24 @@ def test_series_replace_all(gsr, to_replace, value):
else:
pd_value = value

actual = gsr.replace(to_replace=gd_to_replace, value=gd_value)
if pd_value is None:
# TODO: Remove this workaround once cudf
# introduces `no_default` values
expected = psr.replace(to_replace=pd_to_replace)
else:
expected = psr.replace(to_replace=pd_to_replace, value=pd_value)
with expect_warning_if(
isinstance(gsr.dtype, cudf.CategoricalDtype)
and isinstance(gd_to_replace, str)
and gd_to_replace == "one"
):
actual = gsr.replace(to_replace=gd_to_replace, value=gd_value)
with expect_warning_if(
PANDAS_GE_220
and isinstance(gsr.dtype, cudf.CategoricalDtype)
and isinstance(gd_to_replace, str)
and gd_to_replace == "one"
):
if pd_value is None:
# TODO: Remove this workaround once cudf
# introduces `no_default` values
expected = psr.replace(to_replace=pd_to_replace)
else:
expected = psr.replace(to_replace=pd_to_replace, value=pd_value)

assert_eq(
expected.sort_values().reset_index(drop=True),
Expand All @@ -82,16 +93,19 @@ def test_series_replace():

# Categorical
psr3 = pd.Series(["one", "two", "three"], dtype="category")
psr4 = psr3.replace("one", "two")
with expect_warning_if(PANDAS_GE_220):
psr4 = psr3.replace("one", "two")
sr3 = cudf.from_pandas(psr3)
sr4 = sr3.replace("one", "two")
with pytest.warns(FutureWarning):
sr4 = sr3.replace("one", "two")
assert_eq(
psr4.sort_values().reset_index(drop=True),
sr4.sort_values().reset_index(drop=True),
)

psr5 = psr3.replace("one", "five")
sr5 = sr3.replace("one", "five")
with expect_warning_if(PANDAS_GE_220):
psr5 = psr3.replace("one", "five")
with pytest.warns(FutureWarning):
sr5 = sr3.replace("one", "five")

assert_eq(psr5, sr5)

Expand Down Expand Up @@ -236,11 +250,26 @@ def test_dataframe_replace(df, to_replace, value):
else:
gd_to_replace = to_replace

if pd_value is None:
expected = pdf.replace(to_replace=pd_to_replace)
else:
expected = pdf.replace(to_replace=pd_to_replace, value=pd_value)
actual = gdf.replace(to_replace=gd_to_replace, value=gd_value)
with expect_warning_if(
PANDAS_GE_220
and isinstance(df["a"].dtype, cudf.CategoricalDtype)
and isinstance(to_replace, str)
and to_replace == "two"
and isinstance(value, str)
and value == "three"
):
if pd_value is None:
expected = pdf.replace(to_replace=pd_to_replace)
else:
expected = pdf.replace(to_replace=pd_to_replace, value=pd_value)
with expect_warning_if(
isinstance(df["a"].dtype, cudf.CategoricalDtype)
and isinstance(to_replace, str)
and to_replace == "two"
and isinstance(value, str)
and value == "three"
):
actual = gdf.replace(to_replace=gd_to_replace, value=gd_value)

expected_sorted = expected.sort_values(by=list(expected.columns), axis=0)
actual_sorted = actual.sort_values(by=list(actual.columns), axis=0)
Expand Down Expand Up @@ -1342,7 +1371,8 @@ def test_series_replace_errors():
],
)
def test_replace_nulls(gsr, old, new, expected):
actual = gsr.replace(old, new)
with expect_warning_if(isinstance(gsr.dtype, cudf.CategoricalDtype)):
actual = gsr.replace(old, new)
assert_eq(
expected.sort_values().reset_index(drop=True),
actual.sort_values().reset_index(drop=True),
Expand Down

0 comments on commit fbb1f89

Please sign in to comment.