Skip to content

Commit

Permalink
feat(python): Add replace_all expression to complement replace (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored May 30, 2024
1 parent fd808ea commit 3ac1c40
Show file tree
Hide file tree
Showing 7 changed files with 646 additions and 353 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Manipulation/selection
Expr.reinterpret
Expr.repeat_by
Expr.replace
Expr.replace_all
Expr.reshape
Expr.reverse
Expr.rle
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/computation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Computation
Series.peak_min
Series.rank
Series.replace
Series.replace_all
Series.rolling_apply
Series.rolling_map
Series.rolling_max
Expand Down
5 changes: 1 addition & 4 deletions py-polars/polars/_utils/various.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,7 @@ def str_duration_(td: str | None) -> int | None:
.cast(tp)
)
elif tp == Boolean:
cast_cols[c] = F.col(c).replace(
{"true": True, "false": False},
default=None,
)
cast_cols[c] = F.col(c).replace_all({"true": True, "false": False})
elif tp in INTEGER_DTYPES:
int_string = F.col(c).str.replace_all(r"[^\d+-]", "")
cast_cols[c] = (
Expand Down
192 changes: 178 additions & 14 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11371,16 +11371,26 @@ def replace(
Accepts expression input. Sequences are parsed as Series,
other non-expression inputs are parsed as literals.
Length must match the length of `old` or have length 1.
default
Set values that were not replaced to this value.
Defaults to keeping the original value.
Accepts expression input. Non-expression inputs are parsed as literals.
.. deprecated:: 0.20.31
Use :meth:`replace_all` instead to set a default while replacing values.
return_dtype
The data type of the resulting expression. If set to `None` (default),
the data type is determined automatically based on the other inputs.
.. deprecated:: 0.20.31
Use :meth:`replace_all` instead to set a return data type while
replacing values.
See Also
--------
replace_all
str.replace
Notes
Expand Down Expand Up @@ -11422,25 +11432,23 @@ def replace(
└─────┴──────────┘
Passing a mapping with replacements is also supported as syntactic sugar.
Specify a default to set all values that were not matched.
>>> mapping = {2: 100, 3: 200}
>>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1))
>>> df.with_columns(replaced=pl.col("a").replace(mapping))
shape: (4, 2)
┌─────┬──────────┐
│ a ┆ replaced │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪══════════╡
│ 1 ┆ -1
│ 1 ┆ 1
│ 2 ┆ 100 │
│ 2 ┆ 100 │
│ 3 ┆ 200 │
└─────┴──────────┘
Replacing by values of a different data type sets the return type based on
a combination of the `new` data type and either the original data type or the
default data type if it was set.
a combination of the `new` data type and the original data type.
>>> df = pl.DataFrame({"a": ["x", "y", "z"]})
>>> mapping = {"x": 1, "y": 2, "z": 3}
Expand All @@ -11455,7 +11463,156 @@ def replace(
│ y ┆ 2 │
│ z ┆ 3 │
└─────┴──────────┘
>>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None))
Expression input is supported.
>>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]})
>>> df.with_columns(
... replaced=pl.col("a").replace(
... old=pl.col("a").max(),
... new=pl.col("b").sum(),
... )
... )
shape: (4, 3)
┌─────┬─────┬──────────┐
│ a ┆ b ┆ replaced │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 │
╞═════╪═════╪══════════╡
│ 1 ┆ 1.5 ┆ 1.0 │
│ 2 ┆ 2.5 ┆ 2.0 │
│ 2 ┆ 5.0 ┆ 2.0 │
│ 3 ┆ 1.0 ┆ 10.0 │
└─────┴─────┴──────────┘
"""
if new is no_default and isinstance(old, Mapping):
new = pl.Series(old.values())
old = pl.Series(old.keys())
else:
if isinstance(old, Sequence) and not isinstance(old, (str, pl.Series)):
old = pl.Series(old)
if isinstance(new, Sequence) and not isinstance(new, (str, pl.Series)):
new = pl.Series(new)

old = parse_as_expression(old, str_as_lit=True) # type: ignore[arg-type]
new = parse_as_expression(new, str_as_lit=True) # type: ignore[arg-type]

if default is no_default:
default = None
else:
issue_deprecation_warning(
"The `default` parameter for `replace` is deprecated."
" Use `replace_all` instead to set a default while replacing values.",
version="0.20.31",
)
default = parse_as_expression(default, str_as_lit=True)

if return_dtype is not None:
issue_deprecation_warning(
"The `return_dtype` parameter for `replace` is deprecated."
" Use `replace_all` instead to set a return data type while replacing values.",
version="0.20.31",
)

return self._from_pyexpr(self._pyexpr.replace(old, new, default, return_dtype))

def replace_all(
self,
old: IntoExpr | Sequence[Any] | Mapping[Any, Any],
new: IntoExpr | Sequence[Any] | NoDefault = no_default,
*,
default: IntoExpr = None,
return_dtype: PolarsDataType | None = None,
) -> Self:
"""
Replace all values by different values.
Parameters
----------
old
Value or sequence of values to replace.
Accepts expression input. Sequences are parsed as Series,
other non-expression inputs are parsed as literals.
Also accepts a mapping of values to their replacement as syntactic sugar for
`replace_all(old=Series(mapping.keys()), new=Series(mapping.values()))`.
new
Value or sequence of values to replace by.
Accepts expression input. Sequences are parsed as Series,
other non-expression inputs are parsed as literals.
Length must match the length of `old` or have length 1.
default
Set values that were not replaced to this value. Defaults to null.
Accepts expression input. Non-expression inputs are parsed as literals.
return_dtype
The data type of the resulting expression. If set to `None` (default),
the data type is determined automatically based on the other inputs.
See Also
--------
replace
str.replace
Notes
-----
The global string cache must be enabled when replacing categorical values.
Examples
--------
Replace a single value by another value. Values that were not replaced are set
to null.
>>> df = pl.DataFrame({"a": [1, 2, 2, 3]})
>>> df.with_columns(replaced=pl.col("a").replace_all(2, 100))
shape: (4, 2)
┌─────┬──────────┐
│ a ┆ replaced │
│ --- ┆ --- │
│ i64 ┆ i32 │
╞═════╪══════════╡
│ 1 ┆ null │
│ 2 ┆ 100 │
│ 2 ┆ 100 │
│ 3 ┆ null │
└─────┴──────────┘
Replace multiple values by passing sequences to the `old` and `new` parameters.
>>> df.with_columns(replaced=pl.col("a").replace_all([2, 3], [100, 200]))
shape: (4, 2)
┌─────┬──────────┐
│ a ┆ replaced │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪══════════╡
│ 1 ┆ null │
│ 2 ┆ 100 │
│ 2 ┆ 100 │
│ 3 ┆ 200 │
└─────┴──────────┘
Passing a mapping with replacements is also supported as syntactic sugar.
Specify a default to set all values that were not matched.
>>> mapping = {2: 100, 3: 200}
>>> df.with_columns(replaced=pl.col("a").replace_all(mapping, default=-1))
shape: (4, 2)
┌─────┬──────────┐
│ a ┆ replaced │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪══════════╡
│ 1 ┆ -1 │
│ 2 ┆ 100 │
│ 2 ┆ 100 │
│ 3 ┆ 200 │
└─────┴──────────┘
Replacing by values of a different data type sets the return type based on
a combination of the `new` data type and the `default` data type.
>>> df = pl.DataFrame({"a": ["x", "y", "z"]})
>>> mapping = {"x": 1, "y": 2, "z": 3}
>>> df.with_columns(replaced=pl.col("a").replace_all(mapping))
shape: (3, 2)
┌─────┬──────────┐
│ a ┆ replaced │
Expand All @@ -11466,11 +11623,22 @@ def replace(
│ y ┆ 2 │
│ z ┆ 3 │
└─────┴──────────┘
>>> df.with_columns(replaced=pl.col("a").replace_all(mapping, default="x"))
shape: (3, 2)
┌─────┬──────────┐
│ a ┆ replaced │
│ --- ┆ --- │
│ str ┆ str │
╞═════╪══════════╡
│ x ┆ 1 │
│ y ┆ 2 │
│ z ┆ 3 │
└─────┴──────────┘
Set the `return_dtype` parameter to control the resulting data type directly.
>>> df.with_columns(
... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8)
... replaced=pl.col("a").replace_all(mapping, return_dtype=pl.UInt8)
... )
shape: (3, 2)
┌─────┬──────────┐
Expand All @@ -11487,7 +11655,7 @@ def replace(
>>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]})
>>> df.with_columns(
... replaced=pl.col("a").replace(
... replaced=pl.col("a").replace_all(
... old=pl.col("a").max(),
... new=pl.col("b").sum(),
... default=pl.col("b"),
Expand Down Expand Up @@ -11517,11 +11685,7 @@ def replace(
old = parse_as_expression(old, str_as_lit=True) # type: ignore[arg-type]
new = parse_as_expression(new, str_as_lit=True) # type: ignore[arg-type]

default = (
None
if default is no_default
else parse_as_expression(default, str_as_lit=True)
)
default = parse_as_expression(default, str_as_lit=True)

return self._from_pyexpr(self._pyexpr.replace(old, new, default, return_dtype))

Expand Down Expand Up @@ -11974,7 +12138,7 @@ def map_dict(
return_dtype
Set return dtype to override automatic return dtype determination.
"""
return self.replace(mapping, default=default, return_dtype=return_dtype)
return self.replace_all(mapping, default=default, return_dtype=return_dtype)

@classmethod
def from_json(cls, value: str) -> Self:
Expand Down
Loading

0 comments on commit 3ac1c40

Please sign in to comment.