Skip to content

Commit

Permalink
feat: add strip_chars to string namespace (#715)
Browse files Browse the repository at this point in the history
* feat: add `strip_chars` to string namespace

* fix: remove unnecessary "\n"

* doc: complete docstrings /doctests

* perf: hard-code whitespace instead of importing from `string`

* Update narwhals/_arrow/series.py

Co-authored-by: Francesco Bruzzesi <[email protected]>

* doc: better docstrings

---------

Co-authored-by: Francesco Bruzzesi <[email protected]>
  • Loading branch information
lucianosrp and FBruzzesi authored Aug 5, 2024
1 parent b8d9b08 commit e117a5e
Show file tree
Hide file tree
Showing 10 changed files with 145 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/expr_str.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- head
- slice
- starts_with
- strip_chars
- tail
- to_datetime
- to_lowercase
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series_str.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- head
- slice
- starts_with
- strip_chars
- tail
show_source: false
show_bases: false
8 changes: 8 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,14 @@ class ArrowExprStringNamespace:
def __init__(self, expr: ArrowExpr) -> None:
self._expr = expr

def strip_chars(self, characters: str | None = None) -> ArrowExpr:
return reuse_series_namespace_implementation(
self._expr,
"str",
"strip_chars",
characters,
)

def starts_with(self, prefix: str) -> ArrowExpr:
return reuse_series_namespace_implementation(
self._expr,
Expand Down
10 changes: 10 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,16 @@ class ArrowSeriesStringNamespace:
def __init__(self: Self, series: ArrowSeries) -> None:
self._arrow_series = series

def strip_chars(self: Self, characters: str | None = None) -> ArrowSeries:
pc = get_pyarrow_compute()
whitespace = " \t\n\r\v\f"
return self._arrow_series._from_native_series(
pc.utf8_trim(
self._arrow_series._native_series,
characters or whitespace,
)
)

def starts_with(self: Self, prefix: str) -> ArrowSeries:
pc = get_pyarrow_compute()
return self._arrow_series._from_native_series(
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,11 @@ class DaskExprStringNamespace:
def __init__(self, expr: DaskExpr) -> None:
self._expr = expr

def strip_chars(self, characters: str | None = None) -> DaskExpr:
return self._expr._from_call(
lambda _input, characters: _input.str.strip(characters), "strip", characters
)

def starts_with(self, prefix: str) -> DaskExpr:
return self._expr._from_call(
lambda _input, prefix: _input.str.startswith(prefix), "starts_with", prefix
Expand Down
8 changes: 8 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,14 @@ class PandasLikeExprStringNamespace:
def __init__(self, expr: PandasLikeExpr) -> None:
self._expr = expr

def strip_chars(self, characters: str | None = None) -> PandasLikeExpr:
return reuse_series_namespace_implementation(
self._expr,
"str",
"strip_chars",
characters,
)

def starts_with(self, prefix: str) -> PandasLikeExpr:
return reuse_series_namespace_implementation(
self._expr,
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,11 @@ class PandasLikeSeriesStringNamespace:
def __init__(self, series: PandasLikeSeries) -> None:
self._pandas_series = series

def strip_chars(self, characters: str | None) -> PandasLikeSeries:
return self._pandas_series._from_native_series(
self._pandas_series._native_series.str.strip(characters),
)

def starts_with(self, prefix: str) -> PandasLikeSeries:
return self._pandas_series._from_native_series(
self._pandas_series._native_series.str.startswith(prefix),
Expand Down
34 changes: 34 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1833,6 +1833,40 @@ class ExprStringNamespace:
def __init__(self, expr: Expr) -> None:
self._expr = expr

def strip_chars(self, characters: str | None = None) -> Expr:
r"""
Remove leading and trailing characters.
Arguments:
characters: The set of characters to be removed. All combinations of this set of characters will be stripped from the start and end of the string. If set to None (default), all leading and trailing whitespace is removed instead.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> data = {"fruits": ["apple", "\nmango"]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
We define a dataframe-agnostic function:
>>> @nw.narwhalify
... def func(df):
... df = df.with_columns(stripped=nw.col("fruits").str.strip_chars())
... return df.to_dict(as_series=False)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd)
{'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']}
>>> func(df_pl)
{'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']}
"""
return self._expr.__class__(
lambda plx: self._expr._call(plx).str.strip_chars(characters)
)

def starts_with(self, prefix: str) -> Expr:
r"""
Check if string values start with a substring.
Expand Down
34 changes: 34 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2148,6 +2148,40 @@ class SeriesStringNamespace:
def __init__(self, series: Series) -> None:
self._narwhals_series = series

def strip_chars(self, characters: str | None = None) -> Series:
r"""
Remove leading and trailing characters.
Arguments:
characters: The set of characters to be removed. All combinations of this set of characters will be stripped from the start and end of the string. If set to None (default), all leading and trailing whitespace is removed instead.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> data = ["apple", "\nmango"]
>>> s_pd = pd.Series(data)
>>> s_pl = pl.Series(data)
We define a dataframe-agnostic function:
>>> @nw.narwhalify
... def func(s):
... s = s.str.strip_chars()
... return s.to_list()
We can then pass either pandas or Polars to `func`:
>>> func(s_pd)
['apple', 'mango']
>>> func(s_pl)
['apple', 'mango']
"""
return self._narwhals_series._from_compliant_series(
self._narwhals_series._compliant_series.str.strip_chars(characters)
)

def starts_with(self, prefix: str) -> Series:
r"""
Check if string values start with a substring.
Expand Down
39 changes: 39 additions & 0 deletions tests/expr_and_series/str/strip_chars_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations

from typing import Any

import pytest

import narwhals.stable.v1 as nw
from tests.utils import compare_dicts

data = {"a": ["foobar", "bar\n", " baz"]}


@pytest.mark.parametrize(
("characters", "expected"),
[
(None, {"a": ["foobar", "bar", "baz"]}),
("foo", {"a": ["bar", "bar\n", " baz"]}),
],
)
def test_str_strip_chars(constructor: Any, characters: str | None, expected: Any) -> None:
df = nw.from_native(constructor(data))
result_frame = df.select(nw.col("a").str.strip_chars(characters))
compare_dicts(result_frame, expected)


@pytest.mark.parametrize(
("characters", "expected"),
[
(None, {"a": ["foobar", "bar", "baz"]}),
("foo", {"a": ["bar", "bar\n", " baz"]}),
],
)
def test_str_strip_chars_series(
constructor_eager: Any, characters: str | None, expected: Any
) -> None:
df = nw.from_native(constructor_eager(data), eager_only=True)

result_series = df["a"].str.strip_chars(characters)
assert result_series.to_list() == expected["a"]

0 comments on commit e117a5e

Please sign in to comment.