From 28b6191f8290e1432d84f6bcc5396116b0953bbc Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 May 2024 15:56:09 +0200 Subject: [PATCH 1/6] feat: series descriptive --- docs/api-reference/series.md | 6 + narwhals/_pandas_like/series.py | 19 +++ narwhals/series.py | 219 ++++++++++++++++++++++++++++++++ tests/test_series.py | 47 +++++++ 4 files changed, 291 insertions(+) diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e87a30f7c..82ea50900 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -15,12 +15,18 @@ - fill_null - filter - is_between + - is_duplicated + - is_empty + - is_first_distinct - is_in + - is_last_distinct - is_null + - is_unique - max - mean - min - name + - null_count - n_unique - sample - shape diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index a36852afb..503bf7a65 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -439,6 +439,25 @@ def to_pandas(self) -> Any: msg = f"Unknown implementation: {self._implementation}" # pragma: no cover raise AssertionError(msg) + # --- descriptive --- + def is_duplicated(self: Self) -> Self: + return self._from_series(self._series.duplicated(keep=False)) + + def is_empty(self: Self) -> bool: + return self._series.empty # type: ignore[no-any-return] + + def is_unique(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep=False)) + + def null_count(self: Self) -> int: + return self._series.isnull().sum() # type: ignore[no-any-return] + + def is_first_distinct(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep="first")) + + def is_last_distinct(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep="last")) + @property def str(self) -> PandasSeriesStringNamespace: return PandasSeriesStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index 9954f8906..b2e788b1c 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -987,6 +987,225 @@ def __invert__(self) -> Series: def filter(self, other: Any) -> Series: return self._from_series(self._series.filter(self._extract_native(other))) + # --- descriptive --- + def is_duplicated(self: Self) -> Series: + r""" + Get a mask of all duplicated rows in the Series. + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 2, 3, 1]) + >>> s_pl = pl.Series([1, 2, 3, 1]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... duplicated = series.is_duplicated() + ... return nw.to_native(duplicated) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 True + 1 False + 2 False + 3 True + dtype: bool + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [bool] + [ + true + false + false + true + ] + """ + return Series(self._series.is_duplicated()) + + def is_empty(self: Self) -> bool: + r""" + Check if the series is empty. + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + + Let's define a dataframe-agnostic function that filters rows in which "foo" + values are greater than 10, and then checks if the result is empty or not: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.filter(series > 10).is_empty() + + We can then pass either pandas or Polars to `func`: + + >>> s_pd = pd.Series([1, 2, 3]) + >>> s_pl = pl.Series([1, 2, 3]) + >>> func(s_pd), func(s_pl) + (True, True) + + >>> s_pd = pd.Series([100, 2, 3]) + >>> s_pl = pl.Series([100, 2, 3]) + >>> func(s_pd), func(s_pl) + (False, False) + """ + return self._series.is_empty() # type: ignore[no-any-return] + + def is_unique(self: Self) -> Series: + r""" + Get a mask of all unique rows in the Series. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 2, 3, 1]) + >>> s_pl = pl.Series([1, 2, 3, 1]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... unique = series.is_unique() + ... return nw.to_native(unique) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 False + 1 True + 2 True + 3 False + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [bool] + [ + false + true + true + false + ] + """ + return Series(self._series.is_unique()) + + def null_count(self: Self) -> int: + r""" + Create a new Series that shows the null counts per column. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, None, 3]) + >>> s_pl = pl.Series([1, None, None]) + + Let's define a dataframe-agnostic function that returns the null count of + the series: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.null_count() + + We can then pass either pandas or Polars to `func`: + >>> func(s_pd) + 1 + >>> func(s_pl) + 2 + """ + + return self._series.null_count() # type: ignore[no-any-return] + + def is_first_distinct(self: Self) -> Series: + r""" + Return a boolean mask indicating the first occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2]) + >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... first_distinct = series.is_first_distinct() + ... return nw.to_native(first_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 True + 1 False + 2 True + 3 True + 4 False + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + return Series(self._series.is_first_distinct()) + + def is_last_distinct(self: Self) -> Series: + r""" + Return a boolean mask indicating the last occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2]) + >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... last_distinct = series.is_last_distinct() + ... return nw.to_native(last_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + return Series(self._series.is_last_distinct()) + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/test_series.py b/tests/test_series.py index d6a33dd01..fa7b16a9f 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -321,3 +321,50 @@ def test_to_numpy() -> None: result = nw.Series(s).__array__() assert result.dtype == "float64" assert nw.Series(s).shape == (3,) + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_duplicated(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_duplicated() # type: ignore [union-attr] + expected = np.array([True, True, False]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.parametrize(("threshold", "expected"), [(0, False), (10, True)]) +def test_is_empty(df_raw: Any, threshold: Any, expected: Any) -> None: + series = nw.Series(df_raw["a"]) + result = series.filter(series > threshold).is_empty() + assert result == expected + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_unique(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_unique() # type: ignore [union-attr] + expected = np.array([False, False, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("s_raw", [pd.Series([1, 2, None]), pl.Series([1, 2, None])]) +def test_null_count(s_raw: Any) -> None: + series = nw.Series(s_raw) + result = series.null_count() + assert result == 1 + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_first_distinct(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_first_distinct() # type: ignore [union-attr] + expected = np.array([True, False, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_last_distinct(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_last_distinct() # type: ignore [union-attr] + expected = np.array([False, True, True]) + assert (result.to_numpy() == expected).all() From f78d1d1f45cfb90e9192d272318ed0f5a24be340 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 May 2024 16:23:28 +0200 Subject: [PATCH 2/6] add Expr --- docs/api-reference/expressions.md | 5 +++++ narwhals/_pandas_like/expr.py | 15 +++++++++++++++ narwhals/expression.py | 15 +++++++++++++++ tests/expr/is_duplicated_test.py | 24 ++++++++++++++++++++++++ tests/expr/is_first_distinct_test.py | 24 ++++++++++++++++++++++++ tests/expr/is_last_distinct_test.py | 24 ++++++++++++++++++++++++ tests/expr/is_unique_test.py | 24 ++++++++++++++++++++++++ tests/expr/null_count_test.py | 24 ++++++++++++++++++++++++ tests/test_series.py | 8 ++++---- utils/check_api_reference.py | 4 +++- 10 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 tests/expr/is_duplicated_test.py create mode 100644 tests/expr/is_first_distinct_test.py create mode 100644 tests/expr/is_last_distinct_test.py create mode 100644 tests/expr/is_unique_test.py create mode 100644 tests/expr/null_count_test.py diff --git a/docs/api-reference/expressions.md b/docs/api-reference/expressions.md index aa4ab1e61..a889d5b71 100644 --- a/docs/api-reference/expressions.md +++ b/docs/api-reference/expressions.md @@ -14,11 +14,16 @@ - fill_null - filter - is_between + - is_duplicated + - is_first_distinct - is_in + - is_last_distinct - is_null + - is_unique - max - mean - min + - null_count - n_unique - over - unique diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c3273e1b6..f90208248 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -253,6 +253,21 @@ def func(df: PandasDataFrame) -> list[PandasSeries]: implementation=self._implementation, ) + def is_duplicated(self) -> Self: + return register_expression_call(self, "is_duplicated") + + def is_unique(self) -> Self: + return register_expression_call(self, "is_unique") + + def null_count(self) -> Self: + return register_expression_call(self, "null_count") + + def is_first_distinct(self) -> Self: + return register_expression_call(self, "is_first_distinct") + + def is_last_distinct(self) -> Self: + return register_expression_call(self, "is_last_distinct") + @property def str(self) -> PandasExprStringNamespace: return PandasExprStringNamespace(self) diff --git a/narwhals/expression.py b/narwhals/expression.py index ad99e0aee..2254fa4e5 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1122,6 +1122,21 @@ def over(self, *keys: str | Iterable[str]) -> Expr: """ return self.__class__(lambda plx: self._call(plx).over(flatten(keys))) + def is_duplicated(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).is_duplicated()) + + def is_unique(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).is_unique()) + + def null_count(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).null_count()) + + def is_first_distinct(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).is_first_distinct()) + + def is_last_distinct(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).is_last_distinct()) + @property def str(self) -> ExprStringNamespace: return ExprStringNamespace(self) diff --git a/tests/expr/is_duplicated_test.py b/tests/expr/is_duplicated_test.py new file mode 100644 index 000000000..52e18f08a --- /dev/null +++ b/tests/expr/is_duplicated_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2], + "b": [1, 2, 3], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_duplicated(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_duplicated()) + expected = { + "a": [True, True, False], + "b": [False, False, False], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_first_distinct_test.py b/tests/expr/is_first_distinct_test.py new file mode 100644 index 000000000..22208c402 --- /dev/null +++ b/tests/expr/is_first_distinct_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2, 3, 2], + "b": [1, 2, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_first_distinct(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_first_distinct()) + expected = { + "a": [True, False, True, True, False], + "b": [True, True, True, False, False], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_last_distinct_test.py b/tests/expr/is_last_distinct_test.py new file mode 100644 index 000000000..984e2ee00 --- /dev/null +++ b/tests/expr/is_last_distinct_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2, 3, 2], + "b": [1, 2, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_last_distinct(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_last_distinct()) + expected = { + "a": [False, True, False, True, True], + "b": [False, False, True, True, True], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_unique_test.py b/tests/expr/is_unique_test.py new file mode 100644 index 000000000..7ba842add --- /dev/null +++ b/tests/expr/is_unique_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2], + "b": [1, 2, 3], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_unique(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_unique()) + expected = { + "a": [False, False, True], + "b": [True, True, True], + } + compare_dicts(result, expected) diff --git a/tests/expr/null_count_test.py b/tests/expr/null_count_test.py new file mode 100644 index 000000000..68b615585 --- /dev/null +++ b/tests/expr/null_count_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1.0, None, None, 3.0], + "b": [1.0, None, 4, 5.0], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_null_count(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().null_count()) + expected = { + "a": [2], + "b": [1], + } + compare_dicts(result, expected) diff --git a/tests/test_series.py b/tests/test_series.py index fa7b16a9f..413ea8bfb 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -326,7 +326,7 @@ def test_to_numpy() -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) def test_is_duplicated(df_raw: Any) -> None: series = nw.Series(df_raw["b"]) - result = series.is_duplicated() # type: ignore [union-attr] + result = series.is_duplicated() expected = np.array([True, True, False]) assert (result.to_numpy() == expected).all() @@ -342,7 +342,7 @@ def test_is_empty(df_raw: Any, threshold: Any, expected: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) def test_is_unique(df_raw: Any) -> None: series = nw.Series(df_raw["b"]) - result = series.is_unique() # type: ignore [union-attr] + result = series.is_unique() expected = np.array([False, False, True]) assert (result.to_numpy() == expected).all() @@ -357,7 +357,7 @@ def test_null_count(s_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) def test_is_first_distinct(df_raw: Any) -> None: series = nw.Series(df_raw["b"]) - result = series.is_first_distinct() # type: ignore [union-attr] + result = series.is_first_distinct() expected = np.array([True, False, True]) assert (result.to_numpy() == expected).all() @@ -365,6 +365,6 @@ def test_is_first_distinct(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) def test_is_last_distinct(df_raw: Any) -> None: series = nw.Series(df_raw["b"]) - result = series.is_last_distinct() # type: ignore [union-attr] + result = series.is_last_distinct() expected = np.array([False, True, True]) assert (result.to_numpy() == expected).all() diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 9eac56dea..6972ff829 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -133,7 +133,9 @@ if ( extra := set(series) .difference(expr) - .difference({"to_pandas", "to_numpy", "dtype", "name", "shape", "to_frame"}) + .difference( + {"to_pandas", "to_numpy", "dtype", "name", "shape", "to_frame", "is_empty"} + ) ): print("in series but not in expr") # noqa: T201 print(extra) # noqa: T201 From d1735681c241f1194b3e0c11d2c3f8b7bb5a0d66 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 May 2024 21:32:29 +0200 Subject: [PATCH 3/6] value_counts, is_sorted, and expr docstring --- docs/api-reference/series.md | 2 + narwhals/_pandas_like/series.py | 19 ++++ narwhals/expression.py | 193 ++++++++++++++++++++++++++++++++ narwhals/series.py | 84 ++++++++++++++ tests/test_series.py | 27 +++++ utils/check_api_reference.py | 13 ++- 6 files changed, 337 insertions(+), 1 deletion(-) diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 82ea50900..d65262a5b 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -21,6 +21,7 @@ - is_in - is_last_distinct - is_null + - is_sorted - is_unique - max - mean @@ -38,5 +39,6 @@ - to_numpy - to_pandas - unique + - value_counts show_source: false show_bases: false diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 503bf7a65..47c340b99 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -458,6 +458,25 @@ def is_first_distinct(self: Self) -> Self: def is_last_distinct(self: Self) -> Self: return self._from_series(~self._series.duplicated(keep="last")) + def is_sorted(self: Self, *, descending: bool = False) -> bool: + if not isinstance(descending, bool): + msg = f"argument 'descending' should be boolean, found {type(descending)}" + raise TypeError(msg) + + if descending: + return self._series.is_monotonic_decreasing # type: ignore[no-any-return] + else: + return self._series.is_monotonic_increasing # type: ignore[no-any-return] + + def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any: + """Parallel is unused, exists for compatibility""" + from narwhals._pandas_like.dataframe import PandasDataFrame + + return PandasDataFrame( + self._series.value_counts(dropna=False, sort=sort).reset_index(), + implementation=self._implementation, + ) + @property def str(self) -> PandasSeriesStringNamespace: return PandasSeriesStringNamespace(self) diff --git a/narwhals/expression.py b/narwhals/expression.py index 2254fa4e5..6d11cd5e4 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1123,18 +1123,211 @@ def over(self, *keys: str | Iterable[str]) -> Expr: return self.__class__(lambda plx: self._call(plx).over(flatten(keys))) def is_duplicated(self) -> Expr: + r""" + Return a boolean mask indicating duplicated values. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... duplicated = df.select(nw.all().is_duplicated()) + ... return nw.to_native(duplicated) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 True True + 1 False True + 2 False False + 3 True False + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ true ┆ true │ + │ false ┆ true │ + │ false ┆ false │ + │ true ┆ false │ + └───────┴───────┘ + """ return self.__class__(lambda plx: self._call(plx).is_duplicated()) def is_unique(self) -> Expr: + r""" + Return a boolean mask indicating unique values. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... unique = df.select(nw.all().is_unique()) + ... return nw.to_native(unique) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 False False + 1 True False + 2 True True + 3 False True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ true ┆ false │ + │ true ┆ true │ + │ false ┆ true │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_unique()) def null_count(self) -> Expr: + r""" + Count null values. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... nulls = df.select(nw.all().null_count()) + ... return nw.to_native(nulls) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 0 1 2 + >>> func(df_pl) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + """ return self.__class__(lambda plx: self._call(plx).null_count()) def is_first_distinct(self) -> Expr: + r""" + Return a boolean mask indicating the first occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... first_distinct = df.select(nw.all().is_first_distinct()) + ... return nw.to_native(first_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 True True + 1 True False + 2 True True + 3 False True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + │ true ┆ true │ + │ false ┆ true │ + └───────┴───────┘ + """ return self.__class__(lambda plx: self._call(plx).is_first_distinct()) def is_last_distinct(self) -> Expr: + r"""Return a boolean mask indicating the last occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... last_distinct = df.select(nw.all().is_last_distinct()) + ... return nw.to_native(last_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 False False + 1 True True + 2 True True + 3 True True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ true ┆ true │ + │ true ┆ true │ + │ true ┆ true │ + └───────┴───────┘ + """ return self.__class__(lambda plx: self._call(plx).is_last_distinct()) @property diff --git a/narwhals/series.py b/narwhals/series.py index b2e788b1c..f9d9df9e1 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -991,6 +991,7 @@ def filter(self, other: Any) -> Series: def is_duplicated(self: Self) -> Series: r""" Get a mask of all duplicated rows in the Series. + Examples: >>> import narwhals as nw >>> import pandas as pd @@ -1028,6 +1029,7 @@ def is_duplicated(self: Self) -> Series: def is_empty(self: Self) -> bool: r""" Check if the series is empty. + Examples: >>> import narwhals as nw >>> import pandas as pd @@ -1206,6 +1208,88 @@ def is_last_distinct(self: Self) -> Series: """ return Series(self._series.is_last_distinct()) + def is_sorted(self: Self, *, descending: bool = False) -> bool: + r""" + Check if the Series is sorted. + + Arguments: + descending: Check if the Series is sorted in descending order. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> unsorted_data = [1, 3, 2] + >>> sorted_data = [3, 2, 1] + + Let's define a dataframe-agnostic function: + + >>> def func(s_any, descending=False): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.is_sorted(descending=descending) + + We can then pass either pandas or Polars to `func`: + + >>> func(pl.Series(unsorted_data)) + False + >>> func(pl.Series(sorted_data), descending=True) + True + >>> func(pd.Series(unsorted_data)) + False + >>> func(pd.Series(sorted_data), descending=True) + True + """ + return self._series.is_sorted(descending=descending) # type: ignore[no-any-return] + + def value_counts( + self: Self, *, sort: bool = False, parallel: bool = False + ) -> DataFrame: + r""" + Count the occurrences of unique values. + + Arguments: + sort: Sort the output by count in descending order. If set to False (default), + the order of the output is random. + parallel: Execute the computation in parallel. Unused for pandas-like APIs. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2], name="s") + >>> s_pl = pl.Series(values=[1, 1, 2, 3, 2], name="s") + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... val_count = series.value_counts(sort=True) + ... return nw.to_native(val_count) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + s count + 0 1 2 + 1 2 2 + 2 3 1 + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3, 2) + ┌─────┬───────┐ + │ s ┆ count │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═══════╡ + │ 1 ┆ 2 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴───────┘ + """ + from narwhals.dataframe import DataFrame + + return DataFrame(self._series.value_counts(sort=sort, parallel=parallel)) + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/test_series.py b/tests/test_series.py index 413ea8bfb..1de79e494 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -368,3 +368,30 @@ def test_is_last_distinct(df_raw: Any) -> None: result = series.is_last_distinct() expected = np.array([False, True, True]) assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_value_counts(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.value_counts(sort=True) + expected = np.array([[4, 2], [6, 1]]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.parametrize( + ("col", "descending", "expected"), + [("a", False, False), ("z", False, True), ("z", True, False)], +) +def test_is_sorted(df_raw: Any, col: str, descending: bool, expected: bool) -> None: # noqa: FBT001 + series = nw.Series(df_raw[col]) + result = series.is_sorted(descending=descending) + assert result == expected + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_sorted_invalid(df_raw: Any) -> None: + series = nw.Series(df_raw["z"]) + + with pytest.raises(TypeError): + series.is_sorted(descending="invalid_type") # type: ignore[arg-type] diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 6972ff829..e1f74ac2b 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -126,6 +126,7 @@ series = [ i for i in nw.Series(pl.Series()).__dir__() if not i[0].isupper() and i[0] != "_" ] + if missing := set(expr).difference(series).difference({"over"}): print("In expr but not in series") # noqa: T201 print(missing) # noqa: T201 @@ -134,7 +135,17 @@ extra := set(series) .difference(expr) .difference( - {"to_pandas", "to_numpy", "dtype", "name", "shape", "to_frame", "is_empty"} + { + "to_pandas", + "to_numpy", + "dtype", + "name", + "shape", + "to_frame", + "is_empty", + "is_sorted", + "value_counts", + } ) ): print("in series but not in expr") # noqa: T201 From 15c7dfd3e57a33ce9e6166b2d60434691e16ad90 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 19 May 2024 11:19:01 +0200 Subject: [PATCH 4/6] pandas value count fix? --- narwhals/_pandas_like/series.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 47c340b99..dea54d695 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -472,8 +472,14 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A """Parallel is unused, exists for compatibility""" from narwhals._pandas_like.dataframe import PandasDataFrame + name_ = self._series.name or "index" + val_count = self._series.value_counts(dropna=False, sort=False).reset_index() + val_count.columns = [name_, "count"] + if sort: + val_count = val_count.sort_values(name_) + return PandasDataFrame( - self._series.value_counts(dropna=False, sort=sort).reset_index(), + val_count, implementation=self._implementation, ) From cf0dcf1978e3679b05883c2cacd327c8f76851a5 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 19 May 2024 11:31:35 +0200 Subject: [PATCH 5/6] un sorted test --- tests/test_series.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_series.py b/tests/test_series.py index 1de79e494..bcc702c4f 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -373,9 +373,18 @@ def test_is_last_distinct(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) def test_value_counts(df_raw: Any) -> None: series = nw.Series(df_raw["b"]) - result = series.value_counts(sort=True) + sorted_result = series.value_counts(sort=True) + assert sorted_result.columns == ["b", "count"] + expected = np.array([[4, 2], [6, 1]]) - assert (result.to_numpy() == expected).all() + assert (sorted_result.to_numpy() == expected).all() + + unsorted_result = series.value_counts(sort=False) + assert unsorted_result.columns == ["b", "count"] + + a = unsorted_result.to_numpy() + + assert (a[a[:, 0].argsort()] == expected).all() @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) From 88bccb61bb9696d87e9b6af04ad525ef0a021942 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 19 May 2024 13:59:29 +0200 Subject: [PATCH 6/6] explicit name is None check --- narwhals/_pandas_like/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index dea54d695..819492bfe 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -472,7 +472,7 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A """Parallel is unused, exists for compatibility""" from narwhals._pandas_like.dataframe import PandasDataFrame - name_ = self._series.name or "index" + name_ = "index" if self._series.name is None else self._series.name val_count = self._series.value_counts(dropna=False, sort=False).reset_index() val_count.columns = [name_, "count"] if sort: