diff --git a/docs/api-reference/expressions.md b/docs/api-reference/expressions.md index aa4ab1e61..a889d5b71 100644 --- a/docs/api-reference/expressions.md +++ b/docs/api-reference/expressions.md @@ -14,11 +14,16 @@ - fill_null - filter - is_between + - is_duplicated + - is_first_distinct - is_in + - is_last_distinct - is_null + - is_unique - max - mean - min + - null_count - n_unique - over - unique diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e87a30f7c..d65262a5b 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -15,12 +15,19 @@ - fill_null - filter - is_between + - is_duplicated + - is_empty + - is_first_distinct - is_in + - is_last_distinct - is_null + - is_sorted + - is_unique - max - mean - min - name + - null_count - n_unique - sample - shape @@ -32,5 +39,6 @@ - to_numpy - to_pandas - unique + - value_counts show_source: false show_bases: false diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c3273e1b6..f90208248 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -253,6 +253,21 @@ def func(df: PandasDataFrame) -> list[PandasSeries]: implementation=self._implementation, ) + def is_duplicated(self) -> Self: + return register_expression_call(self, "is_duplicated") + + def is_unique(self) -> Self: + return register_expression_call(self, "is_unique") + + def null_count(self) -> Self: + return register_expression_call(self, "null_count") + + def is_first_distinct(self) -> Self: + return register_expression_call(self, "is_first_distinct") + + def is_last_distinct(self) -> Self: + return register_expression_call(self, "is_last_distinct") + @property def str(self) -> PandasExprStringNamespace: return PandasExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index a36852afb..819492bfe 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -439,6 +439,50 @@ def to_pandas(self) -> Any: msg = f"Unknown implementation: {self._implementation}" # pragma: no cover raise AssertionError(msg) + # --- descriptive --- + def is_duplicated(self: Self) -> Self: + return self._from_series(self._series.duplicated(keep=False)) + + def is_empty(self: Self) -> bool: + return self._series.empty # type: ignore[no-any-return] + + def is_unique(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep=False)) + + def null_count(self: Self) -> int: + return self._series.isnull().sum() # type: ignore[no-any-return] + + def is_first_distinct(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep="first")) + + def is_last_distinct(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep="last")) + + def is_sorted(self: Self, *, descending: bool = False) -> bool: + if not isinstance(descending, bool): + msg = f"argument 'descending' should be boolean, found {type(descending)}" + raise TypeError(msg) + + if descending: + return self._series.is_monotonic_decreasing # type: ignore[no-any-return] + else: + return self._series.is_monotonic_increasing # type: ignore[no-any-return] + + def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any: + """Parallel is unused, exists for compatibility""" + from narwhals._pandas_like.dataframe import PandasDataFrame + + name_ = "index" if self._series.name is None else self._series.name + val_count = self._series.value_counts(dropna=False, sort=False).reset_index() + val_count.columns = [name_, "count"] + if sort: + val_count = val_count.sort_values(name_) + + return PandasDataFrame( + val_count, + implementation=self._implementation, + ) + @property def str(self) -> PandasSeriesStringNamespace: return PandasSeriesStringNamespace(self) diff --git a/narwhals/expression.py b/narwhals/expression.py index ad99e0aee..6d11cd5e4 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1122,6 +1122,214 @@ def over(self, *keys: str | Iterable[str]) -> Expr: """ return self.__class__(lambda plx: self._call(plx).over(flatten(keys))) + def is_duplicated(self) -> Expr: + r""" + Return a boolean mask indicating duplicated values. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... duplicated = df.select(nw.all().is_duplicated()) + ... return nw.to_native(duplicated) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 True True + 1 False True + 2 False False + 3 True False + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ true ┆ true │ + │ false ┆ true │ + │ false ┆ false │ + │ true ┆ false │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_duplicated()) + + def is_unique(self) -> Expr: + r""" + Return a boolean mask indicating unique values. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... unique = df.select(nw.all().is_unique()) + ... return nw.to_native(unique) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 False False + 1 True False + 2 True True + 3 False True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ true ┆ false │ + │ true ┆ true │ + │ false ┆ true │ + └───────┴───────┘ + """ + + return self.__class__(lambda plx: self._call(plx).is_unique()) + + def null_count(self) -> Expr: + r""" + Count null values. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... nulls = df.select(nw.all().null_count()) + ... return nw.to_native(nulls) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 0 1 2 + >>> func(df_pl) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + """ + return self.__class__(lambda plx: self._call(plx).null_count()) + + def is_first_distinct(self) -> Expr: + r""" + Return a boolean mask indicating the first occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... first_distinct = df.select(nw.all().is_first_distinct()) + ... return nw.to_native(first_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 True True + 1 True False + 2 True True + 3 False True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + │ true ┆ true │ + │ false ┆ true │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_first_distinct()) + + def is_last_distinct(self) -> Expr: + r"""Return a boolean mask indicating the last occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... last_distinct = df.select(nw.all().is_last_distinct()) + ... return nw.to_native(last_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 False False + 1 True True + 2 True True + 3 True True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ true ┆ true │ + │ true ┆ true │ + │ true ┆ true │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_last_distinct()) + @property def str(self) -> ExprStringNamespace: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index aa902879d..7a8c98094 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1015,6 +1015,309 @@ def __invert__(self) -> Series: def filter(self, other: Any) -> Series: return self._from_series(self._series.filter(self._extract_native(other))) + # --- descriptive --- + def is_duplicated(self: Self) -> Series: + r""" + Get a mask of all duplicated rows in the Series. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 2, 3, 1]) + >>> s_pl = pl.Series([1, 2, 3, 1]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... duplicated = series.is_duplicated() + ... return nw.to_native(duplicated) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 True + 1 False + 2 False + 3 True + dtype: bool + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [bool] + [ + true + false + false + true + ] + """ + return Series(self._series.is_duplicated()) + + def is_empty(self: Self) -> bool: + r""" + Check if the series is empty. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + + Let's define a dataframe-agnostic function that filters rows in which "foo" + values are greater than 10, and then checks if the result is empty or not: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.filter(series > 10).is_empty() + + We can then pass either pandas or Polars to `func`: + + >>> s_pd = pd.Series([1, 2, 3]) + >>> s_pl = pl.Series([1, 2, 3]) + >>> func(s_pd), func(s_pl) + (True, True) + + >>> s_pd = pd.Series([100, 2, 3]) + >>> s_pl = pl.Series([100, 2, 3]) + >>> func(s_pd), func(s_pl) + (False, False) + """ + return self._series.is_empty() # type: ignore[no-any-return] + + def is_unique(self: Self) -> Series: + r""" + Get a mask of all unique rows in the Series. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 2, 3, 1]) + >>> s_pl = pl.Series([1, 2, 3, 1]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... unique = series.is_unique() + ... return nw.to_native(unique) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 False + 1 True + 2 True + 3 False + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [bool] + [ + false + true + true + false + ] + """ + return Series(self._series.is_unique()) + + def null_count(self: Self) -> int: + r""" + Create a new Series that shows the null counts per column. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, None, 3]) + >>> s_pl = pl.Series([1, None, None]) + + Let's define a dataframe-agnostic function that returns the null count of + the series: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.null_count() + + We can then pass either pandas or Polars to `func`: + >>> func(s_pd) + 1 + >>> func(s_pl) + 2 + """ + + return self._series.null_count() # type: ignore[no-any-return] + + def is_first_distinct(self: Self) -> Series: + r""" + Return a boolean mask indicating the first occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2]) + >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... first_distinct = series.is_first_distinct() + ... return nw.to_native(first_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 True + 1 False + 2 True + 3 True + 4 False + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + return Series(self._series.is_first_distinct()) + + def is_last_distinct(self: Self) -> Series: + r""" + Return a boolean mask indicating the last occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2]) + >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... last_distinct = series.is_last_distinct() + ... return nw.to_native(last_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + return Series(self._series.is_last_distinct()) + + def is_sorted(self: Self, *, descending: bool = False) -> bool: + r""" + Check if the Series is sorted. + + Arguments: + descending: Check if the Series is sorted in descending order. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> unsorted_data = [1, 3, 2] + >>> sorted_data = [3, 2, 1] + + Let's define a dataframe-agnostic function: + + >>> def func(s_any, descending=False): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.is_sorted(descending=descending) + + We can then pass either pandas or Polars to `func`: + + >>> func(pl.Series(unsorted_data)) + False + >>> func(pl.Series(sorted_data), descending=True) + True + >>> func(pd.Series(unsorted_data)) + False + >>> func(pd.Series(sorted_data), descending=True) + True + """ + return self._series.is_sorted(descending=descending) # type: ignore[no-any-return] + + def value_counts( + self: Self, *, sort: bool = False, parallel: bool = False + ) -> DataFrame: + r""" + Count the occurrences of unique values. + + Arguments: + sort: Sort the output by count in descending order. If set to False (default), + the order of the output is random. + parallel: Execute the computation in parallel. Unused for pandas-like APIs. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2], name="s") + >>> s_pl = pl.Series(values=[1, 1, 2, 3, 2], name="s") + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... val_count = series.value_counts(sort=True) + ... return nw.to_native(val_count) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + s count + 0 1 2 + 1 2 2 + 2 3 1 + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3, 2) + ┌─────┬───────┐ + │ s ┆ count │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═══════╡ + │ 1 ┆ 2 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴───────┘ + """ + from narwhals.dataframe import DataFrame + + return DataFrame(self._series.value_counts(sort=sort, parallel=parallel)) + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/expr/is_duplicated_test.py b/tests/expr/is_duplicated_test.py new file mode 100644 index 000000000..52e18f08a --- /dev/null +++ b/tests/expr/is_duplicated_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2], + "b": [1, 2, 3], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_duplicated(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_duplicated()) + expected = { + "a": [True, True, False], + "b": [False, False, False], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_first_distinct_test.py b/tests/expr/is_first_distinct_test.py new file mode 100644 index 000000000..22208c402 --- /dev/null +++ b/tests/expr/is_first_distinct_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2, 3, 2], + "b": [1, 2, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_first_distinct(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_first_distinct()) + expected = { + "a": [True, False, True, True, False], + "b": [True, True, True, False, False], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_last_distinct_test.py b/tests/expr/is_last_distinct_test.py new file mode 100644 index 000000000..984e2ee00 --- /dev/null +++ b/tests/expr/is_last_distinct_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2, 3, 2], + "b": [1, 2, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_last_distinct(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_last_distinct()) + expected = { + "a": [False, True, False, True, True], + "b": [False, False, True, True, True], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_unique_test.py b/tests/expr/is_unique_test.py new file mode 100644 index 000000000..7ba842add --- /dev/null +++ b/tests/expr/is_unique_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2], + "b": [1, 2, 3], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_unique(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_unique()) + expected = { + "a": [False, False, True], + "b": [True, True, True], + } + compare_dicts(result, expected) diff --git a/tests/expr/null_count_test.py b/tests/expr/null_count_test.py new file mode 100644 index 000000000..68b615585 --- /dev/null +++ b/tests/expr/null_count_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1.0, None, None, 3.0], + "b": [1.0, None, 4, 5.0], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_null_count(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().null_count()) + expected = { + "a": [2], + "b": [1], + } + compare_dicts(result, expected) diff --git a/tests/test_series.py b/tests/test_series.py index d6a33dd01..bcc702c4f 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -321,3 +321,86 @@ def test_to_numpy() -> None: result = nw.Series(s).__array__() assert result.dtype == "float64" assert nw.Series(s).shape == (3,) + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_duplicated(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_duplicated() + expected = np.array([True, True, False]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.parametrize(("threshold", "expected"), [(0, False), (10, True)]) +def test_is_empty(df_raw: Any, threshold: Any, expected: Any) -> None: + series = nw.Series(df_raw["a"]) + result = series.filter(series > threshold).is_empty() + assert result == expected + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_unique(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_unique() + expected = np.array([False, False, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("s_raw", [pd.Series([1, 2, None]), pl.Series([1, 2, None])]) +def test_null_count(s_raw: Any) -> None: + series = nw.Series(s_raw) + result = series.null_count() + assert result == 1 + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_first_distinct(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_first_distinct() + expected = np.array([True, False, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_last_distinct(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_last_distinct() + expected = np.array([False, True, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_value_counts(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + sorted_result = series.value_counts(sort=True) + assert sorted_result.columns == ["b", "count"] + + expected = np.array([[4, 2], [6, 1]]) + assert (sorted_result.to_numpy() == expected).all() + + unsorted_result = series.value_counts(sort=False) + assert unsorted_result.columns == ["b", "count"] + + a = unsorted_result.to_numpy() + + assert (a[a[:, 0].argsort()] == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.parametrize( + ("col", "descending", "expected"), + [("a", False, False), ("z", False, True), ("z", True, False)], +) +def test_is_sorted(df_raw: Any, col: str, descending: bool, expected: bool) -> None: # noqa: FBT001 + series = nw.Series(df_raw[col]) + result = series.is_sorted(descending=descending) + assert result == expected + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_sorted_invalid(df_raw: Any) -> None: + series = nw.Series(df_raw["z"]) + + with pytest.raises(TypeError): + series.is_sorted(descending="invalid_type") # type: ignore[arg-type] diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 9eac56dea..e1f74ac2b 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -126,6 +126,7 @@ series = [ i for i in nw.Series(pl.Series()).__dir__() if not i[0].isupper() and i[0] != "_" ] + if missing := set(expr).difference(series).difference({"over"}): print("In expr but not in series") # noqa: T201 print(missing) # noqa: T201 @@ -133,7 +134,19 @@ if ( extra := set(series) .difference(expr) - .difference({"to_pandas", "to_numpy", "dtype", "name", "shape", "to_frame"}) + .difference( + { + "to_pandas", + "to_numpy", + "dtype", + "name", + "shape", + "to_frame", + "is_empty", + "is_sorted", + "value_counts", + } + ) ): print("in series but not in expr") # noqa: T201 print(extra) # noqa: T201