Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: descriptive #159

Merged
merged 3 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/api-reference/dataframe.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@
- filter
- group_by
- head
- is_duplicated
- is_empty
- is_unique
- join
- lazy
- null_count
- pipe
- rename
- schema
Expand Down
26 changes: 26 additions & 0 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,3 +263,29 @@ def to_pandas(self) -> Any:
if self._implementation == "modin": # pragma: no cover
return self._dataframe._to_pandas()
return self._dataframe.to_pandas() # pragma: no cover

# --- descriptive ---
def is_duplicated(self: Self) -> PandasSeries:
from narwhals._pandas_like.series import PandasSeries

return PandasSeries(
self._dataframe.duplicated(keep=False),
implementation=self._implementation,
)

def is_empty(self: Self) -> bool:
return self._dataframe.empty # type: ignore[no-any-return]

def is_unique(self: Self) -> PandasSeries:
from narwhals._pandas_like.series import PandasSeries

return PandasSeries(
~self._dataframe.duplicated(keep=False),
implementation=self._implementation,
)

def null_count(self: Self) -> PandasDataFrame:
return PandasDataFrame(
self._dataframe.isnull().sum(axis=0).to_frame().transpose(),
implementation=self._implementation,
)
Comment on lines +288 to +291
Copy link
Member Author

@FBruzzesi FBruzzesi May 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All this to return a dataframe (instead of a series) as polars does.
I double checked modin and cudf docs, both support to_frame and transpose so it should be fine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the fact that pandas reduces these operations to Series kind of annoys me

looks good, thanks

149 changes: 149 additions & 0 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,155 @@ def join(
"""
return super().join(other, how=how, left_on=left_on, right_on=right_on)

# --- descriptive ---
def is_duplicated(self: Self) -> Series:
r"""
Get a mask of all duplicated rows in this DataFrame.

Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import pandas as pd

>>> df_pl = pl.DataFrame(
... {
... "a": [1, 2, 3, 1],
... "b": ["x", "y", "z", "x"],
... }
... )
>>> nw.to_native(nw.from_native(df_pl).is_duplicated()) # doctest: +NORMALIZE_WHITESPACE
shape: (4,)
Series: '' [bool]
[
true
false
false
true
]

>>> df_pd = pd.DataFrame(
... {
... "a": [1, 2, 3, 1],
... "b": ["x", "y", "z", "x"],
... }
... )
>>> nw.to_native(nw.from_native(df_pd).is_duplicated())
0 True
1 False
2 False
3 True
dtype: bool
"""
from narwhals.series import Series

return Series(self._dataframe.is_duplicated())

def is_empty(self: Self) -> bool:
r"""
Check if the dataframe is empty.

Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import pandas as pd

>>> df_pl = nw.from_native(pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}))
>>> df_pl.is_empty()
False
>>> df_pl.filter(nw.col("foo") > 99).is_empty()
True

>>> df_pd = nw.from_native(pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}))
>>> df_pd.is_empty()
False
>>> df_pd.filter(nw.col("foo") > 99).is_empty()
True
"""

return self._dataframe.is_empty() # type: ignore[no-any-return]

def is_unique(self: Self) -> Series:
r"""
Get a mask of all unique rows in this DataFrame.

Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import pandas as pd

>>> df_pl = pl.DataFrame(
... {
... "a": [1, 2, 3, 1],
... "b": ["x", "y", "z", "x"],
... }
... )
>>> nw.to_native(nw.from_native(df_pl).is_unique()) # doctest: +NORMALIZE_WHITESPACE
shape: (4,)
Series: '' [bool]
[
false
true
true
false
]

>>> df_pd = pd.DataFrame(
... {
... "a": [1, 2, 3, 1],
... "b": ["x", "y", "z", "x"],
... }
... )
>>> nw.to_native(nw.from_native(df_pd).is_unique())
0 False
1 True
2 True
3 False
dtype: bool
"""
from narwhals.series import Series

return Series(self._dataframe.is_unique())

def null_count(self: Self) -> DataFrame:
r"""
Create a new DataFrame that shows the null counts per column.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we add a note here on how pandas and Polars treat null values differently? some other docstrings have it

Copy link
Member Author

@FBruzzesi FBruzzesi May 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the note here and adjusted all the examples using func

HypePartyGIF


Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import pandas as pd

>>> df_pl = pl.DataFrame(
... {
... "foo": [1, None, 3],
... "bar": [6, 7, None],
... "ham": ["a", "b", "c"],
... }
... )
>>> nw.to_native(nw.from_native(df_pl).null_count())
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ u32 ┆ u32 ┆ u32 │
╞═════╪═════╪═════╡
│ 1 ┆ 1 ┆ 0 │
└─────┴─────┴─────┘

>>> df_pd = pd.DataFrame(
... {
... "foo": [1, None, 3],
... "bar": [6, 7, None],
... "ham": ["a", "b", "c"],
... }
... )
>>> nw.to_native(nw.from_native(df_pd).null_count())
foo bar ham
0 1 1 0
"""

return DataFrame(self._dataframe.null_count())


class LazyFrame(BaseFrame):
r"""
Expand Down
Loading