Skip to content

Commit

Permalink
Merge pull request #191 from narwhals-dev/by-dtype
Browse files Browse the repository at this point in the history
add By dtype selector (flixbus PR)
  • Loading branch information
MarcoGorelli authored May 20, 2024
2 parents 457256a + b7e8d86 commit 44772d3
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 2 deletions.
11 changes: 11 additions & 0 deletions docs/api-reference/selectors.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# `narwhals.selectors`

::: narwhals.selectors
handler: python
options:
members:
- by_dtype
- numeric
show_root_heading: false
show_source: false
show_bases: false
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ nav:
- api-reference/expressions_str.md
- api-reference/dtypes.md
- api-reference/dependencies.md
- api-reference/selectors.md
theme:
name: material
font: false
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_pandas_like/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def func(df: Any) -> Any:
for result_keys in results_keys:
out_group.append(item(result_keys._series))
out_names.append(result_keys.name)
return plx.make_native_series(name="", data=out_group, index=out_names)
return plx._make_native_series(name="", data=out_group, index=out_names)

if implementation == "pandas":
import pandas as pd
Expand Down
7 changes: 6 additions & 1 deletion narwhals/_pandas_like/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from narwhals import dtypes
from narwhals._pandas_like.dataframe import PandasDataFrame
from narwhals._pandas_like.expr import PandasExpr
from narwhals._pandas_like.selectors import PandasSelector
from narwhals._pandas_like.series import PandasSeries
from narwhals._pandas_like.utils import horizontal_concat
from narwhals._pandas_like.utils import parse_into_exprs
Expand Down Expand Up @@ -36,7 +37,11 @@ class PandasNamespace:
String = dtypes.String
Datetime = dtypes.Datetime

def make_native_series(self, name: str, data: list[Any], index: Any) -> Any:
@property
def selectors(self) -> PandasSelector:
return PandasSelector(self._implementation)

def _make_native_series(self, name: str, data: list[Any], index: Any) -> Any:
if self._implementation == "pandas":
import pandas as pd

Expand Down
28 changes: 28 additions & 0 deletions narwhals/_pandas_like/selectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from narwhals._pandas_like.expr import PandasExpr

if TYPE_CHECKING:
from narwhals._pandas_like.dataframe import PandasDataFrame
from narwhals._pandas_like.series import PandasSeries
from narwhals.dtypes import DType


class PandasSelector:
def __init__(self, implementation: str) -> None:
self._implementation = implementation

def by_dtype(self, dtypes: list[DType]) -> PandasExpr:
def func(df: PandasDataFrame) -> list[PandasSeries]:
return [df[col] for col in df.columns if df.schema[col] in dtypes]

return PandasExpr(
func,
depth=0,
function_name="type_selector",
root_names=None,
output_names=None,
implementation=self._implementation,
)
110 changes: 110 additions & 0 deletions narwhals/selectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import annotations

from typing import Any

from narwhals import dtypes
from narwhals.dtypes import translate_dtype
from narwhals.expression import Expr
from narwhals.utils import flatten


def by_dtype(*dtypes: Any) -> Expr:
"""
Select columns based on their dtype.
Arguments:
dtypes: one or data types to select
Examples:
>>> import narwhals as nw
>>> import narwhals.selectors as ncs
>>> import pandas as pd
>>> import polars as pl
>>>
>>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [4.1, 2.3]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function to select int64 and float64
dtypes and multiplies each value by 2:
>>> def func(df_any):
... df = nw.from_native(df_any)
... df = df.select(ncs.by_dtype(nw.Int64, nw.Float64)*2)
... return nw.to_native(df)
We can then pass either pandas or Polars dataframes:
>>> func(df_pd)
a c
0 2 8.2
1 4 4.6
>>> func(df_pl)
shape: (2, 2)
┌─────┬─────┐
│ a ┆ c │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 2 ┆ 8.2 │
│ 4 ┆ 4.6 │
└─────┴─────┘
"""
return Expr(
lambda plx: plx.selectors.by_dtype(
[translate_dtype(plx, dtype) for dtype in flatten(dtypes)]
)
)


def numeric() -> Expr:
"""
Select numeric columns.
Examples:
>>> import narwhals as nw
>>> import narwhals.selectors as ncs
>>> import pandas as pd
>>> import polars as pl
>>>
>>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [4.1, 2.3]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function to select numeric
dtypes and multiplies each value by 2:
>>> def func(df_any):
... df = nw.from_native(df_any)
... df = df.select(ncs.by_dtype(nw.Int64, nw.Float64)*2)
... return nw.to_native(df)
We can then pass either pandas or Polars dataframes:
>>> func(df_pd)
a c
0 2 8.2
1 4 4.6
>>> func(df_pl)
shape: (2, 2)
┌─────┬─────┐
│ a ┆ c │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 2 ┆ 8.2 │
│ 4 ┆ 4.6 │
└─────┴─────┘
"""
return by_dtype(
dtypes.Int64,
dtypes.Int32,
dtypes.Int16,
dtypes.Int8,
dtypes.UInt64,
dtypes.UInt32,
dtypes.UInt16,
dtypes.UInt8,
dtypes.Float64,
dtypes.Float32,
)
28 changes: 28 additions & 0 deletions tests/selectors_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Any

import pandas as pd
import polars as pl
import pytest

import narwhals as nw
from narwhals.selectors import by_dtype
from narwhals.selectors import numeric
from tests.utils import compare_dicts

data = {"a": [1, 1, 2], "b": ["a", "b", "c"], "c": [4.0, 5.0, 6.0]}


@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame])
def test_selecctors(constructor: Any) -> None:
df = nw.from_native(constructor(data))
result = nw.to_native(df.select(by_dtype([nw.Int64, nw.Float64]) + 1))
expected = {"a": [2, 2, 3], "c": [5.0, 6.0, 7.0]}
compare_dicts(result, expected)


@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame])
def test_numeric(constructor: Any) -> None:
df = nw.from_native(constructor(data))
result = nw.to_native(df.select(numeric() + 1))
expected = {"a": [2, 2, 3], "c": [5.0, 6.0, 7.0]}
compare_dicts(result, expected)

0 comments on commit 44772d3

Please sign in to comment.