Skip to content

Commit

Permalink
feat: join suffix (#934)
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi authored Sep 9, 2024
1 parent 9246f11 commit 5dc4300
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 34 deletions.
5 changes: 3 additions & 2 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ def join(
how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
left_on: str | list[str] | None,
right_on: str | list[str] | None,
suffix: str,
) -> Self:
how_to_join_map = {
"anti": "left anti",
Expand All @@ -298,7 +299,7 @@ def join(
keys=key_token,
right_keys=key_token,
join_type="inner",
right_suffix="_right",
right_suffix=suffix,
)
.drop([key_token]),
)
Expand All @@ -309,7 +310,7 @@ def join(
keys=left_on,
right_keys=right_on,
join_type=how_to_join_map[how],
right_suffix="_right",
right_suffix=suffix,
),
)

Expand Down
4 changes: 3 additions & 1 deletion narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,9 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
)
raise ValueError(msg)
tmp = df.group_by(*keys).agg(self)
tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys)
tmp = df.select(*keys).join(
tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
)
return [tmp[name] for name in self._output_names]

return self.__class__(
Expand Down
7 changes: 4 additions & 3 deletions narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def join(
how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
left_on: str | list[str] | None,
right_on: str | list[str] | None,
suffix: str,
) -> Self:
if how == "cross":
key_token = generate_unique_token(
Expand All @@ -221,7 +222,7 @@ def join(
how="inner",
left_on=key_token,
right_on=key_token,
suffixes=("", "_right"),
suffixes=("", suffix),
)
.drop(columns=key_token),
)
Expand Down Expand Up @@ -273,7 +274,7 @@ def join(
how="left",
left_on=left_on,
right_on=right_on,
suffixes=("", "_right"),
suffixes=("", suffix),
)
extra = []
for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type]
Expand All @@ -289,7 +290,7 @@ def join(
left_on=left_on,
right_on=right_on,
how=how,
suffixes=("", "_right"),
suffixes=("", suffix),
),
)

Expand Down
2 changes: 1 addition & 1 deletion narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ def func(df: DaskLazyFrame) -> list[Any]:
tmp = df.group_by(*keys).agg(self)
tmp_native = (
df.select(*keys)
.join(tmp, how="left", left_on=keys, right_on=keys)
.join(tmp, how="left", left_on=keys, right_on=keys, suffix="_right")
._native_frame
)
return [tmp_native[name] for name in self._output_names]
Expand Down
11 changes: 6 additions & 5 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ def join(
how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
left_on: str | list[str] | None,
right_on: str | list[str] | None,
suffix: str,
) -> Self:
if isinstance(left_on, str):
left_on = [left_on]
Expand All @@ -427,7 +428,7 @@ def join(
how="inner",
left_on=key_token,
right_on=key_token,
suffixes=("", "_right"),
suffixes=("", suffix),
)
.drop(columns=key_token),
)
Expand All @@ -436,7 +437,7 @@ def join(
self._native_frame.merge(
other._native_frame,
how="cross",
suffixes=("", "_right"),
suffixes=("", suffix),
),
)

Expand Down Expand Up @@ -488,14 +489,14 @@ def join(
how="left",
left_on=left_on,
right_on=right_on,
suffixes=("", "_right"),
suffixes=("", suffix),
)
extra = []
for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type]
if right_key != left_key and right_key not in self.columns:
extra.append(right_key)
elif right_key != left_key:
extra.append(f"{right_key}_right")
extra.append(f"{right_key}{suffix}")
return self._from_native_frame(result_native.drop(columns=extra))

return self._from_native_frame(
Expand All @@ -504,7 +505,7 @@ def join(
left_on=left_on,
right_on=right_on,
how=how,
suffixes=("", "_right"),
suffixes=("", suffix),
),
)

Expand Down
4 changes: 3 additions & 1 deletion narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,9 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
)
raise ValueError(msg)
tmp = df.group_by(*keys).agg(self)
tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys)
tmp = df.select(*keys).join(
tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
)
return [tmp[name] for name in self._output_names]

return self.__class__(
Expand Down
46 changes: 25 additions & 21 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,12 @@ def sort(
def join(
self,
other: Self,
*,
on: str | list[str] | None = None,
how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
on: str | list[str] | None = None,
suffix: str = "_right",
) -> Self:
_supported_joins = ("inner", "left", "cross", "anti", "semi")

Expand Down Expand Up @@ -219,6 +220,7 @@ def join(
how=how,
left_on=left_on,
right_on=right_on,
suffix=suffix,
)
)

Expand Down Expand Up @@ -1850,30 +1852,29 @@ def sort(
def join(
self,
other: Self,
*,
on: str | list[str] | None = None,
how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
on: str | list[str] | None = None,
suffix: str = "_right",
) -> Self:
r"""
Join in SQL-like fashion.
Arguments:
other: DataFrame to join with.
other: Lazy DataFrame to join with.
on: Name(s) of the join columns in both DataFrames. If set, `left_on` and
`right_on` should be None.
how: Join strategy.
* *inner*: Returns rows that have matching values in both tables.
* *cross*: Returns the Cartesian product of rows from both tables.
* *semi*: Filter rows that have a match in the right table.
* *anti*: Filter rows that do not have a match in the right table.
left_on: Name(s) of the left join column(s).
right_on: Name(s) of the right join column(s).
on: Join column of both DataFrames. If set, left_on and right_on should be None.
left_on: Join column of the left DataFrame.
right_on: Join column of the right DataFrame.
suffix: Suffix to append to columns with a duplicate name.
Returns:
A new joined DataFrame
Expand Down Expand Up @@ -1922,7 +1923,9 @@ def join(
│ 2 ┆ 7.0 ┆ b ┆ y │
└─────┴─────┴─────┴───────┘
"""
return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on)
return super().join(
other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix
)

def join_asof(
self,
Expand Down Expand Up @@ -3578,30 +3581,29 @@ def sort(
def join(
self,
other: Self,
*,
on: str | list[str] | None = None,
how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
on: str | list[str] | None = None,
suffix: str = "_right",
) -> Self:
r"""
Add a join operation to the Logical Plan.
Arguments:
other: Lazy DataFrame to join with.
on: Name(s) of the join columns in both DataFrames. If set, `left_on` and
`right_on` should be None.
how: Join strategy.
* *inner*: Returns rows that have matching values in both tables.
* *cross*: Returns the Cartesian product of rows from both tables.
* *semi*: Filter rows that have a match in the right table.
* *anti*: Filter rows that do not have a match in the right table.
left_on: Join column of the left DataFrame.
right_on: Join column of the right DataFrame.
on: Join column of both DataFrames. If set, left_on and right_on should be None.
suffix: Suffix to append to columns with a duplicate name.
Returns:
A new joined LazyFrame
Expand Down Expand Up @@ -3650,7 +3652,9 @@ def join(
│ 2 ┆ 7.0 ┆ b ┆ y │
└─────┴─────┴─────┴───────┘
"""
return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on)
return super().join(
other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix
)

def join_asof(
self,
Expand Down
35 changes: 35 additions & 0 deletions tests/frame/join_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,41 @@ def test_cross_join(constructor: Any) -> None:
df.join(df, how="cross", left_on="antananarivo") # type: ignore[arg-type]


@pytest.mark.parametrize("how", ["inner", "left"])
@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"])
def test_suffix(constructor: Any, how: str, suffix: str) -> None:
data = {
"antananarivo": [1, 3, 2],
"bob": [4, 4, 6],
"zorro": [7.0, 8, 9],
}
df = nw.from_native(constructor(data))
df_right = df
result = df.join(
df_right, # type: ignore[arg-type]
left_on=["antananarivo", "bob"],
right_on=["antananarivo", "bob"],
how=how, # type: ignore[arg-type]
suffix=suffix,
)
result_cols = result.collect_schema().names()
assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"]


@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"])
def test_cross_join_suffix(constructor: Any, suffix: str) -> None:
data = {"antananarivo": [1, 3, 2]}
df = nw.from_native(constructor(data))
result = df.join(df, how="cross", suffix=suffix).sort( # type: ignore[arg-type]
"antananarivo", f"antananarivo{suffix}"
)
expected = {
"antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3],
f"antananarivo{suffix}": [1, 2, 3, 1, 2, 3, 1, 2, 3],
}
compare_dicts(result, expected)


def test_cross_join_non_pandas() -> None:
data = {"antananarivo": [1, 3, 2]}
df = nw.from_native(pd.DataFrame(data))
Expand Down

0 comments on commit 5dc4300

Please sign in to comment.