feat: join suffix (#934)

narwhals-dev · Sep 9, 2024 · 5dc4300 · 5dc4300
1 parent 9246f11
commit 5dc4300
Show file tree

Hide file tree

Showing 8 changed files with 80 additions and 34 deletions.
diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -277,6 +277,7 @@ def join(
         how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
         left_on: str | list[str] | None,
         right_on: str | list[str] | None,
+        suffix: str,
     ) -> Self:
         how_to_join_map = {
             "anti": "left anti",
@@ -298,7 +299,7 @@ def join(
                     keys=key_token,
                     right_keys=key_token,
                     join_type="inner",
-                    right_suffix="_right",
+                    right_suffix=suffix,
                 )
                 .drop([key_token]),
             )
@@ -309,7 +310,7 @@ def join(
                 keys=left_on,
                 right_keys=right_on,
                 join_type=how_to_join_map[how],
-                right_suffix="_right",
+                right_suffix=suffix,
             ),
         )
 

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -309,7 +309,9 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
                 )
                 raise ValueError(msg)
             tmp = df.group_by(*keys).agg(self)
-            tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys)
+            tmp = df.select(*keys).join(
+                tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
+            )
             return [tmp[name] for name in self._output_names]
 
         return self.__class__(

diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -208,6 +208,7 @@ def join(
         how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
         left_on: str | list[str] | None,
         right_on: str | list[str] | None,
+        suffix: str,
     ) -> Self:
         if how == "cross":
             key_token = generate_unique_token(
@@ -221,7 +222,7 @@ def join(
                     how="inner",
                     left_on=key_token,
                     right_on=key_token,
-                    suffixes=("", "_right"),
+                    suffixes=("", suffix),
                 )
                 .drop(columns=key_token),
             )
@@ -273,7 +274,7 @@ def join(
                 how="left",
                 left_on=left_on,
                 right_on=right_on,
-                suffixes=("", "_right"),
+                suffixes=("", suffix),
             )
             extra = []
             for left_key, right_key in zip(left_on, right_on):  # type: ignore[arg-type]
@@ -289,7 +290,7 @@ def join(
                 left_on=left_on,
                 right_on=right_on,
                 how=how,
-                suffixes=("", "_right"),
+                suffixes=("", suffix),
             ),
         )
 

diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -629,7 +629,7 @@ def func(df: DaskLazyFrame) -> list[Any]:
             tmp = df.group_by(*keys).agg(self)
             tmp_native = (
                 df.select(*keys)
-                .join(tmp, how="left", left_on=keys, right_on=keys)
+                .join(tmp, how="left", left_on=keys, right_on=keys, suffix="_right")
                 ._native_frame
             )
             return [tmp_native[name] for name in self._output_names]

diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -403,6 +403,7 @@ def join(
         how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
         left_on: str | list[str] | None,
         right_on: str | list[str] | None,
+        suffix: str,
     ) -> Self:
         if isinstance(left_on, str):
             left_on = [left_on]
@@ -427,7 +428,7 @@ def join(
                         how="inner",
                         left_on=key_token,
                         right_on=key_token,
-                        suffixes=("", "_right"),
+                        suffixes=("", suffix),
                     )
                     .drop(columns=key_token),
                 )
@@ -436,7 +437,7 @@ def join(
                     self._native_frame.merge(
                         other._native_frame,
                         how="cross",
-                        suffixes=("", "_right"),
+                        suffixes=("", suffix),
                     ),
                 )
 
@@ -488,14 +489,14 @@ def join(
                 how="left",
                 left_on=left_on,
                 right_on=right_on,
-                suffixes=("", "_right"),
+                suffixes=("", suffix),
             )
             extra = []
             for left_key, right_key in zip(left_on, right_on):  # type: ignore[arg-type]
                 if right_key != left_key and right_key not in self.columns:
                     extra.append(right_key)
                 elif right_key != left_key:
-                    extra.append(f"{right_key}_right")
+                    extra.append(f"{right_key}{suffix}")
             return self._from_native_frame(result_native.drop(columns=extra))
 
         return self._from_native_frame(
@@ -504,7 +505,7 @@ def join(
                 left_on=left_on,
                 right_on=right_on,
                 how=how,
-                suffixes=("", "_right"),
+                suffixes=("", suffix),
             ),
         )
 

diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -287,7 +287,9 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
                 )
                 raise ValueError(msg)
             tmp = df.group_by(*keys).agg(self)
-            tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys)
+            tmp = df.select(*keys).join(
+                tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
+            )
             return [tmp[name] for name in self._output_names]
 
         return self.__class__(

diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py
@@ -182,11 +182,12 @@ def sort(
     def join(
         self,
         other: Self,
-        *,
+        on: str | list[str] | None = None,
         how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
+        *,
         left_on: str | list[str] | None = None,
         right_on: str | list[str] | None = None,
-        on: str | list[str] | None = None,
+        suffix: str = "_right",
     ) -> Self:
         _supported_joins = ("inner", "left", "cross", "anti", "semi")
 
@@ -219,6 +220,7 @@ def join(
                 how=how,
                 left_on=left_on,
                 right_on=right_on,
+                suffix=suffix,
             )
         )
 
@@ -1850,30 +1852,29 @@ def sort(
     def join(
         self,
         other: Self,
-        *,
+        on: str | list[str] | None = None,
         how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
+        *,
         left_on: str | list[str] | None = None,
         right_on: str | list[str] | None = None,
-        on: str | list[str] | None = None,
+        suffix: str = "_right",
     ) -> Self:
         r"""
         Join in SQL-like fashion.
 
         Arguments:
-            other: DataFrame to join with.
-
+            other: Lazy DataFrame to join with.
+            on: Name(s) of the join columns in both DataFrames. If set, `left_on` and
+                `right_on` should be None.
             how: Join strategy.
 
                   * *inner*: Returns rows that have matching values in both tables.
                   * *cross*: Returns the Cartesian product of rows from both tables.
                   * *semi*: Filter rows that have a match in the right table.
                   * *anti*: Filter rows that do not have a match in the right table.
-
-            left_on: Name(s) of the left join column(s).
-
-            right_on: Name(s) of the right join column(s).
-
-            on: Join column of both DataFrames. If set, left_on and right_on should be None.
+            left_on: Join column of the left DataFrame.
+            right_on: Join column of the right DataFrame.
+            suffix: Suffix to append to columns with a duplicate name.
 
         Returns:
             A new joined DataFrame
@@ -1922,7 +1923,9 @@ def join(
             │ 2   ┆ 7.0 ┆ b   ┆ y     │
             └─────┴─────┴─────┴───────┘
         """
-        return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on)
+        return super().join(
+            other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix
+        )
 
     def join_asof(
         self,
@@ -3578,30 +3581,29 @@ def sort(
     def join(
         self,
         other: Self,
-        *,
+        on: str | list[str] | None = None,
         how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
+        *,
         left_on: str | list[str] | None = None,
         right_on: str | list[str] | None = None,
-        on: str | list[str] | None = None,
+        suffix: str = "_right",
     ) -> Self:
         r"""
         Add a join operation to the Logical Plan.
 
         Arguments:
             other: Lazy DataFrame to join with.
-
+            on: Name(s) of the join columns in both DataFrames. If set, `left_on` and
+                `right_on` should be None.
             how: Join strategy.
 
                   * *inner*: Returns rows that have matching values in both tables.
                   * *cross*: Returns the Cartesian product of rows from both tables.
                   * *semi*: Filter rows that have a match in the right table.
                   * *anti*: Filter rows that do not have a match in the right table.
-
             left_on: Join column of the left DataFrame.
-
             right_on: Join column of the right DataFrame.
-
-            on: Join column of both DataFrames. If set, left_on and right_on should be None.
+            suffix: Suffix to append to columns with a duplicate name.
 
         Returns:
             A new joined LazyFrame
@@ -3650,7 +3652,9 @@ def join(
             │ 2   ┆ 7.0 ┆ b   ┆ y     │
             └─────┴─────┴─────┴───────┘
         """
-        return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on)
+        return super().join(
+            other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix
+        )
 
     def join_asof(
         self,

diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py
@@ -89,6 +89,41 @@ def test_cross_join(constructor: Any) -> None:
         df.join(df, how="cross", left_on="antananarivo")  # type: ignore[arg-type]
 
 
+@pytest.mark.parametrize("how", ["inner", "left"])
+@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"])
+def test_suffix(constructor: Any, how: str, suffix: str) -> None:
+    data = {
+        "antananarivo": [1, 3, 2],
+        "bob": [4, 4, 6],
+        "zorro": [7.0, 8, 9],
+    }
+    df = nw.from_native(constructor(data))
+    df_right = df
+    result = df.join(
+        df_right,  # type: ignore[arg-type]
+        left_on=["antananarivo", "bob"],
+        right_on=["antananarivo", "bob"],
+        how=how,  # type: ignore[arg-type]
+        suffix=suffix,
+    )
+    result_cols = result.collect_schema().names()
+    assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"]
+
+
+@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"])
+def test_cross_join_suffix(constructor: Any, suffix: str) -> None:
+    data = {"antananarivo": [1, 3, 2]}
+    df = nw.from_native(constructor(data))
+    result = df.join(df, how="cross", suffix=suffix).sort(  # type: ignore[arg-type]
+        "antananarivo", f"antananarivo{suffix}"
+    )
+    expected = {
+        "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+        f"antananarivo{suffix}": [1, 2, 3, 1, 2, 3, 1, 2, 3],
+    }
+    compare_dicts(result, expected)
+
+
 def test_cross_join_non_pandas() -> None:
     data = {"antananarivo": [1, 3, 2]}
     df = nw.from_native(pd.DataFrame(data))