FIX-#7250: Revert "PERF-#6666: Avoid internal reset_index for left me…

…rge" (#7251) Signed-off-by: Anatoly Myachev <[email protected]>
modin-project · May 13, 2024 · 1f06e70 · 1f06e70
1 parent 78dd171
commit 1f06e70
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 45 deletions.
diff --git a/modin/core/storage_formats/pandas/merge.py b/modin/core/storage_formats/pandas/merge.py
@@ -144,23 +144,8 @@ def should_keep_index(left, right):
                     )
                 return keep_index
 
-            def map_func(
-                left, right, *axis_lengths, kwargs=kwargs, **service_kwargs
-            ):  # pragma: no cover
-                df = pandas.merge(left, right, **kwargs)
-
-                if kwargs["how"] == "left":
-                    partition_idx = service_kwargs["partition_idx"]
-                    if len(axis_lengths):
-                        if not should_keep_index(left, right):
-                            # Doesn't work for "inner" case, since the partition sizes of the
-                            # left dataframe may change
-                            start = sum(axis_lengths[:partition_idx])
-                            stop = sum(axis_lengths[: partition_idx + 1])
-
-                            df.index = pandas.RangeIndex(start, stop)
-
-                return df
+            def map_func(left, right):  # pragma: no cover
+                return pandas.merge(left, right, **kwargs)
 
             # Want to ensure that these are python lists
             if left_on is not None and right_on is not None:
@@ -188,7 +173,6 @@ def map_func(
                 left._modin_frame.broadcast_apply_full_axis(
                     axis=1,
                     func=map_func,
-                    enumerate_partitions=how == "left",
                     other=right_to_broadcast,
                     # We're going to explicitly change the shape across the 1-axis,
                     # so we want for partitioning to adapt as well
@@ -199,7 +183,6 @@ def map_func(
                     new_columns=new_columns,
                     sync_labels=False,
                     dtypes=new_dtypes,
-                    pass_axis_lengths_to_partitions=how == "left",
                 )
             )
 
@@ -238,11 +221,7 @@ def map_func(
                         else new_left.sort_rows_by_column_values(on)
                     )
 
-            return (
-                new_left.reset_index(drop=True)
-                if not keep_index and (kwargs["how"] != "left" or sort)
-                else new_left
-            )
+            return new_left if keep_index else new_left.reset_index(drop=True)
         else:
             return left.default_to_pandas(pandas.DataFrame.merge, right, **kwargs)
 

diff --git a/...n/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/...n/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py
@@ -1994,8 +1994,6 @@ def sort_rows(self, columns, ascending, ignore_index, na_position):
                 drop_index_cols_after = [
                     col for col in base._index_cols if col in columns
                 ]
-                if not drop_index_cols_after:
-                    drop_index_cols_after = None
 
                 if drop_index_cols_before:
                     exprs = dict()

diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py
@@ -230,20 +230,20 @@ def test_join_6602():
     "test_data, test_data2",
     [
         (
-            np.random.uniform(0, 100, size=(2**6, 2**6)),
-            np.random.uniform(0, 100, size=(2**7, 2**6)),
+            np.random.randint(0, 100, size=(64, 64)),
+            np.random.randint(0, 100, size=(128, 64)),
         ),
         (
-            np.random.uniform(0, 100, size=(2**7, 2**6)),
-            np.random.uniform(0, 100, size=(2**6, 2**6)),
+            np.random.randint(0, 100, size=(128, 64)),
+            np.random.randint(0, 100, size=(64, 64)),
         ),
         (
-            np.random.uniform(0, 100, size=(2**6, 2**6)),
-            np.random.uniform(0, 100, size=(2**6, 2**7)),
+            np.random.randint(0, 100, size=(64, 64)),
+            np.random.randint(0, 100, size=(64, 128)),
         ),
         (
-            np.random.uniform(0, 100, size=(2**6, 2**7)),
-            np.random.uniform(0, 100, size=(2**6, 2**6)),
+            np.random.randint(0, 100, size=(64, 128)),
+            np.random.randint(0, 100, size=(64, 64)),
         ),
     ],
 )
@@ -280,7 +280,9 @@ def test_merge(test_data, test_data2):
             pandas_result = pandas_df.merge(
                 pandas_df2, how=hows[i], on=ons[j], sort=sorts[j]
             )
-            sort_if_range_partitioning(modin_result, pandas_result)
+            sort_if_range_partitioning(
+                modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+            )
 
             modin_result = modin_df.merge(
                 modin_df2,
@@ -296,7 +298,9 @@ def test_merge(test_data, test_data2):
                 right_on="key",
                 sort=sorts[j],
             )
-            sort_if_range_partitioning(modin_result, pandas_result)
+            sort_if_range_partitioning(
+                modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+            )
 
     # Test for issue #1771
     modin_df = pd.DataFrame({"name": np.arange(40)})
@@ -305,7 +309,9 @@ def test_merge(test_data, test_data2):
     pandas_df2 = pandas.DataFrame({"name": [39], "position": [0]})
     modin_result = modin_df.merge(modin_df2, on="name", how="inner")
     pandas_result = pandas_df.merge(pandas_df2, on="name", how="inner")
-    sort_if_range_partitioning(modin_result, pandas_result)
+    sort_if_range_partitioning(
+        modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+    )
 
     frame_data = {
         "col1": [0, 1, 2, 3],
@@ -326,7 +332,9 @@ def test_merge(test_data, test_data2):
         # Defaults
         modin_result = modin_df.merge(modin_df2, how=how)
         pandas_result = pandas_df.merge(pandas_df2, how=how)
-        sort_if_range_partitioning(modin_result, pandas_result)
+        sort_if_range_partitioning(
+            modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+        )
 
         # left_on and right_index
         modin_result = modin_df.merge(
@@ -335,7 +343,9 @@ def test_merge(test_data, test_data2):
         pandas_result = pandas_df.merge(
             pandas_df2, how=how, left_on="col1", right_index=True
         )
-        sort_if_range_partitioning(modin_result, pandas_result)
+        sort_if_range_partitioning(
+            modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+        )
 
         # left_index and right_on
         modin_result = modin_df.merge(
@@ -344,7 +354,9 @@ def test_merge(test_data, test_data2):
         pandas_result = pandas_df.merge(
             pandas_df2, how=how, left_index=True, right_on="col1"
         )
-        sort_if_range_partitioning(modin_result, pandas_result)
+        sort_if_range_partitioning(
+            modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+        )
 
         # left_on and right_on col1
         modin_result = modin_df.merge(
@@ -353,7 +365,9 @@ def test_merge(test_data, test_data2):
         pandas_result = pandas_df.merge(
             pandas_df2, how=how, left_on="col1", right_on="col1"
         )
-        sort_if_range_partitioning(modin_result, pandas_result)
+        sort_if_range_partitioning(
+            modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+        )
 
         # left_on and right_on col2
         modin_result = modin_df.merge(
@@ -362,7 +376,9 @@ def test_merge(test_data, test_data2):
         pandas_result = pandas_df.merge(
             pandas_df2, how=how, left_on="col2", right_on="col2"
         )
-        sort_if_range_partitioning(modin_result, pandas_result)
+        sort_if_range_partitioning(
+            modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+        )
 
         # left_index and right_index
         modin_result = modin_df.merge(
@@ -371,7 +387,9 @@ def test_merge(test_data, test_data2):
         pandas_result = pandas_df.merge(
             pandas_df2, how=how, left_index=True, right_index=True
         )
-        sort_if_range_partitioning(modin_result, pandas_result)
+        sort_if_range_partitioning(
+            modin_result, pandas_result, force=StorageFormat.get() == "Hdk"
+        )
 
     # Cannot merge a Series without a name
     ps = pandas.Series(frame_data2.get("col1"))
@@ -382,6 +400,7 @@ def test_merge(test_data, test_data2):
         lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps),
         comparator=sort_if_range_partitioning,
         expected_exception=ValueError("Cannot merge a Series without a name"),
+        comparator_kwargs={"force": StorageFormat.get() == "Hdk"},
     )
 
     # merge a Series with a name
@@ -392,6 +411,7 @@ def test_merge(test_data, test_data2):
         pandas_df,
         lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps),
         comparator=sort_if_range_partitioning,
+        comparator_kwargs={"force": StorageFormat.get() == "Hdk"},
     )
 
     with pytest.raises(TypeError):

diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
@@ -697,12 +697,12 @@ def sort_data(data):
         return np.sort(data)
 
 
-def sort_if_range_partitioning(df1, df2, comparator=None):
+def sort_if_range_partitioning(df1, df2, comparator=None, force=False):
     """Sort the passed objects if 'RangePartitioning' is enabled and compare the sorted results."""
     if comparator is None:
         comparator = df_equals
 
-    if RangePartitioning.get() or use_range_partitioning_groupby():
+    if force or (RangePartitioning.get() or use_range_partitioning_groupby()):
         df1, df2 = sort_data(df1), sort_data(df2)
 
     comparator(df1, df2)