Move logic of sort_values into the query compiler (modin-project#1754)

* Move logic of `sort_values` into the query compiler Signed-off-by: Devin Petersohn <[email protected]> * Remove dead code Signed-off-by: Devin Petersohn <[email protected]> * Add back `"kind"` parameter Signed-off-by: Devin Petersohn <[email protected]> * Added a test for bug modin-project#1743 This test runs inplace sort of dataframe that has non-numerical index Signed-off-by: Gregory Shimansky <[email protected]> * Apply suggestions from code review Co-authored-by: anmyachev <[email protected]> Co-authored-by: Gregory Shimansky <[email protected]> Co-authored-by: anmyachev <[email protected]>
aregm · Sep 16, 2020 · e647d26 · e647d26
1 parent 8b46126
commit e647d26
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 47 deletions.
diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
@@ -1874,3 +1874,80 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
             item_to_distribute=broadcasted_items,
         )
         return self.__constructor__(new_modin_frame)
+
+    def sort_rows_by_column_values(self, columns, ascending=True, **kwargs):
+        """Reorder the rows based on the lexicographic order of the given columns.
+
+        Parameters
+        ----------
+        columns : scalar or list of scalar
+            The column or columns to sort by
+        ascending : bool
+            Sort in ascending order (True) or descending order (False)
+
+        Returns
+        -------
+        PandasQueryCompiler
+            A new query compiler that contains result of the sort
+        """
+        na_position = kwargs.get("na_position", "last")
+        kind = kwargs.get("kind", "quicksort")
+        if not is_list_like(columns):
+            columns = [columns]
+        # Currently, sort_values will just reindex based on the sorted values.
+        # TODO create a more efficient way to sort
+        ErrorMessage.default_to_pandas("sort_values")
+        broadcast_value_dict = {
+            col: self.getitem_column_array([col]).to_pandas().squeeze(axis=1)
+            for col in columns
+        }
+        # Index may contain duplicates
+        broadcast_values1 = pandas.DataFrame(broadcast_value_dict, index=self.index)
+        # Index without duplicates
+        broadcast_values2 = pandas.DataFrame(broadcast_value_dict)
+        broadcast_values2 = broadcast_values2.reset_index(drop=True)
+        # Index may contain duplicates
+        new_index1 = broadcast_values1.sort_values(
+            by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position,
+        ).index
+        # Index without duplicates
+        new_index2 = broadcast_values2.sort_values(
+            by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position,
+        ).index
+
+        result = self.reset_index(drop=True).reindex(0, new_index2)
+        result.index = new_index1
+        return result
+
+    def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):
+        """Reorder the columns based on the lexicographic order of the given rows.
+
+        Parameters
+        ----------
+        rows : scalar or list of scalar
+            The row or rows to sort by
+        ascending : bool
+            Sort in ascending order (True) or descending order (False)
+
+        Returns
+        -------
+        PandasQueryCompiler
+            A new query compiler that contains result of the sort
+        """
+        na_position = kwargs.get("na_position", "last")
+        kind = kwargs.get("kind", "quicksort")
+        if not is_list_like(rows):
+            rows = [rows]
+        ErrorMessage.default_to_pandas("sort_values")
+        broadcast_value_list = [
+            self.getitem_row_array([row]).to_pandas() for row in rows
+        ]
+        index_builder = list(zip(broadcast_value_list, rows))
+        broadcast_values = pandas.concat(
+            [row for row, idx in index_builder], copy=False
+        )
+        broadcast_values.columns = self.columns
+        new_columns = broadcast_values.sort_values(
+            by=rows, axis=1, ascending=ascending, kind=kind, na_position=na_position,
+        ).columns
+        return self.reindex(1, new_columns)
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -2791,59 +2791,24 @@ def sort_values(
              A sorted DataFrame.
         """
         axis = self._get_axis_number(axis)
-        if not is_list_like(by):
-            by = [by]
-        # Currently, sort_values will just reindex based on the sorted values.
-        # TODO create a more efficient way to sort
-        ErrorMessage.default_to_pandas("sort_values")
+        inplace = validate_bool_kwarg(inplace, "inplace")
         if axis == 0:
-            broadcast_value_dict = {col: self[col]._to_pandas() for col in by}
-            # Index may contain duplicates
-            broadcast_values1 = pandas.DataFrame(broadcast_value_dict, index=self.index)
-            # Index without duplicates
-            broadcast_values2 = pandas.DataFrame(broadcast_value_dict)
-            broadcast_values2 = broadcast_values2.reset_index(drop=True)
-            # Index may contain duplicates
-            new_index1 = broadcast_values1.sort_values(
-                by=by,
-                axis=axis,
+            result = self._query_compiler.sort_rows_by_column_values(
+                by,
                 ascending=ascending,
                 kind=kind,
                 na_position=na_position,
-            ).index
-            # Index without duplicates
-            new_index2 = broadcast_values2.sort_values(
-                by=by,
-                axis=axis,
-                ascending=ascending,
-                kind=kind,
-                na_position=na_position,
-            ).index
-            if inplace:
-                self.reindex(index=new_index2, copy=False)
-                self.index = new_index1
-            else:
-                result = self.reset_index(drop=True)
-                result = result.reindex(index=new_index2, copy=True)
-                result.index = new_index1
-                return result
-        else:
-            broadcast_value_list = [
-                self[row :: len(self.index)]._to_pandas() for row in by
-            ]
-            index_builder = list(zip(broadcast_value_list, by))
-            broadcast_values = pandas.concat(
-                [row for row, idx in index_builder], copy=False
+                ignore_index=ignore_index,
             )
-            broadcast_values.columns = self.columns
-            new_columns = broadcast_values.sort_values(
-                by=by,
-                axis=axis,
+        else:
+            result = self._query_compiler.sort_columns_by_row_values(
+                by,
                 ascending=ascending,
                 kind=kind,
                 na_position=na_position,
-            ).columns
-            return self.reindex(columns=new_columns, copy=not inplace)
+                ignore_index=ignore_index,
+            )
+        return self._create_or_update_from_compiler(result, inplace)
 
     def std(
         self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs

diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py
@@ -6020,10 +6020,27 @@ def test_sort_values_with_duplicates(self):
         pandas_df = pandas.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0])
 
         key = modin_df.columns[0]
-        modin_result = modin_df.sort_values(key, inplace=False,)
-        pandas_result = pandas_df.sort_values(key, inplace=False,)
+        modin_result = modin_df.sort_values(key, inplace=False)
+        pandas_result = pandas_df.sort_values(key, inplace=False)
         df_equals(modin_result, pandas_result)
 
+        modin_df.sort_values(key, inplace=True)
+        pandas_df.sort_values(key, inplace=True)
+        df_equals(modin_df, pandas_df)
+
+    def test_sort_values_with_string_index(self):
+        modin_df = pd.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"])
+        pandas_df = pandas.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"])
+
+        key = modin_df.columns[0]
+        modin_result = modin_df.sort_values(key, inplace=False)
+        pandas_result = pandas_df.sort_values(key, inplace=False)
+        df_equals(modin_result, pandas_result)
+
+        modin_df.sort_values(key, inplace=True)
+        pandas_df.sort_values(key, inplace=True)
+        df_equals(modin_df, pandas_df)
+
     def test_where(self):
         frame_data = random_state.randn(100, 10)
         pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij"))