Skip to content

Commit

Permalink
Move logic of sort_values into the query compiler (modin-project#1754)
Browse files Browse the repository at this point in the history
* Move logic of `sort_values` into the query compiler

Signed-off-by: Devin Petersohn <[email protected]>

* Remove dead code

Signed-off-by: Devin Petersohn <[email protected]>

* Add back `"kind"` parameter

Signed-off-by: Devin Petersohn <[email protected]>

* Added a test for bug modin-project#1743

This test runs inplace sort of dataframe that has non-numerical index

Signed-off-by: Gregory Shimansky <[email protected]>

* Apply suggestions from code review

Co-authored-by: anmyachev <[email protected]>

Co-authored-by: Gregory Shimansky <[email protected]>
Co-authored-by: anmyachev <[email protected]>
  • Loading branch information
3 people authored and aregm committed Sep 16, 2020
1 parent 8b46126 commit e647d26
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 47 deletions.
77 changes: 77 additions & 0 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1874,3 +1874,80 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
item_to_distribute=broadcasted_items,
)
return self.__constructor__(new_modin_frame)

def sort_rows_by_column_values(self, columns, ascending=True, **kwargs):
"""Reorder the rows based on the lexicographic order of the given columns.
Parameters
----------
columns : scalar or list of scalar
The column or columns to sort by
ascending : bool
Sort in ascending order (True) or descending order (False)
Returns
-------
PandasQueryCompiler
A new query compiler that contains result of the sort
"""
na_position = kwargs.get("na_position", "last")
kind = kwargs.get("kind", "quicksort")
if not is_list_like(columns):
columns = [columns]
# Currently, sort_values will just reindex based on the sorted values.
# TODO create a more efficient way to sort
ErrorMessage.default_to_pandas("sort_values")
broadcast_value_dict = {
col: self.getitem_column_array([col]).to_pandas().squeeze(axis=1)
for col in columns
}
# Index may contain duplicates
broadcast_values1 = pandas.DataFrame(broadcast_value_dict, index=self.index)
# Index without duplicates
broadcast_values2 = pandas.DataFrame(broadcast_value_dict)
broadcast_values2 = broadcast_values2.reset_index(drop=True)
# Index may contain duplicates
new_index1 = broadcast_values1.sort_values(
by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position,
).index
# Index without duplicates
new_index2 = broadcast_values2.sort_values(
by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position,
).index

result = self.reset_index(drop=True).reindex(0, new_index2)
result.index = new_index1
return result

def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):
"""Reorder the columns based on the lexicographic order of the given rows.
Parameters
----------
rows : scalar or list of scalar
The row or rows to sort by
ascending : bool
Sort in ascending order (True) or descending order (False)
Returns
-------
PandasQueryCompiler
A new query compiler that contains result of the sort
"""
na_position = kwargs.get("na_position", "last")
kind = kwargs.get("kind", "quicksort")
if not is_list_like(rows):
rows = [rows]
ErrorMessage.default_to_pandas("sort_values")
broadcast_value_list = [
self.getitem_row_array([row]).to_pandas() for row in rows
]
index_builder = list(zip(broadcast_value_list, rows))
broadcast_values = pandas.concat(
[row for row, idx in index_builder], copy=False
)
broadcast_values.columns = self.columns
new_columns = broadcast_values.sort_values(
by=rows, axis=1, ascending=ascending, kind=kind, na_position=na_position,
).columns
return self.reindex(1, new_columns)
55 changes: 10 additions & 45 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2791,59 +2791,24 @@ def sort_values(
A sorted DataFrame.
"""
axis = self._get_axis_number(axis)
if not is_list_like(by):
by = [by]
# Currently, sort_values will just reindex based on the sorted values.
# TODO create a more efficient way to sort
ErrorMessage.default_to_pandas("sort_values")
inplace = validate_bool_kwarg(inplace, "inplace")
if axis == 0:
broadcast_value_dict = {col: self[col]._to_pandas() for col in by}
# Index may contain duplicates
broadcast_values1 = pandas.DataFrame(broadcast_value_dict, index=self.index)
# Index without duplicates
broadcast_values2 = pandas.DataFrame(broadcast_value_dict)
broadcast_values2 = broadcast_values2.reset_index(drop=True)
# Index may contain duplicates
new_index1 = broadcast_values1.sort_values(
by=by,
axis=axis,
result = self._query_compiler.sort_rows_by_column_values(
by,
ascending=ascending,
kind=kind,
na_position=na_position,
).index
# Index without duplicates
new_index2 = broadcast_values2.sort_values(
by=by,
axis=axis,
ascending=ascending,
kind=kind,
na_position=na_position,
).index
if inplace:
self.reindex(index=new_index2, copy=False)
self.index = new_index1
else:
result = self.reset_index(drop=True)
result = result.reindex(index=new_index2, copy=True)
result.index = new_index1
return result
else:
broadcast_value_list = [
self[row :: len(self.index)]._to_pandas() for row in by
]
index_builder = list(zip(broadcast_value_list, by))
broadcast_values = pandas.concat(
[row for row, idx in index_builder], copy=False
ignore_index=ignore_index,
)
broadcast_values.columns = self.columns
new_columns = broadcast_values.sort_values(
by=by,
axis=axis,
else:
result = self._query_compiler.sort_columns_by_row_values(
by,
ascending=ascending,
kind=kind,
na_position=na_position,
).columns
return self.reindex(columns=new_columns, copy=not inplace)
ignore_index=ignore_index,
)
return self._create_or_update_from_compiler(result, inplace)

def std(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
Expand Down
21 changes: 19 additions & 2 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6020,10 +6020,27 @@ def test_sort_values_with_duplicates(self):
pandas_df = pandas.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0])

key = modin_df.columns[0]
modin_result = modin_df.sort_values(key, inplace=False,)
pandas_result = pandas_df.sort_values(key, inplace=False,)
modin_result = modin_df.sort_values(key, inplace=False)
pandas_result = pandas_df.sort_values(key, inplace=False)
df_equals(modin_result, pandas_result)

modin_df.sort_values(key, inplace=True)
pandas_df.sort_values(key, inplace=True)
df_equals(modin_df, pandas_df)

def test_sort_values_with_string_index(self):
modin_df = pd.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"])
pandas_df = pandas.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"])

key = modin_df.columns[0]
modin_result = modin_df.sort_values(key, inplace=False)
pandas_result = pandas_df.sort_values(key, inplace=False)
df_equals(modin_result, pandas_result)

modin_df.sort_values(key, inplace=True)
pandas_df.sort_values(key, inplace=True)
df_equals(modin_df, pandas_df)

def test_where(self):
frame_data = random_state.randn(100, 10)
pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij"))
Expand Down

0 comments on commit e647d26

Please sign in to comment.