Skip to content

Commit

Permalink
Make groupby transform-like op order match original data order (#8720)
Browse files Browse the repository at this point in the history
Closes #8714 

This PR makes transform-like ops return results with orders matching that of inputs. For example: `groupby.shift`

```python
In [21]: df.head(8)
Out[21]:
   key  val1
0    1    70
1    1    86
2    0    18
3    1    91
4    1    74
5    1    97
6    0    43
7    0    48

In [22]: df.groupby('key').shift(1).head(8)
Out[22]:
   val1
0  <NA>
1    70
2  <NA>
3    86
4    91
5    74
6    18
7    43
```

This would affect `groupby.scan` and `groupby.shift`.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Ashwin Srinath (https://github.com/shwina)

URL: #8720
  • Loading branch information
isVoid authored Aug 4, 2021
1 parent ec99bed commit fdf47af
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 28 deletions.
18 changes: 10 additions & 8 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ def agg(self, func):
else:
raise

if libgroupby._is_all_scan_aggregate(normalized_aggs):
# Scan aggregations return rows in original index order
return self._mimic_pandas_order(result)

# set index names to be group key names
if len(result):
result.index.names = self.grouping.names
Expand Down Expand Up @@ -981,22 +985,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")

value_column_names = [
x for x in self.obj._column_names if x not in self.grouping.names
]
num_columns_to_shift = len(value_column_names)
value_columns = self.grouping.values
if is_list_like(fill_value):
if not len(fill_value) == num_columns_to_shift:
if not len(fill_value) == len(value_columns._data):
raise ValueError(
"Mismatched number of columns and values to fill."
)
else:
fill_value = [fill_value] * num_columns_to_shift
fill_value = [fill_value] * len(value_columns._data)

value_columns = self.obj._data.select_by_label(value_column_names)
return self.obj.__class__._from_data(
result = self.obj.__class__._from_data(
*self._groupby.shift(Table(value_columns), periods, fill_value)
)
result = self._mimic_pandas_order(result)
return result._copy_type_metadata(value_columns)

def _mimic_pandas_order(
self, result: DataFrameOrSeries
Expand Down
20 changes: 0 additions & 20 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,7 +1702,6 @@ def test_groupby_2keys_scan(nelem, func):
# pd.groupby.cumcount returns a series.
if isinstance(expect_df, pd.Series):
expect_df = expect_df.to_frame("val")
expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index()

check_dtype = False if func in _index_type_aggs else True
assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
Expand Down Expand Up @@ -1734,10 +1733,6 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value):
)
got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas())
assert_groupby_results_equal(
expected[["val", "val2"]], got[["val", "val2"]]
)
Expand Down Expand Up @@ -1776,10 +1771,6 @@ def test_groupby_shift_row_mixed_numerics(
expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = gdf["0"].to_pandas()
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down Expand Up @@ -1817,10 +1808,6 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
expected = pdf.groupby(["0"]).shift(periods=n_shift)
got = gdf.groupby(["0"]).shift(periods=n_shift)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = gdf["0"].to_pandas()
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down Expand Up @@ -1880,10 +1867,6 @@ def test_groupby_shift_row_mixed_fill(

got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = gdf["0"].to_pandas()
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down Expand Up @@ -1916,9 +1899,6 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
expected = gdf
got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value)

# Here, the result should be the same as input due to 0-shift, only the
# key orders are different.
expected = expected.set_index("0")
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down

0 comments on commit fdf47af

Please sign in to comment.