Make groupby transform-like op order match original data order (#8720)

Closes #8714 This PR makes transform-like ops return results with orders matching that of inputs. For example: `groupby.shift` ```python In [21]: df.head(8) Out[21]: key val1 0 1 70 1 1 86 2 0 18 3 1 91 4 1 74 5 1 97 6 0 43 7 0 48 In [22]: df.groupby('key').shift(1).head(8) Out[22]: val1 0 <NA> 1 70 2 <NA> 3 86 4 91 5 74 6 18 7 43 ``` This would affect `groupby.scan` and `groupby.shift`. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Charles Blackmon-Luca (https://github.com/charlesbluca) - Ashwin Srinath (https://github.com/shwina) URL: #8720
rapidsai · Aug 4, 2021 · fdf47af · fdf47af
1 parent ec99bed
commit fdf47af
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 28 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -220,6 +220,10 @@ def agg(self, func):
                     else:
                         raise
 
+        if libgroupby._is_all_scan_aggregate(normalized_aggs):
+            # Scan aggregations return rows in original index order
+            return self._mimic_pandas_order(result)
+
         # set index names to be group key names
         if len(result):
             result.index.names = self.grouping.names
@@ -981,22 +985,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        value_column_names = [
-            x for x in self.obj._column_names if x not in self.grouping.names
-        ]
-        num_columns_to_shift = len(value_column_names)
+        value_columns = self.grouping.values
         if is_list_like(fill_value):
-            if not len(fill_value) == num_columns_to_shift:
+            if not len(fill_value) == len(value_columns._data):
                 raise ValueError(
                     "Mismatched number of columns and values to fill."
                 )
         else:
-            fill_value = [fill_value] * num_columns_to_shift
+            fill_value = [fill_value] * len(value_columns._data)
 
-        value_columns = self.obj._data.select_by_label(value_column_names)
-        return self.obj.__class__._from_data(
+        result = self.obj.__class__._from_data(
             *self._groupby.shift(Table(value_columns), periods, fill_value)
         )
+        result = self._mimic_pandas_order(result)
+        return result._copy_type_metadata(value_columns)
 
     def _mimic_pandas_order(
         self, result: DataFrameOrSeries

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1702,7 +1702,6 @@ def test_groupby_2keys_scan(nelem, func):
     # pd.groupby.cumcount returns a series.
     if isinstance(expect_df, pd.Series):
         expect_df = expect_df.to_frame("val")
-    expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index()
 
     check_dtype = False if func in _index_type_aggs else True
     assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
@@ -1734,10 +1733,6 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value):
     )
     got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas())
     assert_groupby_results_equal(
         expected[["val", "val2"]], got[["val", "val2"]]
     )
@@ -1776,10 +1771,6 @@ def test_groupby_shift_row_mixed_numerics(
     expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
     got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = gdf["0"].to_pandas()
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -1817,10 +1808,6 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
     expected = pdf.groupby(["0"]).shift(periods=n_shift)
     got = gdf.groupby(["0"]).shift(periods=n_shift)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = gdf["0"].to_pandas()
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -1880,10 +1867,6 @@ def test_groupby_shift_row_mixed_fill(
 
     got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = gdf["0"].to_pandas()
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -1916,9 +1899,6 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
     expected = gdf
     got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value)
 
-    # Here, the result should be the same as input due to 0-shift, only the
-    # key orders are different.
-    expected = expected.set_index("0")
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )