From 77df03a3a8e5e4e0395afff7fa3dc5c891b7ceff Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 12 Jul 2021 16:43:54 -0700 Subject: [PATCH 1/2] Initial, groupby shift --- python/cudf/cudf/core/groupby/groupby.py | 18 +++++++++--------- python/cudf/cudf/tests/test_groupby.py | 19 ------------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 1641f821a38..d39189914be 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -938,21 +938,21 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - value_column_names = [ - x for x in self.obj._column_names if x not in self.grouping.names - ] - num_columns_to_shift = len(value_column_names) + value_columns = self.grouping.values if is_list_like(fill_value): - if not len(fill_value) == num_columns_to_shift: + if not len(fill_value) == len(value_columns._data): raise ValueError( "Mismatched number of columns and values to fill." ) else: - fill_value = [fill_value] * num_columns_to_shift + fill_value = [fill_value] * len(value_columns._data) - value_columns = self.obj._data.select_by_label(value_column_names) - result = self._groupby.shift(Table(value_columns), periods, fill_value) - return self.obj.__class__._from_table(result) + result = self._groupby.shift( + Table(value_columns._data), periods, fill_value + ) + result = self.obj.__class__._from_table(result) + result = self._mimic_pandas_order(result) + return result._copy_type_metadata(value_columns) def _mimic_pandas_order( self, result: DataFrameOrSeries diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e423a64fe4d..e5287ccc2e5 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1734,10 +1734,6 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): ) got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) - # Pandas returns shifted column in original row order. We set its index - # to be the key columns, so that `assert_groupby_results_equal` can sort - # rows by key columns to make sure cudf and pandas results matches. - expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas()) assert_groupby_results_equal( expected[["val", "val2"]], got[["val", "val2"]] ) @@ -1776,10 +1772,6 @@ def test_groupby_shift_row_mixed_numerics( expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - # Pandas returns shifted column in original row order. We set its index - # to be the key columns, so that `assert_groupby_results_equal` can sort - # rows by key columns to make sure cudf and pandas results matches. - expected.index = gdf["0"].to_pandas() assert_groupby_results_equal( expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] ) @@ -1817,10 +1809,6 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): expected = pdf.groupby(["0"]).shift(periods=n_shift) got = gdf.groupby(["0"]).shift(periods=n_shift) - # Pandas returns shifted column in original row order. We set its index - # to be the key columns, so that `assert_groupby_results_equal` can sort - # rows by key columns to make sure cudf and pandas results matches. - expected.index = gdf["0"].to_pandas() assert_groupby_results_equal( expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] ) @@ -1880,10 +1868,6 @@ def test_groupby_shift_row_mixed_fill( got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - # Pandas returns shifted column in original row order. We set its index - # to be the key columns, so that `assert_groupby_results_equal` can sort - # rows by key columns to make sure cudf and pandas results matches. - expected.index = gdf["0"].to_pandas() assert_groupby_results_equal( expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] ) @@ -1916,9 +1900,6 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value): expected = gdf got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value) - # Here, the result should be the same as input due to 0-shift, only the - # key orders are different. - expected = expected.set_index("0") assert_groupby_results_equal( expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] ) From 45940e7750a2bab360dea5216023bd0b4261edcd Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 13 Jul 2021 18:30:33 -0700 Subject: [PATCH 2/2] scan_agg ordering --- python/cudf/cudf/core/groupby/groupby.py | 4 ++++ python/cudf/cudf/tests/test_groupby.py | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d39189914be..8477e5bde39 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -220,6 +220,10 @@ def agg(self, func): else: raise + if libgroupby._is_all_scan_aggregate(normalized_aggs): + # Scan aggregations return rows in original index order + return self._mimic_pandas_order(result) + # set index names to be group key names if len(result): result.index.names = self.grouping.names diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e5287ccc2e5..de7d8e35bce 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1702,7 +1702,6 @@ def test_groupby_2keys_scan(nelem, func): # pd.groupby.cumcount returns a series. if isinstance(expect_df, pd.Series): expect_df = expect_df.to_frame("val") - expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index() check_dtype = False if func in _index_type_aggs else True assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)