From b647c1a2765adab10c906f14ce821e1d769c964f Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 2 Mar 2022 10:49:15 -0800 Subject: [PATCH 1/8] create new pr --- python/cudf/cudf/core/groupby/groupby.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b76f5dcc261..97fc3c5cc68 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1807,3 +1807,7 @@ def _is_multi_agg(aggs): if is_list_like(aggs): return True return False + + +def _cov_or_corr(): + pass From eff451d80cbfabba17efb2890339df2e41cab870 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 2 Mar 2022 15:18:21 -0800 Subject: [PATCH 2/8] created _cov_or_corr() method --- python/cudf/cudf/core/groupby/groupby.py | 96 +++++++++++++++++++++++- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 97fc3c5cc68..8255a8bd162 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1155,6 +1155,98 @@ def combine_columns(gb_cov, ys): return res + def _cov_or_corr(self, min_periods=0, ddof=1): + """ + internal function that calls to either corr() or cov() + for sort groupby computations + + """ + # create expanded dataframe consisting all combinations of the + # struct columns-pairs used in the covariance calculation + # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) + column_names = self.grouping.values._column_names + num_cols = len(column_names) + + column_pair_structs = {} + for x, y in itertools.combinations_with_replacement(column_names, 2): + # The number of output columns is the number of input columns + # squared. We directly call the struct column factory here to + # reduce overhead and avoid copying data. Since libcudf groupby + # maintains a cache of aggregation requests, reusing the same + # column also makes use of previously cached column means and + # reduces kernel costs. + + # checks if input column names are string, raise a warning if + # not so and cast them to strings + if not (isinstance(x, str) and isinstance(y, str)): + warnings.warn( + "DataFrame contains non-string column name(s). " + "Struct columns require field names to be strings. " + "Non-string column names will be cast to strings " + "in the result's field names." + ) + x, y = str(x), str(y) + + column_pair_structs[(x, y)] = cudf.core.column.build_struct_column( + names=(x, y), + children=(self.obj._data[x], self.obj._data[y]), + size=len(self.obj), + ) + + column_pair_groupby = cudf.DataFrame._from_data( + column_pair_structs + ).groupby(by=self.grouping.keys) + + try: + gb_cov = column_pair_groupby.agg( + lambda x: x.cov(min_periods, ddof) + ) + except RuntimeError as e: + if "Unsupported groupby reduction type-agg combination" in str(e): + raise TypeError( + "Covariance accepts only numerical column-pairs" + ) + raise + + # ensure that column-pair labels are arranged in ascending order + cols_list = [ + (y, x) if i > j else (x, y) + for j, y in enumerate(column_names) + for i, x in enumerate(column_names) + ] + cols_split = [ + cols_list[i : i + num_cols] + for i in range(0, len(cols_list), num_cols) + ] + + def combine_columns(gb_cov, ys): + list_of_columns = [gb_cov._data[y] for y in ys] + frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) + return interleave_columns(frame) + + # interleave: combine the correlation results for each column-pair + # into a single column + res = cudf.DataFrame._from_data( + { + x: combine_columns(gb_cov, ys) + for ys, x in zip(cols_split, column_names) + } + ) + + # create a multiindex for the groupby correlated dataframe, + # to match pandas behavior + unsorted_idx = gb_cov.index.repeat(num_cols) + idx_sort_order = unsorted_idx._get_sorted_inds() + sorted_idx = unsorted_idx._gather(idx_sort_order) + if len(gb_cov): + # TO-DO: Should the operation below be done on the CPU instead? + sorted_idx._data[None] = as_column( + np.tile(column_names, len(gb_cov.index)) + ) + res.index = MultiIndex._from_data(sorted_idx._data) + + return res + def var(self, ddof=1): """Compute the column-wise variance of the values in each group. @@ -1807,7 +1899,3 @@ def _is_multi_agg(aggs): if is_list_like(aggs): return True return False - - -def _cov_or_corr(): - pass From 9e013a6c7901b8937f40ddd6d7581bc57f81d6be Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 9 Mar 2022 16:08:15 -0800 Subject: [PATCH 3/8] added func- check to assign correct aggregation --- python/cudf/cudf/core/groupby/groupby.py | 171 +++-------------------- 1 file changed, 19 insertions(+), 152 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e37a848a10f..d26278611a7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -942,63 +942,12 @@ def corr(self, method="pearson", min_periods=1): "Only pearson correlation is currently supported" ) - # create expanded dataframe consisting all combinations of the - # struct columns-pairs to be correlated - # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - _cols = self.grouping.values._data.to_pandas_index().tolist() - len_cols = len(_cols) - - new_df_data = {} - for x, y in itertools.combinations_with_replacement(_cols, 2): - new_df_data[(x, y)] = cudf.DataFrame._from_data( - {"x": self.obj._data[x], "y": self.obj._data[y]} - ).to_struct() - new_gb = cudf.DataFrame._from_data(new_df_data).groupby( - by=self.grouping.keys - ) - - try: - gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) - except RuntimeError as e: - if "Unsupported groupby reduction type-agg combination" in str(e): - raise TypeError( - "Correlation accepts only numerical column-pairs" - ) - raise - - # ensure that column-pair labels are arranged in ascending order - cols_list = [ - (y, x) if i > j else (x, y) - for j, y in enumerate(_cols) - for i, x in enumerate(_cols) - ] - cols_split = [ - cols_list[i : i + len_cols] - for i in range(0, len(cols_list), len_cols) - ] - - # interleave: combine the correlation results for each column-pair - # into a single column - res = cudf.DataFrame._from_data( - { - x: gb_corr.loc[:, i].interleave_columns() - for i, x in zip(cols_split, _cols) - } - ) - - # create a multiindex for the groupby correlated dataframe, - # to match pandas behavior - unsorted_idx = gb_corr.index.repeat(len_cols) - idx_sort_order = unsorted_idx._get_sorted_inds() - sorted_idx = unsorted_idx._gather(idx_sort_order) - if len(gb_corr): - # TO-DO: Should the operation below be done on the CPU instead? - sorted_idx._data[None] = as_column( - cudf.Series(_cols).tile(len(gb_corr.index)) + def func(column_pair_groupby): + return column_pair_groupby.agg( + lambda x: x.corr(method, min_periods) ) - res.index = MultiIndex._from_data(sorted_idx._data) - return res + return self._cov_or_corr(func, "Correlation") def cov(self, min_periods=0, ddof=1): """ @@ -1085,96 +1034,16 @@ def cov(self, min_periods=0, ddof=1): val3 3.833333 12.333333 12.333333 """ - # create expanded dataframe consisting all combinations of the - # struct columns-pairs used in the covariance calculation - # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values._column_names - num_cols = len(column_names) - - column_pair_structs = {} - for x, y in itertools.combinations_with_replacement(column_names, 2): - # The number of output columns is the number of input columns - # squared. We directly call the struct column factory here to - # reduce overhead and avoid copying data. Since libcudf groupby - # maintains a cache of aggregation requests, reusing the same - # column also makes use of previously cached column means and - # reduces kernel costs. - - # checks if input column names are string, raise a warning if - # not so and cast them to strings - if not (isinstance(x, str) and isinstance(y, str)): - warnings.warn( - "DataFrame contains non-string column name(s). " - "Struct columns require field names to be strings. " - "Non-string column names will be cast to strings " - "in the result's field names." - ) - x, y = str(x), str(y) - - column_pair_structs[(x, y)] = cudf.core.column.build_struct_column( - names=(x, y), - children=(self.obj._data[x], self.obj._data[y]), - size=len(self.obj), - ) - - column_pair_groupby = cudf.DataFrame._from_data( - column_pair_structs - ).groupby(by=self.grouping.keys) - - try: - gb_cov = column_pair_groupby.agg( - lambda x: x.cov(min_periods, ddof) - ) - except RuntimeError as e: - if "Unsupported groupby reduction type-agg combination" in str(e): - raise TypeError( - "Covariance accepts only numerical column-pairs" - ) - raise - - # ensure that column-pair labels are arranged in ascending order - cols_list = [ - (y, x) if i > j else (x, y) - for j, y in enumerate(column_names) - for i, x in enumerate(column_names) - ] - cols_split = [ - cols_list[i : i + num_cols] - for i in range(0, len(cols_list), num_cols) - ] - - def combine_columns(gb_cov, ys): - list_of_columns = [gb_cov._data[y] for y in ys] - frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) - return interleave_columns(frame) + def func(column_pair_groupby): + return column_pair_groupby.agg(lambda x: x.cov(min_periods, ddof)) - # interleave: combine the correlation results for each column-pair - # into a single column - res = cudf.DataFrame._from_data( - { - x: combine_columns(gb_cov, ys) - for ys, x in zip(cols_split, column_names) - } - ) + return self._cov_or_corr(func, "Covariance") - # create a multiindex for the groupby correlated dataframe, - # to match pandas behavior - unsorted_idx = gb_cov.index.repeat(num_cols) - idx_sort_order = unsorted_idx._get_sorted_inds() - sorted_idx = unsorted_idx._gather(idx_sort_order) - if len(gb_cov): - # TO-DO: Should the operation below be done on the CPU instead? - sorted_idx._data[None] = as_column( - np.tile(column_names, len(gb_cov.index)) - ) - res.index = MultiIndex._from_data(sorted_idx._data) - - return res - - def _cov_or_corr(self, min_periods=0, ddof=1): + def _cov_or_corr(self, func, method_name): """ - internal function that calls to either corr() or cov() - for sort groupby computations + internal function that is called by either corr() or cov() + for sort groupby correlation and covariance computations, + respectively. """ # create expanded dataframe consisting all combinations of the @@ -1214,13 +1083,11 @@ def _cov_or_corr(self, min_periods=0, ddof=1): ).groupby(by=self.grouping.keys) try: - gb_cov = column_pair_groupby.agg( - lambda x: x.cov(min_periods, ddof) - ) + gb_cov_corr = func(column_pair_groupby) except RuntimeError as e: if "Unsupported groupby reduction type-agg combination" in str(e): raise TypeError( - "Covariance accepts only numerical column-pairs" + f"{method_name} accepts only numerical column-pairs" ) raise @@ -1235,8 +1102,8 @@ def _cov_or_corr(self, min_periods=0, ddof=1): for i in range(0, len(cols_list), num_cols) ] - def combine_columns(gb_cov, ys): - list_of_columns = [gb_cov._data[y] for y in ys] + def combine_columns(gb_cov_corr, ys): + list_of_columns = [gb_cov_corr._data[y] for y in ys] frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) return interleave_columns(frame) @@ -1244,20 +1111,20 @@ def combine_columns(gb_cov, ys): # into a single column res = cudf.DataFrame._from_data( { - x: combine_columns(gb_cov, ys) + x: combine_columns(gb_cov_corr, ys) for ys, x in zip(cols_split, column_names) } ) # create a multiindex for the groupby correlated dataframe, # to match pandas behavior - unsorted_idx = gb_cov.index.repeat(num_cols) + unsorted_idx = gb_cov_corr.index.repeat(num_cols) idx_sort_order = unsorted_idx._get_sorted_inds() sorted_idx = unsorted_idx._gather(idx_sort_order) - if len(gb_cov): + if len(gb_cov_corr): # TO-DO: Should the operation below be done on the CPU instead? sorted_idx._data[None] = as_column( - np.tile(column_names, len(gb_cov.index)) + np.tile(column_names, len(gb_cov_corr.index)) ) res.index = MultiIndex._from_data(sorted_idx._data) From b7325a1adb14e20f7ae4a6efc4c1ec464c123cf3 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 9 Mar 2022 16:14:26 -0800 Subject: [PATCH 4/8] minor comment-fixes --- python/cudf/cudf/core/groupby/groupby.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d26278611a7..7ea8b698a6b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1047,7 +1047,7 @@ def _cov_or_corr(self, func, method_name): """ # create expanded dataframe consisting all combinations of the - # struct columns-pairs used in the covariance calculation + # struct columns-pairs to be used in the correlation or covariance # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) column_names = self.grouping.values._column_names num_cols = len(column_names) @@ -1107,8 +1107,8 @@ def combine_columns(gb_cov_corr, ys): frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) return interleave_columns(frame) - # interleave: combine the correlation results for each column-pair - # into a single column + # interleave: combine the correlation or covariance results for each + # column-pair into a single column res = cudf.DataFrame._from_data( { x: combine_columns(gb_cov_corr, ys) @@ -1116,8 +1116,8 @@ def combine_columns(gb_cov_corr, ys): } ) - # create a multiindex for the groupby correlated dataframe, - # to match pandas behavior + # create a multiindex for the groupby covariance or correlation + # dataframe, to match pandas behavior unsorted_idx = gb_cov_corr.index.repeat(num_cols) idx_sort_order = unsorted_idx._get_sorted_inds() sorted_idx = unsorted_idx._gather(idx_sort_order) From 73127ed479ddd6d0a6be81a531c76f076b4974f7 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Fri, 11 Mar 2022 11:53:46 -0800 Subject: [PATCH 5/8] Update python/cudf/cudf/core/groupby/groupby.py Co-authored-by: Michael Wang --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 7ea8b698a6b..22d771bdfc0 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -947,7 +947,7 @@ def func(column_pair_groupby): lambda x: x.corr(method, min_periods) ) - return self._cov_or_corr(func, "Correlation") + return self._cov_or_corr(lambda x: x.corr(method, min_periods), "Correlation") def cov(self, min_periods=0, ddof=1): """ From cf24593380133744581ecbf4dbf175a2baee79de Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 11 Mar 2022 12:44:29 -0800 Subject: [PATCH 6/8] addressed michael's review --- python/cudf/cudf/core/groupby/groupby.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 22d771bdfc0..faba4e7a4e9 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -942,12 +942,9 @@ def corr(self, method="pearson", min_periods=1): "Only pearson correlation is currently supported" ) - def func(column_pair_groupby): - return column_pair_groupby.agg( - lambda x: x.corr(method, min_periods) - ) - - return self._cov_or_corr(lambda x: x.corr(method, min_periods), "Correlation") + return self._cov_or_corr( + lambda x: x.corr(method, min_periods), "Correlation" + ) def cov(self, min_periods=0, ddof=1): """ @@ -1034,10 +1031,9 @@ def cov(self, min_periods=0, ddof=1): val3 3.833333 12.333333 12.333333 """ - def func(column_pair_groupby): - return column_pair_groupby.agg(lambda x: x.cov(min_periods, ddof)) - - return self._cov_or_corr(func, "Covariance") + return self._cov_or_corr( + lambda x: x.cov(min_periods, ddof), "Covariance" + ) def _cov_or_corr(self, func, method_name): """ @@ -1083,7 +1079,7 @@ def _cov_or_corr(self, func, method_name): ).groupby(by=self.grouping.keys) try: - gb_cov_corr = func(column_pair_groupby) + gb_cov_corr = column_pair_groupby.agg(func) except RuntimeError as e: if "Unsupported groupby reduction type-agg combination" in str(e): raise TypeError( From 289ba26bee1985c5e6d037f1f63744eadccf3496 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Fri, 11 Mar 2022 14:16:21 -0800 Subject: [PATCH 7/8] Update python/cudf/cudf/core/groupby/groupby.py Co-authored-by: Michael Wang --- python/cudf/cudf/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index faba4e7a4e9..5f3953b8d79 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1040,7 +1040,6 @@ def _cov_or_corr(self, func, method_name): internal function that is called by either corr() or cov() for sort groupby correlation and covariance computations, respectively. - """ # create expanded dataframe consisting all combinations of the # struct columns-pairs to be used in the correlation or covariance From 6dfb4a5b58afc10aa20dd404a8ef2e6117affdae Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 11 Mar 2022 14:30:31 -0800 Subject: [PATCH 8/8] . --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 5f3953b8d79..a1a4596ba45 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1102,7 +1102,7 @@ def combine_columns(gb_cov_corr, ys): frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) return interleave_columns(frame) - # interleave: combine the correlation or covariance results for each + # interleave: combines the correlation or covariance results for each # column-pair into a single column res = cudf.DataFrame._from_data( {