From b647c1a2765adab10c906f14ce821e1d769c964f Mon Sep 17 00:00:00 2001
From: Sheilah <kirui.sheilah@gmail.com>
Date: Wed, 2 Mar 2022 10:49:15 -0800
Subject: [PATCH 1/8] create new pr

---
 python/cudf/cudf/core/groupby/groupby.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b76f5dcc261..97fc3c5cc68 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1807,3 +1807,7 @@ def _is_multi_agg(aggs):
     if is_list_like(aggs):
         return True
     return False
+
+
+def _cov_or_corr():
+    pass

From eff451d80cbfabba17efb2890339df2e41cab870 Mon Sep 17 00:00:00 2001
From: Sheilah <kirui.sheilah@gmail.com>
Date: Wed, 2 Mar 2022 15:18:21 -0800
Subject: [PATCH 2/8] created _cov_or_corr() method

---
 python/cudf/cudf/core/groupby/groupby.py | 96 +++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 97fc3c5cc68..8255a8bd162 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1155,6 +1155,98 @@ def combine_columns(gb_cov, ys):
 
         return res
 
+    def _cov_or_corr(self, min_periods=0, ddof=1):
+        """
+        internal function that calls to either corr() or cov()
+        for sort groupby computations
+
+        """
+        # create expanded dataframe consisting all combinations of the
+        # struct columns-pairs used in the covariance calculation
+        # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
+        column_names = self.grouping.values._column_names
+        num_cols = len(column_names)
+
+        column_pair_structs = {}
+        for x, y in itertools.combinations_with_replacement(column_names, 2):
+            # The number of output columns is the number of input columns
+            # squared. We directly call the struct column factory here to
+            # reduce overhead and avoid copying data. Since libcudf groupby
+            # maintains a cache of aggregation requests, reusing the same
+            # column also makes use of previously cached column means and
+            # reduces kernel costs.
+
+            # checks if input column names are string, raise a warning if
+            # not so and cast them to strings
+            if not (isinstance(x, str) and isinstance(y, str)):
+                warnings.warn(
+                    "DataFrame contains non-string column name(s). "
+                    "Struct columns require field names to be strings. "
+                    "Non-string column names will be cast to strings "
+                    "in the result's field names."
+                )
+                x, y = str(x), str(y)
+
+            column_pair_structs[(x, y)] = cudf.core.column.build_struct_column(
+                names=(x, y),
+                children=(self.obj._data[x], self.obj._data[y]),
+                size=len(self.obj),
+            )
+
+        column_pair_groupby = cudf.DataFrame._from_data(
+            column_pair_structs
+        ).groupby(by=self.grouping.keys)
+
+        try:
+            gb_cov = column_pair_groupby.agg(
+                lambda x: x.cov(min_periods, ddof)
+            )
+        except RuntimeError as e:
+            if "Unsupported groupby reduction type-agg combination" in str(e):
+                raise TypeError(
+                    "Covariance accepts only numerical column-pairs"
+                )
+            raise
+
+        # ensure that column-pair labels are arranged in ascending order
+        cols_list = [
+            (y, x) if i > j else (x, y)
+            for j, y in enumerate(column_names)
+            for i, x in enumerate(column_names)
+        ]
+        cols_split = [
+            cols_list[i : i + num_cols]
+            for i in range(0, len(cols_list), num_cols)
+        ]
+
+        def combine_columns(gb_cov, ys):
+            list_of_columns = [gb_cov._data[y] for y in ys]
+            frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
+            return interleave_columns(frame)
+
+        # interleave: combine the correlation results for each column-pair
+        # into a single column
+        res = cudf.DataFrame._from_data(
+            {
+                x: combine_columns(gb_cov, ys)
+                for ys, x in zip(cols_split, column_names)
+            }
+        )
+
+        # create a multiindex for the groupby correlated dataframe,
+        # to match pandas behavior
+        unsorted_idx = gb_cov.index.repeat(num_cols)
+        idx_sort_order = unsorted_idx._get_sorted_inds()
+        sorted_idx = unsorted_idx._gather(idx_sort_order)
+        if len(gb_cov):
+            # TO-DO: Should the operation below be done on the CPU instead?
+            sorted_idx._data[None] = as_column(
+                np.tile(column_names, len(gb_cov.index))
+            )
+        res.index = MultiIndex._from_data(sorted_idx._data)
+
+        return res
+
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
@@ -1807,7 +1899,3 @@ def _is_multi_agg(aggs):
     if is_list_like(aggs):
         return True
     return False
-
-
-def _cov_or_corr():
-    pass

From 9e013a6c7901b8937f40ddd6d7581bc57f81d6be Mon Sep 17 00:00:00 2001
From: Sheilah <kirui.sheilah@gmail.com>
Date: Wed, 9 Mar 2022 16:08:15 -0800
Subject: [PATCH 3/8] added func- check to assign correct aggregation

---
 python/cudf/cudf/core/groupby/groupby.py | 171 +++--------------------
 1 file changed, 19 insertions(+), 152 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e37a848a10f..d26278611a7 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -942,63 +942,12 @@ def corr(self, method="pearson", min_periods=1):
                 "Only pearson correlation is currently supported"
             )
 
-        # create expanded dataframe consisting all combinations of the
-        # struct columns-pairs to be correlated
-        # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        _cols = self.grouping.values._data.to_pandas_index().tolist()
-        len_cols = len(_cols)
-
-        new_df_data = {}
-        for x, y in itertools.combinations_with_replacement(_cols, 2):
-            new_df_data[(x, y)] = cudf.DataFrame._from_data(
-                {"x": self.obj._data[x], "y": self.obj._data[y]}
-            ).to_struct()
-        new_gb = cudf.DataFrame._from_data(new_df_data).groupby(
-            by=self.grouping.keys
-        )
-
-        try:
-            gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods))
-        except RuntimeError as e:
-            if "Unsupported groupby reduction type-agg combination" in str(e):
-                raise TypeError(
-                    "Correlation accepts only numerical column-pairs"
-                )
-            raise
-
-        # ensure that column-pair labels are arranged in ascending order
-        cols_list = [
-            (y, x) if i > j else (x, y)
-            for j, y in enumerate(_cols)
-            for i, x in enumerate(_cols)
-        ]
-        cols_split = [
-            cols_list[i : i + len_cols]
-            for i in range(0, len(cols_list), len_cols)
-        ]
-
-        # interleave: combine the correlation results for each column-pair
-        # into a single column
-        res = cudf.DataFrame._from_data(
-            {
-                x: gb_corr.loc[:, i].interleave_columns()
-                for i, x in zip(cols_split, _cols)
-            }
-        )
-
-        # create a multiindex for the groupby correlated dataframe,
-        # to match pandas behavior
-        unsorted_idx = gb_corr.index.repeat(len_cols)
-        idx_sort_order = unsorted_idx._get_sorted_inds()
-        sorted_idx = unsorted_idx._gather(idx_sort_order)
-        if len(gb_corr):
-            # TO-DO: Should the operation below be done on the CPU instead?
-            sorted_idx._data[None] = as_column(
-                cudf.Series(_cols).tile(len(gb_corr.index))
+        def func(column_pair_groupby):
+            return column_pair_groupby.agg(
+                lambda x: x.corr(method, min_periods)
             )
-        res.index = MultiIndex._from_data(sorted_idx._data)
 
-        return res
+        return self._cov_or_corr(func, "Correlation")
 
     def cov(self, min_periods=0, ddof=1):
         """
@@ -1085,96 +1034,16 @@ def cov(self, min_periods=0, ddof=1):
            val3  3.833333  12.333333  12.333333
         """
 
-        # create expanded dataframe consisting all combinations of the
-        # struct columns-pairs used in the covariance calculation
-        # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values._column_names
-        num_cols = len(column_names)
-
-        column_pair_structs = {}
-        for x, y in itertools.combinations_with_replacement(column_names, 2):
-            # The number of output columns is the number of input columns
-            # squared. We directly call the struct column factory here to
-            # reduce overhead and avoid copying data. Since libcudf groupby
-            # maintains a cache of aggregation requests, reusing the same
-            # column also makes use of previously cached column means and
-            # reduces kernel costs.
-
-            # checks if input column names are string, raise a warning if
-            # not so and cast them to strings
-            if not (isinstance(x, str) and isinstance(y, str)):
-                warnings.warn(
-                    "DataFrame contains non-string column name(s). "
-                    "Struct columns require field names to be strings. "
-                    "Non-string column names will be cast to strings "
-                    "in the result's field names."
-                )
-                x, y = str(x), str(y)
-
-            column_pair_structs[(x, y)] = cudf.core.column.build_struct_column(
-                names=(x, y),
-                children=(self.obj._data[x], self.obj._data[y]),
-                size=len(self.obj),
-            )
-
-        column_pair_groupby = cudf.DataFrame._from_data(
-            column_pair_structs
-        ).groupby(by=self.grouping.keys)
-
-        try:
-            gb_cov = column_pair_groupby.agg(
-                lambda x: x.cov(min_periods, ddof)
-            )
-        except RuntimeError as e:
-            if "Unsupported groupby reduction type-agg combination" in str(e):
-                raise TypeError(
-                    "Covariance accepts only numerical column-pairs"
-                )
-            raise
-
-        # ensure that column-pair labels are arranged in ascending order
-        cols_list = [
-            (y, x) if i > j else (x, y)
-            for j, y in enumerate(column_names)
-            for i, x in enumerate(column_names)
-        ]
-        cols_split = [
-            cols_list[i : i + num_cols]
-            for i in range(0, len(cols_list), num_cols)
-        ]
-
-        def combine_columns(gb_cov, ys):
-            list_of_columns = [gb_cov._data[y] for y in ys]
-            frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
-            return interleave_columns(frame)
+        def func(column_pair_groupby):
+            return column_pair_groupby.agg(lambda x: x.cov(min_periods, ddof))
 
-        # interleave: combine the correlation results for each column-pair
-        # into a single column
-        res = cudf.DataFrame._from_data(
-            {
-                x: combine_columns(gb_cov, ys)
-                for ys, x in zip(cols_split, column_names)
-            }
-        )
+        return self._cov_or_corr(func, "Covariance")
 
-        # create a multiindex for the groupby correlated dataframe,
-        # to match pandas behavior
-        unsorted_idx = gb_cov.index.repeat(num_cols)
-        idx_sort_order = unsorted_idx._get_sorted_inds()
-        sorted_idx = unsorted_idx._gather(idx_sort_order)
-        if len(gb_cov):
-            # TO-DO: Should the operation below be done on the CPU instead?
-            sorted_idx._data[None] = as_column(
-                np.tile(column_names, len(gb_cov.index))
-            )
-        res.index = MultiIndex._from_data(sorted_idx._data)
-
-        return res
-
-    def _cov_or_corr(self, min_periods=0, ddof=1):
+    def _cov_or_corr(self, func, method_name):
         """
-        internal function that calls to either corr() or cov()
-        for sort groupby computations
+        internal function that is called by either corr() or cov()
+        for sort groupby correlation and covariance computations,
+        respectively.
 
         """
         # create expanded dataframe consisting all combinations of the
@@ -1214,13 +1083,11 @@ def _cov_or_corr(self, min_periods=0, ddof=1):
         ).groupby(by=self.grouping.keys)
 
         try:
-            gb_cov = column_pair_groupby.agg(
-                lambda x: x.cov(min_periods, ddof)
-            )
+            gb_cov_corr = func(column_pair_groupby)
         except RuntimeError as e:
             if "Unsupported groupby reduction type-agg combination" in str(e):
                 raise TypeError(
-                    "Covariance accepts only numerical column-pairs"
+                    f"{method_name} accepts only numerical column-pairs"
                 )
             raise
 
@@ -1235,8 +1102,8 @@ def _cov_or_corr(self, min_periods=0, ddof=1):
             for i in range(0, len(cols_list), num_cols)
         ]
 
-        def combine_columns(gb_cov, ys):
-            list_of_columns = [gb_cov._data[y] for y in ys]
+        def combine_columns(gb_cov_corr, ys):
+            list_of_columns = [gb_cov_corr._data[y] for y in ys]
             frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
             return interleave_columns(frame)
 
@@ -1244,20 +1111,20 @@ def combine_columns(gb_cov, ys):
         # into a single column
         res = cudf.DataFrame._from_data(
             {
-                x: combine_columns(gb_cov, ys)
+                x: combine_columns(gb_cov_corr, ys)
                 for ys, x in zip(cols_split, column_names)
             }
         )
 
         # create a multiindex for the groupby correlated dataframe,
         # to match pandas behavior
-        unsorted_idx = gb_cov.index.repeat(num_cols)
+        unsorted_idx = gb_cov_corr.index.repeat(num_cols)
         idx_sort_order = unsorted_idx._get_sorted_inds()
         sorted_idx = unsorted_idx._gather(idx_sort_order)
-        if len(gb_cov):
+        if len(gb_cov_corr):
             # TO-DO: Should the operation below be done on the CPU instead?
             sorted_idx._data[None] = as_column(
-                np.tile(column_names, len(gb_cov.index))
+                np.tile(column_names, len(gb_cov_corr.index))
             )
         res.index = MultiIndex._from_data(sorted_idx._data)
 

From b7325a1adb14e20f7ae4a6efc4c1ec464c123cf3 Mon Sep 17 00:00:00 2001
From: Sheilah <kirui.sheilah@gmail.com>
Date: Wed, 9 Mar 2022 16:14:26 -0800
Subject: [PATCH 4/8] minor comment-fixes

---
 python/cudf/cudf/core/groupby/groupby.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d26278611a7..7ea8b698a6b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1047,7 +1047,7 @@ def _cov_or_corr(self, func, method_name):
 
         """
         # create expanded dataframe consisting all combinations of the
-        # struct columns-pairs used in the covariance calculation
+        # struct columns-pairs to be used in the correlation or covariance
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
         column_names = self.grouping.values._column_names
         num_cols = len(column_names)
@@ -1107,8 +1107,8 @@ def combine_columns(gb_cov_corr, ys):
             frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
             return interleave_columns(frame)
 
-        # interleave: combine the correlation results for each column-pair
-        # into a single column
+        # interleave: combine the correlation or covariance results for each
+        # column-pair into a single column
         res = cudf.DataFrame._from_data(
             {
                 x: combine_columns(gb_cov_corr, ys)
@@ -1116,8 +1116,8 @@ def combine_columns(gb_cov_corr, ys):
             }
         )
 
-        # create a multiindex for the groupby correlated dataframe,
-        # to match pandas behavior
+        # create a multiindex for the groupby covariance or correlation
+        # dataframe, to match pandas behavior
         unsorted_idx = gb_cov_corr.index.repeat(num_cols)
         idx_sort_order = unsorted_idx._get_sorted_inds()
         sorted_idx = unsorted_idx._gather(idx_sort_order)

From 73127ed479ddd6d0a6be81a531c76f076b4974f7 Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Fri, 11 Mar 2022 11:53:46 -0800
Subject: [PATCH 5/8] Update python/cudf/cudf/core/groupby/groupby.py

Co-authored-by: Michael Wang <isVoid@users.noreply.github.com>
---
 python/cudf/cudf/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 7ea8b698a6b..22d771bdfc0 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -947,7 +947,7 @@ def func(column_pair_groupby):
                 lambda x: x.corr(method, min_periods)
             )
 
-        return self._cov_or_corr(func, "Correlation")
+        return self._cov_or_corr(lambda x: x.corr(method, min_periods), "Correlation")
 
     def cov(self, min_periods=0, ddof=1):
         """

From cf24593380133744581ecbf4dbf175a2baee79de Mon Sep 17 00:00:00 2001
From: Sheilah <kirui.sheilah@gmail.com>
Date: Fri, 11 Mar 2022 12:44:29 -0800
Subject: [PATCH 6/8] addressed michael's review

---
 python/cudf/cudf/core/groupby/groupby.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 22d771bdfc0..faba4e7a4e9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -942,12 +942,9 @@ def corr(self, method="pearson", min_periods=1):
                 "Only pearson correlation is currently supported"
             )
 
-        def func(column_pair_groupby):
-            return column_pair_groupby.agg(
-                lambda x: x.corr(method, min_periods)
-            )
-
-        return self._cov_or_corr(lambda x: x.corr(method, min_periods), "Correlation")
+        return self._cov_or_corr(
+            lambda x: x.corr(method, min_periods), "Correlation"
+        )
 
     def cov(self, min_periods=0, ddof=1):
         """
@@ -1034,10 +1031,9 @@ def cov(self, min_periods=0, ddof=1):
            val3  3.833333  12.333333  12.333333
         """
 
-        def func(column_pair_groupby):
-            return column_pair_groupby.agg(lambda x: x.cov(min_periods, ddof))
-
-        return self._cov_or_corr(func, "Covariance")
+        return self._cov_or_corr(
+            lambda x: x.cov(min_periods, ddof), "Covariance"
+        )
 
     def _cov_or_corr(self, func, method_name):
         """
@@ -1083,7 +1079,7 @@ def _cov_or_corr(self, func, method_name):
         ).groupby(by=self.grouping.keys)
 
         try:
-            gb_cov_corr = func(column_pair_groupby)
+            gb_cov_corr = column_pair_groupby.agg(func)
         except RuntimeError as e:
             if "Unsupported groupby reduction type-agg combination" in str(e):
                 raise TypeError(

From 289ba26bee1985c5e6d037f1f63744eadccf3496 Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Fri, 11 Mar 2022 14:16:21 -0800
Subject: [PATCH 7/8] Update python/cudf/cudf/core/groupby/groupby.py

Co-authored-by: Michael Wang <isVoid@users.noreply.github.com>
---
 python/cudf/cudf/core/groupby/groupby.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index faba4e7a4e9..5f3953b8d79 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1040,7 +1040,6 @@ def _cov_or_corr(self, func, method_name):
         internal function that is called by either corr() or cov()
         for sort groupby correlation and covariance computations,
         respectively.
-
         """
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be used in the correlation or covariance

From 6dfb4a5b58afc10aa20dd404a8ef2e6117affdae Mon Sep 17 00:00:00 2001
From: Sheilah <kirui.sheilah@gmail.com>
Date: Fri, 11 Mar 2022 14:30:31 -0800
Subject: [PATCH 8/8] .

---
 python/cudf/cudf/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 5f3953b8d79..a1a4596ba45 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1102,7 +1102,7 @@ def combine_columns(gb_cov_corr, ys):
             frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
             return interleave_columns(frame)
 
-        # interleave: combine the correlation or covariance results for each
+        # interleave: combines the correlation or covariance results for each
         # column-pair into a single column
         res = cudf.DataFrame._from_data(
             {