From 26bafd02b7f37a3c20eeaa1fdc8d59e13015a57f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 24 Mar 2021 12:29:57 -0700
Subject: [PATCH 01/49] Don't identify decimals as strings.

---
 python/cudf/cudf/utils/dtypes.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 8875a36dba8..67b54ef33f0 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -154,7 +154,11 @@ def is_numerical_dtype(obj):
 
 
 def is_string_dtype(obj):
-    return pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)
+    return (
+        not is_decimal_dtype(obj)
+        and pd.api.types.is_string_dtype(obj)
+        and not is_categorical_dtype(obj)
+    )
 
 
 def is_datetime_dtype(obj):

From babcdfcfa9598eb4b20d458eb0d48a3ee035eedb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 24 Mar 2021 17:10:53 -0700
Subject: [PATCH 02/49] Reject all extension types as string types.

---
 python/cudf/cudf/utils/dtypes.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 67b54ef33f0..8af225ecb58 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -155,9 +155,13 @@ def is_numerical_dtype(obj):
 
 def is_string_dtype(obj):
     return (
-        not is_decimal_dtype(obj)
-        and pd.api.types.is_string_dtype(obj)
+        pd.api.types.is_string_dtype(obj)
+        # Reject all cudf extension types.
         and not is_categorical_dtype(obj)
+        and not is_decimal_dtype(obj)
+        and not is_list_dtype(obj)
+        and not is_struct_dtype(obj)
+        and not is_interval_dtype(obj)
     )
 
 

From 2b0061146238557005aa4bc2f6496dc1038fa7c3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 24 Mar 2021 18:33:34 -0700
Subject: [PATCH 03/49] Create separate lists for extension type methods.

---
 python/cudf/cudf/_lib/groupby.pyx | 65 ++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0f5cdc73d3b..b47430770ea 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -51,13 +51,52 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
-    "collect"
+    "collect",
+    "variance"
 }
 
 _LIST_AGGS = {
     "collect"
 }
 
+# TODO: For now, the aggs for all of these extension types are the same as strings until we can find a way to fix that.
+_STRUCT_AGGS = {
+    "count",
+    "size",
+    "max",
+    "min",
+    "nunique",
+    "nth",
+    "collect"
+}
+
+_INTERVAL_AGGS = {
+    "count",
+    "size",
+    "max",
+    "min",
+    "nunique",
+    "nth",
+    "collect"
+}
+
+_DECIMAL_AGGS = {
+    "count",
+    "sum",
+    "idxmin",  # Silently fails.
+    "idxmax",  # Silently fails.
+    "min",
+    "max",
+    "mean",  # Errors internally in C++
+    "var",  # Silently fails.
+    "std",
+    "quantile",  # Errors internally in C++
+    "median",  # Errors internally in C++
+    "nunique",
+    "nth",
+    "collect"  # Errors internally in Python
+}
+
 cdef class GroupBy:
     cdef unique_ptr[libcudf_groupby.groupby] c_obj
     cdef dict __dict__
@@ -182,10 +221,14 @@ def _drop_unsupported_aggs(Table values, aggs):
     from cudf.utils.dtypes import (
         is_categorical_dtype,
         is_string_dtype,
-        is_list_dtype
+        is_list_dtype,
+        is_interval_dtype,
+        is_struct_dtype,
+        is_decimal_dtype,
     )
     result = aggs.copy()
 
+        # and not is_decimal_dtype(obj)
     for col_name in aggs:
         if (
             is_list_dtype(values._data[col_name].dtype)
@@ -205,6 +248,24 @@ def _drop_unsupported_aggs(Table values, aggs):
             for i, agg_name in enumerate(aggs[col_name]):
                 if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS:
                     del result[col_name][i]
+        elif (
+                is_struct_dtype(values._data[col_name].dtype)
+        ):
+            for i, agg_name in enumerate(aggs[col_name]):
+                if Aggregation(agg_name).kind not in _STRUCT_AGGS:
+                    del result[col_name][i]
+        elif (
+                is_interval_dtype(values._data[col_name].dtype)
+        ):
+            for i, agg_name in enumerate(aggs[col_name]):
+                if Aggregation(agg_name).kind not in _INTERVAL_AGGS:
+                    del result[col_name][i]
+        elif (
+                is_decimal_dtype(values._data[col_name].dtype)
+        ):
+            for i, agg_name in enumerate(aggs[col_name]):
+                if Aggregation(agg_name).kind not in _DECIMAL_AGGS:
+                    del result[col_name][i]
 
     if all(len(v) == 0 for v in result.values()):
         raise DataError("No numeric types to aggregate")

From 1ebde51ccc666777d7eb3b71ead6ff0ff372296d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 09:48:42 -0700
Subject: [PATCH 04/49] Enable collect for decimals.

---
 python/cudf/cudf/_lib/groupby.pyx | 2 +-
 python/cudf/cudf/utils/dtypes.py  | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index b47430770ea..739ab19993a 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -94,7 +94,7 @@ _DECIMAL_AGGS = {
     "median",  # Errors internally in C++
     "nunique",
     "nth",
-    "collect"  # Errors internally in Python
+    "collect"
 }
 
 cdef class GroupBy:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 8af225ecb58..e1d9b71daf0 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -294,9 +294,8 @@ def cudf_dtype_to_pa_type(dtype):
     """
     if is_categorical_dtype(dtype):
         raise NotImplementedError()
-    elif is_list_dtype(dtype):
-        return dtype.to_arrow()
-    elif is_struct_dtype(dtype):
+    elif (is_list_dtype(dtype) or is_struct_dtype(dtype)
+          or is_decimal_dtype(dtype)):
         return dtype.to_arrow()
     else:
         return np_to_pa_dtype(np.dtype(dtype))

From 4c5d87604d6bbc09b1b737e8aaea47a270e0b102 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 10:23:54 -0700
Subject: [PATCH 05/49] Enable argmin and argmax.

---
 python/cudf/cudf/_lib/groupby.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 739ab19993a..28e051d1234 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -83,13 +83,13 @@ _INTERVAL_AGGS = {
 _DECIMAL_AGGS = {
     "count",
     "sum",
-    "idxmin",  # Silently fails.
-    "idxmax",  # Silently fails.
+    "argmin",
+    "argmax",
     "min",
     "max",
     "mean",  # Errors internally in C++
-    "var",  # Silently fails.
-    "std",
+    "var",  # Column silently vanishes.
+    "std",  # Gives all nulls.
     "quantile",  # Errors internally in C++
     "median",  # Errors internally in C++
     "nunique",

From 4134e43f7531e30f404142e34653d938e6fe5b8f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 10:26:58 -0700
Subject: [PATCH 06/49] Fix variance key name.

---
 python/cudf/cudf/_lib/groupby.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 28e051d1234..319259177d5 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -88,7 +88,7 @@ _DECIMAL_AGGS = {
     "min",
     "max",
     "mean",  # Errors internally in C++
-    "var",  # Column silently vanishes.
+    "variance",  # Gives all nulls.
     "std",  # Gives all nulls.
     "quantile",  # Errors internally in C++
     "median",  # Errors internally in C++

From 43cf580dda3a614356f4d53a319295e7254583e3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 10:40:12 -0700
Subject: [PATCH 07/49] Move groupby aggregation list to groupby.py and clean
 up the assignment of aggregators.

---
 python/cudf/cudf/_lib/groupby.pyx        | 18 ------------
 python/cudf/cudf/core/groupby/groupby.py | 36 ++++++++++++++++++------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 319259177d5..5df01dbe6e3 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -20,24 +20,6 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
 
-_GROUPBY_AGGS = {
-    "count",
-    "size",
-    "sum",
-    "idxmin",
-    "idxmax",
-    "min",
-    "max",
-    "mean",
-    "var",
-    "std",
-    "quantile",
-    "median",
-    "nunique",
-    "nth",
-    "collect"
-}
-
 _CATEGORICAL_AGGS = {
     "count",
     "size",
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 86e1f5cfe30..bde946f2e1d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -13,6 +13,8 @@
 from cudf.utils.utils import cached_property
 
 
+# Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the
+# class after its definition (see below).
 class GroupBy(Serializable):
 
     _MAX_GROUPS_BEFORE_WARN = 100
@@ -58,14 +60,6 @@ def __init__(
         else:
             self.grouping = _Grouping(obj, by, level)
 
-    def __getattribute__(self, key):
-        try:
-            return super().__getattribute__(key)
-        except AttributeError:
-            if key in libgroupby._GROUPBY_AGGS:
-                return functools.partial(self._agg_func_name_with_args, key)
-            raise
-
     def __iter__(self):
         group_names, offsets, _, grouped_values = self._grouped()
         if isinstance(group_names, cudf.Index):
@@ -590,6 +584,32 @@ def rolling(self, *args, **kwargs):
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
 
+# Set of valid groupby aggregations.
+_VALID_GROUPBY_AGGS = {
+    "count",
+    "size",
+    "sum",
+    "idxmin",
+    "idxmax",
+    "min",
+    "max",
+    "mean",
+    "var",
+    "std",
+    "quantile",
+    "median",
+    "nunique",
+    "nth",
+    "collect"
+}
+
+
+# Dynamically bind the different aggregation methods.
+for key in _VALID_GROUPBY_AGGS:
+    setattr(GroupBy, key,
+            functools.partialmethod(GroupBy._agg_func_name_with_args, key))
+
+
 class DataFrameGroupBy(GroupBy):
     def __init__(
         self, obj, by=None, level=None, sort=False, as_index=True, dropna=True

From 474a179e8b64f80f8c1789cd3a67eb97028a58fb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 12:21:41 -0700
Subject: [PATCH 08/49] Disable aggs that are overrides of actual methods.

---
 python/cudf/cudf/core/groupby/groupby.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index bde946f2e1d..fbab73c183b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -587,7 +587,7 @@ def rolling(self, *args, **kwargs):
 # Set of valid groupby aggregations.
 _VALID_GROUPBY_AGGS = {
     "count",
-    "size",
+    # "size",  # This aggregation will never happen because GroupBy.Size exists
     "sum",
     "idxmin",
     "idxmax",
@@ -599,7 +599,7 @@ def rolling(self, *args, **kwargs):
     "quantile",
     "median",
     "nunique",
-    "nth",
+    # "nth",  # This aggregation will never happen because GroupBy.Size exists
     "collect"
 }
 

From 25e74ef08ec68a7064be4ce12530a949c579d419 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 12:34:55 -0700
Subject: [PATCH 09/49] Move more logic out of the GroupBy class.

---
 python/cudf/cudf/_lib/groupby.pyx        |  3 +++
 python/cudf/cudf/core/groupby/groupby.py | 34 ++++++++++++------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 5df01dbe6e3..94f9ff9ca6b 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -20,6 +20,9 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
 
+# The sets below define the possible aggregations that can be performed on
+# different dtypes. The uppercased versions of these strings correspond to
+# elements of the AggregationKind enum.
 _CATEGORICAL_AGGS = {
     "count",
     "size",
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index fbab73c183b..9cc6e098322 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -261,19 +261,6 @@ def _grouped(self):
         group_names = grouped_keys.unique()
         return (group_names, offsets, grouped_keys, grouped_values)
 
-    def _agg_func_name_with_args(self, func_name, *args, **kwargs):
-        """
-        Aggregate given an aggregate function name
-        and arguments to the function, e.g.,
-        `_agg_func_name_with_args("quantile", 0.5)`
-        """
-
-        def func(x):
-            return getattr(x, func_name)(*args, **kwargs)
-
-        func.__name__ = func_name
-        return self.agg(func)
-
     def _normalize_aggs(self, aggs):
         """
         Normalize aggs to a dict mapping column names
@@ -584,10 +571,10 @@ def rolling(self, *args, **kwargs):
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
 
-# Set of valid groupby aggregations.
+# Set of valid groupby aggregations that are monkey-patched into the GroupBy
+# namespace.
 _VALID_GROUPBY_AGGS = {
     "count",
-    # "size",  # This aggregation will never happen because GroupBy.Size exists
     "sum",
     "idxmin",
     "idxmax",
@@ -599,15 +586,28 @@ def rolling(self, *args, **kwargs):
     "quantile",
     "median",
     "nunique",
-    # "nth",  # This aggregation will never happen because GroupBy.Size exists
     "collect"
 }
 
 
 # Dynamically bind the different aggregation methods.
 for key in _VALID_GROUPBY_AGGS:
+    def _agg_func_name_with_args(self, func_name, *args, **kwargs):
+        """
+        Aggregate given an aggregate function name and arguments to the
+        function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
+        aggregations must be members of _AggregationFactory.
+        """
+
+        def func(x):
+            """Compute the {} of the group.""".format(func_name)
+            return getattr(x, func_name)(*args, **kwargs)
+
+        func.__name__ = func_name
+        return self.agg(func)
+
     setattr(GroupBy, key,
-            functools.partialmethod(GroupBy._agg_func_name_with_args, key))
+            functools.partialmethod(_agg_func_name_with_args, key))
 
 
 class DataFrameGroupBy(GroupBy):

From 8a4482736a0e368b842f872b450b3be4410c4d89 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 12:54:20 -0700
Subject: [PATCH 10/49] Simplify getattr usage.

---
 python/cudf/cudf/_lib/groupby.pyx        |  3 ++-
 python/cudf/cudf/core/groupby/groupby.py | 14 +++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 94f9ff9ca6b..6ffc34ca345 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -44,7 +44,8 @@ _LIST_AGGS = {
     "collect"
 }
 
-# TODO: For now, the aggs for all of these extension types are the same as strings until we can find a way to fix that.
+# TODO: For now, the aggs for struct and interval types are the same as strings.
+# Need to evaluate what's actually reasonable.
 _STRUCT_AGGS = {
     "count",
     "size",
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 9cc6e098322..76274dcd545 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -705,15 +705,11 @@ def __init__(
             dropna=dropna,
         )
 
-    def __getattribute__(self, key):
-        try:
-            return super().__getattribute__(key)
-        except AttributeError:
-            if key in self.obj:
-                return self.obj[key].groupby(
-                    self.grouping, dropna=self._dropna, sort=self._sort
-                )
-            raise
+    def __getattr__(self, key):
+        if key in self.obj:
+            return self.obj[key].groupby(
+                self.grouping, dropna=self._dropna, sort=self._sort
+            )
 
     def __getitem__(self, key):
         return self.obj[key].groupby(

From 6b5c67fbbd8fde3f0e8120b2687f1ec521c0904d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 13:47:13 -0700
Subject: [PATCH 11/49] Clearly documented unknown failures.

---
 python/cudf/cudf/_lib/groupby.pyx | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 6ffc34ca345..ccd734cef91 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -73,9 +73,20 @@ _DECIMAL_AGGS = {
     "argmax",
     "min",
     "max",
-    "mean",  # Errors internally in C++
+    # TODO: As far as I can tell, variance and std are using the reduction operations
+    # defined in reductions/compound.cuh. reductions/var.cu defines it as using the
+    # element_type_dispatcher, which checks validity using std::is_arithmetic. I assume
+    # that this is intentional, in which case var and std are not supported. However,
+    # instead of raising any errors, I just get columns of NULLs if I try to actually
+    # compute the variance of a GroupBy object containing decimals, so something else
+    # might be wrong as well.
     "variance",  # Gives all nulls.
     "std",  # Gives all nulls.
+    # TODO: Need to resolve the three below. They fail with a cudf error, but the error
+    # indicates that the column type is wrong (it wants a fixed point column but isn't
+    # receiving one). I suspect that a special-cased conversion for decimal types is
+    # missing somewhere.
+    "mean",  # Errors internally in C++
     "quantile",  # Errors internally in C++
     "median",  # Errors internally in C++
     "nunique",

From 8e45ad00d3f899bac149ff84c0d9199818c0dac5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 13:47:48 -0700
Subject: [PATCH 12/49] Match other class groupbys to strings.

---
 python/cudf/cudf/_lib/groupby.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index ccd734cef91..fd8f0f61f54 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -54,6 +54,7 @@ _STRUCT_AGGS = {
     "nunique",
     "nth",
     "collect"
+    "variance"
 }
 
 _INTERVAL_AGGS = {
@@ -64,6 +65,7 @@ _INTERVAL_AGGS = {
     "nunique",
     "nth",
     "collect"
+    "variance"
 }
 
 _DECIMAL_AGGS = {

From 81ffe0af4538daf9ad0918fdfb1911273a900e1e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 14:56:09 -0700
Subject: [PATCH 13/49] Fix style and remove unsupported operations.

---
 python/cudf/cudf/_lib/groupby.pyx | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index fd8f0f61f54..cc6706e667b 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -44,8 +44,8 @@ _LIST_AGGS = {
     "collect"
 }
 
-# TODO: For now, the aggs for struct and interval types are the same as strings.
-# Need to evaluate what's actually reasonable.
+# TODO: For now, the aggs for struct and interval types are the same as
+# strings.  Need to evaluate what's actually reasonable.
 _STRUCT_AGGS = {
     "count",
     "size",
@@ -75,22 +75,6 @@ _DECIMAL_AGGS = {
     "argmax",
     "min",
     "max",
-    # TODO: As far as I can tell, variance and std are using the reduction operations
-    # defined in reductions/compound.cuh. reductions/var.cu defines it as using the
-    # element_type_dispatcher, which checks validity using std::is_arithmetic. I assume
-    # that this is intentional, in which case var and std are not supported. However,
-    # instead of raising any errors, I just get columns of NULLs if I try to actually
-    # compute the variance of a GroupBy object containing decimals, so something else
-    # might be wrong as well.
-    "variance",  # Gives all nulls.
-    "std",  # Gives all nulls.
-    # TODO: Need to resolve the three below. They fail with a cudf error, but the error
-    # indicates that the column type is wrong (it wants a fixed point column but isn't
-    # receiving one). I suspect that a special-cased conversion for decimal types is
-    # missing somewhere.
-    "mean",  # Errors internally in C++
-    "quantile",  # Errors internally in C++
-    "median",  # Errors internally in C++
     "nunique",
     "nth",
     "collect"
@@ -227,7 +211,6 @@ def _drop_unsupported_aggs(Table values, aggs):
     )
     result = aggs.copy()
 
-        # and not is_decimal_dtype(obj)
     for col_name in aggs:
         if (
             is_list_dtype(values._data[col_name].dtype)

From 6d3fad376bf13bcf17be28a3df778af4ba97829d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 15:00:41 -0700
Subject: [PATCH 14/49] Apply black reformattings.

---
 python/cudf/cudf/core/groupby/groupby.py | 8 +++++---
 python/cudf/cudf/utils/dtypes.py         | 7 +++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 76274dcd545..3f520c60d4f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -586,12 +586,13 @@ def rolling(self, *args, **kwargs):
     "quantile",
     "median",
     "nunique",
-    "collect"
+    "collect",
 }
 
 
 # Dynamically bind the different aggregation methods.
 for key in _VALID_GROUPBY_AGGS:
+
     def _agg_func_name_with_args(self, func_name, *args, **kwargs):
         """
         Aggregate given an aggregate function name and arguments to the
@@ -606,8 +607,9 @@ def func(x):
         func.__name__ = func_name
         return self.agg(func)
 
-    setattr(GroupBy, key,
-            functools.partialmethod(_agg_func_name_with_args, key))
+    setattr(
+        GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key)
+    )
 
 
 class DataFrameGroupBy(GroupBy):
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index e1d9b71daf0..b5fecc5cd3c 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -294,8 +294,11 @@ def cudf_dtype_to_pa_type(dtype):
     """
     if is_categorical_dtype(dtype):
         raise NotImplementedError()
-    elif (is_list_dtype(dtype) or is_struct_dtype(dtype)
-          or is_decimal_dtype(dtype)):
+    elif (
+        is_list_dtype(dtype)
+        or is_struct_dtype(dtype)
+        or is_decimal_dtype(dtype)
+    ):
         return dtype.to_arrow()
     else:
         return np_to_pa_dtype(np.dtype(dtype))

From 714742d433a63f51f2485a3fbda9e356bbc075b0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 15:09:26 -0700
Subject: [PATCH 15/49] Remove variance from obviously unsupported types.

---
 python/cudf/cudf/_lib/groupby.pyx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index cc6706e667b..fe14a6ac4ac 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -37,7 +37,6 @@ _STRING_AGGS = {
     "nunique",
     "nth",
     "collect",
-    "variance"
 }
 
 _LIST_AGGS = {
@@ -54,7 +53,6 @@ _STRUCT_AGGS = {
     "nunique",
     "nth",
     "collect"
-    "variance"
 }
 
 _INTERVAL_AGGS = {
@@ -65,7 +63,6 @@ _INTERVAL_AGGS = {
     "nunique",
     "nth",
     "collect"
-    "variance"
 }
 
 _DECIMAL_AGGS = {

From ea4ed2e0b0e7cf768201817b9812044673586042 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 15:19:14 -0700
Subject: [PATCH 16/49] Defer getattr to getitem if possible.

---
 python/cudf/cudf/core/groupby/groupby.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3f520c60d4f..b6260ebf643 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -708,10 +708,10 @@ def __init__(
         )
 
     def __getattr__(self, key):
-        if key in self.obj:
-            return self.obj[key].groupby(
-                self.grouping, dropna=self._dropna, sort=self._sort
-            )
+        try:
+            return self[key]
+        except KeyError:
+            raise AttributeError
 
     def __getitem__(self, key):
         return self.obj[key].groupby(

From 026bb4e16ce97cbd3bc97b854e65582cc9b24f5a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 16:01:59 -0700
Subject: [PATCH 17/49] Make getattr safe for copying.

---
 python/cudf/cudf/core/groupby/groupby.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b6260ebf643..34064193506 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -708,6 +708,11 @@ def __init__(
         )
 
     def __getattr__(self, key):
+        # Without this check, copying can trigger a RecursionError. See
+        # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  #noqa: E501
+        # for an explanation.
+        if key == "obj":
+            raise AttributeError
         try:
             return self[key]
         except KeyError:

From 1259032e376710f40a81f702914fabb0c9ceeef6 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 16:07:48 -0700
Subject: [PATCH 18/49] Remove support for aggregating structs.

---
 python/cudf/cudf/_lib/groupby.pyx | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index fe14a6ac4ac..4f3d3599eef 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -43,26 +43,10 @@ _LIST_AGGS = {
     "collect"
 }
 
-# TODO: For now, the aggs for struct and interval types are the same as
-# strings.  Need to evaluate what's actually reasonable.
 _STRUCT_AGGS = {
-    "count",
-    "size",
-    "max",
-    "min",
-    "nunique",
-    "nth",
-    "collect"
 }
 
 _INTERVAL_AGGS = {
-    "count",
-    "size",
-    "max",
-    "min",
-    "nunique",
-    "nth",
-    "collect"
 }
 
 _DECIMAL_AGGS = {

From 6c61806fb22a96b3f283ec1a4b1ddb84a9ce72ca Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 16:10:45 -0700
Subject: [PATCH 19/49] Update documented list of groupby operations.

---
 docs/cudf/source/groupby.md | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md
index 7e96d4fe38c..a74e6285296 100644
--- a/docs/cudf/source/groupby.md
+++ b/docs/cudf/source/groupby.md
@@ -120,23 +120,23 @@ a
 
 The following table summarizes the available aggregations and the types that support them:
 
-| Aggregations\dtypes | Numeric  | Datetime | String   | Categorical | List | Struct |
-| ------------------- | -------- | -------  | -------- | ----------- | ---- | ------ |
-| count               | ✅       | ✅       | ✅       | ✅          |      |        |
-| size                | ✅       | ✅       | ✅       | ✅          |      |        |
-| sum                 | ✅       | ✅       |          |             |      |        |
-| idxmin              | ✅       | ✅       |          |             |      |        |
-| idxmax              | ✅       | ✅       |          |             |      |        |
-| min                 | ✅       | ✅       | ✅       |             |      |        |
-| max                 | ✅       | ✅       | ✅       |             |      |        |
-| mean                | ✅       | ✅       |          |             |      |        |
-| var                 | ✅       | ✅       |          |             |      |        |
-| std                 | ✅       | ✅       |          |             |      |        |
-| quantile            | ✅       | ✅       |          |             |      |        |
-| median              | ✅       | ✅       |          |             |      |        |
-| nunique             | ✅       | ✅       | ✅       | ✅          |      |        |
-| nth                 | ✅       | ✅       | ✅       |             |      |        |
-| collect             | ✅       | ✅       | ✅       |             | ✅   |        |
+| Aggregations\dtypes | Numeric  | Datetime | String   | Categorical | List | Struct | Interval | Decimal |
+| ------------------- | -------- | -------  | -------- | ----------- | ---- | ------ | -------- | ------- |
+| count               | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
+| size                | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
+| sum                 | ✅       | ✅       |          |             |      |        |          | ✅      |
+| idxmin              | ✅       | ✅       |          |             |      |        |          | ✅      |
+| idxmax              | ✅       | ✅       |          |             |      |        |          | ✅      |
+| min                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
+| max                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
+| mean                | ✅       | ✅       |          |             |      |        |          |         |
+| var                 | ✅       | ✅       |          |             |      |        |          |         |
+| std                 | ✅       | ✅       |          |             |      |        |          |         |
+| quantile            | ✅       | ✅       |          |             |      |        |          |         |
+| median              | ✅       | ✅       |          |             |      |        |          |         |
+| nunique             | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
+| nth                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
+| collect             | ✅       | ✅       | ✅       |             | ✅   |        |          | ✅      |
 
 ## GroupBy apply
 

From a14d30f0c13d2849b9fa9fab5a4a64a7309fc185 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 25 Mar 2021 16:51:10 -0700
Subject: [PATCH 20/49] Move function out of loop.

---
 python/cudf/cudf/core/groupby/groupby.py | 26 ++++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 34064193506..37819b30c45 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -591,22 +591,22 @@ def rolling(self, *args, **kwargs):
 
 
 # Dynamically bind the different aggregation methods.
-for key in _VALID_GROUPBY_AGGS:
+def _agg_func_name_with_args(self, func_name, *args, **kwargs):
+    """
+    Aggregate given an aggregate function name and arguments to the
+    function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
+    aggregations must be members of _AggregationFactory.
+    """
 
-    def _agg_func_name_with_args(self, func_name, *args, **kwargs):
-        """
-        Aggregate given an aggregate function name and arguments to the
-        function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
-        aggregations must be members of _AggregationFactory.
-        """
+    def func(x):
+        """Compute the {} of the group.""".format(func_name)
+        return getattr(x, func_name)(*args, **kwargs)
 
-        def func(x):
-            """Compute the {} of the group.""".format(func_name)
-            return getattr(x, func_name)(*args, **kwargs)
+    func.__name__ = func_name
+    return self.agg(func)
 
-        func.__name__ = func_name
-        return self.agg(func)
 
+for key in _VALID_GROUPBY_AGGS:
     setattr(
         GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key)
     )
@@ -709,7 +709,7 @@ def __init__(
 
     def __getattr__(self, key):
         # Without this check, copying can trigger a RecursionError. See
-        # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  #noqa: E501
+        # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
         # for an explanation.
         if key == "obj":
             raise AttributeError

From 5c71bfee04bfcdd693abc424d6bf0aba9df05f77 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 26 Mar 2021 17:26:55 -0700
Subject: [PATCH 21/49] Remove redundant test, add test of decimal.

---
 python/cudf/cudf/tests/test_groupby.py | 64 ++++++++++++++------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index a96db59dee3..e53d1b4f2ec 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import pytest
 from numba import cuda
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_allclose, assert_array_equal
 
 import cudf
 from cudf.core import DataFrame, Series
@@ -148,26 +148,6 @@ def test_groupby_agg_min_max_dictlist(nelem):
     assert_eq(got_df, expect_df)
 
 
-@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
-@pytest.mark.parametrize(
-    "func", ["mean", "min", "max", "idxmin", "idxmax", "count", "sum"]
-)
-def test_groupby_2keys_agg(nelem, func):
-    # gdf (Note: lack of multiIndex)
-    expect_df = (
-        make_frame(pd.DataFrame, nelem=nelem)
-        .groupby(["x", "y"], sort=True)
-        .agg(func)
-    )
-    got_df = (
-        make_frame(DataFrame, nelem=nelem)
-        .groupby(["x", "y"], sort=True)
-        .agg(func)
-    )
-    check_dtype = False if func in _index_type_aggs else True
-    assert_eq(got_df, expect_df, check_dtype=check_dtype)
-
-
 @pytest.mark.parametrize("as_index", [True, False])
 def test_groupby_as_index_single_agg(pdf, gdf, as_index):
     gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"})
@@ -331,28 +311,54 @@ def emulate(df):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("nelem", [100, 500])
+@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
 @pytest.mark.parametrize(
     "func",
     ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"],
 )
-def test_groupby_cudf_2keys_agg(nelem, func):
-    got_df = (
-        make_frame(DataFrame, nelem=nelem)
+def test_groupby_2keys_agg(nelem, func):
+    # gdf (Note: lack of multiIndex)
+    expect_df = (
+        make_frame(pd.DataFrame, nelem=nelem)
         .groupby(["x", "y"], sort=True)
         .agg(func)
     )
-
-    # pandas
-    expect_df = (
-        make_frame(pd.DataFrame, nelem=nelem)
+    got_df = (
+        make_frame(DataFrame, nelem=nelem)
         .groupby(["x", "y"], sort=True)
         .agg(func)
     )
+
     check_dtype = False if func in _index_type_aggs else True
     assert_eq(got_df, expect_df, check_dtype=check_dtype)
 
 
+@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
+@pytest.mark.parametrize(
+    "func",
+    ["min", "max", "idxmin", "idxmax", "count", "sum"],
+)
+def test_groupby_agg_decimal(nelem, func):
+    idx_col = np.arange(nelem)
+    x = (np.random.rand(nelem) * 100).round(2)
+    y = (np.random.rand(nelem) * 100).round(2)
+    pdf = pd.DataFrame({'idx': idx_col, 'x': x, 'y': y})
+    gdf = DataFrame({
+        'idx': idx_col,
+        'x': cudf.Series(x).astype(cudf.Decimal64Dtype(3, 2)),
+        'y': cudf.Series(y).astype(cudf.Decimal64Dtype(3, 2)),
+    })
+
+    expect_df = pdf.groupby('idx', sort=True).agg(func)
+    got_df = gdf.groupby('idx', sort=True).agg(func).astype(float)
+
+    check_dtype = False if func in _index_type_aggs else True
+    assert_allclose(got_df['x'].to_array(), expect_df['x'],
+                    atol=1e-2, rtol=1e-2)
+    assert_allclose(got_df['y'].to_array(), expect_df['y'],
+                    atol=1e-2, rtol=1e-2)
+
+
 @pytest.mark.parametrize(
     "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "mean"]
 )

From 25811b0465bc570ff00d18770da2ea093a111714 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 26 Mar 2021 17:28:07 -0700
Subject: [PATCH 22/49] Fix formatting.

---
 python/cudf/cudf/tests/test_groupby.py | 32 ++++++++++++++------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e53d1b4f2ec..e6547438538 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -335,28 +335,30 @@ def test_groupby_2keys_agg(nelem, func):
 
 @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
 @pytest.mark.parametrize(
-    "func",
-    ["min", "max", "idxmin", "idxmax", "count", "sum"],
+    "func", ["min", "max", "idxmin", "idxmax", "count", "sum"],
 )
 def test_groupby_agg_decimal(nelem, func):
     idx_col = np.arange(nelem)
     x = (np.random.rand(nelem) * 100).round(2)
     y = (np.random.rand(nelem) * 100).round(2)
-    pdf = pd.DataFrame({'idx': idx_col, 'x': x, 'y': y})
-    gdf = DataFrame({
-        'idx': idx_col,
-        'x': cudf.Series(x).astype(cudf.Decimal64Dtype(3, 2)),
-        'y': cudf.Series(y).astype(cudf.Decimal64Dtype(3, 2)),
-    })
+    pdf = pd.DataFrame({"idx": idx_col, "x": x, "y": y})
+    gdf = DataFrame(
+        {
+            "idx": idx_col,
+            "x": cudf.Series(x).astype(cudf.Decimal64Dtype(3, 2)),
+            "y": cudf.Series(y).astype(cudf.Decimal64Dtype(3, 2)),
+        }
+    )
 
-    expect_df = pdf.groupby('idx', sort=True).agg(func)
-    got_df = gdf.groupby('idx', sort=True).agg(func).astype(float)
+    expect_df = pdf.groupby("idx", sort=True).agg(func)
+    got_df = gdf.groupby("idx", sort=True).agg(func).astype(float)
 
-    check_dtype = False if func in _index_type_aggs else True
-    assert_allclose(got_df['x'].to_array(), expect_df['x'],
-                    atol=1e-2, rtol=1e-2)
-    assert_allclose(got_df['y'].to_array(), expect_df['y'],
-                    atol=1e-2, rtol=1e-2)
+    assert_allclose(
+        got_df["x"].to_array(), expect_df["x"], atol=1e-2, rtol=1e-2
+    )
+    assert_allclose(
+        got_df["y"].to_array(), expect_df["y"], atol=1e-2, rtol=1e-2
+    )
 
 
 @pytest.mark.parametrize(

From 4a0f27d83c0ed693cd0fe21e4cc291c4e2fdf51b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Sun, 28 Mar 2021 08:35:53 -0700
Subject: [PATCH 23/49] Simplify drop aggs logic.

---
 python/cudf/cudf/_lib/groupby.pyx | 129 +++++++++---------------------
 1 file changed, 39 insertions(+), 90 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4a949d80129..1024f23e773 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -19,48 +19,29 @@ cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
+from pandas.core.groupby.groupby import DataError
+from cudf.utils.dtypes import (
+    is_categorical_dtype,
+    is_string_dtype,
+    is_list_dtype,
+    is_interval_dtype,
+    is_struct_dtype,
+    is_decimal_dtype,
+)
+
 
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. The uppercased versions of these strings correspond to
 # elements of the AggregationKind enum.
-_CATEGORICAL_AGGS = {
-    "count",
-    "size",
-    "nunique",
-    "unique",
-}
-
-_STRING_AGGS = {
-    "count",
-    "size",
-    "max",
-    "min",
-    "nunique",
-    "nth",
-    "collect",
-    "unique",
-}
-
-_LIST_AGGS = {
-    "collect",
-}
-
-_STRUCT_AGGS = {
-}
-
-_INTERVAL_AGGS = {
-}
-
-_DECIMAL_AGGS = {
-    "count",
-    "sum",
-    "argmin",
-    "argmax",
-    "min",
-    "max",
-    "nunique",
-    "nth",
-    "collect"
+_VALID_AGGS_BY_TYPE = {
+    "CATEGORICAL":  {"count", "size", "nunique", "unique"},
+    "STRING": {"count", "size", "max", "min", "nunique", "nth", "collect",
+               "unique"},
+    "LIST":  {"collect" },
+    "STRUCT": set(),
+    "INTERVAL": set(),
+    "DECIMAL": {"count", "sum", "argmin", "argmax", "min", "max", "nunique",
+                "nth", "collect"},
 }
 
 
@@ -187,61 +168,29 @@ cdef class GroupBy:
         return result
 
 
-def _drop_unsupported_aggs(Table values, aggs):
+def _drop_unsupported_aggregations(Table values, aggregations):
     """
     Drop any aggregations that are not supported.
     """
-    from pandas.core.groupby.groupby import DataError
-
-    if all(len(v) == 0 for v in aggs.values()):
-        return aggs
-
-    from cudf.utils.dtypes import (
-        is_categorical_dtype,
-        is_string_dtype,
-        is_list_dtype,
-        is_interval_dtype,
-        is_struct_dtype,
-        is_decimal_dtype,
-    )
-    result = aggs.copy()
-
-    for col_name in aggs:
-        if (
-            is_list_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _LIST_AGGS:
-                    del result[col_name][i]
-        elif (
-            is_string_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _STRING_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_categorical_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_struct_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _STRUCT_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_interval_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _INTERVAL_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_decimal_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _DECIMAL_AGGS:
+    if all(len(v) == 0 for v in aggregations.values()):
+        return aggregations
+
+    result = aggregations.copy()
+
+    for col_name in aggregations:
+        valid_aggregations_key = (
+            "LIST" if is_list_dtype(values._data[col_name].dtype)
+            else "STRING" if is_string_dtype(values._data[col_name].dtype)
+            else "CATEGORICAL" if is_categorical_dtype(values._data[col_name].dtype)
+            else "STRUCT" if is_struct_dtype(values._data[col_name].dtype)
+            else "INTERVAL" if is_interval_dtype(values._data[col_name].dtype)
+            else "DECIMAL" if is_decimal_dtype(values._data[col_name].dtype)
+            else None
+        )
+        if valid_aggregations_key is not None:
+            valid_aggregations = _VALID_AGGS_BY_TYPE[valid_aggregations_key]
+            for i, agg_name in enumerate(aggregations[col_name]):
+                if Aggregation(agg_name).kind not in valid_aggregations:
                     del result[col_name][i]
 
     if all(len(v) == 0 for v in result.values()):

From 596496f50b2e585a86adf937330ca3080f5f738b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Sun, 28 Mar 2021 09:32:17 -0700
Subject: [PATCH 24/49] Inline drop logic.

---
 python/cudf/cudf/_lib/groupby.pyx | 75 ++++++++++++++-----------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 1024f23e773..d7ad0f43ab6 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -33,16 +33,14 @@ from cudf.utils.dtypes import (
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. The uppercased versions of these strings correspond to
 # elements of the AggregationKind enum.
-_VALID_AGGS_BY_TYPE = {
-    "CATEGORICAL":  {"count", "size", "nunique", "unique"},
-    "STRING": {"count", "size", "max", "min", "nunique", "nth", "collect",
-               "unique"},
-    "LIST":  {"collect" },
-    "STRUCT": set(),
-    "INTERVAL": set(),
-    "DECIMAL": {"count", "sum", "argmin", "argmax", "min", "max", "nunique",
-                "nth", "collect"},
-}
+_CATEGORICAL_AGGS = {"count", "size", "nunique", "unique"}
+_STRING_AGGS = {"count", "size", "max", "min", "nunique", "nth", "collect",
+                "unique"}
+_LIST_AGGS = {"collect" }
+_STRUCT_AGGS = set()
+_INTERVAL_AGGS = set()
+_DECIMAL_AGGS = {"count", "sum", "argmin", "argmax", "min", "max", "nunique",
+                 "nth", "collect"}
 
 
 cdef class GroupBy:
@@ -114,7 +112,31 @@ cdef class GroupBy:
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
         cdef Column col
 
-        aggregations = _drop_unsupported_aggs(values, aggregations)
+        # Drop unsupported aggregations.
+        # TODO: Is allowing users to provide empty aggregations something we do
+        # to support pandas semantics? Whereas we need to throw an error if
+        # some aggregations are requested when in fact none are possible?
+        if any(len(v) for v in aggregations.values()):
+            aggregations = aggregations.copy()
+
+            for col_name in aggregations:
+                valid_aggregations = (
+                    _LIST_AGGS if is_list_dtype(values._data[col_name].dtype)
+                    else _STRING_AGGS if is_string_dtype(values._data[col_name].dtype)
+                    else _CATEGORICAL_AGGS if is_categorical_dtype(values._data[col_name].dtype)
+                    else _STRING_AGGS if is_struct_dtype(values._data[col_name].dtype)
+                    else _INTERVAL_AGGS if is_interval_dtype(values._data[col_name].dtype)
+                    else _DECIMAL_AGGS if is_decimal_dtype(values._data[col_name].dtype)
+                    else None
+                )
+                if valid_aggregations is not None:
+                    for i, agg_name in enumerate(list(aggregations[col_name])):
+                        if Aggregation(agg_name).kind not in valid_aggregations:
+                            del aggregations[col_name][i]
+
+            if all(len(v) == 0 for v in aggregations.values()):
+                raise DataError("No numeric types to aggregate")
+
 
         for i, (col_name, aggs) in enumerate(aggregations.items()):
             col = values._data[col_name]
@@ -166,34 +188,3 @@ cdef class GroupBy:
 
         result = Table(data=result_data, index=grouped_keys)
         return result
-
-
-def _drop_unsupported_aggregations(Table values, aggregations):
-    """
-    Drop any aggregations that are not supported.
-    """
-    if all(len(v) == 0 for v in aggregations.values()):
-        return aggregations
-
-    result = aggregations.copy()
-
-    for col_name in aggregations:
-        valid_aggregations_key = (
-            "LIST" if is_list_dtype(values._data[col_name].dtype)
-            else "STRING" if is_string_dtype(values._data[col_name].dtype)
-            else "CATEGORICAL" if is_categorical_dtype(values._data[col_name].dtype)
-            else "STRUCT" if is_struct_dtype(values._data[col_name].dtype)
-            else "INTERVAL" if is_interval_dtype(values._data[col_name].dtype)
-            else "DECIMAL" if is_decimal_dtype(values._data[col_name].dtype)
-            else None
-        )
-        if valid_aggregations_key is not None:
-            valid_aggregations = _VALID_AGGS_BY_TYPE[valid_aggregations_key]
-            for i, agg_name in enumerate(aggregations[col_name]):
-                if Aggregation(agg_name).kind not in valid_aggregations:
-                    del result[col_name][i]
-
-    if all(len(v) == 0 for v in result.values()):
-        raise DataError("No numeric types to aggregate")
-
-    return result

From 1f5daddfba627a21e7f7f16f1f0bac969ed97f67 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Sun, 28 Mar 2021 19:31:07 -0700
Subject: [PATCH 25/49] Remove empty aggregations in place rather than
 beforehand.

---
 python/cudf/cudf/_lib/groupby.pyx | 59 ++++++++++++++++---------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index d7ad0f43ab6..4aa2b753ffa 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -112,42 +112,45 @@ cdef class GroupBy:
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
         cdef Column col
 
-        # Drop unsupported aggregations.
         # TODO: Is allowing users to provide empty aggregations something we do
         # to support pandas semantics? Whereas we need to throw an error if
         # some aggregations are requested when in fact none are possible?
-        if any(len(v) for v in aggregations.values()):
-            aggregations = aggregations.copy()
-
-            for col_name in aggregations:
-                valid_aggregations = (
-                    _LIST_AGGS if is_list_dtype(values._data[col_name].dtype)
-                    else _STRING_AGGS if is_string_dtype(values._data[col_name].dtype)
-                    else _CATEGORICAL_AGGS if is_categorical_dtype(values._data[col_name].dtype)
-                    else _STRING_AGGS if is_struct_dtype(values._data[col_name].dtype)
-                    else _INTERVAL_AGGS if is_interval_dtype(values._data[col_name].dtype)
-                    else _DECIMAL_AGGS if is_decimal_dtype(values._data[col_name].dtype)
-                    else None
-                )
-                if valid_aggregations is not None:
-                    for i, agg_name in enumerate(list(aggregations[col_name])):
-                        if Aggregation(agg_name).kind not in valid_aggregations:
-                            del aggregations[col_name][i]
-
-            if all(len(v) == 0 for v in aggregations.values()):
-                raise DataError("No numeric types to aggregate")
-
+        allow_empty = all(len(v) == 0 for v in aggregations.values())
+        empty_aggs = True
 
+        included_aggregations = defaultdict(list)
+        idx = 0
         for i, (col_name, aggs) in enumerate(aggregations.items()):
             col = values._data[col_name]
+            dtype = col.dtype
+
+            valid_aggregations = (
+                _LIST_AGGS if is_list_dtype(dtype)
+                else _STRING_AGGS if is_string_dtype(dtype)
+                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
+                else _STRING_AGGS if is_struct_dtype(dtype)
+                else _INTERVAL_AGGS if is_interval_dtype(dtype)
+                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
+                else None
+            )
+
             c_agg_requests.push_back(
                 move(libcudf_groupby.aggregation_request())
             )
-            c_agg_requests[i].values = col.view()
+            c_agg_requests[idx].values = col.view()
             for agg in aggs:
-                c_agg_requests[i].aggregations.push_back(
-                    move(make_aggregation(agg))
-                )
+                if valid_aggregations is None or Aggregation(agg).kind in valid_aggregations:
+                    empty_aggs = False
+                    included_aggregations[col_name].append(agg)
+                    c_agg_requests[idx].aggregations.push_back(
+                        move(make_aggregation(agg))
+                    )
+            if not c_agg_requests[idx].aggregations.size():
+                c_agg_requests.pop_back()
+            else:
+                idx += 1
+        if empty_aggs and not allow_empty:
+            raise DataError("No numeric types to aggregate")
 
         cdef pair[
             unique_ptr[table],
@@ -178,8 +181,8 @@ cdef class GroupBy:
         )
 
         result_data = ColumnAccessor(multiindex=True)
-        for i, col_name in enumerate(aggregations):
-            for j, agg_name in enumerate(aggregations[col_name]):
+        for i, col_name in enumerate(included_aggregations):
+            for j, agg_name in enumerate(included_aggregations[col_name]):
                 if callable(agg_name):
                     agg_name = agg_name.__name__
                 result_data[(col_name, agg_name)] = (

From f02231f04cebb760de9747e79098eaea43707d6f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Sun, 28 Mar 2021 20:51:25 -0700
Subject: [PATCH 26/49] Reuse aggregation object.

---
 python/cudf/cudf/_lib/groupby.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4aa2b753ffa..3400f921391 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -139,11 +139,12 @@ cdef class GroupBy:
             )
             c_agg_requests[idx].values = col.view()
             for agg in aggs:
-                if valid_aggregations is None or Aggregation(agg).kind in valid_aggregations:
+                agg_obj = Aggregation(agg)
+                if valid_aggregations is None or agg_obj.kind in valid_aggregations:
                     empty_aggs = False
                     included_aggregations[col_name].append(agg)
                     c_agg_requests[idx].aggregations.push_back(
-                        move(make_aggregation(agg))
+                        move(agg_obj.c_obj)
                     )
             if not c_agg_requests[idx].aggregations.size():
                 c_agg_requests.pop_back()

From 0e09786bd99d07fe2d48d085de3cc5c6f6c9f198 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 11:50:04 -0700
Subject: [PATCH 27/49] Write a new make_aggregation2 function that returns the
 cdef class rather than the underlying C++ class.

---
 python/cudf/cudf/_lib/aggregation.pxd |  2 ++
 python/cudf/cudf/_lib/aggregation.pyx | 35 +++++++++++++++++++++++++++
 python/cudf/cudf/_lib/groupby.pyx     |  5 ++--
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index bb332c44237..116e6260a57 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -8,3 +8,5 @@ cdef unique_ptr[aggregation] make_aggregation(op, kwargs=*) except *
 
 cdef class Aggregation:
     cdef unique_ptr[aggregation] c_obj
+
+cdef Aggregation make_aggregation2(op, kwargs=*)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7138bb49743..e29e0e92859 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -97,6 +97,41 @@ cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *:
         raise TypeError("Unknown aggregation {}".format(op))
     return move(agg.c_obj)
 
+
+cdef Aggregation make_aggregation2(op, kwargs={}):
+    """
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+
+    Returns
+    -------
+    unique_ptr[aggregation]
+    """
+    cdef Aggregation agg
+    if isinstance(op, str):
+        agg = getattr(_AggregationFactory, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = _AggregationFactory.collect()
+        elif "dtype" in kwargs:
+            agg = _AggregationFactory.from_udf(op, **kwargs)
+        else:
+            agg = op(_AggregationFactory)
+    else:
+        raise TypeError("Unknown aggregation {}".format(op))
+    return agg
+
+
 # The Cython pattern below enables us to create an Aggregation
 # without ever calling its `__init__` method, which would otherwise
 # result in a RecursionError.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 3400f921391..2a4580055fc 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -12,7 +12,7 @@ from libcpp cimport bool
 
 from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table
-from cudf._lib.aggregation cimport make_aggregation, Aggregation
+from cudf._lib.aggregation cimport Aggregation, make_aggregation2
 
 from cudf._lib.cpp.table.table cimport table, table_view
 cimport cudf._lib.cpp.types as libcudf_types
@@ -111,6 +111,7 @@ cdef class GroupBy:
         from cudf.core.column_accessor import ColumnAccessor
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
         cdef Column col
+        cdef Aggregation agg_obj
 
         # TODO: Is allowing users to provide empty aggregations something we do
         # to support pandas semantics? Whereas we need to throw an error if
@@ -139,7 +140,7 @@ cdef class GroupBy:
             )
             c_agg_requests[idx].values = col.view()
             for agg in aggs:
-                agg_obj = Aggregation(agg)
+                agg_obj = make_aggregation2(agg)
                 if valid_aggregations is None or agg_obj.kind in valid_aggregations:
                     empty_aggs = False
                     included_aggregations[col_name].append(agg)

From 63897c12baf2cbba3aa6bae3f40e857b3b71ca09 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 11:50:56 -0700
Subject: [PATCH 28/49] Remove constructor of Aggregation to avoid recursion
 issue.

---
 python/cudf/cudf/_lib/aggregation.pyx | 45 +++++++++++++--------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index e29e0e92859..7f155a7daf5 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -56,9 +56,6 @@ class AggregationKind(Enum):
 
 
 cdef class Aggregation:
-    def __init__(self, op, **kwargs):
-        self.c_obj = move(make_aggregation(op, kwargs))
-
     @property
     def kind(self):
         return AggregationKind(self.c_obj.get()[0].kind).name.lower()
@@ -139,37 +136,37 @@ cdef class _AggregationFactory:
 
     @classmethod
     def sum(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_sum_aggregation())
         return agg
 
     @classmethod
     def min(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_min_aggregation())
         return agg
 
     @classmethod
     def max(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_max_aggregation())
         return agg
 
     @classmethod
     def idxmin(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_argmin_aggregation())
         return agg
 
     @classmethod
     def idxmax(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_argmax_aggregation())
         return agg
 
     @classmethod
     def mean(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_mean_aggregation())
         return agg
 
@@ -181,7 +178,7 @@ cdef class _AggregationFactory:
         else:
             c_null_handling = libcudf_types.null_policy.INCLUDE
 
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
             c_null_handling
         ))
@@ -189,7 +186,7 @@ cdef class _AggregationFactory:
 
     @classmethod
     def size(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
             <libcudf_types.null_policy><underlying_type_t_null_policy>(
                 NullHandling.INCLUDE
@@ -199,13 +196,13 @@ cdef class _AggregationFactory:
 
     @classmethod
     def nunique(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_nunique_aggregation())
         return agg
 
     @classmethod
     def nth(cls, libcudf_types.size_type size):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(
             libcudf_aggregation.make_nth_element_aggregation(size)
         )
@@ -213,49 +210,49 @@ cdef class _AggregationFactory:
 
     @classmethod
     def any(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_any_aggregation())
         return agg
 
     @classmethod
     def all(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_all_aggregation())
         return agg
 
     @classmethod
     def product(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_product_aggregation())
         return agg
 
     @classmethod
     def sum_of_squares(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_sum_of_squares_aggregation())
         return agg
 
     @classmethod
     def var(cls, ddof=1):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_variance_aggregation(ddof))
         return agg
 
     @classmethod
     def std(cls, ddof=1):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_std_aggregation(ddof))
         return agg
 
     @classmethod
     def median(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_median_aggregation())
         return agg
 
     @classmethod
     def quantile(cls, q=0.5, interpolation="linear"):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
 
         if not pd.api.types.is_list_like(q):
             q = [q]
@@ -275,19 +272,19 @@ cdef class _AggregationFactory:
 
     @classmethod
     def collect(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation())
         return agg
 
     @classmethod
     def unique(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
         agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
         return agg
 
     @classmethod
     def from_udf(cls, op, *args, **kwargs):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = Aggregation()
 
         cdef libcudf_types.type_id tid
         cdef libcudf_types.data_type out_dtype

From 5fbde6f0d57f9de71e61ff62da49b5632301160a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 11:54:21 -0700
Subject: [PATCH 29/49] Remove _AggregationFactory and just use classmethods of
 Aggregation.

---
 python/cudf/cudf/_lib/aggregation.pyx | 142 ++++++++++++--------------
 1 file changed, 68 insertions(+), 74 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7f155a7daf5..c07e92f9dc6 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -60,80 +60,6 @@ cdef class Aggregation:
     def kind(self):
         return AggregationKind(self.c_obj.get()[0].kind).name.lower()
 
-
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *:
-    """
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-
-    Returns
-    -------
-    unique_ptr[aggregation]
-    """
-    cdef Aggregation agg
-    if isinstance(op, str):
-        agg = getattr(_AggregationFactory, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            agg = _AggregationFactory.collect()
-        elif "dtype" in kwargs:
-            agg = _AggregationFactory.from_udf(op, **kwargs)
-        else:
-            agg = op(_AggregationFactory)
-    else:
-        raise TypeError("Unknown aggregation {}".format(op))
-    return move(agg.c_obj)
-
-
-cdef Aggregation make_aggregation2(op, kwargs={}):
-    """
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-
-    Returns
-    -------
-    unique_ptr[aggregation]
-    """
-    cdef Aggregation agg
-    if isinstance(op, str):
-        agg = getattr(_AggregationFactory, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            agg = _AggregationFactory.collect()
-        elif "dtype" in kwargs:
-            agg = _AggregationFactory.from_udf(op, **kwargs)
-        else:
-            agg = op(_AggregationFactory)
-    else:
-        raise TypeError("Unknown aggregation {}".format(op))
-    return agg
-
-
-# The Cython pattern below enables us to create an Aggregation
-# without ever calling its `__init__` method, which would otherwise
-# result in a RecursionError.
-cdef class _AggregationFactory:
-
     @classmethod
     def sum(cls):
         cdef Aggregation agg = Aggregation()
@@ -314,3 +240,71 @@ cdef class _AggregationFactory:
             libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
         ))
         return agg
+
+
+cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *:
+    """
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+
+    Returns
+    -------
+    unique_ptr[aggregation]
+    """
+    cdef Aggregation agg
+    if isinstance(op, str):
+        agg = getattr(Aggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = Aggregation.collect()
+        elif "dtype" in kwargs:
+            agg = Aggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(Aggregation)
+    else:
+        raise TypeError("Unknown aggregation {}".format(op))
+    return move(agg.c_obj)
+
+
+cdef Aggregation make_aggregation2(op, kwargs={}):
+    """
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+
+    Returns
+    -------
+    unique_ptr[aggregation]
+    """
+    cdef Aggregation agg
+    if isinstance(op, str):
+        agg = getattr(Aggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = Aggregation.collect()
+        elif "dtype" in kwargs:
+            agg = Aggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(Aggregation)
+    else:
+        raise TypeError("Unknown aggregation {}".format(op))
+    return agg

From f9508feefa061b32becd111dba3a34b0888fb841 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 12:04:44 -0700
Subject: [PATCH 30/49] Remove old API for aggregations, use new Cython object
 everywhere.

---
 python/cudf/cudf/_lib/aggregation.pxd |  6 ++---
 python/cudf/cudf/_lib/aggregation.pyx | 36 +--------------------------
 python/cudf/cudf/_lib/groupby.pyx     |  4 +--
 python/cudf/cudf/_lib/reduce.pyx      | 12 ++++-----
 python/cudf/cudf/_lib/rolling.pyx     | 11 ++++----
 5 files changed, 17 insertions(+), 52 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index 116e6260a57..13d04575700 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.aggregation cimport aggregation
 
 
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs=*) except *
-
 cdef class Aggregation:
     cdef unique_ptr[aggregation] c_obj
 
-cdef Aggregation make_aggregation2(op, kwargs=*)
+# TODO: the old version of this function was declared as except *, I'm not sure
+# why but I should figure that out before calling this done.
+cdef Aggregation make_aggregation(op, kwargs=*)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index c07e92f9dc6..bc7cc1fbfec 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -242,41 +242,7 @@ cdef class Aggregation:
         return agg
 
 
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *:
-    """
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-
-    Returns
-    -------
-    unique_ptr[aggregation]
-    """
-    cdef Aggregation agg
-    if isinstance(op, str):
-        agg = getattr(Aggregation, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            agg = Aggregation.collect()
-        elif "dtype" in kwargs:
-            agg = Aggregation.from_udf(op, **kwargs)
-        else:
-            agg = op(Aggregation)
-    else:
-        raise TypeError("Unknown aggregation {}".format(op))
-    return move(agg.c_obj)
-
-
-cdef Aggregation make_aggregation2(op, kwargs={}):
+cdef Aggregation make_aggregation(op, kwargs={}):
     """
     Parameters
     ----------
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 2a4580055fc..f5340f23710 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -12,7 +12,7 @@ from libcpp cimport bool
 
 from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table
-from cudf._lib.aggregation cimport Aggregation, make_aggregation2
+from cudf._lib.aggregation cimport Aggregation, make_aggregation
 
 from cudf._lib.cpp.table.table cimport table, table_view
 cimport cudf._lib.cpp.types as libcudf_types
@@ -140,7 +140,7 @@ cdef class GroupBy:
             )
             c_agg_requests[idx].values = col.view()
             for agg in aggs:
-                agg_obj = make_aggregation2(agg)
+                agg_obj = make_aggregation(agg)
                 if valid_aggregations is None or agg_obj.kind in valid_aggregations:
                     empty_aggs = False
                     included_aggregations[col_name].append(agg)
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 2185cb089a7..2cdcaa2d9e0 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -10,7 +10,7 @@ from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.column cimport Column
 from cudf._lib.types import np_to_cudf_types
 from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.aggregation cimport make_aggregation, aggregation
+from cudf._lib.aggregation cimport make_aggregation, aggregation, Aggregation
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 import numpy as np
@@ -38,9 +38,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
-    cdef unique_ptr[aggregation] c_agg = move(make_aggregation(
-        reduction_op, kwargs
-    ))
+    cdef Aggregation cython_agg = make_aggregation(reduction_op, kwargs)
+    cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj)
     cdef type_id tid = (
         <type_id> (
             <underlying_type_t_type_id> (
@@ -88,9 +87,8 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     """
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[column] c_result
-    cdef unique_ptr[aggregation] c_agg = move(
-        make_aggregation(scan_op, kwargs)
-    )
+    cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs)
+    cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj)
 
     cdef scan_type c_inclusive
     if inclusive is True:
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index 9c818f39c38..c985f60f52f 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -8,7 +8,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.aggregation cimport make_aggregation
+from cudf._lib.aggregation cimport Aggregation, make_aggregation
 
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.column.column cimport column
@@ -48,13 +48,14 @@ def rolling(Column source_column, Column pre_column_window,
     cdef column_view pre_column_window_view
     cdef column_view fwd_column_window_view
     cdef unique_ptr[aggregation] agg
+    cdef Aggregation cython_agg
 
     if callable(op):
-        agg = move(
-            make_aggregation(op, {'dtype': source_column.dtype})
-        )
+        cython_agg = make_aggregation(op, {'dtype': source_column.dtype})
+        agg = move(cython_agg.c_obj)
     else:
-        agg = move(make_aggregation(op))
+        cython_agg = make_aggregation(op)
+        agg = move(cython_agg.c_obj)
 
     if window is None:
         if center:

From 4c04b5f3c22b8d17554e6567cc9c31a7cf4a3302 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 12:09:19 -0700
Subject: [PATCH 31/49] Remove direct usage of the C++ groupby object in most
 of the Cython code.

---
 python/cudf/cudf/_lib/groupby.pyx | 1 -
 python/cudf/cudf/_lib/reduce.pyx  | 8 +++-----
 python/cudf/cudf/_lib/rolling.pyx | 8 ++------
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index f5340f23710..a4d913d0849 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -17,7 +17,6 @@ from cudf._lib.aggregation cimport Aggregation, make_aggregation
 from cudf._lib.cpp.table.table cimport table, table_view
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
-cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
 from pandas.core.groupby.groupby import DataError
 from cudf.utils.dtypes import (
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 2cdcaa2d9e0..924b2fa8782 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -10,7 +10,7 @@ from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.column cimport Column
 from cudf._lib.types import np_to_cudf_types
 from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.aggregation cimport make_aggregation, aggregation, Aggregation
+from cudf._lib.aggregation cimport make_aggregation, Aggregation
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 import numpy as np
@@ -39,7 +39,6 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
     cdef Aggregation cython_agg = make_aggregation(reduction_op, kwargs)
-    cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj)
     cdef type_id tid = (
         <type_id> (
             <underlying_type_t_type_id> (
@@ -64,7 +63,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     with nogil:
         c_result = move(cpp_reduce(
             c_incol_view,
-            c_agg,
+            cython_agg.c_obj,
             c_out_dtype
         ))
 
@@ -88,7 +87,6 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[column] c_result
     cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs)
-    cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj)
 
     cdef scan_type c_inclusive
     if inclusive is True:
@@ -99,7 +97,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     with nogil:
         c_result = move(cpp_scan(
             c_incol_view,
-            c_agg,
+            cython_agg.c_obj,
             c_inclusive
         ))
 
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index c985f60f52f..d67fb431ec4 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -13,7 +13,6 @@ from cudf._lib.aggregation cimport Aggregation, make_aggregation
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.aggregation cimport aggregation
 from cudf._lib.cpp.rolling cimport (
     rolling_window as cpp_rolling_window
 )
@@ -47,15 +46,12 @@ def rolling(Column source_column, Column pre_column_window,
     cdef column_view source_column_view = source_column.view()
     cdef column_view pre_column_window_view
     cdef column_view fwd_column_window_view
-    cdef unique_ptr[aggregation] agg
     cdef Aggregation cython_agg
 
     if callable(op):
         cython_agg = make_aggregation(op, {'dtype': source_column.dtype})
-        agg = move(cython_agg.c_obj)
     else:
         cython_agg = make_aggregation(op)
-        agg = move(cython_agg.c_obj)
 
     if window is None:
         if center:
@@ -72,7 +68,7 @@ def rolling(Column source_column, Column pre_column_window,
                     pre_column_window_view,
                     fwd_column_window_view,
                     c_min_periods,
-                    agg)
+                    cython_agg.c_obj)
             )
     else:
         c_min_periods = min_periods
@@ -90,7 +86,7 @@ def rolling(Column source_column, Column pre_column_window,
                     c_window,
                     c_forward_window,
                     c_min_periods,
-                    agg)
+                    cython_agg.c_obj)
             )
 
     return Column.from_unique_ptr(move(c_result))

From b21dfafb8b647d998d99a44869df7506049a08ce Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 12:22:15 -0700
Subject: [PATCH 32/49] Add a docstring for the Aggregation class.

---
 python/cudf/cudf/_lib/aggregation.pyx | 54 ++++++++++++++++-----------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index bc7cc1fbfec..e52cb0762ee 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -56,43 +56,55 @@ class AggregationKind(Enum):
 
 
 cdef class Aggregation:
+    """A Cython wrapper for aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
     @property
     def kind(self):
         return AggregationKind(self.c_obj.get()[0].kind).name.lower()
 
     @classmethod
     def sum(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_sum_aggregation())
         return agg
 
     @classmethod
     def min(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_min_aggregation())
         return agg
 
     @classmethod
     def max(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_max_aggregation())
         return agg
 
     @classmethod
     def idxmin(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_argmin_aggregation())
         return agg
 
     @classmethod
     def idxmax(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_argmax_aggregation())
         return agg
 
     @classmethod
     def mean(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_mean_aggregation())
         return agg
 
@@ -104,7 +116,7 @@ cdef class Aggregation:
         else:
             c_null_handling = libcudf_types.null_policy.INCLUDE
 
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
             c_null_handling
         ))
@@ -112,7 +124,7 @@ cdef class Aggregation:
 
     @classmethod
     def size(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
             <libcudf_types.null_policy><underlying_type_t_null_policy>(
                 NullHandling.INCLUDE
@@ -122,13 +134,13 @@ cdef class Aggregation:
 
     @classmethod
     def nunique(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_nunique_aggregation())
         return agg
 
     @classmethod
     def nth(cls, libcudf_types.size_type size):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(
             libcudf_aggregation.make_nth_element_aggregation(size)
         )
@@ -136,49 +148,49 @@ cdef class Aggregation:
 
     @classmethod
     def any(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_any_aggregation())
         return agg
 
     @classmethod
     def all(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_all_aggregation())
         return agg
 
     @classmethod
     def product(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_product_aggregation())
         return agg
 
     @classmethod
     def sum_of_squares(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_sum_of_squares_aggregation())
         return agg
 
     @classmethod
     def var(cls, ddof=1):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_variance_aggregation(ddof))
         return agg
 
     @classmethod
     def std(cls, ddof=1):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_std_aggregation(ddof))
         return agg
 
     @classmethod
     def median(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_median_aggregation())
         return agg
 
     @classmethod
     def quantile(cls, q=0.5, interpolation="linear"):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
 
         if not pd.api.types.is_list_like(q):
             q = [q]
@@ -198,19 +210,19 @@ cdef class Aggregation:
 
     @classmethod
     def collect(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation())
         return agg
 
     @classmethod
     def unique(cls):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
         return agg
 
     @classmethod
     def from_udf(cls, op, *args, **kwargs):
-        cdef Aggregation agg = Aggregation()
+        cdef Aggregation agg = cls()
 
         cdef libcudf_types.type_id tid
         cdef libcudf_types.data_type out_dtype

From e3ae492d37710e765154224d1f088e6dd8c59571 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 12:24:17 -0700
Subject: [PATCH 33/49] Don't use a dict as a default argument.

---
 python/cudf/cudf/_lib/aggregation.pyx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index e52cb0762ee..717c206397c 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -254,8 +254,8 @@ cdef class Aggregation:
         return agg
 
 
-cdef Aggregation make_aggregation(op, kwargs={}):
-    """
+cdef Aggregation make_aggregation(op, kwargs=None):
+    r"""
     Parameters
     ----------
     op : str or callable
@@ -268,11 +268,16 @@ cdef Aggregation make_aggregation(op, kwargs={}):
         * Is a user defined aggregation function that operates on
           group values. In this case, the output dtype must be
           specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
 
     Returns
     -------
     unique_ptr[aggregation]
     """
+    if kwargs is None:
+        kwargs = {}
+
     cdef Aggregation agg
     if isinstance(op, str):
         agg = getattr(Aggregation, op)(**kwargs)

From bac0de5558f8aeb05046d8cfdd76dbbc90c53990 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 12:32:23 -0700
Subject: [PATCH 34/49] Use the uppercased versions of the acceptable
 aggregations.

---
 python/cudf/cudf/_lib/aggregation.pyx |  2 +-
 python/cudf/cudf/_lib/groupby.pyx     | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 717c206397c..020745422a8 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -70,7 +70,7 @@ cdef class Aggregation:
     """
     @property
     def kind(self):
-        return AggregationKind(self.c_obj.get()[0].kind).name.lower()
+        return AggregationKind(self.c_obj.get()[0].kind).name
 
     @classmethod
     def sum(cls):
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index a4d913d0849..82a8a905b0b 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -30,16 +30,15 @@ from cudf.utils.dtypes import (
 
 
 # The sets below define the possible aggregations that can be performed on
-# different dtypes. The uppercased versions of these strings correspond to
-# elements of the AggregationKind enum.
-_CATEGORICAL_AGGS = {"count", "size", "nunique", "unique"}
-_STRING_AGGS = {"count", "size", "max", "min", "nunique", "nth", "collect",
-                "unique"}
-_LIST_AGGS = {"collect" }
+# different dtypes. These strings must be elements of the AggregationKind enum.
+_CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
+_STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
+                "UNIQUE"}
+_LIST_AGGS = {"COLLECT" }
 _STRUCT_AGGS = set()
 _INTERVAL_AGGS = set()
-_DECIMAL_AGGS = {"count", "sum", "argmin", "argmax", "min", "max", "nunique",
-                 "nth", "collect"}
+_DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
+                 "NTH", "COLLECT"}
 
 
 cdef class GroupBy:

From 4c7f8d270d85b1e1fea0966d63a0bf501ec03859 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 30 Mar 2021 13:02:58 -0700
Subject: [PATCH 35/49] Just define all the aggregations available on the
 GroupBy as methods.

---
 python/cudf/cudf/core/groupby/groupby.py | 99 +++++++++++++++---------
 1 file changed, 62 insertions(+), 37 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cc94548d9a2..29efb74afa0 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 import collections
-import functools
 import pickle
 import warnings
 
@@ -570,47 +569,73 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
+    def count(self, dropna=True):
+        """Compute the sizes of each column."""
+        def func(x):
+            return getattr(x, "count")(dropna=dropna)
 
-# Set of valid groupby aggregations that are monkey-patched into the GroupBy
-# namespace.
-_VALID_GROUPBY_AGGS = {
-    "count",
-    "sum",
-    "idxmin",
-    "idxmax",
-    "min",
-    "max",
-    "mean",
-    "var",
-    "std",
-    "quantile",
-    "median",
-    "nunique",
-    "collect",
-    "unique",
-}
-
-
-# Dynamically bind the different aggregation methods.
-def _agg_func_name_with_args(self, func_name, *args, **kwargs):
-    """
-    Aggregate given an aggregate function name and arguments to the
-    function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
-    aggregations must be members of _AggregationFactory.
-    """
+        return self.agg(func)
+
+    def sum(self):
+        """Compute the sum of each column."""
+        return self.agg("sum")
+
+    def idxmin(self):
+        """Compute the index of the minimum value in each column."""
+        return self.agg("idxmin")
+
+    def idxmax(self):
+        """Compute the index of the maximum value in each column."""
+        return self.agg("idxmax")
+
+    def min(self):
+        """Compute the minimum value in each column."""
+        return self.agg("min")
+
+    def max(self):
+        """Compute the maximum value in each column."""
+        return self.agg("max")
+
+    def mean(self):
+        """Compute the mean value in each column."""
+        return self.agg("mean")
 
-    def func(x):
-        """Compute the {} of the group.""".format(func_name)
-        return getattr(x, func_name)(*args, **kwargs)
+    def median(self):
+        """Compute the median value in each column."""
+        return self.agg("median")
 
-    func.__name__ = func_name
-    return self.agg(func)
+    def var(self, ddof=1):
+        """Compute the variance of the data in each column."""
+        def func(x):
+            return getattr(x, "var")(ddof=ddof)
+
+        return self.agg(func)
+
+    def std(self, ddof=1):
+        """Compute the standard deviation of the data in each column."""
+        def func(x):
+            return getattr(x, "std")(ddof=ddof)
+
+        return self.agg(func)
+
+    def quantile(self, q=0.5, interpolation="linear"):
+        """Compute the quantiles of each column."""
+        def func(x):
+            return getattr(x, "quantile")(q=q, interpolation=interpolation)
+
+        return self.agg(func)
+
+    def nunique(self):
+        """Compute the number of unique elements in each column."""
+        return self.agg("nunique")
 
+    def collect(self):
+        """Get a list of the items in each column."""
+        return self.agg("collect")
 
-for key in _VALID_GROUPBY_AGGS:
-    setattr(
-        GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key)
-    )
+    def unique(self):
+        """Compute the unique elements in each column."""
+        return self.agg("unique")
 
 
 class DataFrameGroupBy(GroupBy):

From 46e054563c6d9ce766b0f5fb283d5d30f1f03785 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 09:31:45 -0700
Subject: [PATCH 36/49] Fix up merge.

---
 python/cudf/cudf/_lib/groupby.pyx        |  6 ++++
 python/cudf/cudf/core/groupby/groupby.py | 42 ------------------------
 2 files changed, 6 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 40fd9dac98d..2903832e7c5 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -133,6 +133,12 @@ cdef class GroupBy:
                 else _DECIMAL_AGGS if is_decimal_dtype(dtype)
                 else None
             )
+            if (valid_aggregations is _DECIMAL_AGGS
+                and rmm._cuda.gpu.runtimeGetVersion() < 11000):
+                raise RuntimeError(
+                    "Decimal aggregations are only supported on CUDA >= 11 "
+                    "due to an nvcc compiler bug."
+                )
 
             c_agg_requests.push_back(
                 move(libcudf_groupby.aggregation_request())
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4acf8e7749c..29efb74afa0 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -638,48 +638,6 @@ def unique(self):
         return self.agg("unique")
 
 
-# Set of valid groupby aggregations that are monkey-patched into the GroupBy
-# namespace.
-_VALID_GROUPBY_AGGS = {
-    "count",
-    "sum",
-    "idxmin",
-    "idxmax",
-    "min",
-    "max",
-    "mean",
-    "var",
-    "std",
-    "quantile",
-    "median",
-    "nunique",
-    "collect",
-    "unique",
-}
-
-
-# Dynamically bind the different aggregation methods.
-def _agg_func_name_with_args(self, func_name, *args, **kwargs):
-    """
-    Aggregate given an aggregate function name and arguments to the
-    function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
-    aggregations must be members of _AggregationFactory.
-    """
-
-    def func(x):
-        """Compute the {} of the group.""".format(func_name)
-        return getattr(x, func_name)(*args, **kwargs)
-
-    func.__name__ = func_name
-    return self.agg(func)
-
-
-for key in _VALID_GROUPBY_AGGS:
-    setattr(
-        GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key)
-    )
-
-
 class DataFrameGroupBy(GroupBy):
     def __init__(
         self, obj, by=None, level=None, sort=False, as_index=True, dropna=True

From 26f101663ef723c8b795d5a765e62f49439796c4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 10:21:24 -0700
Subject: [PATCH 37/49] Some minor cleanup.

---
 python/cudf/cudf/_lib/aggregation.pyx |  2 +-
 python/cudf/cudf/_lib/groupby.pyx     | 16 ++++++----------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 020745422a8..f512cddf8f0 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -273,7 +273,7 @@ cdef Aggregation make_aggregation(op, kwargs=None):
 
     Returns
     -------
-    unique_ptr[aggregation]
+    Aggregation
     """
     if kwargs is None:
         kwargs = {}
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 2903832e7c5..91451f8563f 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -112,11 +112,7 @@ cdef class GroupBy:
         cdef Column col
         cdef Aggregation agg_obj
 
-        # TODO: Is allowing users to provide empty aggregations something we do
-        # to support pandas semantics? Whereas we need to throw an error if
-        # some aggregations are requested when in fact none are possible?
         allow_empty = all(len(v) == 0 for v in aggregations.values())
-        empty_aggs = True
 
         included_aggregations = defaultdict(list)
         idx = 0
@@ -143,20 +139,19 @@ cdef class GroupBy:
             c_agg_requests.push_back(
                 move(libcudf_groupby.aggregation_request())
             )
-            c_agg_requests[idx].values = col.view()
             for agg in aggs:
                 agg_obj = make_aggregation(agg)
                 if valid_aggregations is None or agg_obj.kind in valid_aggregations:
-                    empty_aggs = False
                     included_aggregations[col_name].append(agg)
                     c_agg_requests[idx].aggregations.push_back(
                         move(agg_obj.c_obj)
                     )
-            if not c_agg_requests[idx].aggregations.size():
+            if c_agg_requests[idx].aggregations.empty():
                 c_agg_requests.pop_back()
             else:
+                c_agg_requests[idx].values = col.view()
                 idx += 1
-        if empty_aggs and not allow_empty:
+        if c_agg_requests.empty() and not allow_empty:
             raise DataError("No numeric types to aggregate")
 
         cdef pair[
@@ -188,6 +183,8 @@ cdef class GroupBy:
         )
 
         result_data = ColumnAccessor(multiindex=True)
+        # Note: This loop relies on the included_aggregations dict being
+        # insertion ordered to map results to requested aggregations by index.
         for i, col_name in enumerate(included_aggregations):
             for j, agg_name in enumerate(included_aggregations[col_name]):
                 if callable(agg_name):
@@ -196,5 +193,4 @@ cdef class GroupBy:
                     Column.from_unique_ptr(move(c_result.second[i].results[j]))
                 )
 
-        result = Table(data=result_data, index=grouped_keys)
-        return result
+        return Table(data=result_data, index=grouped_keys)

From a7148a4d40580cf52849cbab37caadac196d4c02 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 10:29:02 -0700
Subject: [PATCH 38/49] Fix docstrings.

---
 python/cudf/cudf/core/groupby/groupby.py | 37 +++++++++++++++++++++---
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 29efb74afa0..dd462427ff3 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -570,7 +570,13 @@ def rolling(self, *args, **kwargs):
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
     def count(self, dropna=True):
-        """Compute the sizes of each column."""
+        """Compute the sizes of each column.
+
+        Parameters
+        ----------
+        dropna : bool
+            If ``True``, don't include null values in the count.
+        """
         def func(x):
             return getattr(x, "count")(dropna=dropna)
 
@@ -605,21 +611,44 @@ def median(self):
         return self.agg("median")
 
     def var(self, ddof=1):
-        """Compute the variance of the data in each column."""
+        """Compute the variance of the data in each column.
+
+        Parameters
+        ----------
+        ddof : int
+            The delta degrees of freedom. N - ddof is the divisor used to
+            normalize the variance.
+        """
         def func(x):
             return getattr(x, "var")(ddof=ddof)
 
         return self.agg(func)
 
     def std(self, ddof=1):
-        """Compute the standard deviation of the data in each column."""
+        """Compute the standard deviation of the data in each column.
+
+        Parameters
+        ----------
+        ddof : int
+            The delta degrees of freedom. N - ddof is the divisor used to
+            normalize the standard deviation.
+        """
         def func(x):
             return getattr(x, "std")(ddof=ddof)
 
         return self.agg(func)
 
     def quantile(self, q=0.5, interpolation="linear"):
-        """Compute the quantiles of each column."""
+        """Compute the quantiles of each column.
+
+        Parameters
+        ----------
+        q : float or array-like
+            The quantiles to compute.
+        interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
+            The interpolation method to use when the desired quantile lies
+            between two data points. Defaults to "linear".
+       """
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
 

From 4dc1d7995c4cc881c002af2b91ed7c83a9ac14af Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 11:02:36 -0700
Subject: [PATCH 39/49] Fix style.

---
 python/cudf/cudf/_lib/groupby.pyx | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 91451f8563f..e76e4333f09 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,6 +1,15 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from collections import defaultdict
+from pandas.core.groupby.groupby import DataError
+from cudf.utils.dtypes import (
+    is_categorical_dtype,
+    is_string_dtype,
+    is_list_dtype,
+    is_interval_dtype,
+    is_struct_dtype,
+    is_decimal_dtype,
+)
 
 import numpy as np
 import rmm
@@ -19,15 +28,6 @@ from cudf._lib.cpp.table.table cimport table, table_view
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 
-from pandas.core.groupby.groupby import DataError
-from cudf.utils.dtypes import (
-    is_categorical_dtype,
-    is_string_dtype,
-    is_list_dtype,
-    is_interval_dtype,
-    is_struct_dtype,
-    is_decimal_dtype,
-)
 
 
 # The sets below define the possible aggregations that can be performed on
@@ -35,7 +35,7 @@ from cudf.utils.dtypes import (
 _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
 _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
                 "UNIQUE"}
-_LIST_AGGS = {"COLLECT" }
+_LIST_AGGS = {"COLLECT"}
 _STRUCT_AGGS = set()
 _INTERVAL_AGGS = set()
 _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
@@ -127,10 +127,10 @@ cdef class GroupBy:
                 else _STRING_AGGS if is_struct_dtype(dtype)
                 else _INTERVAL_AGGS if is_interval_dtype(dtype)
                 else _DECIMAL_AGGS if is_decimal_dtype(dtype)
-                else None
+                else "ALL"
             )
             if (valid_aggregations is _DECIMAL_AGGS
-                and rmm._cuda.gpu.runtimeGetVersion() < 11000):
+                    and rmm._cuda.gpu.runtimeGetVersion() < 11000):
                 raise RuntimeError(
                     "Decimal aggregations are only supported on CUDA >= 11 "
                     "due to an nvcc compiler bug."
@@ -141,7 +141,8 @@ cdef class GroupBy:
             )
             for agg in aggs:
                 agg_obj = make_aggregation(agg)
-                if valid_aggregations is None or agg_obj.kind in valid_aggregations:
+                if (valid_aggregations == "ALL"
+                        or agg_obj.kind in valid_aggregations):
                     included_aggregations[col_name].append(agg)
                     c_agg_requests[idx].aggregations.push_back(
                         move(agg_obj.c_obj)

From 0c18a161400adcf997f650be8cb465022d456c5d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 12:20:29 -0700
Subject: [PATCH 40/49] Remove extra blank line.

---
 python/cudf/cudf/_lib/groupby.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index e76e4333f09..4f9d9a524af 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -29,7 +29,6 @@ cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 
 
-
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. These strings must be elements of the AggregationKind enum.
 _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}

From c9892b18ebacfcdcba6c28b660efebfbcd54d8b9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 13:57:59 -0700
Subject: [PATCH 41/49] Remove TODO.

---
 python/cudf/cudf/_lib/aggregation.pxd | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index 13d04575700..972f95d5aab 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -7,6 +7,4 @@ from cudf._lib.cpp.aggregation cimport aggregation
 cdef class Aggregation:
     cdef unique_ptr[aggregation] c_obj
 
-# TODO: the old version of this function was declared as except *, I'm not sure
-# why but I should figure that out before calling this done.
 cdef Aggregation make_aggregation(op, kwargs=*)

From 61e84b25ac19eb3c2ad0029aae44bbc54a5e6ad1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 1 Apr 2021 14:05:28 -0700
Subject: [PATCH 42/49] Apply black formatting.

---
 python/cudf/cudf/core/groupby/groupby.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index dd462427ff3..ee7dd7c5335 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -577,6 +577,7 @@ def count(self, dropna=True):
         dropna : bool
             If ``True``, don't include null values in the count.
         """
+
         def func(x):
             return getattr(x, "count")(dropna=dropna)
 
@@ -619,6 +620,7 @@ def var(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the variance.
         """
+
         def func(x):
             return getattr(x, "var")(ddof=ddof)
 
@@ -633,6 +635,7 @@ def std(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the standard deviation.
         """
+
         def func(x):
             return getattr(x, "std")(ddof=ddof)
 
@@ -649,6 +652,7 @@ def quantile(self, q=0.5, interpolation="linear"):
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
        """
+
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
 

From 67c6ebcd2970cb1e4284fe9b208641e2d4770f3e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 12 Apr 2021 10:11:39 -0700
Subject: [PATCH 43/49] Remove use of index and just use std::vector::back.

---
 python/cudf/cudf/_lib/groupby.pyx | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4f9d9a524af..2f4f6e89aff 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -114,7 +114,6 @@ cdef class GroupBy:
         allow_empty = all(len(v) == 0 for v in aggregations.values())
 
         included_aggregations = defaultdict(list)
-        idx = 0
         for i, (col_name, aggs) in enumerate(aggregations.items()):
             col = values._data[col_name]
             dtype = col.dtype
@@ -143,14 +142,13 @@ cdef class GroupBy:
                 if (valid_aggregations == "ALL"
                         or agg_obj.kind in valid_aggregations):
                     included_aggregations[col_name].append(agg)
-                    c_agg_requests[idx].aggregations.push_back(
+                    c_agg_requests.back().aggregations.push_back(
                         move(agg_obj.c_obj)
                     )
-            if c_agg_requests[idx].aggregations.empty():
+            if c_agg_requests.back().aggregations.empty():
                 c_agg_requests.pop_back()
             else:
-                c_agg_requests[idx].values = col.view()
-                idx += 1
+                c_agg_requests.back().values = col.view()
         if c_agg_requests.empty() and not allow_empty:
             raise DataError("No numeric types to aggregate")
 

From 9cd725e8af4467226dc73b826e9051d64b7162df Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 12 Apr 2021 10:23:11 -0700
Subject: [PATCH 44/49] Update error message.

---
 python/cudf/cudf/_lib/groupby.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 2f4f6e89aff..c055c174636 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -150,7 +150,7 @@ cdef class GroupBy:
             else:
                 c_agg_requests.back().values = col.view()
         if c_agg_requests.empty() and not allow_empty:
-            raise DataError("No numeric types to aggregate")
+            raise DataError("All requested aggregations are unsupported.")
 
         cdef pair[
             unique_ptr[table],

From 9f01b607879df089bd1569a1061547f0405e26ea Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 12 Apr 2021 10:34:50 -0700
Subject: [PATCH 45/49] Create a temporary agg variable rather than popping
 from the vector.

---
 python/cudf/cudf/_lib/groupby.pyx | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index c055c174636..1679256e4d0 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -108,6 +108,7 @@ cdef class GroupBy:
         """
         from cudf.core.column_accessor import ColumnAccessor
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
+        cdef libcudf_groupby.aggregation_request c_agg_request
         cdef Column col
         cdef Aggregation agg_obj
 
@@ -134,23 +135,23 @@ cdef class GroupBy:
                     "due to an nvcc compiler bug."
                 )
 
-            c_agg_requests.push_back(
-                move(libcudf_groupby.aggregation_request())
-            )
+            c_agg_request = move(libcudf_groupby.aggregation_request())
             for agg in aggs:
                 agg_obj = make_aggregation(agg)
                 if (valid_aggregations == "ALL"
                         or agg_obj.kind in valid_aggregations):
                     included_aggregations[col_name].append(agg)
-                    c_agg_requests.back().aggregations.push_back(
+                    c_agg_request.aggregations.push_back(
                         move(agg_obj.c_obj)
                     )
-            if c_agg_requests.back().aggregations.empty():
-                c_agg_requests.pop_back()
-            else:
-                c_agg_requests.back().values = col.view()
+            if not c_agg_request.aggregations.empty():
+                c_agg_request.values = col.view()
+                c_agg_requests.push_back(
+                    move(c_agg_request)
+                )
+
         if c_agg_requests.empty() and not allow_empty:
-            raise DataError("All requested aggregations are unsupported.")
+            raise DataError("No numeric types to aggregate")
 
         cdef pair[
             unique_ptr[table],

From cb3f5a0c26d42e57653146804178dc3d8829f870 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 13 Apr 2021 09:35:30 -0700
Subject: [PATCH 46/49] Update docstring.

---
 python/cudf/cudf/core/groupby/groupby.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ee7dd7c5335..989fded48f7 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -570,7 +570,7 @@ def rolling(self, *args, **kwargs):
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
     def count(self, dropna=True):
-        """Compute the sizes of each column.
+        """Compute the number of values in each column.
 
         Parameters
         ----------
@@ -584,7 +584,7 @@ def func(x):
         return self.agg(func)
 
     def sum(self):
-        """Compute the sum of each column."""
+        """Compute the sum of the values in each column."""
         return self.agg("sum")
 
     def idxmin(self):
@@ -604,15 +604,15 @@ def max(self):
         return self.agg("max")
 
     def mean(self):
-        """Compute the mean value in each column."""
+        """Compute the mean of the values in each column."""
         return self.agg("mean")
 
     def median(self):
-        """Compute the median value in each column."""
+        """Compute the median of the values in each column."""
         return self.agg("median")
 
     def var(self, ddof=1):
-        """Compute the variance of the data in each column.
+        """Compute the variance of the values in each column.
 
         Parameters
         ----------
@@ -627,7 +627,7 @@ def func(x):
         return self.agg(func)
 
     def std(self, ddof=1):
-        """Compute the standard deviation of the data in each column.
+        """Compute the standard deviation of the values in each column.
 
         Parameters
         ----------
@@ -642,7 +642,7 @@ def func(x):
         return self.agg(func)
 
     def quantile(self, q=0.5, interpolation="linear"):
-        """Compute the quantiles of each column.
+        """Compute the quantiles of the values in each column.
 
         Parameters
         ----------
@@ -659,15 +659,15 @@ def func(x):
         return self.agg(func)
 
     def nunique(self):
-        """Compute the number of unique elements in each column."""
+        """Compute the number of unique values in each column."""
         return self.agg("nunique")
 
     def collect(self):
-        """Get a list of the items in each column."""
+        """Get a list of all the values in each column."""
         return self.agg("collect")
 
     def unique(self):
-        """Compute the unique elements in each column."""
+        """Get a list of the unique values in each column."""
         return self.agg("unique")
 
 

From e96a7958018f0ca045fe8018f99716d657fcd685 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 13 Apr 2021 09:41:07 -0700
Subject: [PATCH 47/49] Update error message.

---
 python/cudf/cudf/_lib/groupby.pyx      | 2 +-
 python/cudf/cudf/tests/test_groupby.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 1679256e4d0..3c2b541f728 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -151,7 +151,7 @@ cdef class GroupBy:
                 )
 
         if c_agg_requests.empty() and not allow_empty:
-            raise DataError("No numeric types to aggregate")
+            raise DataError("All requested aggregations are unsupported.")
 
         cdef pair[
             unique_ptr[table],
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 4dbe608af82..868387b100e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1236,7 +1236,11 @@ def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
 
-    assert_exceptions_equal(pdf.groupby("a").mean, gdf.groupby("a").mean)
+    assert_exceptions_equal(
+        pdf.groupby("a").mean,
+        gdf.groupby("a").mean,
+        compare_error_message=False,
+    )
 
 
 def test_drop_unsupported_multi_agg():

From f52fffb98ab9cc8b510285ee910c5e8e306fe6d3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 14 Apr 2021 12:23:12 -0700
Subject: [PATCH 48/49] Update docstrings.

---
 python/cudf/cudf/core/groupby/groupby.py | 26 ++++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 989fded48f7..a52fae994e7 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -584,35 +584,35 @@ def func(x):
         return self.agg(func)
 
     def sum(self):
-        """Compute the sum of the values in each column."""
+        """Compute the column-wise sum of the values in each group."""
         return self.agg("sum")
 
     def idxmin(self):
-        """Compute the index of the minimum value in each column."""
+        """Get the column-wise index of the minimum value in each group."""
         return self.agg("idxmin")
 
     def idxmax(self):
-        """Compute the index of the maximum value in each column."""
+        """Get the column-wise index of the maximum value in each group."""
         return self.agg("idxmax")
 
     def min(self):
-        """Compute the minimum value in each column."""
+        """Get the column-wise minimum value in each group."""
         return self.agg("min")
 
     def max(self):
-        """Compute the maximum value in each column."""
+        """Get the column-wise maximum value in each group."""
         return self.agg("max")
 
     def mean(self):
-        """Compute the mean of the values in each column."""
+        """Compute the column-wise mean of the values in each group."""
         return self.agg("mean")
 
     def median(self):
-        """Compute the median of the values in each column."""
+        """Get the column-wise median of the values in each group."""
         return self.agg("median")
 
     def var(self, ddof=1):
-        """Compute the variance of the values in each column.
+        """Compute the column-wise variance of the values in each group.
 
         Parameters
         ----------
@@ -627,7 +627,7 @@ def func(x):
         return self.agg(func)
 
     def std(self, ddof=1):
-        """Compute the standard deviation of the values in each column.
+        """Compute the column-wise std of the values in each group.
 
         Parameters
         ----------
@@ -642,7 +642,7 @@ def func(x):
         return self.agg(func)
 
     def quantile(self, q=0.5, interpolation="linear"):
-        """Compute the quantiles of the values in each column.
+        """Compute the column-wise quantiles of the values in each group.
 
         Parameters
         ----------
@@ -659,15 +659,15 @@ def func(x):
         return self.agg(func)
 
     def nunique(self):
-        """Compute the number of unique values in each column."""
+        """Compute the number of unique values in each column in each group."""
         return self.agg("nunique")
 
     def collect(self):
-        """Get a list of all the values in each column."""
+        """Get a list of all the values for each column in each group."""
         return self.agg("collect")
 
     def unique(self):
-        """Get a list of the unique values in each column."""
+        """Get a list of the unique values for each column in each group."""
         return self.agg("unique")
 
 

From affc9c95b07429d3cac293e6ddf63432791e2ae3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 14 Apr 2021 16:40:38 -0500
Subject: [PATCH 49/49] Update python/cudf/cudf/_lib/aggregation.pyx

---
 python/cudf/cudf/_lib/aggregation.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index f512cddf8f0..682d8cbf329 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -289,5 +289,5 @@ cdef Aggregation make_aggregation(op, kwargs=None):
         else:
             agg = op(Aggregation)
     else:
-        raise TypeError("Unknown aggregation {}".format(op))
+        raise TypeError(f"Unknown aggregation {op}")
     return agg