From 26bafd02b7f37a3c20eeaa1fdc8d59e13015a57f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 24 Mar 2021 12:29:57 -0700 Subject: [PATCH 01/49] Don't identify decimals as strings. --- python/cudf/cudf/utils/dtypes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8875a36dba8..67b54ef33f0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -154,7 +154,11 @@ def is_numerical_dtype(obj): def is_string_dtype(obj): - return pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj) + return ( + not is_decimal_dtype(obj) + and pd.api.types.is_string_dtype(obj) + and not is_categorical_dtype(obj) + ) def is_datetime_dtype(obj): From babcdfcfa9598eb4b20d458eb0d48a3ee035eedb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 24 Mar 2021 17:10:53 -0700 Subject: [PATCH 02/49] Reject all extension types as string types. --- python/cudf/cudf/utils/dtypes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 67b54ef33f0..8af225ecb58 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -155,9 +155,13 @@ def is_numerical_dtype(obj): def is_string_dtype(obj): return ( - not is_decimal_dtype(obj) - and pd.api.types.is_string_dtype(obj) + pd.api.types.is_string_dtype(obj) + # Reject all cudf extension types. and not is_categorical_dtype(obj) + and not is_decimal_dtype(obj) + and not is_list_dtype(obj) + and not is_struct_dtype(obj) + and not is_interval_dtype(obj) ) From 2b0061146238557005aa4bc2f6496dc1038fa7c3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 24 Mar 2021 18:33:34 -0700 Subject: [PATCH 03/49] Create separate lists for extension type methods. --- python/cudf/cudf/_lib/groupby.pyx | 65 ++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 0f5cdc73d3b..b47430770ea 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -51,13 +51,52 @@ _STRING_AGGS = { "min", "nunique", "nth", - "collect" + "collect", + "variance" } _LIST_AGGS = { "collect" } +# TODO: For now, the aggs for all of these extension types are the same as strings until we can find a way to fix that. +_STRUCT_AGGS = { + "count", + "size", + "max", + "min", + "nunique", + "nth", + "collect" +} + +_INTERVAL_AGGS = { + "count", + "size", + "max", + "min", + "nunique", + "nth", + "collect" +} + +_DECIMAL_AGGS = { + "count", + "sum", + "idxmin", # Silently fails. + "idxmax", # Silently fails. + "min", + "max", + "mean", # Errors internally in C++ + "var", # Silently fails. + "std", + "quantile", # Errors internally in C++ + "median", # Errors internally in C++ + "nunique", + "nth", + "collect" # Errors internally in Python +} + cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj cdef dict __dict__ @@ -182,10 +221,14 @@ def _drop_unsupported_aggs(Table values, aggs): from cudf.utils.dtypes import ( is_categorical_dtype, is_string_dtype, - is_list_dtype + is_list_dtype, + is_interval_dtype, + is_struct_dtype, + is_decimal_dtype, ) result = aggs.copy() + # and not is_decimal_dtype(obj) for col_name in aggs: if ( is_list_dtype(values._data[col_name].dtype) @@ -205,6 +248,24 @@ def _drop_unsupported_aggs(Table values, aggs): for i, agg_name in enumerate(aggs[col_name]): if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS: del result[col_name][i] + elif ( + is_struct_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _STRUCT_AGGS: + del result[col_name][i] + elif ( + is_interval_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _INTERVAL_AGGS: + del result[col_name][i] + elif ( + is_decimal_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _DECIMAL_AGGS: + del result[col_name][i] if all(len(v) == 0 for v in result.values()): raise DataError("No numeric types to aggregate") From 1ebde51ccc666777d7eb3b71ead6ff0ff372296d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 09:48:42 -0700 Subject: [PATCH 04/49] Enable collect for decimals. --- python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/utils/dtypes.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index b47430770ea..739ab19993a 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -94,7 +94,7 @@ _DECIMAL_AGGS = { "median", # Errors internally in C++ "nunique", "nth", - "collect" # Errors internally in Python + "collect" } cdef class GroupBy: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8af225ecb58..e1d9b71daf0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -294,9 +294,8 @@ def cudf_dtype_to_pa_type(dtype): """ if is_categorical_dtype(dtype): raise NotImplementedError() - elif is_list_dtype(dtype): - return dtype.to_arrow() - elif is_struct_dtype(dtype): + elif (is_list_dtype(dtype) or is_struct_dtype(dtype) + or is_decimal_dtype(dtype)): return dtype.to_arrow() else: return np_to_pa_dtype(np.dtype(dtype)) From 4c5d87604d6bbc09b1b737e8aaea47a270e0b102 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 10:23:54 -0700 Subject: [PATCH 05/49] Enable argmin and argmax. --- python/cudf/cudf/_lib/groupby.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 739ab19993a..28e051d1234 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -83,13 +83,13 @@ _INTERVAL_AGGS = { _DECIMAL_AGGS = { "count", "sum", - "idxmin", # Silently fails. - "idxmax", # Silently fails. + "argmin", + "argmax", "min", "max", "mean", # Errors internally in C++ - "var", # Silently fails. - "std", + "var", # Column silently vanishes. + "std", # Gives all nulls. "quantile", # Errors internally in C++ "median", # Errors internally in C++ "nunique", From 4134e43f7531e30f404142e34653d938e6fe5b8f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 10:26:58 -0700 Subject: [PATCH 06/49] Fix variance key name. --- python/cudf/cudf/_lib/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 28e051d1234..319259177d5 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -88,7 +88,7 @@ _DECIMAL_AGGS = { "min", "max", "mean", # Errors internally in C++ - "var", # Column silently vanishes. + "variance", # Gives all nulls. "std", # Gives all nulls. "quantile", # Errors internally in C++ "median", # Errors internally in C++ From 43cf580dda3a614356f4d53a319295e7254583e3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 10:40:12 -0700 Subject: [PATCH 07/49] Move groupby aggregation list to groupby.py and clean up the assignment of aggregators. --- python/cudf/cudf/_lib/groupby.pyx | 18 ------------ python/cudf/cudf/core/groupby/groupby.py | 36 ++++++++++++++++++------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 319259177d5..5df01dbe6e3 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -20,24 +20,6 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.aggregation as libcudf_aggregation -_GROUPBY_AGGS = { - "count", - "size", - "sum", - "idxmin", - "idxmax", - "min", - "max", - "mean", - "var", - "std", - "quantile", - "median", - "nunique", - "nth", - "collect" -} - _CATEGORICAL_AGGS = { "count", "size", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 86e1f5cfe30..bde946f2e1d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -13,6 +13,8 @@ from cudf.utils.utils import cached_property +# Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the +# class after its definition (see below). class GroupBy(Serializable): _MAX_GROUPS_BEFORE_WARN = 100 @@ -58,14 +60,6 @@ def __init__( else: self.grouping = _Grouping(obj, by, level) - def __getattribute__(self, key): - try: - return super().__getattribute__(key) - except AttributeError: - if key in libgroupby._GROUPBY_AGGS: - return functools.partial(self._agg_func_name_with_args, key) - raise - def __iter__(self): group_names, offsets, _, grouped_values = self._grouped() if isinstance(group_names, cudf.Index): @@ -590,6 +584,32 @@ def rolling(self, *args, **kwargs): return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) +# Set of valid groupby aggregations. +_VALID_GROUPBY_AGGS = { + "count", + "size", + "sum", + "idxmin", + "idxmax", + "min", + "max", + "mean", + "var", + "std", + "quantile", + "median", + "nunique", + "nth", + "collect" +} + + +# Dynamically bind the different aggregation methods. +for key in _VALID_GROUPBY_AGGS: + setattr(GroupBy, key, + functools.partialmethod(GroupBy._agg_func_name_with_args, key)) + + class DataFrameGroupBy(GroupBy): def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True From 474a179e8b64f80f8c1789cd3a67eb97028a58fb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 12:21:41 -0700 Subject: [PATCH 08/49] Disable aggs that are overrides of actual methods. --- python/cudf/cudf/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index bde946f2e1d..fbab73c183b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -587,7 +587,7 @@ def rolling(self, *args, **kwargs): # Set of valid groupby aggregations. _VALID_GROUPBY_AGGS = { "count", - "size", + # "size", # This aggregation will never happen because GroupBy.Size exists "sum", "idxmin", "idxmax", @@ -599,7 +599,7 @@ def rolling(self, *args, **kwargs): "quantile", "median", "nunique", - "nth", + # "nth", # This aggregation will never happen because GroupBy.Size exists "collect" } From 25e74ef08ec68a7064be4ce12530a949c579d419 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 12:34:55 -0700 Subject: [PATCH 09/49] Move more logic out of the GroupBy class. --- python/cudf/cudf/_lib/groupby.pyx | 3 +++ python/cudf/cudf/core/groupby/groupby.py | 34 ++++++++++++------------ 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 5df01dbe6e3..94f9ff9ca6b 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -20,6 +20,9 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.aggregation as libcudf_aggregation +# The sets below define the possible aggregations that can be performed on +# different dtypes. The uppercased versions of these strings correspond to +# elements of the AggregationKind enum. _CATEGORICAL_AGGS = { "count", "size", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fbab73c183b..9cc6e098322 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -261,19 +261,6 @@ def _grouped(self): group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) - def _agg_func_name_with_args(self, func_name, *args, **kwargs): - """ - Aggregate given an aggregate function name - and arguments to the function, e.g., - `_agg_func_name_with_args("quantile", 0.5)` - """ - - def func(x): - return getattr(x, func_name)(*args, **kwargs) - - func.__name__ = func_name - return self.agg(func) - def _normalize_aggs(self, aggs): """ Normalize aggs to a dict mapping column names @@ -584,10 +571,10 @@ def rolling(self, *args, **kwargs): return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) -# Set of valid groupby aggregations. +# Set of valid groupby aggregations that are monkey-patched into the GroupBy +# namespace. _VALID_GROUPBY_AGGS = { "count", - # "size", # This aggregation will never happen because GroupBy.Size exists "sum", "idxmin", "idxmax", @@ -599,15 +586,28 @@ def rolling(self, *args, **kwargs): "quantile", "median", "nunique", - # "nth", # This aggregation will never happen because GroupBy.Size exists "collect" } # Dynamically bind the different aggregation methods. for key in _VALID_GROUPBY_AGGS: + def _agg_func_name_with_args(self, func_name, *args, **kwargs): + """ + Aggregate given an aggregate function name and arguments to the + function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named + aggregations must be members of _AggregationFactory. + """ + + def func(x): + """Compute the {} of the group.""".format(func_name) + return getattr(x, func_name)(*args, **kwargs) + + func.__name__ = func_name + return self.agg(func) + setattr(GroupBy, key, - functools.partialmethod(GroupBy._agg_func_name_with_args, key)) + functools.partialmethod(_agg_func_name_with_args, key)) class DataFrameGroupBy(GroupBy): From 8a4482736a0e368b842f872b450b3be4410c4d89 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 12:54:20 -0700 Subject: [PATCH 10/49] Simplify getattr usage. --- python/cudf/cudf/_lib/groupby.pyx | 3 ++- python/cudf/cudf/core/groupby/groupby.py | 14 +++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 94f9ff9ca6b..6ffc34ca345 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -44,7 +44,8 @@ _LIST_AGGS = { "collect" } -# TODO: For now, the aggs for all of these extension types are the same as strings until we can find a way to fix that. +# TODO: For now, the aggs for struct and interval types are the same as strings. +# Need to evaluate what's actually reasonable. _STRUCT_AGGS = { "count", "size", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 9cc6e098322..76274dcd545 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -705,15 +705,11 @@ def __init__( dropna=dropna, ) - def __getattribute__(self, key): - try: - return super().__getattribute__(key) - except AttributeError: - if key in self.obj: - return self.obj[key].groupby( - self.grouping, dropna=self._dropna, sort=self._sort - ) - raise + def __getattr__(self, key): + if key in self.obj: + return self.obj[key].groupby( + self.grouping, dropna=self._dropna, sort=self._sort + ) def __getitem__(self, key): return self.obj[key].groupby( From 6b5c67fbbd8fde3f0e8120b2687f1ec521c0904d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 13:47:13 -0700 Subject: [PATCH 11/49] Clearly documented unknown failures. --- python/cudf/cudf/_lib/groupby.pyx | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 6ffc34ca345..ccd734cef91 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -73,9 +73,20 @@ _DECIMAL_AGGS = { "argmax", "min", "max", - "mean", # Errors internally in C++ + # TODO: As far as I can tell, variance and std are using the reduction operations + # defined in reductions/compound.cuh. reductions/var.cu defines it as using the + # element_type_dispatcher, which checks validity using std::is_arithmetic. I assume + # that this is intentional, in which case var and std are not supported. However, + # instead of raising any errors, I just get columns of NULLs if I try to actually + # compute the variance of a GroupBy object containing decimals, so something else + # might be wrong as well. "variance", # Gives all nulls. "std", # Gives all nulls. + # TODO: Need to resolve the three below. They fail with a cudf error, but the error + # indicates that the column type is wrong (it wants a fixed point column but isn't + # receiving one). I suspect that a special-cased conversion for decimal types is + # missing somewhere. + "mean", # Errors internally in C++ "quantile", # Errors internally in C++ "median", # Errors internally in C++ "nunique", From 8e45ad00d3f899bac149ff84c0d9199818c0dac5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 13:47:48 -0700 Subject: [PATCH 12/49] Match other class groupbys to strings. --- python/cudf/cudf/_lib/groupby.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index ccd734cef91..fd8f0f61f54 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -54,6 +54,7 @@ _STRUCT_AGGS = { "nunique", "nth", "collect" + "variance" } _INTERVAL_AGGS = { @@ -64,6 +65,7 @@ _INTERVAL_AGGS = { "nunique", "nth", "collect" + "variance" } _DECIMAL_AGGS = { From 81ffe0af4538daf9ad0918fdfb1911273a900e1e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 14:56:09 -0700 Subject: [PATCH 13/49] Fix style and remove unsupported operations. --- python/cudf/cudf/_lib/groupby.pyx | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index fd8f0f61f54..cc6706e667b 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -44,8 +44,8 @@ _LIST_AGGS = { "collect" } -# TODO: For now, the aggs for struct and interval types are the same as strings. -# Need to evaluate what's actually reasonable. +# TODO: For now, the aggs for struct and interval types are the same as +# strings. Need to evaluate what's actually reasonable. _STRUCT_AGGS = { "count", "size", @@ -75,22 +75,6 @@ _DECIMAL_AGGS = { "argmax", "min", "max", - # TODO: As far as I can tell, variance and std are using the reduction operations - # defined in reductions/compound.cuh. reductions/var.cu defines it as using the - # element_type_dispatcher, which checks validity using std::is_arithmetic. I assume - # that this is intentional, in which case var and std are not supported. However, - # instead of raising any errors, I just get columns of NULLs if I try to actually - # compute the variance of a GroupBy object containing decimals, so something else - # might be wrong as well. - "variance", # Gives all nulls. - "std", # Gives all nulls. - # TODO: Need to resolve the three below. They fail with a cudf error, but the error - # indicates that the column type is wrong (it wants a fixed point column but isn't - # receiving one). I suspect that a special-cased conversion for decimal types is - # missing somewhere. - "mean", # Errors internally in C++ - "quantile", # Errors internally in C++ - "median", # Errors internally in C++ "nunique", "nth", "collect" @@ -227,7 +211,6 @@ def _drop_unsupported_aggs(Table values, aggs): ) result = aggs.copy() - # and not is_decimal_dtype(obj) for col_name in aggs: if ( is_list_dtype(values._data[col_name].dtype) From 6d3fad376bf13bcf17be28a3df778af4ba97829d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 15:00:41 -0700 Subject: [PATCH 14/49] Apply black reformattings. --- python/cudf/cudf/core/groupby/groupby.py | 8 +++++--- python/cudf/cudf/utils/dtypes.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 76274dcd545..3f520c60d4f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -586,12 +586,13 @@ def rolling(self, *args, **kwargs): "quantile", "median", "nunique", - "collect" + "collect", } # Dynamically bind the different aggregation methods. for key in _VALID_GROUPBY_AGGS: + def _agg_func_name_with_args(self, func_name, *args, **kwargs): """ Aggregate given an aggregate function name and arguments to the @@ -606,8 +607,9 @@ def func(x): func.__name__ = func_name return self.agg(func) - setattr(GroupBy, key, - functools.partialmethod(_agg_func_name_with_args, key)) + setattr( + GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key) + ) class DataFrameGroupBy(GroupBy): diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e1d9b71daf0..b5fecc5cd3c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -294,8 +294,11 @@ def cudf_dtype_to_pa_type(dtype): """ if is_categorical_dtype(dtype): raise NotImplementedError() - elif (is_list_dtype(dtype) or is_struct_dtype(dtype) - or is_decimal_dtype(dtype)): + elif ( + is_list_dtype(dtype) + or is_struct_dtype(dtype) + or is_decimal_dtype(dtype) + ): return dtype.to_arrow() else: return np_to_pa_dtype(np.dtype(dtype)) From 714742d433a63f51f2485a3fbda9e356bbc075b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 15:09:26 -0700 Subject: [PATCH 15/49] Remove variance from obviously unsupported types. --- python/cudf/cudf/_lib/groupby.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index cc6706e667b..fe14a6ac4ac 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -37,7 +37,6 @@ _STRING_AGGS = { "nunique", "nth", "collect", - "variance" } _LIST_AGGS = { @@ -54,7 +53,6 @@ _STRUCT_AGGS = { "nunique", "nth", "collect" - "variance" } _INTERVAL_AGGS = { @@ -65,7 +63,6 @@ _INTERVAL_AGGS = { "nunique", "nth", "collect" - "variance" } _DECIMAL_AGGS = { From ea4ed2e0b0e7cf768201817b9812044673586042 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 15:19:14 -0700 Subject: [PATCH 16/49] Defer getattr to getitem if possible. --- python/cudf/cudf/core/groupby/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3f520c60d4f..b6260ebf643 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -708,10 +708,10 @@ def __init__( ) def __getattr__(self, key): - if key in self.obj: - return self.obj[key].groupby( - self.grouping, dropna=self._dropna, sort=self._sort - ) + try: + return self[key] + except KeyError: + raise AttributeError def __getitem__(self, key): return self.obj[key].groupby( From 026bb4e16ce97cbd3bc97b854e65582cc9b24f5a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 16:01:59 -0700 Subject: [PATCH 17/49] Make getattr safe for copying. --- python/cudf/cudf/core/groupby/groupby.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b6260ebf643..34064193506 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -708,6 +708,11 @@ def __init__( ) def __getattr__(self, key): + # Without this check, copying can trigger a RecursionError. See + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html #noqa: E501 + # for an explanation. + if key == "obj": + raise AttributeError try: return self[key] except KeyError: From 1259032e376710f40a81f702914fabb0c9ceeef6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 16:07:48 -0700 Subject: [PATCH 18/49] Remove support for aggregating structs. --- python/cudf/cudf/_lib/groupby.pyx | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index fe14a6ac4ac..4f3d3599eef 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -43,26 +43,10 @@ _LIST_AGGS = { "collect" } -# TODO: For now, the aggs for struct and interval types are the same as -# strings. Need to evaluate what's actually reasonable. _STRUCT_AGGS = { - "count", - "size", - "max", - "min", - "nunique", - "nth", - "collect" } _INTERVAL_AGGS = { - "count", - "size", - "max", - "min", - "nunique", - "nth", - "collect" } _DECIMAL_AGGS = { From 6c61806fb22a96b3f283ec1a4b1ddb84a9ce72ca Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 16:10:45 -0700 Subject: [PATCH 19/49] Update documented list of groupby operations. --- docs/cudf/source/groupby.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md index 7e96d4fe38c..a74e6285296 100644 --- a/docs/cudf/source/groupby.md +++ b/docs/cudf/source/groupby.md @@ -120,23 +120,23 @@ a The following table summarizes the available aggregations and the types that support them: -| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | -| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -| count | ✅ | ✅ | ✅ | ✅ | | | -| size | ✅ | ✅ | ✅ | ✅ | | | -| sum | ✅ | ✅ | | | | | -| idxmin | ✅ | ✅ | | | | | -| idxmax | ✅ | ✅ | | | | | -| min | ✅ | ✅ | ✅ | | | | -| max | ✅ | ✅ | ✅ | | | | -| mean | ✅ | ✅ | | | | | -| var | ✅ | ✅ | | | | | -| std | ✅ | ✅ | | | | | -| quantile | ✅ | ✅ | | | | | -| median | ✅ | ✅ | | | | | -| nunique | ✅ | ✅ | ✅ | ✅ | | | -| nth | ✅ | ✅ | ✅ | | | | -| collect | ✅ | ✅ | ✅ | | ✅ | | +| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | +| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -------- | ------- | +| count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| sum | ✅ | ✅ | | | | | | ✅ | +| idxmin | ✅ | ✅ | | | | | | ✅ | +| idxmax | ✅ | ✅ | | | | | | ✅ | +| min | ✅ | ✅ | ✅ | | | | | ✅ | +| max | ✅ | ✅ | ✅ | | | | | ✅ | +| mean | ✅ | ✅ | | | | | | | +| var | ✅ | ✅ | | | | | | | +| std | ✅ | ✅ | | | | | | | +| quantile | ✅ | ✅ | | | | | | | +| median | ✅ | ✅ | | | | | | | +| nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| nth | ✅ | ✅ | ✅ | | | | | ✅ | +| collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | ## GroupBy apply From a14d30f0c13d2849b9fa9fab5a4a64a7309fc185 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Mar 2021 16:51:10 -0700 Subject: [PATCH 20/49] Move function out of loop. --- python/cudf/cudf/core/groupby/groupby.py | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 34064193506..37819b30c45 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -591,22 +591,22 @@ def rolling(self, *args, **kwargs): # Dynamically bind the different aggregation methods. -for key in _VALID_GROUPBY_AGGS: +def _agg_func_name_with_args(self, func_name, *args, **kwargs): + """ + Aggregate given an aggregate function name and arguments to the + function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named + aggregations must be members of _AggregationFactory. + """ - def _agg_func_name_with_args(self, func_name, *args, **kwargs): - """ - Aggregate given an aggregate function name and arguments to the - function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named - aggregations must be members of _AggregationFactory. - """ + def func(x): + """Compute the {} of the group.""".format(func_name) + return getattr(x, func_name)(*args, **kwargs) - def func(x): - """Compute the {} of the group.""".format(func_name) - return getattr(x, func_name)(*args, **kwargs) + func.__name__ = func_name + return self.agg(func) - func.__name__ = func_name - return self.agg(func) +for key in _VALID_GROUPBY_AGGS: setattr( GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key) ) @@ -709,7 +709,7 @@ def __init__( def __getattr__(self, key): # Without this check, copying can trigger a RecursionError. See - # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html #noqa: E501 + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # noqa: E501 # for an explanation. if key == "obj": raise AttributeError From 5c71bfee04bfcdd693abc424d6bf0aba9df05f77 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Mar 2021 17:26:55 -0700 Subject: [PATCH 21/49] Remove redundant test, add test of decimal. --- python/cudf/cudf/tests/test_groupby.py | 64 ++++++++++++++------------ 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a96db59dee3..e53d1b4f2ec 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -7,7 +7,7 @@ import pandas as pd import pytest from numba import cuda -from numpy.testing import assert_array_equal +from numpy.testing import assert_allclose, assert_array_equal import cudf from cudf.core import DataFrame, Series @@ -148,26 +148,6 @@ def test_groupby_agg_min_max_dictlist(nelem): assert_eq(got_df, expect_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize( - "func", ["mean", "min", "max", "idxmin", "idxmax", "count", "sum"] -) -def test_groupby_2keys_agg(nelem, func): - # gdf (Note: lack of multiIndex) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - check_dtype = False if func in _index_type_aggs else True - assert_eq(got_df, expect_df, check_dtype=check_dtype) - - @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_single_agg(pdf, gdf, as_index): gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) @@ -331,28 +311,54 @@ def emulate(df): assert_eq(expect, got) -@pytest.mark.parametrize("nelem", [100, 500]) +@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"], ) -def test_groupby_cudf_2keys_agg(nelem, func): - got_df = ( - make_frame(DataFrame, nelem=nelem) +def test_groupby_2keys_agg(nelem, func): + # gdf (Note: lack of multiIndex) + expect_df = ( + make_frame(pd.DataFrame, nelem=nelem) .groupby(["x", "y"], sort=True) .agg(func) ) - - # pandas - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) + got_df = ( + make_frame(DataFrame, nelem=nelem) .groupby(["x", "y"], sort=True) .agg(func) ) + check_dtype = False if func in _index_type_aggs else True assert_eq(got_df, expect_df, check_dtype=check_dtype) +@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) +@pytest.mark.parametrize( + "func", + ["min", "max", "idxmin", "idxmax", "count", "sum"], +) +def test_groupby_agg_decimal(nelem, func): + idx_col = np.arange(nelem) + x = (np.random.rand(nelem) * 100).round(2) + y = (np.random.rand(nelem) * 100).round(2) + pdf = pd.DataFrame({'idx': idx_col, 'x': x, 'y': y}) + gdf = DataFrame({ + 'idx': idx_col, + 'x': cudf.Series(x).astype(cudf.Decimal64Dtype(3, 2)), + 'y': cudf.Series(y).astype(cudf.Decimal64Dtype(3, 2)), + }) + + expect_df = pdf.groupby('idx', sort=True).agg(func) + got_df = gdf.groupby('idx', sort=True).agg(func).astype(float) + + check_dtype = False if func in _index_type_aggs else True + assert_allclose(got_df['x'].to_array(), expect_df['x'], + atol=1e-2, rtol=1e-2) + assert_allclose(got_df['y'].to_array(), expect_df['y'], + atol=1e-2, rtol=1e-2) + + @pytest.mark.parametrize( "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "mean"] ) From 25811b0465bc570ff00d18770da2ea093a111714 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Mar 2021 17:28:07 -0700 Subject: [PATCH 22/49] Fix formatting. --- python/cudf/cudf/tests/test_groupby.py | 32 ++++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e53d1b4f2ec..e6547438538 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -335,28 +335,30 @@ def test_groupby_2keys_agg(nelem, func): @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( - "func", - ["min", "max", "idxmin", "idxmax", "count", "sum"], + "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], ) def test_groupby_agg_decimal(nelem, func): idx_col = np.arange(nelem) x = (np.random.rand(nelem) * 100).round(2) y = (np.random.rand(nelem) * 100).round(2) - pdf = pd.DataFrame({'idx': idx_col, 'x': x, 'y': y}) - gdf = DataFrame({ - 'idx': idx_col, - 'x': cudf.Series(x).astype(cudf.Decimal64Dtype(3, 2)), - 'y': cudf.Series(y).astype(cudf.Decimal64Dtype(3, 2)), - }) + pdf = pd.DataFrame({"idx": idx_col, "x": x, "y": y}) + gdf = DataFrame( + { + "idx": idx_col, + "x": cudf.Series(x).astype(cudf.Decimal64Dtype(3, 2)), + "y": cudf.Series(y).astype(cudf.Decimal64Dtype(3, 2)), + } + ) - expect_df = pdf.groupby('idx', sort=True).agg(func) - got_df = gdf.groupby('idx', sort=True).agg(func).astype(float) + expect_df = pdf.groupby("idx", sort=True).agg(func) + got_df = gdf.groupby("idx", sort=True).agg(func).astype(float) - check_dtype = False if func in _index_type_aggs else True - assert_allclose(got_df['x'].to_array(), expect_df['x'], - atol=1e-2, rtol=1e-2) - assert_allclose(got_df['y'].to_array(), expect_df['y'], - atol=1e-2, rtol=1e-2) + assert_allclose( + got_df["x"].to_array(), expect_df["x"], atol=1e-2, rtol=1e-2 + ) + assert_allclose( + got_df["y"].to_array(), expect_df["y"], atol=1e-2, rtol=1e-2 + ) @pytest.mark.parametrize( From 4a0f27d83c0ed693cd0fe21e4cc291c4e2fdf51b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Mar 2021 08:35:53 -0700 Subject: [PATCH 23/49] Simplify drop aggs logic. --- python/cudf/cudf/_lib/groupby.pyx | 129 +++++++++--------------------- 1 file changed, 39 insertions(+), 90 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 4a949d80129..1024f23e773 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -19,48 +19,29 @@ cimport cudf._lib.cpp.types as libcudf_types cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.aggregation as libcudf_aggregation +from pandas.core.groupby.groupby import DataError +from cudf.utils.dtypes import ( + is_categorical_dtype, + is_string_dtype, + is_list_dtype, + is_interval_dtype, + is_struct_dtype, + is_decimal_dtype, +) + # The sets below define the possible aggregations that can be performed on # different dtypes. The uppercased versions of these strings correspond to # elements of the AggregationKind enum. -_CATEGORICAL_AGGS = { - "count", - "size", - "nunique", - "unique", -} - -_STRING_AGGS = { - "count", - "size", - "max", - "min", - "nunique", - "nth", - "collect", - "unique", -} - -_LIST_AGGS = { - "collect", -} - -_STRUCT_AGGS = { -} - -_INTERVAL_AGGS = { -} - -_DECIMAL_AGGS = { - "count", - "sum", - "argmin", - "argmax", - "min", - "max", - "nunique", - "nth", - "collect" +_VALID_AGGS_BY_TYPE = { + "CATEGORICAL": {"count", "size", "nunique", "unique"}, + "STRING": {"count", "size", "max", "min", "nunique", "nth", "collect", + "unique"}, + "LIST": {"collect" }, + "STRUCT": set(), + "INTERVAL": set(), + "DECIMAL": {"count", "sum", "argmin", "argmax", "min", "max", "nunique", + "nth", "collect"}, } @@ -187,61 +168,29 @@ cdef class GroupBy: return result -def _drop_unsupported_aggs(Table values, aggs): +def _drop_unsupported_aggregations(Table values, aggregations): """ Drop any aggregations that are not supported. """ - from pandas.core.groupby.groupby import DataError - - if all(len(v) == 0 for v in aggs.values()): - return aggs - - from cudf.utils.dtypes import ( - is_categorical_dtype, - is_string_dtype, - is_list_dtype, - is_interval_dtype, - is_struct_dtype, - is_decimal_dtype, - ) - result = aggs.copy() - - for col_name in aggs: - if ( - is_list_dtype(values._data[col_name].dtype) - ): - for i, agg_name in enumerate(aggs[col_name]): - if Aggregation(agg_name).kind not in _LIST_AGGS: - del result[col_name][i] - elif ( - is_string_dtype(values._data[col_name].dtype) - ): - for i, agg_name in enumerate(aggs[col_name]): - if Aggregation(agg_name).kind not in _STRING_AGGS: - del result[col_name][i] - elif ( - is_categorical_dtype(values._data[col_name].dtype) - ): - for i, agg_name in enumerate(aggs[col_name]): - if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS: - del result[col_name][i] - elif ( - is_struct_dtype(values._data[col_name].dtype) - ): - for i, agg_name in enumerate(aggs[col_name]): - if Aggregation(agg_name).kind not in _STRUCT_AGGS: - del result[col_name][i] - elif ( - is_interval_dtype(values._data[col_name].dtype) - ): - for i, agg_name in enumerate(aggs[col_name]): - if Aggregation(agg_name).kind not in _INTERVAL_AGGS: - del result[col_name][i] - elif ( - is_decimal_dtype(values._data[col_name].dtype) - ): - for i, agg_name in enumerate(aggs[col_name]): - if Aggregation(agg_name).kind not in _DECIMAL_AGGS: + if all(len(v) == 0 for v in aggregations.values()): + return aggregations + + result = aggregations.copy() + + for col_name in aggregations: + valid_aggregations_key = ( + "LIST" if is_list_dtype(values._data[col_name].dtype) + else "STRING" if is_string_dtype(values._data[col_name].dtype) + else "CATEGORICAL" if is_categorical_dtype(values._data[col_name].dtype) + else "STRUCT" if is_struct_dtype(values._data[col_name].dtype) + else "INTERVAL" if is_interval_dtype(values._data[col_name].dtype) + else "DECIMAL" if is_decimal_dtype(values._data[col_name].dtype) + else None + ) + if valid_aggregations_key is not None: + valid_aggregations = _VALID_AGGS_BY_TYPE[valid_aggregations_key] + for i, agg_name in enumerate(aggregations[col_name]): + if Aggregation(agg_name).kind not in valid_aggregations: del result[col_name][i] if all(len(v) == 0 for v in result.values()): From 596496f50b2e585a86adf937330ca3080f5f738b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Mar 2021 09:32:17 -0700 Subject: [PATCH 24/49] Inline drop logic. --- python/cudf/cudf/_lib/groupby.pyx | 75 ++++++++++++++----------------- 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 1024f23e773..d7ad0f43ab6 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -33,16 +33,14 @@ from cudf.utils.dtypes import ( # The sets below define the possible aggregations that can be performed on # different dtypes. The uppercased versions of these strings correspond to # elements of the AggregationKind enum. -_VALID_AGGS_BY_TYPE = { - "CATEGORICAL": {"count", "size", "nunique", "unique"}, - "STRING": {"count", "size", "max", "min", "nunique", "nth", "collect", - "unique"}, - "LIST": {"collect" }, - "STRUCT": set(), - "INTERVAL": set(), - "DECIMAL": {"count", "sum", "argmin", "argmax", "min", "max", "nunique", - "nth", "collect"}, -} +_CATEGORICAL_AGGS = {"count", "size", "nunique", "unique"} +_STRING_AGGS = {"count", "size", "max", "min", "nunique", "nth", "collect", + "unique"} +_LIST_AGGS = {"collect" } +_STRUCT_AGGS = set() +_INTERVAL_AGGS = set() +_DECIMAL_AGGS = {"count", "sum", "argmin", "argmax", "min", "max", "nunique", + "nth", "collect"} cdef class GroupBy: @@ -114,7 +112,31 @@ cdef class GroupBy: cdef vector[libcudf_groupby.aggregation_request] c_agg_requests cdef Column col - aggregations = _drop_unsupported_aggs(values, aggregations) + # Drop unsupported aggregations. + # TODO: Is allowing users to provide empty aggregations something we do + # to support pandas semantics? Whereas we need to throw an error if + # some aggregations are requested when in fact none are possible? + if any(len(v) for v in aggregations.values()): + aggregations = aggregations.copy() + + for col_name in aggregations: + valid_aggregations = ( + _LIST_AGGS if is_list_dtype(values._data[col_name].dtype) + else _STRING_AGGS if is_string_dtype(values._data[col_name].dtype) + else _CATEGORICAL_AGGS if is_categorical_dtype(values._data[col_name].dtype) + else _STRING_AGGS if is_struct_dtype(values._data[col_name].dtype) + else _INTERVAL_AGGS if is_interval_dtype(values._data[col_name].dtype) + else _DECIMAL_AGGS if is_decimal_dtype(values._data[col_name].dtype) + else None + ) + if valid_aggregations is not None: + for i, agg_name in enumerate(list(aggregations[col_name])): + if Aggregation(agg_name).kind not in valid_aggregations: + del aggregations[col_name][i] + + if all(len(v) == 0 for v in aggregations.values()): + raise DataError("No numeric types to aggregate") + for i, (col_name, aggs) in enumerate(aggregations.items()): col = values._data[col_name] @@ -166,34 +188,3 @@ cdef class GroupBy: result = Table(data=result_data, index=grouped_keys) return result - - -def _drop_unsupported_aggregations(Table values, aggregations): - """ - Drop any aggregations that are not supported. - """ - if all(len(v) == 0 for v in aggregations.values()): - return aggregations - - result = aggregations.copy() - - for col_name in aggregations: - valid_aggregations_key = ( - "LIST" if is_list_dtype(values._data[col_name].dtype) - else "STRING" if is_string_dtype(values._data[col_name].dtype) - else "CATEGORICAL" if is_categorical_dtype(values._data[col_name].dtype) - else "STRUCT" if is_struct_dtype(values._data[col_name].dtype) - else "INTERVAL" if is_interval_dtype(values._data[col_name].dtype) - else "DECIMAL" if is_decimal_dtype(values._data[col_name].dtype) - else None - ) - if valid_aggregations_key is not None: - valid_aggregations = _VALID_AGGS_BY_TYPE[valid_aggregations_key] - for i, agg_name in enumerate(aggregations[col_name]): - if Aggregation(agg_name).kind not in valid_aggregations: - del result[col_name][i] - - if all(len(v) == 0 for v in result.values()): - raise DataError("No numeric types to aggregate") - - return result From 1f5daddfba627a21e7f7f16f1f0bac969ed97f67 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Mar 2021 19:31:07 -0700 Subject: [PATCH 25/49] Remove empty aggregations in place rather than beforehand. --- python/cudf/cudf/_lib/groupby.pyx | 59 ++++++++++++++++--------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index d7ad0f43ab6..4aa2b753ffa 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -112,42 +112,45 @@ cdef class GroupBy: cdef vector[libcudf_groupby.aggregation_request] c_agg_requests cdef Column col - # Drop unsupported aggregations. # TODO: Is allowing users to provide empty aggregations something we do # to support pandas semantics? Whereas we need to throw an error if # some aggregations are requested when in fact none are possible? - if any(len(v) for v in aggregations.values()): - aggregations = aggregations.copy() - - for col_name in aggregations: - valid_aggregations = ( - _LIST_AGGS if is_list_dtype(values._data[col_name].dtype) - else _STRING_AGGS if is_string_dtype(values._data[col_name].dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(values._data[col_name].dtype) - else _STRING_AGGS if is_struct_dtype(values._data[col_name].dtype) - else _INTERVAL_AGGS if is_interval_dtype(values._data[col_name].dtype) - else _DECIMAL_AGGS if is_decimal_dtype(values._data[col_name].dtype) - else None - ) - if valid_aggregations is not None: - for i, agg_name in enumerate(list(aggregations[col_name])): - if Aggregation(agg_name).kind not in valid_aggregations: - del aggregations[col_name][i] - - if all(len(v) == 0 for v in aggregations.values()): - raise DataError("No numeric types to aggregate") - + allow_empty = all(len(v) == 0 for v in aggregations.values()) + empty_aggs = True + included_aggregations = defaultdict(list) + idx = 0 for i, (col_name, aggs) in enumerate(aggregations.items()): col = values._data[col_name] + dtype = col.dtype + + valid_aggregations = ( + _LIST_AGGS if is_list_dtype(dtype) + else _STRING_AGGS if is_string_dtype(dtype) + else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) + else _STRING_AGGS if is_struct_dtype(dtype) + else _INTERVAL_AGGS if is_interval_dtype(dtype) + else _DECIMAL_AGGS if is_decimal_dtype(dtype) + else None + ) + c_agg_requests.push_back( move(libcudf_groupby.aggregation_request()) ) - c_agg_requests[i].values = col.view() + c_agg_requests[idx].values = col.view() for agg in aggs: - c_agg_requests[i].aggregations.push_back( - move(make_aggregation(agg)) - ) + if valid_aggregations is None or Aggregation(agg).kind in valid_aggregations: + empty_aggs = False + included_aggregations[col_name].append(agg) + c_agg_requests[idx].aggregations.push_back( + move(make_aggregation(agg)) + ) + if not c_agg_requests[idx].aggregations.size(): + c_agg_requests.pop_back() + else: + idx += 1 + if empty_aggs and not allow_empty: + raise DataError("No numeric types to aggregate") cdef pair[ unique_ptr[table], @@ -178,8 +181,8 @@ cdef class GroupBy: ) result_data = ColumnAccessor(multiindex=True) - for i, col_name in enumerate(aggregations): - for j, agg_name in enumerate(aggregations[col_name]): + for i, col_name in enumerate(included_aggregations): + for j, agg_name in enumerate(included_aggregations[col_name]): if callable(agg_name): agg_name = agg_name.__name__ result_data[(col_name, agg_name)] = ( From f02231f04cebb760de9747e79098eaea43707d6f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Mar 2021 20:51:25 -0700 Subject: [PATCH 26/49] Reuse aggregation object. --- python/cudf/cudf/_lib/groupby.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 4aa2b753ffa..3400f921391 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -139,11 +139,12 @@ cdef class GroupBy: ) c_agg_requests[idx].values = col.view() for agg in aggs: - if valid_aggregations is None or Aggregation(agg).kind in valid_aggregations: + agg_obj = Aggregation(agg) + if valid_aggregations is None or agg_obj.kind in valid_aggregations: empty_aggs = False included_aggregations[col_name].append(agg) c_agg_requests[idx].aggregations.push_back( - move(make_aggregation(agg)) + move(agg_obj.c_obj) ) if not c_agg_requests[idx].aggregations.size(): c_agg_requests.pop_back() From 0e09786bd99d07fe2d48d085de3cc5c6f6c9f198 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 11:50:04 -0700 Subject: [PATCH 27/49] Write a new make_aggregation2 function that returns the cdef class rather than the underlying C++ class. --- python/cudf/cudf/_lib/aggregation.pxd | 2 ++ python/cudf/cudf/_lib/aggregation.pyx | 35 +++++++++++++++++++++++++++ python/cudf/cudf/_lib/groupby.pyx | 5 ++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd index bb332c44237..116e6260a57 100644 --- a/python/cudf/cudf/_lib/aggregation.pxd +++ b/python/cudf/cudf/_lib/aggregation.pxd @@ -8,3 +8,5 @@ cdef unique_ptr[aggregation] make_aggregation(op, kwargs=*) except * cdef class Aggregation: cdef unique_ptr[aggregation] c_obj + +cdef Aggregation make_aggregation2(op, kwargs=*) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7138bb49743..e29e0e92859 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -97,6 +97,41 @@ cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *: raise TypeError("Unknown aggregation {}".format(op)) return move(agg.c_obj) + +cdef Aggregation make_aggregation2(op, kwargs={}): + """ + Parameters + ---------- + op : str or callable + If callable, must meet one of the following requirements: + + * Is of the form lambda x: x.agg(*args, **kwargs), where + `agg` is the name of a supported aggregation. Used to + to specify aggregations that take arguments, e.g., + `lambda x: x.quantile(0.5)`. + * Is a user defined aggregation function that operates on + group values. In this case, the output dtype must be + specified in the `kwargs` dictionary. + + Returns + ------- + unique_ptr[aggregation] + """ + cdef Aggregation agg + if isinstance(op, str): + agg = getattr(_AggregationFactory, op)(**kwargs) + elif callable(op): + if op is list: + agg = _AggregationFactory.collect() + elif "dtype" in kwargs: + agg = _AggregationFactory.from_udf(op, **kwargs) + else: + agg = op(_AggregationFactory) + else: + raise TypeError("Unknown aggregation {}".format(op)) + return agg + + # The Cython pattern below enables us to create an Aggregation # without ever calling its `__init__` method, which would otherwise # result in a RecursionError. diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 3400f921391..2a4580055fc 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -12,7 +12,7 @@ from libcpp cimport bool from cudf._lib.column cimport Column from cudf._lib.table cimport Table -from cudf._lib.aggregation cimport make_aggregation, Aggregation +from cudf._lib.aggregation cimport Aggregation, make_aggregation2 from cudf._lib.cpp.table.table cimport table, table_view cimport cudf._lib.cpp.types as libcudf_types @@ -111,6 +111,7 @@ cdef class GroupBy: from cudf.core.column_accessor import ColumnAccessor cdef vector[libcudf_groupby.aggregation_request] c_agg_requests cdef Column col + cdef Aggregation agg_obj # TODO: Is allowing users to provide empty aggregations something we do # to support pandas semantics? Whereas we need to throw an error if @@ -139,7 +140,7 @@ cdef class GroupBy: ) c_agg_requests[idx].values = col.view() for agg in aggs: - agg_obj = Aggregation(agg) + agg_obj = make_aggregation2(agg) if valid_aggregations is None or agg_obj.kind in valid_aggregations: empty_aggs = False included_aggregations[col_name].append(agg) From 63897c12baf2cbba3aa6bae3f40e857b3b71ca09 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 11:50:56 -0700 Subject: [PATCH 28/49] Remove constructor of Aggregation to avoid recursion issue. --- python/cudf/cudf/_lib/aggregation.pyx | 45 +++++++++++++-------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index e29e0e92859..7f155a7daf5 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -56,9 +56,6 @@ class AggregationKind(Enum): cdef class Aggregation: - def __init__(self, op, **kwargs): - self.c_obj = move(make_aggregation(op, kwargs)) - @property def kind(self): return AggregationKind(self.c_obj.get()[0].kind).name.lower() @@ -139,37 +136,37 @@ cdef class _AggregationFactory: @classmethod def sum(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_sum_aggregation()) return agg @classmethod def min(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_min_aggregation()) return agg @classmethod def max(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_max_aggregation()) return agg @classmethod def idxmin(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_argmin_aggregation()) return agg @classmethod def idxmax(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_argmax_aggregation()) return agg @classmethod def mean(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_mean_aggregation()) return agg @@ -181,7 +178,7 @@ cdef class _AggregationFactory: else: c_null_handling = libcudf_types.null_policy.INCLUDE - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_count_aggregation( c_null_handling )) @@ -189,7 +186,7 @@ cdef class _AggregationFactory: @classmethod def size(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_count_aggregation( ( NullHandling.INCLUDE @@ -199,13 +196,13 @@ cdef class _AggregationFactory: @classmethod def nunique(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_nunique_aggregation()) return agg @classmethod def nth(cls, libcudf_types.size_type size): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move( libcudf_aggregation.make_nth_element_aggregation(size) ) @@ -213,49 +210,49 @@ cdef class _AggregationFactory: @classmethod def any(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_any_aggregation()) return agg @classmethod def all(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_all_aggregation()) return agg @classmethod def product(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_product_aggregation()) return agg @classmethod def sum_of_squares(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_sum_of_squares_aggregation()) return agg @classmethod def var(cls, ddof=1): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_variance_aggregation(ddof)) return agg @classmethod def std(cls, ddof=1): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_std_aggregation(ddof)) return agg @classmethod def median(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_median_aggregation()) return agg @classmethod def quantile(cls, q=0.5, interpolation="linear"): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() if not pd.api.types.is_list_like(q): q = [q] @@ -275,19 +272,19 @@ cdef class _AggregationFactory: @classmethod def collect(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation()) return agg @classmethod def unique(cls): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation()) return agg @classmethod def from_udf(cls, op, *args, **kwargs): - cdef Aggregation agg = Aggregation.__new__(Aggregation) + cdef Aggregation agg = Aggregation() cdef libcudf_types.type_id tid cdef libcudf_types.data_type out_dtype From 5fbde6f0d57f9de71e61ff62da49b5632301160a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 11:54:21 -0700 Subject: [PATCH 29/49] Remove _AggregationFactory and just use classmethods of Aggregation. --- python/cudf/cudf/_lib/aggregation.pyx | 142 ++++++++++++-------------- 1 file changed, 68 insertions(+), 74 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7f155a7daf5..c07e92f9dc6 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -60,80 +60,6 @@ cdef class Aggregation: def kind(self): return AggregationKind(self.c_obj.get()[0].kind).name.lower() - -cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *: - """ - Parameters - ---------- - op : str or callable - If callable, must meet one of the following requirements: - - * Is of the form lambda x: x.agg(*args, **kwargs), where - `agg` is the name of a supported aggregation. Used to - to specify aggregations that take arguments, e.g., - `lambda x: x.quantile(0.5)`. - * Is a user defined aggregation function that operates on - group values. In this case, the output dtype must be - specified in the `kwargs` dictionary. - - Returns - ------- - unique_ptr[aggregation] - """ - cdef Aggregation agg - if isinstance(op, str): - agg = getattr(_AggregationFactory, op)(**kwargs) - elif callable(op): - if op is list: - agg = _AggregationFactory.collect() - elif "dtype" in kwargs: - agg = _AggregationFactory.from_udf(op, **kwargs) - else: - agg = op(_AggregationFactory) - else: - raise TypeError("Unknown aggregation {}".format(op)) - return move(agg.c_obj) - - -cdef Aggregation make_aggregation2(op, kwargs={}): - """ - Parameters - ---------- - op : str or callable - If callable, must meet one of the following requirements: - - * Is of the form lambda x: x.agg(*args, **kwargs), where - `agg` is the name of a supported aggregation. Used to - to specify aggregations that take arguments, e.g., - `lambda x: x.quantile(0.5)`. - * Is a user defined aggregation function that operates on - group values. In this case, the output dtype must be - specified in the `kwargs` dictionary. - - Returns - ------- - unique_ptr[aggregation] - """ - cdef Aggregation agg - if isinstance(op, str): - agg = getattr(_AggregationFactory, op)(**kwargs) - elif callable(op): - if op is list: - agg = _AggregationFactory.collect() - elif "dtype" in kwargs: - agg = _AggregationFactory.from_udf(op, **kwargs) - else: - agg = op(_AggregationFactory) - else: - raise TypeError("Unknown aggregation {}".format(op)) - return agg - - -# The Cython pattern below enables us to create an Aggregation -# without ever calling its `__init__` method, which would otherwise -# result in a RecursionError. -cdef class _AggregationFactory: - @classmethod def sum(cls): cdef Aggregation agg = Aggregation() @@ -314,3 +240,71 @@ cdef class _AggregationFactory: libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype )) return agg + + +cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *: + """ + Parameters + ---------- + op : str or callable + If callable, must meet one of the following requirements: + + * Is of the form lambda x: x.agg(*args, **kwargs), where + `agg` is the name of a supported aggregation. Used to + to specify aggregations that take arguments, e.g., + `lambda x: x.quantile(0.5)`. + * Is a user defined aggregation function that operates on + group values. In this case, the output dtype must be + specified in the `kwargs` dictionary. + + Returns + ------- + unique_ptr[aggregation] + """ + cdef Aggregation agg + if isinstance(op, str): + agg = getattr(Aggregation, op)(**kwargs) + elif callable(op): + if op is list: + agg = Aggregation.collect() + elif "dtype" in kwargs: + agg = Aggregation.from_udf(op, **kwargs) + else: + agg = op(Aggregation) + else: + raise TypeError("Unknown aggregation {}".format(op)) + return move(agg.c_obj) + + +cdef Aggregation make_aggregation2(op, kwargs={}): + """ + Parameters + ---------- + op : str or callable + If callable, must meet one of the following requirements: + + * Is of the form lambda x: x.agg(*args, **kwargs), where + `agg` is the name of a supported aggregation. Used to + to specify aggregations that take arguments, e.g., + `lambda x: x.quantile(0.5)`. + * Is a user defined aggregation function that operates on + group values. In this case, the output dtype must be + specified in the `kwargs` dictionary. + + Returns + ------- + unique_ptr[aggregation] + """ + cdef Aggregation agg + if isinstance(op, str): + agg = getattr(Aggregation, op)(**kwargs) + elif callable(op): + if op is list: + agg = Aggregation.collect() + elif "dtype" in kwargs: + agg = Aggregation.from_udf(op, **kwargs) + else: + agg = op(Aggregation) + else: + raise TypeError("Unknown aggregation {}".format(op)) + return agg From f9508feefa061b32becd111dba3a34b0888fb841 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 12:04:44 -0700 Subject: [PATCH 30/49] Remove old API for aggregations, use new Cython object everywhere. --- python/cudf/cudf/_lib/aggregation.pxd | 6 ++--- python/cudf/cudf/_lib/aggregation.pyx | 36 +-------------------------- python/cudf/cudf/_lib/groupby.pyx | 4 +-- python/cudf/cudf/_lib/reduce.pyx | 12 ++++----- python/cudf/cudf/_lib/rolling.pyx | 11 ++++---- 5 files changed, 17 insertions(+), 52 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd index 116e6260a57..13d04575700 100644 --- a/python/cudf/cudf/_lib/aggregation.pxd +++ b/python/cudf/cudf/_lib/aggregation.pxd @@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.aggregation cimport aggregation -cdef unique_ptr[aggregation] make_aggregation(op, kwargs=*) except * - cdef class Aggregation: cdef unique_ptr[aggregation] c_obj -cdef Aggregation make_aggregation2(op, kwargs=*) +# TODO: the old version of this function was declared as except *, I'm not sure +# why but I should figure that out before calling this done. +cdef Aggregation make_aggregation(op, kwargs=*) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index c07e92f9dc6..bc7cc1fbfec 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -242,41 +242,7 @@ cdef class Aggregation: return agg -cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *: - """ - Parameters - ---------- - op : str or callable - If callable, must meet one of the following requirements: - - * Is of the form lambda x: x.agg(*args, **kwargs), where - `agg` is the name of a supported aggregation. Used to - to specify aggregations that take arguments, e.g., - `lambda x: x.quantile(0.5)`. - * Is a user defined aggregation function that operates on - group values. In this case, the output dtype must be - specified in the `kwargs` dictionary. - - Returns - ------- - unique_ptr[aggregation] - """ - cdef Aggregation agg - if isinstance(op, str): - agg = getattr(Aggregation, op)(**kwargs) - elif callable(op): - if op is list: - agg = Aggregation.collect() - elif "dtype" in kwargs: - agg = Aggregation.from_udf(op, **kwargs) - else: - agg = op(Aggregation) - else: - raise TypeError("Unknown aggregation {}".format(op)) - return move(agg.c_obj) - - -cdef Aggregation make_aggregation2(op, kwargs={}): +cdef Aggregation make_aggregation(op, kwargs={}): """ Parameters ---------- diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 2a4580055fc..f5340f23710 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -12,7 +12,7 @@ from libcpp cimport bool from cudf._lib.column cimport Column from cudf._lib.table cimport Table -from cudf._lib.aggregation cimport Aggregation, make_aggregation2 +from cudf._lib.aggregation cimport Aggregation, make_aggregation from cudf._lib.cpp.table.table cimport table, table_view cimport cudf._lib.cpp.types as libcudf_types @@ -140,7 +140,7 @@ cdef class GroupBy: ) c_agg_requests[idx].values = col.view() for agg in aggs: - agg_obj = make_aggregation2(agg) + agg_obj = make_aggregation(agg) if valid_aggregations is None or agg_obj.kind in valid_aggregations: empty_aggs = False included_aggregations[col_name].append(agg) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 2185cb089a7..2cdcaa2d9e0 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -10,7 +10,7 @@ from cudf._lib.scalar cimport DeviceScalar from cudf._lib.column cimport Column from cudf._lib.types import np_to_cudf_types from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.aggregation cimport make_aggregation, aggregation +from cudf._lib.aggregation cimport make_aggregation, aggregation, Aggregation from libcpp.memory cimport unique_ptr from libcpp.utility cimport move, pair import numpy as np @@ -38,9 +38,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): cdef column_view c_incol_view = incol.view() cdef unique_ptr[scalar] c_result - cdef unique_ptr[aggregation] c_agg = move(make_aggregation( - reduction_op, kwargs - )) + cdef Aggregation cython_agg = make_aggregation(reduction_op, kwargs) + cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj) cdef type_id tid = ( ( ( @@ -88,9 +87,8 @@ def scan(scan_op, Column incol, inclusive, **kwargs): """ cdef column_view c_incol_view = incol.view() cdef unique_ptr[column] c_result - cdef unique_ptr[aggregation] c_agg = move( - make_aggregation(scan_op, kwargs) - ) + cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs) + cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj) cdef scan_type c_inclusive if inclusive is True: diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx index 9c818f39c38..c985f60f52f 100644 --- a/python/cudf/cudf/_lib/rolling.pyx +++ b/python/cudf/cudf/_lib/rolling.pyx @@ -8,7 +8,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cudf._lib.column cimport Column -from cudf._lib.aggregation cimport make_aggregation +from cudf._lib.aggregation cimport Aggregation, make_aggregation from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.column.column cimport column @@ -48,13 +48,14 @@ def rolling(Column source_column, Column pre_column_window, cdef column_view pre_column_window_view cdef column_view fwd_column_window_view cdef unique_ptr[aggregation] agg + cdef Aggregation cython_agg if callable(op): - agg = move( - make_aggregation(op, {'dtype': source_column.dtype}) - ) + cython_agg = make_aggregation(op, {'dtype': source_column.dtype}) + agg = move(cython_agg.c_obj) else: - agg = move(make_aggregation(op)) + cython_agg = make_aggregation(op) + agg = move(cython_agg.c_obj) if window is None: if center: From 4c04b5f3c22b8d17554e6567cc9c31a7cf4a3302 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 12:09:19 -0700 Subject: [PATCH 31/49] Remove direct usage of the C++ groupby object in most of the Cython code. --- python/cudf/cudf/_lib/groupby.pyx | 1 - python/cudf/cudf/_lib/reduce.pyx | 8 +++----- python/cudf/cudf/_lib/rolling.pyx | 8 ++------ 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index f5340f23710..a4d913d0849 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -17,7 +17,6 @@ from cudf._lib.aggregation cimport Aggregation, make_aggregation from cudf._lib.cpp.table.table cimport table, table_view cimport cudf._lib.cpp.types as libcudf_types cimport cudf._lib.cpp.groupby as libcudf_groupby -cimport cudf._lib.cpp.aggregation as libcudf_aggregation from pandas.core.groupby.groupby import DataError from cudf.utils.dtypes import ( diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 2cdcaa2d9e0..924b2fa8782 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -10,7 +10,7 @@ from cudf._lib.scalar cimport DeviceScalar from cudf._lib.column cimport Column from cudf._lib.types import np_to_cudf_types from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.aggregation cimport make_aggregation, aggregation, Aggregation +from cudf._lib.aggregation cimport make_aggregation, Aggregation from libcpp.memory cimport unique_ptr from libcpp.utility cimport move, pair import numpy as np @@ -39,7 +39,6 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): cdef column_view c_incol_view = incol.view() cdef unique_ptr[scalar] c_result cdef Aggregation cython_agg = make_aggregation(reduction_op, kwargs) - cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj) cdef type_id tid = ( ( ( @@ -64,7 +63,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): with nogil: c_result = move(cpp_reduce( c_incol_view, - c_agg, + cython_agg.c_obj, c_out_dtype )) @@ -88,7 +87,6 @@ def scan(scan_op, Column incol, inclusive, **kwargs): cdef column_view c_incol_view = incol.view() cdef unique_ptr[column] c_result cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs) - cdef unique_ptr[aggregation] c_agg = move(cython_agg.c_obj) cdef scan_type c_inclusive if inclusive is True: @@ -99,7 +97,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs): with nogil: c_result = move(cpp_scan( c_incol_view, - c_agg, + cython_agg.c_obj, c_inclusive )) diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx index c985f60f52f..d67fb431ec4 100644 --- a/python/cudf/cudf/_lib/rolling.pyx +++ b/python/cudf/cudf/_lib/rolling.pyx @@ -13,7 +13,6 @@ from cudf._lib.aggregation cimport Aggregation, make_aggregation from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.aggregation cimport aggregation from cudf._lib.cpp.rolling cimport ( rolling_window as cpp_rolling_window ) @@ -47,15 +46,12 @@ def rolling(Column source_column, Column pre_column_window, cdef column_view source_column_view = source_column.view() cdef column_view pre_column_window_view cdef column_view fwd_column_window_view - cdef unique_ptr[aggregation] agg cdef Aggregation cython_agg if callable(op): cython_agg = make_aggregation(op, {'dtype': source_column.dtype}) - agg = move(cython_agg.c_obj) else: cython_agg = make_aggregation(op) - agg = move(cython_agg.c_obj) if window is None: if center: @@ -72,7 +68,7 @@ def rolling(Column source_column, Column pre_column_window, pre_column_window_view, fwd_column_window_view, c_min_periods, - agg) + cython_agg.c_obj) ) else: c_min_periods = min_periods @@ -90,7 +86,7 @@ def rolling(Column source_column, Column pre_column_window, c_window, c_forward_window, c_min_periods, - agg) + cython_agg.c_obj) ) return Column.from_unique_ptr(move(c_result)) From b21dfafb8b647d998d99a44869df7506049a08ce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 12:22:15 -0700 Subject: [PATCH 32/49] Add a docstring for the Aggregation class. --- python/cudf/cudf/_lib/aggregation.pyx | 54 ++++++++++++++++----------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index bc7cc1fbfec..e52cb0762ee 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -56,43 +56,55 @@ class AggregationKind(Enum): cdef class Aggregation: + """A Cython wrapper for aggregations. + + **This class should never be instantiated using a standard constructor, + only using one of its many factories.** These factories handle mapping + different cudf operations to their libcudf analogs, e.g. + `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform + any additional configuration needed to translate Python arguments into + their corresponding C++ types (for instance, C++ enumerations used for + flag arguments). The factory approach is necessary to support operations + like `df.agg(lambda x: x.sum())`; such functions are called with this + class as an argument to generation the desired aggregation. + """ @property def kind(self): return AggregationKind(self.c_obj.get()[0].kind).name.lower() @classmethod def sum(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_sum_aggregation()) return agg @classmethod def min(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_min_aggregation()) return agg @classmethod def max(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_max_aggregation()) return agg @classmethod def idxmin(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_argmin_aggregation()) return agg @classmethod def idxmax(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_argmax_aggregation()) return agg @classmethod def mean(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_mean_aggregation()) return agg @@ -104,7 +116,7 @@ cdef class Aggregation: else: c_null_handling = libcudf_types.null_policy.INCLUDE - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_count_aggregation( c_null_handling )) @@ -112,7 +124,7 @@ cdef class Aggregation: @classmethod def size(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_count_aggregation( ( NullHandling.INCLUDE @@ -122,13 +134,13 @@ cdef class Aggregation: @classmethod def nunique(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_nunique_aggregation()) return agg @classmethod def nth(cls, libcudf_types.size_type size): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move( libcudf_aggregation.make_nth_element_aggregation(size) ) @@ -136,49 +148,49 @@ cdef class Aggregation: @classmethod def any(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_any_aggregation()) return agg @classmethod def all(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_all_aggregation()) return agg @classmethod def product(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_product_aggregation()) return agg @classmethod def sum_of_squares(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_sum_of_squares_aggregation()) return agg @classmethod def var(cls, ddof=1): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_variance_aggregation(ddof)) return agg @classmethod def std(cls, ddof=1): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_std_aggregation(ddof)) return agg @classmethod def median(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_median_aggregation()) return agg @classmethod def quantile(cls, q=0.5, interpolation="linear"): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() if not pd.api.types.is_list_like(q): q = [q] @@ -198,19 +210,19 @@ cdef class Aggregation: @classmethod def collect(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation()) return agg @classmethod def unique(cls): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation()) return agg @classmethod def from_udf(cls, op, *args, **kwargs): - cdef Aggregation agg = Aggregation() + cdef Aggregation agg = cls() cdef libcudf_types.type_id tid cdef libcudf_types.data_type out_dtype From e3ae492d37710e765154224d1f088e6dd8c59571 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 12:24:17 -0700 Subject: [PATCH 33/49] Don't use a dict as a default argument. --- python/cudf/cudf/_lib/aggregation.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index e52cb0762ee..717c206397c 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -254,8 +254,8 @@ cdef class Aggregation: return agg -cdef Aggregation make_aggregation(op, kwargs={}): - """ +cdef Aggregation make_aggregation(op, kwargs=None): + r""" Parameters ---------- op : str or callable @@ -268,11 +268,16 @@ cdef Aggregation make_aggregation(op, kwargs={}): * Is a user defined aggregation function that operates on group values. In this case, the output dtype must be specified in the `kwargs` dictionary. + \*\*kwargs : dict, optional + Any keyword arguments to be passed to the op. Returns ------- unique_ptr[aggregation] """ + if kwargs is None: + kwargs = {} + cdef Aggregation agg if isinstance(op, str): agg = getattr(Aggregation, op)(**kwargs) From bac0de5558f8aeb05046d8cfdd76dbbc90c53990 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 12:32:23 -0700 Subject: [PATCH 34/49] Use the uppercased versions of the acceptable aggregations. --- python/cudf/cudf/_lib/aggregation.pyx | 2 +- python/cudf/cudf/_lib/groupby.pyx | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 717c206397c..020745422a8 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -70,7 +70,7 @@ cdef class Aggregation: """ @property def kind(self): - return AggregationKind(self.c_obj.get()[0].kind).name.lower() + return AggregationKind(self.c_obj.get()[0].kind).name @classmethod def sum(cls): diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index a4d913d0849..82a8a905b0b 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -30,16 +30,15 @@ from cudf.utils.dtypes import ( # The sets below define the possible aggregations that can be performed on -# different dtypes. The uppercased versions of these strings correspond to -# elements of the AggregationKind enum. -_CATEGORICAL_AGGS = {"count", "size", "nunique", "unique"} -_STRING_AGGS = {"count", "size", "max", "min", "nunique", "nth", "collect", - "unique"} -_LIST_AGGS = {"collect" } +# different dtypes. These strings must be elements of the AggregationKind enum. +_CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} +_STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", + "UNIQUE"} +_LIST_AGGS = {"COLLECT" } _STRUCT_AGGS = set() _INTERVAL_AGGS = set() -_DECIMAL_AGGS = {"count", "sum", "argmin", "argmax", "min", "max", "nunique", - "nth", "collect"} +_DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", + "NTH", "COLLECT"} cdef class GroupBy: From 4c7f8d270d85b1e1fea0966d63a0bf501ec03859 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Mar 2021 13:02:58 -0700 Subject: [PATCH 35/49] Just define all the aggregations available on the GroupBy as methods. --- python/cudf/cudf/core/groupby/groupby.py | 99 +++++++++++++++--------- 1 file changed, 62 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cc94548d9a2..29efb74afa0 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,6 +1,5 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import collections -import functools import pickle import warnings @@ -570,47 +569,73 @@ def rolling(self, *args, **kwargs): """ return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) + def count(self, dropna=True): + """Compute the sizes of each column.""" + def func(x): + return getattr(x, "count")(dropna=dropna) -# Set of valid groupby aggregations that are monkey-patched into the GroupBy -# namespace. -_VALID_GROUPBY_AGGS = { - "count", - "sum", - "idxmin", - "idxmax", - "min", - "max", - "mean", - "var", - "std", - "quantile", - "median", - "nunique", - "collect", - "unique", -} - - -# Dynamically bind the different aggregation methods. -def _agg_func_name_with_args(self, func_name, *args, **kwargs): - """ - Aggregate given an aggregate function name and arguments to the - function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named - aggregations must be members of _AggregationFactory. - """ + return self.agg(func) + + def sum(self): + """Compute the sum of each column.""" + return self.agg("sum") + + def idxmin(self): + """Compute the index of the minimum value in each column.""" + return self.agg("idxmin") + + def idxmax(self): + """Compute the index of the maximum value in each column.""" + return self.agg("idxmax") + + def min(self): + """Compute the minimum value in each column.""" + return self.agg("min") + + def max(self): + """Compute the maximum value in each column.""" + return self.agg("max") + + def mean(self): + """Compute the mean value in each column.""" + return self.agg("mean") - def func(x): - """Compute the {} of the group.""".format(func_name) - return getattr(x, func_name)(*args, **kwargs) + def median(self): + """Compute the median value in each column.""" + return self.agg("median") - func.__name__ = func_name - return self.agg(func) + def var(self, ddof=1): + """Compute the variance of the data in each column.""" + def func(x): + return getattr(x, "var")(ddof=ddof) + + return self.agg(func) + + def std(self, ddof=1): + """Compute the standard deviation of the data in each column.""" + def func(x): + return getattr(x, "std")(ddof=ddof) + + return self.agg(func) + + def quantile(self, q=0.5, interpolation="linear"): + """Compute the quantiles of each column.""" + def func(x): + return getattr(x, "quantile")(q=q, interpolation=interpolation) + + return self.agg(func) + + def nunique(self): + """Compute the number of unique elements in each column.""" + return self.agg("nunique") + def collect(self): + """Get a list of the items in each column.""" + return self.agg("collect") -for key in _VALID_GROUPBY_AGGS: - setattr( - GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key) - ) + def unique(self): + """Compute the unique elements in each column.""" + return self.agg("unique") class DataFrameGroupBy(GroupBy): From 46e054563c6d9ce766b0f5fb283d5d30f1f03785 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 09:31:45 -0700 Subject: [PATCH 36/49] Fix up merge. --- python/cudf/cudf/_lib/groupby.pyx | 6 ++++ python/cudf/cudf/core/groupby/groupby.py | 42 ------------------------ 2 files changed, 6 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 40fd9dac98d..2903832e7c5 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -133,6 +133,12 @@ cdef class GroupBy: else _DECIMAL_AGGS if is_decimal_dtype(dtype) else None ) + if (valid_aggregations is _DECIMAL_AGGS + and rmm._cuda.gpu.runtimeGetVersion() < 11000): + raise RuntimeError( + "Decimal aggregations are only supported on CUDA >= 11 " + "due to an nvcc compiler bug." + ) c_agg_requests.push_back( move(libcudf_groupby.aggregation_request()) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4acf8e7749c..29efb74afa0 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -638,48 +638,6 @@ def unique(self): return self.agg("unique") -# Set of valid groupby aggregations that are monkey-patched into the GroupBy -# namespace. -_VALID_GROUPBY_AGGS = { - "count", - "sum", - "idxmin", - "idxmax", - "min", - "max", - "mean", - "var", - "std", - "quantile", - "median", - "nunique", - "collect", - "unique", -} - - -# Dynamically bind the different aggregation methods. -def _agg_func_name_with_args(self, func_name, *args, **kwargs): - """ - Aggregate given an aggregate function name and arguments to the - function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named - aggregations must be members of _AggregationFactory. - """ - - def func(x): - """Compute the {} of the group.""".format(func_name) - return getattr(x, func_name)(*args, **kwargs) - - func.__name__ = func_name - return self.agg(func) - - -for key in _VALID_GROUPBY_AGGS: - setattr( - GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key) - ) - - class DataFrameGroupBy(GroupBy): def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True From 26f101663ef723c8b795d5a765e62f49439796c4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 10:21:24 -0700 Subject: [PATCH 37/49] Some minor cleanup. --- python/cudf/cudf/_lib/aggregation.pyx | 2 +- python/cudf/cudf/_lib/groupby.pyx | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 020745422a8..f512cddf8f0 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -273,7 +273,7 @@ cdef Aggregation make_aggregation(op, kwargs=None): Returns ------- - unique_ptr[aggregation] + Aggregation """ if kwargs is None: kwargs = {} diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 2903832e7c5..91451f8563f 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -112,11 +112,7 @@ cdef class GroupBy: cdef Column col cdef Aggregation agg_obj - # TODO: Is allowing users to provide empty aggregations something we do - # to support pandas semantics? Whereas we need to throw an error if - # some aggregations are requested when in fact none are possible? allow_empty = all(len(v) == 0 for v in aggregations.values()) - empty_aggs = True included_aggregations = defaultdict(list) idx = 0 @@ -143,20 +139,19 @@ cdef class GroupBy: c_agg_requests.push_back( move(libcudf_groupby.aggregation_request()) ) - c_agg_requests[idx].values = col.view() for agg in aggs: agg_obj = make_aggregation(agg) if valid_aggregations is None or agg_obj.kind in valid_aggregations: - empty_aggs = False included_aggregations[col_name].append(agg) c_agg_requests[idx].aggregations.push_back( move(agg_obj.c_obj) ) - if not c_agg_requests[idx].aggregations.size(): + if c_agg_requests[idx].aggregations.empty(): c_agg_requests.pop_back() else: + c_agg_requests[idx].values = col.view() idx += 1 - if empty_aggs and not allow_empty: + if c_agg_requests.empty() and not allow_empty: raise DataError("No numeric types to aggregate") cdef pair[ @@ -188,6 +183,8 @@ cdef class GroupBy: ) result_data = ColumnAccessor(multiindex=True) + # Note: This loop relies on the included_aggregations dict being + # insertion ordered to map results to requested aggregations by index. for i, col_name in enumerate(included_aggregations): for j, agg_name in enumerate(included_aggregations[col_name]): if callable(agg_name): @@ -196,5 +193,4 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - result = Table(data=result_data, index=grouped_keys) - return result + return Table(data=result_data, index=grouped_keys) From a7148a4d40580cf52849cbab37caadac196d4c02 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 10:29:02 -0700 Subject: [PATCH 38/49] Fix docstrings. --- python/cudf/cudf/core/groupby/groupby.py | 37 +++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 29efb74afa0..dd462427ff3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -570,7 +570,13 @@ def rolling(self, *args, **kwargs): return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) def count(self, dropna=True): - """Compute the sizes of each column.""" + """Compute the sizes of each column. + + Parameters + ---------- + dropna : bool + If ``True``, don't include null values in the count. + """ def func(x): return getattr(x, "count")(dropna=dropna) @@ -605,21 +611,44 @@ def median(self): return self.agg("median") def var(self, ddof=1): - """Compute the variance of the data in each column.""" + """Compute the variance of the data in each column. + + Parameters + ---------- + ddof : int + The delta degrees of freedom. N - ddof is the divisor used to + normalize the variance. + """ def func(x): return getattr(x, "var")(ddof=ddof) return self.agg(func) def std(self, ddof=1): - """Compute the standard deviation of the data in each column.""" + """Compute the standard deviation of the data in each column. + + Parameters + ---------- + ddof : int + The delta degrees of freedom. N - ddof is the divisor used to + normalize the standard deviation. + """ def func(x): return getattr(x, "std")(ddof=ddof) return self.agg(func) def quantile(self, q=0.5, interpolation="linear"): - """Compute the quantiles of each column.""" + """Compute the quantiles of each column. + + Parameters + ---------- + q : float or array-like + The quantiles to compute. + interpolation : {"linear", "lower", "higher", "midpoint", "nearest"} + The interpolation method to use when the desired quantile lies + between two data points. Defaults to "linear". + """ def func(x): return getattr(x, "quantile")(q=q, interpolation=interpolation) From 4dc1d7995c4cc881c002af2b91ed7c83a9ac14af Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 11:02:36 -0700 Subject: [PATCH 39/49] Fix style. --- python/cudf/cudf/_lib/groupby.pyx | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 91451f8563f..e76e4333f09 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,6 +1,15 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from collections import defaultdict +from pandas.core.groupby.groupby import DataError +from cudf.utils.dtypes import ( + is_categorical_dtype, + is_string_dtype, + is_list_dtype, + is_interval_dtype, + is_struct_dtype, + is_decimal_dtype, +) import numpy as np import rmm @@ -19,15 +28,6 @@ from cudf._lib.cpp.table.table cimport table, table_view cimport cudf._lib.cpp.types as libcudf_types cimport cudf._lib.cpp.groupby as libcudf_groupby -from pandas.core.groupby.groupby import DataError -from cudf.utils.dtypes import ( - is_categorical_dtype, - is_string_dtype, - is_list_dtype, - is_interval_dtype, - is_struct_dtype, - is_decimal_dtype, -) # The sets below define the possible aggregations that can be performed on @@ -35,7 +35,7 @@ from cudf.utils.dtypes import ( _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} -_LIST_AGGS = {"COLLECT" } +_LIST_AGGS = {"COLLECT"} _STRUCT_AGGS = set() _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", @@ -127,10 +127,10 @@ cdef class GroupBy: else _STRING_AGGS if is_struct_dtype(dtype) else _INTERVAL_AGGS if is_interval_dtype(dtype) else _DECIMAL_AGGS if is_decimal_dtype(dtype) - else None + else "ALL" ) if (valid_aggregations is _DECIMAL_AGGS - and rmm._cuda.gpu.runtimeGetVersion() < 11000): + and rmm._cuda.gpu.runtimeGetVersion() < 11000): raise RuntimeError( "Decimal aggregations are only supported on CUDA >= 11 " "due to an nvcc compiler bug." @@ -141,7 +141,8 @@ cdef class GroupBy: ) for agg in aggs: agg_obj = make_aggregation(agg) - if valid_aggregations is None or agg_obj.kind in valid_aggregations: + if (valid_aggregations == "ALL" + or agg_obj.kind in valid_aggregations): included_aggregations[col_name].append(agg) c_agg_requests[idx].aggregations.push_back( move(agg_obj.c_obj) From 0c18a161400adcf997f650be8cb465022d456c5d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 12:20:29 -0700 Subject: [PATCH 40/49] Remove extra blank line. --- python/cudf/cudf/_lib/groupby.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index e76e4333f09..4f9d9a524af 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -29,7 +29,6 @@ cimport cudf._lib.cpp.types as libcudf_types cimport cudf._lib.cpp.groupby as libcudf_groupby - # The sets below define the possible aggregations that can be performed on # different dtypes. These strings must be elements of the AggregationKind enum. _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} From c9892b18ebacfcdcba6c28b660efebfbcd54d8b9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 13:57:59 -0700 Subject: [PATCH 41/49] Remove TODO. --- python/cudf/cudf/_lib/aggregation.pxd | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd index 13d04575700..972f95d5aab 100644 --- a/python/cudf/cudf/_lib/aggregation.pxd +++ b/python/cudf/cudf/_lib/aggregation.pxd @@ -7,6 +7,4 @@ from cudf._lib.cpp.aggregation cimport aggregation cdef class Aggregation: cdef unique_ptr[aggregation] c_obj -# TODO: the old version of this function was declared as except *, I'm not sure -# why but I should figure that out before calling this done. cdef Aggregation make_aggregation(op, kwargs=*) From 61e84b25ac19eb3c2ad0029aae44bbc54a5e6ad1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 14:05:28 -0700 Subject: [PATCH 42/49] Apply black formatting. --- python/cudf/cudf/core/groupby/groupby.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index dd462427ff3..ee7dd7c5335 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -577,6 +577,7 @@ def count(self, dropna=True): dropna : bool If ``True``, don't include null values in the count. """ + def func(x): return getattr(x, "count")(dropna=dropna) @@ -619,6 +620,7 @@ def var(self, ddof=1): The delta degrees of freedom. N - ddof is the divisor used to normalize the variance. """ + def func(x): return getattr(x, "var")(ddof=ddof) @@ -633,6 +635,7 @@ def std(self, ddof=1): The delta degrees of freedom. N - ddof is the divisor used to normalize the standard deviation. """ + def func(x): return getattr(x, "std")(ddof=ddof) @@ -649,6 +652,7 @@ def quantile(self, q=0.5, interpolation="linear"): The interpolation method to use when the desired quantile lies between two data points. Defaults to "linear". """ + def func(x): return getattr(x, "quantile")(q=q, interpolation=interpolation) From 67c6ebcd2970cb1e4284fe9b208641e2d4770f3e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 12 Apr 2021 10:11:39 -0700 Subject: [PATCH 43/49] Remove use of index and just use std::vector::back. --- python/cudf/cudf/_lib/groupby.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 4f9d9a524af..2f4f6e89aff 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -114,7 +114,6 @@ cdef class GroupBy: allow_empty = all(len(v) == 0 for v in aggregations.values()) included_aggregations = defaultdict(list) - idx = 0 for i, (col_name, aggs) in enumerate(aggregations.items()): col = values._data[col_name] dtype = col.dtype @@ -143,14 +142,13 @@ cdef class GroupBy: if (valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations): included_aggregations[col_name].append(agg) - c_agg_requests[idx].aggregations.push_back( + c_agg_requests.back().aggregations.push_back( move(agg_obj.c_obj) ) - if c_agg_requests[idx].aggregations.empty(): + if c_agg_requests.back().aggregations.empty(): c_agg_requests.pop_back() else: - c_agg_requests[idx].values = col.view() - idx += 1 + c_agg_requests.back().values = col.view() if c_agg_requests.empty() and not allow_empty: raise DataError("No numeric types to aggregate") From 9cd725e8af4467226dc73b826e9051d64b7162df Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 12 Apr 2021 10:23:11 -0700 Subject: [PATCH 44/49] Update error message. --- python/cudf/cudf/_lib/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 2f4f6e89aff..c055c174636 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -150,7 +150,7 @@ cdef class GroupBy: else: c_agg_requests.back().values = col.view() if c_agg_requests.empty() and not allow_empty: - raise DataError("No numeric types to aggregate") + raise DataError("All requested aggregations are unsupported.") cdef pair[ unique_ptr[table], From 9f01b607879df089bd1569a1061547f0405e26ea Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 12 Apr 2021 10:34:50 -0700 Subject: [PATCH 45/49] Create a temporary agg variable rather than popping from the vector. --- python/cudf/cudf/_lib/groupby.pyx | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c055c174636..1679256e4d0 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -108,6 +108,7 @@ cdef class GroupBy: """ from cudf.core.column_accessor import ColumnAccessor cdef vector[libcudf_groupby.aggregation_request] c_agg_requests + cdef libcudf_groupby.aggregation_request c_agg_request cdef Column col cdef Aggregation agg_obj @@ -134,23 +135,23 @@ cdef class GroupBy: "due to an nvcc compiler bug." ) - c_agg_requests.push_back( - move(libcudf_groupby.aggregation_request()) - ) + c_agg_request = move(libcudf_groupby.aggregation_request()) for agg in aggs: agg_obj = make_aggregation(agg) if (valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations): included_aggregations[col_name].append(agg) - c_agg_requests.back().aggregations.push_back( + c_agg_request.aggregations.push_back( move(agg_obj.c_obj) ) - if c_agg_requests.back().aggregations.empty(): - c_agg_requests.pop_back() - else: - c_agg_requests.back().values = col.view() + if not c_agg_request.aggregations.empty(): + c_agg_request.values = col.view() + c_agg_requests.push_back( + move(c_agg_request) + ) + if c_agg_requests.empty() and not allow_empty: - raise DataError("All requested aggregations are unsupported.") + raise DataError("No numeric types to aggregate") cdef pair[ unique_ptr[table], From cb3f5a0c26d42e57653146804178dc3d8829f870 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 13 Apr 2021 09:35:30 -0700 Subject: [PATCH 46/49] Update docstring. --- python/cudf/cudf/core/groupby/groupby.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ee7dd7c5335..989fded48f7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -570,7 +570,7 @@ def rolling(self, *args, **kwargs): return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) def count(self, dropna=True): - """Compute the sizes of each column. + """Compute the number of values in each column. Parameters ---------- @@ -584,7 +584,7 @@ def func(x): return self.agg(func) def sum(self): - """Compute the sum of each column.""" + """Compute the sum of the values in each column.""" return self.agg("sum") def idxmin(self): @@ -604,15 +604,15 @@ def max(self): return self.agg("max") def mean(self): - """Compute the mean value in each column.""" + """Compute the mean of the values in each column.""" return self.agg("mean") def median(self): - """Compute the median value in each column.""" + """Compute the median of the values in each column.""" return self.agg("median") def var(self, ddof=1): - """Compute the variance of the data in each column. + """Compute the variance of the values in each column. Parameters ---------- @@ -627,7 +627,7 @@ def func(x): return self.agg(func) def std(self, ddof=1): - """Compute the standard deviation of the data in each column. + """Compute the standard deviation of the values in each column. Parameters ---------- @@ -642,7 +642,7 @@ def func(x): return self.agg(func) def quantile(self, q=0.5, interpolation="linear"): - """Compute the quantiles of each column. + """Compute the quantiles of the values in each column. Parameters ---------- @@ -659,15 +659,15 @@ def func(x): return self.agg(func) def nunique(self): - """Compute the number of unique elements in each column.""" + """Compute the number of unique values in each column.""" return self.agg("nunique") def collect(self): - """Get a list of the items in each column.""" + """Get a list of all the values in each column.""" return self.agg("collect") def unique(self): - """Compute the unique elements in each column.""" + """Get a list of the unique values in each column.""" return self.agg("unique") From e96a7958018f0ca045fe8018f99716d657fcd685 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 13 Apr 2021 09:41:07 -0700 Subject: [PATCH 47/49] Update error message. --- python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/tests/test_groupby.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 1679256e4d0..3c2b541f728 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -151,7 +151,7 @@ cdef class GroupBy: ) if c_agg_requests.empty() and not allow_empty: - raise DataError("No numeric types to aggregate") + raise DataError("All requested aggregations are unsupported.") cdef pair[ unique_ptr[table], diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 4dbe608af82..868387b100e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1236,7 +1236,11 @@ def test_raise_data_error(): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) gdf = cudf.from_pandas(pdf) - assert_exceptions_equal(pdf.groupby("a").mean, gdf.groupby("a").mean) + assert_exceptions_equal( + pdf.groupby("a").mean, + gdf.groupby("a").mean, + compare_error_message=False, + ) def test_drop_unsupported_multi_agg(): From f52fffb98ab9cc8b510285ee910c5e8e306fe6d3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 14 Apr 2021 12:23:12 -0700 Subject: [PATCH 48/49] Update docstrings. --- python/cudf/cudf/core/groupby/groupby.py | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 989fded48f7..a52fae994e7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -584,35 +584,35 @@ def func(x): return self.agg(func) def sum(self): - """Compute the sum of the values in each column.""" + """Compute the column-wise sum of the values in each group.""" return self.agg("sum") def idxmin(self): - """Compute the index of the minimum value in each column.""" + """Get the column-wise index of the minimum value in each group.""" return self.agg("idxmin") def idxmax(self): - """Compute the index of the maximum value in each column.""" + """Get the column-wise index of the maximum value in each group.""" return self.agg("idxmax") def min(self): - """Compute the minimum value in each column.""" + """Get the column-wise minimum value in each group.""" return self.agg("min") def max(self): - """Compute the maximum value in each column.""" + """Get the column-wise maximum value in each group.""" return self.agg("max") def mean(self): - """Compute the mean of the values in each column.""" + """Compute the column-wise mean of the values in each group.""" return self.agg("mean") def median(self): - """Compute the median of the values in each column.""" + """Get the column-wise median of the values in each group.""" return self.agg("median") def var(self, ddof=1): - """Compute the variance of the values in each column. + """Compute the column-wise variance of the values in each group. Parameters ---------- @@ -627,7 +627,7 @@ def func(x): return self.agg(func) def std(self, ddof=1): - """Compute the standard deviation of the values in each column. + """Compute the column-wise std of the values in each group. Parameters ---------- @@ -642,7 +642,7 @@ def func(x): return self.agg(func) def quantile(self, q=0.5, interpolation="linear"): - """Compute the quantiles of the values in each column. + """Compute the column-wise quantiles of the values in each group. Parameters ---------- @@ -659,15 +659,15 @@ def func(x): return self.agg(func) def nunique(self): - """Compute the number of unique values in each column.""" + """Compute the number of unique values in each column in each group.""" return self.agg("nunique") def collect(self): - """Get a list of all the values in each column.""" + """Get a list of all the values for each column in each group.""" return self.agg("collect") def unique(self): - """Get a list of the unique values in each column.""" + """Get a list of the unique values for each column in each group.""" return self.agg("unique") From affc9c95b07429d3cac293e6ddf63432791e2ae3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 14 Apr 2021 16:40:38 -0500 Subject: [PATCH 49/49] Update python/cudf/cudf/_lib/aggregation.pyx --- python/cudf/cudf/_lib/aggregation.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index f512cddf8f0..682d8cbf329 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -289,5 +289,5 @@ cdef Aggregation make_aggregation(op, kwargs=None): else: agg = op(Aggregation) else: - raise TypeError("Unknown aggregation {}".format(op)) + raise TypeError(f"Unknown aggregation {op}") return agg