From b3a736a89aa7d1264c3a118e34b1dbbfd27232b0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 10 Apr 2023 11:56:23 -0700 Subject: [PATCH 1/7] initial untested --- python/cudf/cudf/core/frame.py | 4 +++ python/cudf/cudf/core/groupby/groupby.py | 20 +++++++++------ python/cudf/cudf/core/udf/groupby_utils.py | 29 ++++++++++++++++++++++ 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d8b9ee4d006..44a3b16bcd9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -91,6 +91,10 @@ def _dtypes(self): zip(self._data.names, (col.dtype for col in self._data.columns)) ) + @property + def _has_nulls(self): + any(col.has_nulls() for col in self._data.values()) + def serialize(self): header = { "type-serialized": pickle.dumps(type(self)), diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cb4c0f6b48b..da8899f55bb 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -25,7 +25,10 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex -from cudf.core.udf.groupby_utils import jit_groupby_apply +from cudf.core.udf.groupby_utils import ( + _jit_groupby_eligible, + jit_groupby_apply, +) from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate @@ -1144,11 +1147,9 @@ def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): # Nulls are not yet supported - for colname in self.grouping.values._data.keys(): - if self.obj._data[colname].has_nulls(): - raise ValueError( - "Nulls not yet supported with groupby JIT engine" - ) + # TODO: don't check this twice under `engine='auto'` + if self.grouping.has_nulls: + raise ValueError("Nulls not yet supported with groupby JIT engine") chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args @@ -1198,7 +1199,7 @@ def _iterative_groupby_apply( result.index = cudf.MultiIndex._from_data(index_data) return result - def apply(self, function, *args, engine="cudf"): + def apply(self, function, *args, engine="auto"): """Apply a python transformation function over the grouped chunk. Parameters @@ -1294,6 +1295,11 @@ def mult(df): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, group_keys, grouped_values = self._grouped() + if engine == "auto": + if _jit_groupby_eligible(grouped_values, function, args): + engine = "jit" + else: + engine = "cudf" if engine == "jit": result = self._jit_groupby_apply( function, diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index ebf8c677e55..939916c8d98 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -4,6 +4,7 @@ import cupy as cp import numpy as np from numba import cuda, types +from numba.core.errors import TypingError from numba.cuda.cudadrv.devices import get_context from numba.np import numpy_support from numba.types import Record @@ -104,6 +105,7 @@ def _groupby_apply_kernel_string_from_template(frame, args): def _get_groupby_apply_kernel(frame, func, args): + breakpoint() np_field_types = np.dtype( list( _supported_dtypes_from_frame( @@ -202,3 +204,30 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): specialized[ngroups, tpb](*launch_args) return output + + +def _jit_groupby_eligible(frame, func, args): + return (not frame.has_nulls) and _can_be_jitted(frame, func, args) + + +def _can_be_jitted(frame, func, args): + """ + Determine if this UDF is supported through the JIT engine + by attempting to compile just the function to PTX using the + target set of types + """ + np_field_types = np.dtype( + list( + _supported_dtypes_from_frame( + frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES + ).items() + ) + ) + dataframe_group_type = _get_frame_groupby_type( + np_field_types, frame.index.dtype + ) + try: + _get_udf_return_type(dataframe_group_type, func, args) + return True + except TypingError: + return False From da6659f4a0ea26c847e2d9c6c833503a0564be81 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Apr 2023 07:22:27 -0700 Subject: [PATCH 2/7] cleanup --- python/cudf/cudf/core/udf/groupby_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 939916c8d98..16dbdb2d43a 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -105,7 +105,6 @@ def _groupby_apply_kernel_string_from_template(frame, args): def _get_groupby_apply_kernel(frame, func, args): - breakpoint() np_field_types = np.dtype( list( _supported_dtypes_from_frame( From 7795929f3f23bc1304f5eb792f1f2898f8f623de Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 21 Apr 2023 11:43:02 -0700 Subject: [PATCH 3/7] fixes --- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 5 ++++- python/cudf/cudf/core/udf/groupby_utils.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 44a3b16bcd9..b67b71fc4fa 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -93,7 +93,7 @@ def _dtypes(self): @property def _has_nulls(self): - any(col.has_nulls() for col in self._data.values()) + return any(col.has_nulls() for col in self._data.values()) def serialize(self): header = { diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b81187d130e..13dc7a55531 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1148,7 +1148,7 @@ def _jit_groupby_apply( ): # Nulls are not yet supported # TODO: don't check this twice under `engine='auto'` - if self.grouping.has_nulls: + if self.grouping._obj._has_nulls: raise ValueError("Nulls not yet supported with groupby JIT engine") chunk_results = jit_groupby_apply( @@ -1291,6 +1291,9 @@ def mult(df): 1 2 1 2 3 1 """ + + if self.obj.empty: + return self.obj if not callable(function): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, group_keys, grouped_values = self._grouped() diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 16dbdb2d43a..3834e3ae92a 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -206,7 +206,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): def _jit_groupby_eligible(frame, func, args): - return (not frame.has_nulls) and _can_be_jitted(frame, func, args) + return (not frame._has_nulls) and _can_be_jitted(frame, func, args) def _can_be_jitted(frame, func, args): From 9dadf35eaa2decedce5c22623071ba080a902e3d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Apr 2023 10:00:40 -0700 Subject: [PATCH 4/7] update docs --- python/cudf/cudf/core/groupby/groupby.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index bcb89097fb4..d99f71c30fd 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1253,7 +1253,7 @@ def apply(self, function, *args, engine="auto"): on the grouped chunk. args : tuple Optional positional arguments to pass to the function. - engine: {'cudf', 'jit'}, default 'cudf' + engine: {'cudf', 'jit'}, default 'auto' Selects the GroupBy.apply implementation. Use `jit` to select the numba JIT pipeline. Only certain operations are allowed within the function when using this option: min, max, sum, mean, var, @@ -1262,6 +1262,11 @@ def apply(self, function, *args, engine="auto"): `df['x'] * 2` is not yet allowed. For more information, see the `cuDF guide to user defined functions `__. + Use `cudf` to select the iterative groupby apply algorithm which aims + to provide maximum flexibility at the expense of performance. + The default value `auto` will attempt to use the numba JIT pipeline + where possible and will fall back to the iterative algorithm if + necessary. Examples -------- From 064f52ddb24cae8f3a150597bcf3a797f6822b1f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Apr 2023 10:01:05 -0700 Subject: [PATCH 5/7] remove todo --- python/cudf/cudf/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d99f71c30fd..1668bb16f78 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1164,7 +1164,6 @@ def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): # Nulls are not yet supported - # TODO: don't check this twice under `engine='auto'` if self.grouping._obj._has_nulls: raise ValueError("Nulls not yet supported with groupby JIT engine") From ad4dbaa9efad40fe4801a41400927d1d518fa293 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 11 May 2023 07:47:32 -0700 Subject: [PATCH 6/7] inline function --- python/cudf/cudf/core/groupby/groupby.py | 9 ++++----- python/cudf/cudf/core/udf/groupby_utils.py | 4 ---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 1668bb16f78..b102a29d896 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -25,10 +25,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex -from cudf.core.udf.groupby_utils import ( - _jit_groupby_eligible, - jit_groupby_apply, -) +from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate @@ -1347,7 +1344,9 @@ def mult(df): group_names, offsets, group_keys, grouped_values = self._grouped() if engine == "auto": - if _jit_groupby_eligible(grouped_values, function, args): + if (not grouped_values._has_nulls) and _can_be_jitted( + grouped_values, function, args + ): engine = "jit" else: engine = "cudf" diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 0fd5cbf91db..8ccf7b710dc 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -204,10 +204,6 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): return output -def _jit_groupby_eligible(frame, func, args): - return (not frame._has_nulls) and _can_be_jitted(frame, func, args) - - def _can_be_jitted(frame, func, args): """ Determine if this UDF is supported through the JIT engine From 11d4f732a92161e1bc27ba3a3c60a15b5c4dd2f5 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 May 2023 09:34:35 +0100 Subject: [PATCH 7/7] Mention auto as a valid engine type --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b102a29d896..163b106d73c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1249,7 +1249,7 @@ def apply(self, function, *args, engine="auto"): on the grouped chunk. args : tuple Optional positional arguments to pass to the function. - engine: {'cudf', 'jit'}, default 'auto' + engine: 'auto', 'cudf', or 'jit', default 'auto' Selects the GroupBy.apply implementation. Use `jit` to select the numba JIT pipeline. Only certain operations are allowed within the function when using this option: min, max, sum, mean, var,