Skip to content

Commit

Permalink
Automatically select GroupBy.apply algorithm based on if the UDF is…
Browse files Browse the repository at this point in the history
… jittable (#13113)

Closes #13103

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: #13113
  • Loading branch information
brandon-b-miller authored May 16, 2023
1 parent 4c456cb commit 89feac7
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 8 deletions.
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def _dtypes(self):
zip(self._data.names, (col.dtype for col in self._data.columns))
)

@property
def _has_nulls(self):
return any(col.has_nulls() for col in self._data.values())

def serialize(self):
header = {
"type-serialized": pickle.dumps(type(self)),
Expand Down
28 changes: 20 additions & 8 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.mixins import Reducible, Scannable
from cudf.core.multiindex import MultiIndex
from cudf.core.udf.groupby_utils import jit_groupby_apply
from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate


Expand Down Expand Up @@ -1166,11 +1166,8 @@ def _jit_groupby_apply(
self, function, group_names, offsets, group_keys, grouped_values, *args
):
# Nulls are not yet supported
for colname in self.grouping.values._data.keys():
if self.obj._data[colname].has_nulls():
raise ValueError(
"Nulls not yet supported with groupby JIT engine"
)
if self.grouping._obj._has_nulls:
raise ValueError("Nulls not yet supported with groupby JIT engine")

chunk_results = jit_groupby_apply(
offsets, grouped_values, function, *args
Expand Down Expand Up @@ -1247,7 +1244,7 @@ def _post_process_chunk_results(
return result

@_cudf_nvtx_annotate
def apply(self, function, *args, engine="cudf"):
def apply(self, function, *args, engine="auto"):
"""Apply a python transformation function over the grouped chunk.
Parameters
Expand All @@ -1257,7 +1254,7 @@ def apply(self, function, *args, engine="cudf"):
on the grouped chunk.
args : tuple
Optional positional arguments to pass to the function.
engine: {'cudf', 'jit'}, default 'cudf'
engine: 'auto', 'cudf', or 'jit', default 'auto'
Selects the GroupBy.apply implementation. Use `jit` to
select the numba JIT pipeline. Only certain operations are allowed
within the function when using this option: min, max, sum, mean, var,
Expand All @@ -1266,6 +1263,11 @@ def apply(self, function, *args, engine="cudf"):
`df['x'] * 2` is not yet allowed.
For more information, see the `cuDF guide to user defined functions
<https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
Use `cudf` to select the iterative groupby apply algorithm which aims
to provide maximum flexibility at the expense of performance.
The default value `auto` will attempt to use the numba JIT pipeline
where possible and will fall back to the iterative algorithm if
necessary.
Examples
--------
Expand Down Expand Up @@ -1339,10 +1341,20 @@ def mult(df):
1 2 1
2 3 1
"""

if self.obj.empty:
return self.obj
if not callable(function):
raise TypeError(f"type {type(function)} is not callable")
group_names, offsets, group_keys, grouped_values = self._grouped()

if engine == "auto":
if (not grouped_values._has_nulls) and _can_be_jitted(
grouped_values, function, args
):
engine = "jit"
else:
engine = "cudf"
if engine == "jit":
result = self._jit_groupby_apply(
function,
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/core/udf/groupby_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import cupy as cp
import numpy as np
from numba import cuda, types
from numba.core.errors import TypingError
from numba.cuda.cudadrv.devices import get_context
from numba.np import numpy_support

Expand Down Expand Up @@ -201,3 +202,26 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
specialized[ngroups, tpb](*launch_args)

return output


def _can_be_jitted(frame, func, args):
"""
Determine if this UDF is supported through the JIT engine
by attempting to compile just the function to PTX using the
target set of types
"""
np_field_types = np.dtype(
list(
_supported_dtypes_from_frame(
frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES
).items()
)
)
dataframe_group_type = _get_frame_groupby_type(
np_field_types, frame.index.dtype
)
try:
_get_udf_return_type(dataframe_group_type, func, args)
return True
except TypingError:
return False

0 comments on commit 89feac7

Please sign in to comment.