rapidsai · rapids-bot · May 16, 2023 · Apr 10, 2023 · Apr 11, 2023 · Apr 21, 2023
@@ -91,6 +91,10 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
+    @property
+    def _has_nulls(self):
+        return any(col.has_nulls() for col in self._data.values())
+
     def serialize(self):
         header = {
             "type-serialized": pickle.dumps(type(self)),

@@ -25,7 +25,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
-from cudf.core.udf.groupby_utils import jit_groupby_apply
+from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
 from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
 
 
@@ -1161,11 +1161,8 @@ def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
         # Nulls are not yet supported
-        for colname in self.grouping.values._data.keys():
-            if self.obj._data[colname].has_nulls():
-                raise ValueError(
-                    "Nulls not yet supported with groupby JIT engine"
-                )
+        if self.grouping._obj._has_nulls:
+            raise ValueError("Nulls not yet supported with groupby JIT engine")
 
         chunk_results = jit_groupby_apply(
             offsets, grouped_values, function, *args
@@ -1242,7 +1239,7 @@ def _post_process_chunk_results(
         return result
 
     @_cudf_nvtx_annotate
-    def apply(self, function, *args, engine="cudf"):
+    def apply(self, function, *args, engine="auto"):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
@@ -1252,7 +1249,7 @@ def apply(self, function, *args, engine="cudf"):
           on the grouped chunk.
         args : tuple
             Optional positional arguments to pass to the function.
-        engine: {'cudf', 'jit'}, default 'cudf'
+        engine: {'cudf', 'jit'}, default 'auto'
           Selects the GroupBy.apply implementation. Use `jit` to
           select the numba JIT pipeline. Only certain operations are allowed
           within the function when using this option: min, max, sum, mean, var,
@@ -1261,6 +1258,11 @@ def apply(self, function, *args, engine="cudf"):
           `df['x'] * 2` is not yet allowed.
           For more information, see the `cuDF guide to user defined functions
           <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
+          Use `cudf` to select the iterative groupby apply algorithm which aims
+          to provide maximum flexibility at the expense of performance.
+          The default value `auto` will attempt to use the numba JIT pipeline
+          where possible and will fall back to the iterative algorithm if
+          necessary.
 
         Examples
         --------
@@ -1334,10 +1336,20 @@ def mult(df):
         1  2     1
         2  3     1
         """
+
+        if self.obj.empty:
+            return self.obj
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
+        if engine == "auto":
+            if (not grouped_values._has_nulls) and _can_be_jitted(
+                grouped_values, function, args
+            ):
+                engine = "jit"
+            else:
+                engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
                 function,

@@ -4,6 +4,7 @@
 import cupy as cp
 import numpy as np
 from numba import cuda, types
+from numba.core.errors import TypingError
 from numba.cuda.cudadrv.devices import get_context
 from numba.np import numpy_support
 
@@ -201,3 +202,26 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     specialized[ngroups, tpb](*launch_args)
 
     return output
+
+
+def _can_be_jitted(frame, func, args):
+    """
+    Determine if this UDF is supported through the JIT engine
+    by attempting to compile just the function to PTX using the
+    target set of types
+    """
+    np_field_types = np.dtype(
+        list(
+            _supported_dtypes_from_frame(
+                frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES
+            ).items()
+        )
+    )
+    dataframe_group_type = _get_frame_groupby_type(
+        np_field_types, frame.index.dtype
+    )
+    try:
+        _get_udf_return_type(dataframe_group_type, func, args)
+        return True
+    except TypingError:
+        return False