From b3a736a89aa7d1264c3a118e34b1dbbfd27232b0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 10 Apr 2023 11:56:23 -0700
Subject: [PATCH 1/7] initial untested

---
 python/cudf/cudf/core/frame.py             |  4 +++
 python/cudf/cudf/core/groupby/groupby.py   | 20 +++++++++------
 python/cudf/cudf/core/udf/groupby_utils.py | 29 ++++++++++++++++++++++
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d8b9ee4d006..44a3b16bcd9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -91,6 +91,10 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
+    @property
+    def _has_nulls(self):
+        any(col.has_nulls() for col in self._data.values())
+
     def serialize(self):
         header = {
             "type-serialized": pickle.dumps(type(self)),
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cb4c0f6b48b..da8899f55bb 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -25,7 +25,10 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
-from cudf.core.udf.groupby_utils import jit_groupby_apply
+from cudf.core.udf.groupby_utils import (
+    _jit_groupby_eligible,
+    jit_groupby_apply,
+)
 from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
 
 
@@ -1144,11 +1147,9 @@ def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
         # Nulls are not yet supported
-        for colname in self.grouping.values._data.keys():
-            if self.obj._data[colname].has_nulls():
-                raise ValueError(
-                    "Nulls not yet supported with groupby JIT engine"
-                )
+        # TODO: don't check this twice under `engine='auto'`
+        if self.grouping.has_nulls:
+            raise ValueError("Nulls not yet supported with groupby JIT engine")
 
         chunk_results = jit_groupby_apply(
             offsets, grouped_values, function, *args
@@ -1198,7 +1199,7 @@ def _iterative_groupby_apply(
                 result.index = cudf.MultiIndex._from_data(index_data)
         return result
 
-    def apply(self, function, *args, engine="cudf"):
+    def apply(self, function, *args, engine="auto"):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
@@ -1294,6 +1295,11 @@ def mult(df):
             raise TypeError(f"type {type(function)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
+        if engine == "auto":
+            if _jit_groupby_eligible(grouped_values, function, args):
+                engine = "jit"
+            else:
+                engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
                 function,
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index ebf8c677e55..939916c8d98 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -4,6 +4,7 @@
 import cupy as cp
 import numpy as np
 from numba import cuda, types
+from numba.core.errors import TypingError
 from numba.cuda.cudadrv.devices import get_context
 from numba.np import numpy_support
 from numba.types import Record
@@ -104,6 +105,7 @@ def _groupby_apply_kernel_string_from_template(frame, args):
 
 
 def _get_groupby_apply_kernel(frame, func, args):
+    breakpoint()
     np_field_types = np.dtype(
         list(
             _supported_dtypes_from_frame(
@@ -202,3 +204,30 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     specialized[ngroups, tpb](*launch_args)
 
     return output
+
+
+def _jit_groupby_eligible(frame, func, args):
+    return (not frame.has_nulls) and _can_be_jitted(frame, func, args)
+
+
+def _can_be_jitted(frame, func, args):
+    """
+    Determine if this UDF is supported through the JIT engine
+    by attempting to compile just the function to PTX using the
+    target set of types
+    """
+    np_field_types = np.dtype(
+        list(
+            _supported_dtypes_from_frame(
+                frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES
+            ).items()
+        )
+    )
+    dataframe_group_type = _get_frame_groupby_type(
+        np_field_types, frame.index.dtype
+    )
+    try:
+        _get_udf_return_type(dataframe_group_type, func, args)
+        return True
+    except TypingError:
+        return False

From da6659f4a0ea26c847e2d9c6c833503a0564be81 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 11 Apr 2023 07:22:27 -0700
Subject: [PATCH 2/7] cleanup

---
 python/cudf/cudf/core/udf/groupby_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 939916c8d98..16dbdb2d43a 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -105,7 +105,6 @@ def _groupby_apply_kernel_string_from_template(frame, args):
 
 
 def _get_groupby_apply_kernel(frame, func, args):
-    breakpoint()
     np_field_types = np.dtype(
         list(
             _supported_dtypes_from_frame(

From 7795929f3f23bc1304f5eb792f1f2898f8f623de Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 21 Apr 2023 11:43:02 -0700
Subject: [PATCH 3/7] fixes

---
 python/cudf/cudf/core/frame.py             | 2 +-
 python/cudf/cudf/core/groupby/groupby.py   | 5 ++++-
 python/cudf/cudf/core/udf/groupby_utils.py | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 44a3b16bcd9..b67b71fc4fa 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -93,7 +93,7 @@ def _dtypes(self):
 
     @property
     def _has_nulls(self):
-        any(col.has_nulls() for col in self._data.values())
+        return any(col.has_nulls() for col in self._data.values())
 
     def serialize(self):
         header = {
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b81187d130e..13dc7a55531 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1148,7 +1148,7 @@ def _jit_groupby_apply(
     ):
         # Nulls are not yet supported
         # TODO: don't check this twice under `engine='auto'`
-        if self.grouping.has_nulls:
+        if self.grouping._obj._has_nulls:
             raise ValueError("Nulls not yet supported with groupby JIT engine")
 
         chunk_results = jit_groupby_apply(
@@ -1291,6 +1291,9 @@ def mult(df):
         1  2     1
         2  3     1
         """
+
+        if self.obj.empty:
+            return self.obj
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped()
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 16dbdb2d43a..3834e3ae92a 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -206,7 +206,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
 
 
 def _jit_groupby_eligible(frame, func, args):
-    return (not frame.has_nulls) and _can_be_jitted(frame, func, args)
+    return (not frame._has_nulls) and _can_be_jitted(frame, func, args)
 
 
 def _can_be_jitted(frame, func, args):

From 9dadf35eaa2decedce5c22623071ba080a902e3d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Apr 2023 10:00:40 -0700
Subject: [PATCH 4/7] update docs

---
 python/cudf/cudf/core/groupby/groupby.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index bcb89097fb4..d99f71c30fd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1253,7 +1253,7 @@ def apply(self, function, *args, engine="auto"):
           on the grouped chunk.
         args : tuple
             Optional positional arguments to pass to the function.
-        engine: {'cudf', 'jit'}, default 'cudf'
+        engine: {'cudf', 'jit'}, default 'auto'
           Selects the GroupBy.apply implementation. Use `jit` to
           select the numba JIT pipeline. Only certain operations are allowed
           within the function when using this option: min, max, sum, mean, var,
@@ -1262,6 +1262,11 @@ def apply(self, function, *args, engine="auto"):
           `df['x'] * 2` is not yet allowed.
           For more information, see the `cuDF guide to user defined functions
           <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
+          Use `cudf` to select the iterative groupby apply algorithm which aims
+          to provide maximum flexibility at the expense of performance.
+          The default value `auto` will attempt to use the numba JIT pipeline
+          where possible and will fall back to the iterative algorithm if
+          necessary.
 
         Examples
         --------

From 064f52ddb24cae8f3a150597bcf3a797f6822b1f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Apr 2023 10:01:05 -0700
Subject: [PATCH 5/7] remove todo

---
 python/cudf/cudf/core/groupby/groupby.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d99f71c30fd..1668bb16f78 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1164,7 +1164,6 @@ def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
         # Nulls are not yet supported
-        # TODO: don't check this twice under `engine='auto'`
         if self.grouping._obj._has_nulls:
             raise ValueError("Nulls not yet supported with groupby JIT engine")
 

From ad4dbaa9efad40fe4801a41400927d1d518fa293 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 07:47:32 -0700
Subject: [PATCH 6/7] inline function

---
 python/cudf/cudf/core/groupby/groupby.py   | 9 ++++-----
 python/cudf/cudf/core/udf/groupby_utils.py | 4 ----
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 1668bb16f78..b102a29d896 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -25,10 +25,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
-from cudf.core.udf.groupby_utils import (
-    _jit_groupby_eligible,
-    jit_groupby_apply,
-)
+from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
 from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
 
 
@@ -1347,7 +1344,9 @@ def mult(df):
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
         if engine == "auto":
-            if _jit_groupby_eligible(grouped_values, function, args):
+            if (not grouped_values._has_nulls) and _can_be_jitted(
+                grouped_values, function, args
+            ):
                 engine = "jit"
             else:
                 engine = "cudf"
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 0fd5cbf91db..8ccf7b710dc 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -204,10 +204,6 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     return output
 
 
-def _jit_groupby_eligible(frame, func, args):
-    return (not frame._has_nulls) and _can_be_jitted(frame, func, args)
-
-
 def _can_be_jitted(frame, func, args):
     """
     Determine if this UDF is supported through the JIT engine

From 11d4f732a92161e1bc27ba3a3c60a15b5c4dd2f5 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Mon, 15 May 2023 09:34:35 +0100
Subject: [PATCH 7/7] Mention auto as a valid engine type

---
 python/cudf/cudf/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b102a29d896..163b106d73c 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1249,7 +1249,7 @@ def apply(self, function, *args, engine="auto"):
           on the grouped chunk.
         args : tuple
             Optional positional arguments to pass to the function.
-        engine: {'cudf', 'jit'}, default 'auto'
+        engine: 'auto', 'cudf', or 'jit', default 'auto'
           Selects the GroupBy.apply implementation. Use `jit` to
           select the numba JIT pipeline. Only certain operations are allowed
           within the function when using this option: min, max, sum, mean, var,