rapidsai · rapids-bot · Jan 27, 2023 · Aug 3, 2022 · Aug 4, 2022 · Aug 5, 2022
@@ -1,5 +1,6 @@
 #cpp code owners
-cpp/               @rapidsai/cudf-cpp-codeowners
+cpp/                 @rapidsai/cudf-cpp-codeowners
+python/cudf/udf_cpp/ @rapidsai/cudf-cpp-codeowners
 
 #python code owners
 python/            @rapidsai/cudf-python-codeowners

diff --git a/.gitignore b/.gitignore
@@ -31,6 +31,7 @@ python/cudf/*/_cuda/*.cpp
 python/cudf/*.ipynb
 python/cudf/.ipynb_checkpoints
 python/*/record.txt
+python/cudf/cudf/core/udf/*.ptx
 python/cudf_kafka/*/_lib/**/*.cpp
 python/cudf_kafka/*/_lib/**/*.h
 python/custreamz/*/_lib/**/*.cpp

@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -17,6 +17,8 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 set(cudf_version 23.02.00)
 
 include(../../fetch_rapids.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(cudf-python)
 
 project(
   cudf-python
@@ -25,7 +27,11 @@ project(
             # language to be enabled here. The test project that is built in scikit-build to verify
             # various linking options for the python library is hardcoded to build with C, so until
             # that is fixed we need to keep C.
-            C CXX
+            C
+            CXX
+            # Temporarily enabling for groupby UDFs compilation until we come up with a better
-            # Temporarily enabling for groupby UDFs compilation until we come up with a better
+            # TODO: Temporarily enabling for groupby UDFs compilation until we come up with a better
-            # Temporarily enabling for groupby UDFs compilation until we come up with a better
+            # TODO: Temporarily enabling for groupby UDFs compilation until we come up with a better
+            # solution.
+            CUDA
 )
 
 option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
@@ -117,6 +123,7 @@ endif()
 rapids_cython_init()
 
 add_subdirectory(cudf/_lib)
+add_subdirectory(udf_cpp/groupby)
 
 include(cmake/Modules/ProtobufHelpers.cmake)
 codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto)

@@ -1,8 +1,9 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from cudf.utils.gpu_utils import validate_setup
 
 validate_setup()
+import os
-import os
-import os
 
 import cupy
 from numba import config as numba_config, cuda
@@ -88,7 +89,13 @@
     pass
 else:
     # Patch Numba to support CUDA enhanced compatibility.
-    patch_numba_linker_if_needed()
+    # cuDF requires a stronger set of conditions than what is
+    # checked by patch_numba_linker_if_needed due to the PTX
+    # files needed for JIT Groupby Apply and string UDFs
+    from cudf.core.udf.utils import _setup_numba_linker
+
+    _setup_numba_linker(os.path.dirname(__file__) + "/core/udf/", "function_")
+
     del patch_numba_linker_if_needed
 
 cuda.set_memory_manager(rmm.RMMNumbaManager)

@@ -23,6 +23,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
+from cudf.core.udf.groupby_utils import jit_groupby_apply
 from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
 
 
@@ -786,14 +787,19 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    def apply(self, function, *args):
+    def apply(self, function, *args, engine="cudf"):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
         ----------
         func : function
           The python transformation function that will be applied
           on the grouped chunk.
+        engine: {'cudf', 'jit'}, default 'cudf'
+          Selects the GroupBy.apply implementation. Use `jit` to
+          select the numba JIT pipeline.
+          For more information, see the `cuDF guide to user defined functions
+          <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
 
         Examples
         --------
@@ -855,25 +861,40 @@ def mult(df):
             raise TypeError(f"type {type(function)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
-        ngroups = len(offsets) - 1
-        if ngroups > self._MAX_GROUPS_BEFORE_WARN:
-            warnings.warn(
-                f"GroupBy.apply() performance scales poorly with "
-                f"number of groups. Got {ngroups} groups."
-            )
-
-        chunks = [
-            grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
-        ]
-        chunk_results = [function(chk, *args) for chk in chunks]
-        if not len(chunk_results):
-            return self.obj.head(0)
+        if engine == "jit":
+            # Nulls are not yet supported
+            for colname in self.grouping.values._data.keys():
+                if self.obj._data[colname].has_nulls():
+                    raise ValueError(
+                        "Nulls not yet supported with groupby JIT engine"
+                    )
 
-        if cudf.api.types.is_scalar(chunk_results[0]):
+            chunk_results = jit_groupby_apply(
+                offsets, grouped_values, function, *args
+            )
             result = cudf.Series(chunk_results, index=group_names)
             result.index.names = self.grouping.names
-        else:
-            if isinstance(chunk_results[0], cudf.Series) and isinstance(
+            result = result.reset_index()
+            result[None] = result.pop(0)
+        elif engine == "cudf":
+            ngroups = len(offsets) - 1
+            if ngroups > self._MAX_GROUPS_BEFORE_WARN:
+                warnings.warn(
+                    f"GroupBy.apply() performance scales poorly with "
+                    f"number of groups. Got {ngroups} groups."
+                )
+
+            chunks = [
+                grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
+            ]
+            chunk_results = [function(chk, *args) for chk in chunks]
+            if not len(chunk_results):
+                return self.obj.head(0)
+
+            if cudf.api.types.is_scalar(chunk_results[0]):
+                result = cudf.Series(chunk_results, index=group_names)
+                result.index.names = self.grouping.names
+            elif isinstance(chunk_results[0], cudf.Series) and isinstance(
                 self.obj, cudf.DataFrame
             ):
                 result = cudf.concat(chunk_results, axis=1).T
@@ -884,6 +905,8 @@ def mult(df):
                     index_data = group_keys._data.copy(deep=True)
                     index_data[None] = grouped_values.index._column
                     result.index = cudf.MultiIndex._from_data(index_data)
+        else:
+            raise ValueError(f"Unsupported engine '{engine}'")
 
         if self._sort:
             result = result.sort_index()

@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 from functools import lru_cache
 
@@ -9,7 +9,7 @@
 from cudf.core.udf import api, row_function, utils
 from cudf.utils.dtypes import STRING_TYPES
 
-from . import masked_lowering, masked_typing
+from . import groupby_lowering, groupby_typing, masked_lowering, masked_typing
 
 _units = ["ns", "ms", "us", "s"]
 _datetime_cases = {types.NPDatetime(u) for u in _units}

@@ -0,0 +1,169 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+
+from functools import partial
+
+from numba import types
+from numba.core import cgutils
+from numba.core.extending import lower_builtin
+from numba.core.typing import signature as nb_signature
+from numba.cuda.cudaimpl import lower as cuda_lower
+
+from cudf.core.udf.groupby_typing import (
+    SUPPORTED_GROUPBY_NUMBA_TYPES,
+    Group,
+    GroupType,
+    call_cuda_functions,
+    index_default_type,
+)
+
+
+def lowering_function(context, builder, sig, args, function):
+    """
+    Instruction boilerplate used for calling a groupby reduction
+    __device__ function. Centers around a forward declaration of
+    this function and adds the pre/post processing instructions
+    necessary for calling it.
+    """
+    # return type
+    retty = sig.return_type
+
+    # a variable logically corresponding to the calling `Group`
+    grp = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=args[0]
+    )
+
+    # what specific (numba) GroupType
+    grp_type = sig.args[0]
+    group_dataty = grp_type.group_data_type
+
+    # logically take the address of the group's data pointer
+    group_data_ptr = builder.alloca(grp.group_data.type)
+    builder.store(grp.group_data, group_data_ptr)
+
+    # obtain the correct forward declaration from registry
+    type_key = (sig.return_type, grp_type.group_scalar_type)
+    func = call_cuda_functions[function][type_key]
+
+    # insert the forward declaration and return its result
+    # pass it the data pointer and the group's size
+    return context.compile_internal(
+        builder,
+        func,
+        nb_signature(retty, group_dataty, grp_type.size_type),
+        (builder.load(group_data_ptr), grp.size),
+    )
+
+
+@lower_builtin(Group, types.Array, types.int64, types.Array)
+def group_constructor(context, builder, sig, args):
+    """
+    Instruction boilerplate used for instantiating a Group
+    struct from a data pointer, an index pointer, and a size
+    """
+
+    group_data, size, index = args
+
+    # a variable logically corresponding to the calling `Group`
+    grp = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+
+    # the group data array and its pointer
+    arr_group_data = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=group_data
+    )
+    group_data_ptr = arr_group_data.data
+
+    # the group index array and its pointer
+    arr_index = cgutils.create_struct_proxy(sig.args[2])(
+        context, builder, value=index
+    )
+    index_ptr = arr_index.data
+
+    # fill the struct explicitly
+    grp.group_data = group_data_ptr
+    grp.index = index_ptr
+    grp.size = size
+
+    # return the struct by value
+    return grp._getvalue()
+
+
+def cuda_Group_idx_max_or_min(context, builder, sig, args, function):
+    """
+    Instruction boilerplate used for calling a groupby reduction
+    __device__ function in the case where the function is either
+    `idxmax` or `idxmin`. See `lowering_function` for details. This
+    lowering differs from other reductions due to the presence of
+    the index. This results in the forward declaration expecting
+    an extra arg.
+    """
+    retty = sig.return_type
+
+    grp = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=args[0]
+    )
+    grp_type = sig.args[0]
+
+    if grp_type.index_type != index_default_type:
+        raise TypeError(
+            f"Only inputs with default index dtype {index_default_type} "
+            "are supported."
+        )
+
+    group_dataty = grp_type.group_data_type
+    group_data_ptr = builder.alloca(grp.group_data.type)
+    builder.store(grp.group_data, group_data_ptr)
+
+    index_dataty = grp_type.group_index_type
+    index_ptr = builder.alloca(grp.index.type)
+    builder.store(grp.index, index_ptr)
+    type_key = (types.int64, grp_type.group_scalar_type)
+    func = call_cuda_functions[function][type_key]
+
+    return context.compile_internal(
+        builder,
+        func,
+        nb_signature(retty, group_dataty, index_dataty, grp_type.size_type),
+        (builder.load(group_data_ptr), builder.load(index_ptr), grp.size),
+    )
+
+
+cuda_Group_max = partial(lowering_function, function="max")
+cuda_Group_min = partial(lowering_function, function="min")
+cuda_Group_sum = partial(lowering_function, function="sum")
+cuda_Group_mean = partial(lowering_function, function="mean")
+cuda_Group_std = partial(lowering_function, function="std")
+cuda_Group_var = partial(lowering_function, function="var")
+
+cuda_Group_idxmax = partial(cuda_Group_idx_max_or_min, function="idxmax")
+cuda_Group_idxmin = partial(cuda_Group_idx_max_or_min, function="idxmin")
+
+
+def cuda_Group_size(context, builder, sig, args):
+    grp = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=args[0]
+    )
+    return grp.size
+
+
+def cuda_Group_count(context, builder, sig, args):
+    grp = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=args[0]
+    )
+    return grp.size
+
+
+for ty in SUPPORTED_GROUPBY_NUMBA_TYPES:
+    cuda_lower("GroupType.max", GroupType(ty))(cuda_Group_max)
+    cuda_lower("GroupType.min", GroupType(ty))(cuda_Group_min)
+    cuda_lower("GroupType.sum", GroupType(ty))(cuda_Group_sum)
+    cuda_lower("GroupType.count", GroupType(ty))(cuda_Group_count)
+    cuda_lower("GroupType.size", GroupType(ty))(cuda_Group_size)
+    cuda_lower("GroupType.mean", GroupType(ty))(cuda_Group_mean)
+    cuda_lower("GroupType.std", GroupType(ty))(cuda_Group_std)
+    cuda_lower("GroupType.var", GroupType(ty))(cuda_Group_var)
+    cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))(
+        cuda_Group_idxmax
+    )
+    cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))(
+        cuda_Group_idxmin
+    )