Series apply method backed by masked UDFs (#9217)

Depends on #9174 Adds `Series.apply` which applies a scalar UDF elementwise to the series data returning a new series. Null sensitive. Works in terms of our numba `MaskedType` extension type. Similar to `pd.Series.apply`. Authors: - https://github.com/brandon-b-miller Approvers: - Ashwin Srinath (https://github.com/shwina) - H. Thomson Comer (https://github.com/thomcom) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #9217
rapidsai · Oct 1, 2021 · 3648783 · 3648783
1 parent cf0b2ca
commit 3648783
Show file tree

Hide file tree

Showing 3 changed files with 208 additions and 3 deletions.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -3341,6 +3341,94 @@ def _return_sentinel_series():
         return codes
 
     # UDF related
+    def apply(self, func, convert_dtype=True, args=(), **kwargs):
+        """
+        Apply a scalar function to the values of a Series.
+
+        Similar to `pandas.Series.apply. Applies a user
+        defined function elementwise over a series.
+
+        Parameters
+        ----------
+        func : function
+            Scalar Python function to apply.
+        convert_dtype : bool, default True
+            In cuDF, this parameter is always True. Because
+            cuDF does not support arbitrary object dtypes,
+            the result will always be the common type as determined
+            by numba based on the function logic and argument types.
+            See examples for details.
+        args : tuple
+            Not supported
+        **kwargs
+            Not supported
+
+        Notes
+        -----
+        UDFs are cached in memory to avoid recompilation. The first
+        call to the UDF will incur compilation overhead.
+
+        Examples
+        --------
+
+        Apply a basic function to a series
+        >>> sr = cudf.Series([1,2,3])
+        >>> def f(x):
+        ...     return x + 1
+        >>> sr.apply(f)
+        0    2
+        1    3
+        2    4
+        dtype: int64
+
+        Apply a basic function to a series with nulls
+        >>> sr = cudf.Series([1,cudf.NA,3])
+        >>> def f(x):
+        ...     return x + 1
+        >>> sr.apply(f)
+        0       2
+        1    <NA>
+        2       4
+        dtype: int64
+
+        Use a function that does something conditionally,
+        based on if the value is or is not null
+        >>> sr = cudf.Series([1,cudf.NA,3])
+        >>> def f(x):
+        ...     if x is cudf.NA:
+        ...         return 42
+        ...     else:
+        ...         return x - 1
+        >>> sr.apply(f)
+        0     0
+        1    42
+        2     2
+        dtype: int64
+
+        Results will be upcast to the common dtype required
+        as derived from the UDFs logic. Note that this means
+        the common type will be returned even if such data
+        is passed that would not result in any values of that
+        dtype.
+
+        >>> sr = cudf.Series([1,cudf.NA,3])
+        >>> def f(x):
+        ...     return x + 1.5
+        >>> sr.apply(f)
+        0     2.5
+        1    <NA>
+        2     4.5
+        dtype: float64
+
+
+
+        """
+        if args or kwargs:
+            raise ValueError(
+                "UDFs using *args or **kwargs are not yet supported."
+            )
+
+        return super()._apply(func)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.

diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
@@ -130,7 +130,6 @@ def _kernel(retval, {input_columns}, {input_offsets}, size):
 def _define_function(df, scalar_return=False):
     # Create argument list for kernel
     input_columns = ", ".join([f"input_col_{i}" for i in range(len(df._data))])
-
     input_offsets = ", ".join([f"offset_{i}" for i in range(len(df._data))])
 
     # Create argument list to pass to device function
@@ -177,15 +176,17 @@ def compile_or_get(df, f):
     """
 
     # check to see if we already compiled this function
+    frame_dtypes = tuple(col.dtype for col in df._data.values())
     cache_key = (
-        *cudautils.make_cache_key(f, tuple(df.dtypes)),
+        *cudautils.make_cache_key(f, frame_dtypes),
         *(col.mask is None for col in df._data.values()),
     )
     if precompiled.get(cache_key) is not None:
         kernel, scalar_return_type = precompiled[cache_key]
         return kernel, scalar_return_type
 
-    numba_return_type = get_udf_return_type(f, df.dtypes)
+    numba_return_type = get_udf_return_type(f, frame_dtypes)
+
     _is_scalar_return = not isinstance(numba_return_type, MaskedType)
     scalar_return_type = (
         numba_return_type

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -40,6 +40,15 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     assert_eq(expect, obtain, **kwargs)
 
 
+def run_masked_udf_series(func_psr, func_gsr, data, **kwargs):
+    gsr = data
+    psr = data.to_pandas(nullable=True)
+
+    expect = psr.apply(func_psr)
+    obtain = gsr.apply(func_gsr)
+    assert_eq(expect, obtain, **kwargs)
+
+
 @pytest.mark.parametrize("op", arith_ops)
 def test_arith_masked_vs_masked(op):
     # This test should test all the typing
@@ -314,3 +323,110 @@ def func_gdf(w, x, y, z):
         }
     )
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
+
+###
+
+
+@pytest.mark.parametrize(
+    "data", [cudf.Series([1, 2, 3]), cudf.Series([1, cudf.NA, 3])]
+)
+def test_series_apply_basic(data):
+    def func(x):
+        return x + 1
+
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+def test_series_apply_null_conditional():
+    def func_pdf(x):
+        if x is pd.NA:
+            return 42
+        else:
+            return x - 1
+
+    def func_gdf(x):
+        if x is cudf.NA:
+            return 42
+        else:
+            return x - 1
+
+    data = cudf.Series([1, cudf.NA, 3])
+
+    run_masked_udf_series(func_pdf, func_gdf, data)
+
+
+###
+
+
+@pytest.mark.parametrize("op", arith_ops)
+def test_series_arith_masked_vs_masked(op):
+    def func(x):
+        return op(x, x)
+
+    data = cudf.Series([1, cudf.NA, 3])
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", comparison_ops)
+def test_series_compare_masked_vs_masked(op):
+    """
+    In the series case, only one other MaskedType to compare with
+    - itself
+    """
+
+    def func(x):
+        return op(x, x)
+
+    data = cudf.Series([1, cudf.NA, 3])
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA])
+def test_series_arith_masked_vs_constant(op, constant):
+    def func(x):
+        return op(x, constant)
+
+    # Just a single column -> result will be all NA
+    data = cudf.Series([1, 2, cudf.NA])
+    if constant is cudf.NA and op is operator.pow:
+        # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
+        with pytest.xfail():
+            run_masked_udf_series(func, func, data, check_dtype=False)
+        return
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA])
+def test_series_arith_masked_vs_constant_reflected(op, constant):
+    def func(x):
+        return op(constant, x)
+
+    # Just a single column -> result will be all NA
+    data = cudf.Series([1, 2, cudf.NA])
+    if constant is not cudf.NA and constant == 1 and op is operator.pow:
+        # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
+        with pytest.xfail():
+            run_masked_udf_series(func, func, data, check_dtype=False)
+        return
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+def test_series_masked_is_null_conditional():
+    def func_psr(x):
+        if x is pd.NA:
+            return 42
+        else:
+            return x
+
+    def func_gsr(x):
+        if x is cudf.NA:
+            return 42
+        else:
+            return x
+
+    data = cudf.Series([1, cudf.NA, 3, cudf.NA])
+
+    run_masked_udf_series(func_psr, func_gsr, data, check_dtype=False)