Skip to content

Commit

Permalink
Series apply method backed by masked UDFs (#9217)
Browse files Browse the repository at this point in the history
Depends on #9174

Adds `Series.apply` which applies a scalar UDF elementwise to the series data returning a new series. Null sensitive. Works in terms of our numba `MaskedType` extension type. Similar to `pd.Series.apply`.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - H. Thomson Comer (https://github.com/thomcom)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9217
  • Loading branch information
brandon-b-miller authored Oct 1, 2021
1 parent cf0b2ca commit 3648783
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 3 deletions.
88 changes: 88 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3341,6 +3341,94 @@ def _return_sentinel_series():
return codes

# UDF related
def apply(self, func, convert_dtype=True, args=(), **kwargs):
"""
Apply a scalar function to the values of a Series.
Similar to `pandas.Series.apply. Applies a user
defined function elementwise over a series.
Parameters
----------
func : function
Scalar Python function to apply.
convert_dtype : bool, default True
In cuDF, this parameter is always True. Because
cuDF does not support arbitrary object dtypes,
the result will always be the common type as determined
by numba based on the function logic and argument types.
See examples for details.
args : tuple
Not supported
**kwargs
Not supported
Notes
-----
UDFs are cached in memory to avoid recompilation. The first
call to the UDF will incur compilation overhead.
Examples
--------
Apply a basic function to a series
>>> sr = cudf.Series([1,2,3])
>>> def f(x):
... return x + 1
>>> sr.apply(f)
0 2
1 3
2 4
dtype: int64
Apply a basic function to a series with nulls
>>> sr = cudf.Series([1,cudf.NA,3])
>>> def f(x):
... return x + 1
>>> sr.apply(f)
0 2
1 <NA>
2 4
dtype: int64
Use a function that does something conditionally,
based on if the value is or is not null
>>> sr = cudf.Series([1,cudf.NA,3])
>>> def f(x):
... if x is cudf.NA:
... return 42
... else:
... return x - 1
>>> sr.apply(f)
0 0
1 42
2 2
dtype: int64
Results will be upcast to the common dtype required
as derived from the UDFs logic. Note that this means
the common type will be returned even if such data
is passed that would not result in any values of that
dtype.
>>> sr = cudf.Series([1,cudf.NA,3])
>>> def f(x):
... return x + 1.5
>>> sr.apply(f)
0 2.5
1 <NA>
2 4.5
dtype: float64
"""
if args or kwargs:
raise ValueError(
"UDFs using *args or **kwargs are not yet supported."
)

return super()._apply(func)

def applymap(self, udf, out_dtype=None):
"""Apply an elementwise function to transform the values in the Column.
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/udf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ def _kernel(retval, {input_columns}, {input_offsets}, size):
def _define_function(df, scalar_return=False):
# Create argument list for kernel
input_columns = ", ".join([f"input_col_{i}" for i in range(len(df._data))])

input_offsets = ", ".join([f"offset_{i}" for i in range(len(df._data))])

# Create argument list to pass to device function
Expand Down Expand Up @@ -177,15 +176,17 @@ def compile_or_get(df, f):
"""

# check to see if we already compiled this function
frame_dtypes = tuple(col.dtype for col in df._data.values())
cache_key = (
*cudautils.make_cache_key(f, tuple(df.dtypes)),
*cudautils.make_cache_key(f, frame_dtypes),
*(col.mask is None for col in df._data.values()),
)
if precompiled.get(cache_key) is not None:
kernel, scalar_return_type = precompiled[cache_key]
return kernel, scalar_return_type

numba_return_type = get_udf_return_type(f, df.dtypes)
numba_return_type = get_udf_return_type(f, frame_dtypes)

_is_scalar_return = not isinstance(numba_return_type, MaskedType)
scalar_return_type = (
numba_return_type
Expand Down
116 changes: 116 additions & 0 deletions python/cudf/cudf/tests/test_udf_masked_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
assert_eq(expect, obtain, **kwargs)


def run_masked_udf_series(func_psr, func_gsr, data, **kwargs):
gsr = data
psr = data.to_pandas(nullable=True)

expect = psr.apply(func_psr)
obtain = gsr.apply(func_gsr)
assert_eq(expect, obtain, **kwargs)


@pytest.mark.parametrize("op", arith_ops)
def test_arith_masked_vs_masked(op):
# This test should test all the typing
Expand Down Expand Up @@ -314,3 +323,110 @@ def func_gdf(w, x, y, z):
}
)
run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)


###


@pytest.mark.parametrize(
"data", [cudf.Series([1, 2, 3]), cudf.Series([1, cudf.NA, 3])]
)
def test_series_apply_basic(data):
def func(x):
return x + 1

run_masked_udf_series(func, func, data, check_dtype=False)


def test_series_apply_null_conditional():
def func_pdf(x):
if x is pd.NA:
return 42
else:
return x - 1

def func_gdf(x):
if x is cudf.NA:
return 42
else:
return x - 1

data = cudf.Series([1, cudf.NA, 3])

run_masked_udf_series(func_pdf, func_gdf, data)


###


@pytest.mark.parametrize("op", arith_ops)
def test_series_arith_masked_vs_masked(op):
def func(x):
return op(x, x)

data = cudf.Series([1, cudf.NA, 3])
run_masked_udf_series(func, func, data, check_dtype=False)


@pytest.mark.parametrize("op", comparison_ops)
def test_series_compare_masked_vs_masked(op):
"""
In the series case, only one other MaskedType to compare with
- itself
"""

def func(x):
return op(x, x)

data = cudf.Series([1, cudf.NA, 3])
run_masked_udf_series(func, func, data, check_dtype=False)


@pytest.mark.parametrize("op", arith_ops)
@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA])
def test_series_arith_masked_vs_constant(op, constant):
def func(x):
return op(x, constant)

# Just a single column -> result will be all NA
data = cudf.Series([1, 2, cudf.NA])
if constant is cudf.NA and op is operator.pow:
# in pandas, 1**NA == 1. In cudf, 1**NA == 1.
with pytest.xfail():
run_masked_udf_series(func, func, data, check_dtype=False)
return
run_masked_udf_series(func, func, data, check_dtype=False)


@pytest.mark.parametrize("op", arith_ops)
@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA])
def test_series_arith_masked_vs_constant_reflected(op, constant):
def func(x):
return op(constant, x)

# Just a single column -> result will be all NA
data = cudf.Series([1, 2, cudf.NA])
if constant is not cudf.NA and constant == 1 and op is operator.pow:
# in pandas, 1**NA == 1. In cudf, 1**NA == 1.
with pytest.xfail():
run_masked_udf_series(func, func, data, check_dtype=False)
return
run_masked_udf_series(func, func, data, check_dtype=False)


def test_series_masked_is_null_conditional():
def func_psr(x):
if x is pd.NA:
return 42
else:
return x

def func_gsr(x):
if x is cudf.NA:
return 42
else:
return x

data = cudf.Series([1, cudf.NA, 3, cudf.NA])

run_masked_udf_series(func_psr, func_gsr, data, check_dtype=False)

0 comments on commit 3648783

Please sign in to comment.