Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support args= in Series.apply #9982

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
cff4d1f
basic
brandon-b-miller Jan 5, 2022
4b7181d
enough for now
brandon-b-miller Jan 6, 2022
be9deb4
stuff works
brandon-b-miller Jan 6, 2022
c3ed817
unify get_udf_return_type and change how its called
brandon-b-miller Jan 6, 2022
76fe430
lots of progress here
brandon-b-miller Jan 6, 2022
9131e23
bugfixes
brandon-b-miller Jan 6, 2022
4e9c876
all passing
brandon-b-miller Jan 6, 2022
af56462
moving a few things around
brandon-b-miller Jan 6, 2022
21c4b95
rename things
brandon-b-miller Jan 6, 2022
1feed89
updates
brandon-b-miller Jan 6, 2022
c77333d
more updates, slowly refactoring
brandon-b-miller Jan 7, 2022
3543e15
pretty close now
brandon-b-miller Jan 7, 2022
caff641
move _apply from Frame to IndexedFrame
brandon-b-miller Jan 7, 2022
8338061
merge latest
brandon-b-miller Jan 11, 2022
ebdd9d4
Merge branch 'branch-22.02' into fea-series-apply-args
brandon-b-miller Jan 11, 2022
ac442b2
rename confusing function names
brandon-b-miller Jan 11, 2022
d87fcbe
Apply suggestions from code review
brandon-b-miller Jan 18, 2022
460a45d
push compile_or_get down and adjust signature
brandon-b-miller Jan 18, 2022
e1635f8
lambda -> scalar
brandon-b-miller Jan 18, 2022
34f5c57
bugfix
brandon-b-miller Jan 18, 2022
841cad8
prefix everything with an underscore
brandon-b-miller Jan 18, 2022
c784b12
address more reviews
brandon-b-miller Jan 18, 2022
809b4c1
style
brandon-b-miller Jan 18, 2022
1f320e6
factor out common logic
brandon-b-miller Jan 18, 2022
b099ddc
Merge branch 'branch-22.02' into fea-series-apply-args
brandon-b-miller Jan 19, 2022
ac0bb27
dont use a locals dict
brandon-b-miller Jan 19, 2022
f9b6bbd
merge latest and resolve conflicts
brandon-b-miller Jan 19, 2022
18c256d
Apply suggestions from code review
brandon-b-miller Jan 19, 2022
25ffcdb
partially address reviews
brandon-b-miller Jan 19, 2022
1844344
shorten comments
brandon-b-miller Jan 24, 2022
c84ac0a
move to a TypingError
brandon-b-miller Jan 24, 2022
8d41c89
address more reviews
brandon-b-miller Jan 24, 2022
2415a98
merge 22.04 and resolve conflicts
brandon-b-miller Jan 26, 2022
643d55e
Update python/cudf/cudf/core/udf/utils.py
brandon-b-miller Jan 27, 2022
0af638f
updates
brandon-b-miller Jan 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from cudf.core.multiindex import MultiIndex
from cudf.core.resample import DataFrameResampler
from cudf.core.series import Series
from cudf.core.udf.row_function import _get_row_kernel
from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -3926,10 +3927,8 @@ def apply(
raise ValueError("The `raw` kwarg is not yet supported.")
if result_type is not None:
raise ValueError("The `result_type` kwarg is not yet supported.")
if kwargs:
raise ValueError("UDFs using **kwargs are not yet supported.")

return self._apply(func, *args)
return self._apply(func, _get_row_kernel, *args, **kwargs)

@applyutils.doc_apply()
def apply_rows(
Expand Down
34 changes: 0 additions & 34 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
)
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.join import Merge, MergeSemi
from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
from cudf.core.window import Rolling
from cudf.utils import ioutils
from cudf.utils.docutils import copy_docstring
Expand Down Expand Up @@ -1367,39 +1366,6 @@ def _quantiles(
result._copy_type_metadata(self)
return result

@annotate("APPLY", color="purple", domain="cudf_python")
def _apply(self, func, *args):
"""
Apply `func` across the rows of the frame.
"""
kernel, retty = compile_or_get(self, func, args)

# Mask and data column preallocated
ans_col = cupy.empty(len(self), dtype=retty)
ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
launch_args = [(ans_col, ans_mask), len(self)]
offsets = []

# if compile_or_get succeeds, it is safe to create a kernel that only
# consumes the columns that are of supported dtype
for col in supported_cols_from_frame(self).values():
data = col.data
mask = col.mask
if mask is None:
launch_args.append(data)
else:
launch_args.append((data, mask))
offsets.append(col.offset)
launch_args += offsets
launch_args += list(args)
kernel.forall(len(self))(*launch_args)

col = as_column(ans_col)
col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
result = cudf.Series._from_data({None: col}, self._index)

return result

def rank(
self,
axis=0,
Expand Down
48 changes: 47 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@
is_integer_dtype,
is_list_like,
)
from cudf.core.column import arange
from cudf.core.column import arange, as_column
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.index import Index, RangeIndex, _index_from_columns
from cudf.core.multiindex import MultiIndex
from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
from cudf.utils.utils import cached_property

doc_reset_index_template = """
Expand Down Expand Up @@ -756,6 +757,51 @@ def add_suffix(self, suffix):
Use `Series.add_suffix` or `DataFrame.add_suffix`"
)

@annotate("APPLY", color="purple", domain="cudf_python")
def _apply(self, func, kernel_getter, *args, **kwargs):
"""Apply `func` across the rows of the frame."""
if kwargs:
raise ValueError("UDFs using **kwargs are not yet supported.")

try:
kernel, retty = _compile_or_get(
self, func, args, kernel_getter=kernel_getter
)
except Exception as e:
raise ValueError(
"user defined function compilation failed."
) from e

# Mask and data column preallocated
ans_col = cp.empty(len(self), dtype=retty)
ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
launch_args = [(ans_col, ans_mask), len(self)]
offsets = []

# if _compile_or_get succeeds, it is safe to create a kernel that only
# consumes the columns that are of supported dtype
for col in _supported_cols_from_frame(self).values():
data = col.data
mask = col.mask
if mask is None:
launch_args.append(data)
else:
launch_args.append((data, mask))
offsets.append(col.offset)
launch_args += offsets
launch_args += list(args)

try:
kernel.forall(len(self))(*launch_args)
except Exception as e:
raise RuntimeError("UDF kernel execution failed.") from e

col = as_column(ans_col)
col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
result = cudf.Series._from_data({None: col}, self._index)

return result

def sort_values(
self,
by,
Expand Down
21 changes: 5 additions & 16 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import cupy
import numpy as np
import pandas as pd
from numba import cuda
from pandas._config import get_option

import cudf
Expand Down Expand Up @@ -67,6 +66,7 @@
doc_reset_index_template,
)
from cudf.core.single_column_frame import SingleColumnFrame
from cudf.core.udf.scalar_function import _get_scalar_kernel
from cudf.utils import cudautils, docutils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -2374,7 +2374,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
by numba based on the function logic and argument types.
See examples for details.
args : tuple
Not supported
Positional arguments passed to func after the series value.
**kwargs
Not supported

Expand Down Expand Up @@ -2440,20 +2440,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
2 4.5
dtype: float64
"""
if args or kwargs:
raise ValueError(
"UDFs using *args or **kwargs are not yet supported."
)

# these functions are generally written as functions of scalar
# values rather than rows. Rather than writing an entirely separate
# numba kernel that is not built around a row object, its simpler
# to just turn this into the equivalent single column dataframe case
name = self.name or "__temp_srname"
df = cudf.DataFrame({name: self})
f_ = cuda.jit(device=True)(func)

return df.apply(lambda row: f_(row[name]))
if convert_dtype is not True:
raise ValueError("Series.apply only supports convert_dtype=True")
return self._apply(func, _get_scalar_kernel, *args, **kwargs)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

def applymap(self, udf, out_dtype=None):
"""Apply an elementwise function to transform the values in the Column.
Expand Down
Loading