Skip to content

Commit

Permalink
Refactor Frame scans (#9021)
Browse files Browse the repository at this point in the history
This pull request is a substantial refactor of the internals of scan operations like `cummax` and `cumsum`. The new implementation moves nearly all logic to the `Frame` level. The resulting code improves performance and adds support for new features. In particular:
- For data sizes where Python overheads dominate, `Series` operations are now 10-20% faster. More importantly, `DataFrame` operations are 2-3x faster.
- Prefix sums are now automatically supported for Index types as well.
- Prefix sums for `DataFrame` now support axis=1 (previously only reductions like `sum` did so).
- Total code is halved

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Marlene  (https://github.com/marlenezw)

URL: #9021
  • Loading branch information
vyasr authored Aug 17, 2021
1 parent 3402fec commit 368890f
Show file tree
Hide file tree
Showing 4 changed files with 300 additions and 427 deletions.
168 changes: 18 additions & 150 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"max": "nanmax",
"sum": "nansum",
"prod": "nanprod",
"product": "nanprod",
"mean": "nanmean",
"std": "nanstd",
"var": "nanvar",
Expand Down Expand Up @@ -5852,7 +5853,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
)

if data.ndim == 2:
num_cols = len(data[0])
num_cols = data.shape[1]
else:
# Since we validate ndim to be either 1 or 2 above,
# this case can be assumed to be ndim == 1.
Expand Down Expand Up @@ -6226,7 +6227,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
col.nullable for col in self._columns
):
msg = (
f"Row-wise operations to calculate '{method}' is not "
f"Row-wise operations to calculate '{method}' do not "
f"currently support columns with null values. "
f"Consider removing them with .dropna() "
f"or using .fillna()."
Expand Down Expand Up @@ -6341,154 +6342,15 @@ def _reduce(
elif axis == 1:
return self._apply_support_method_axis_1(op, **kwargs)

def cummin(self, axis=None, skipna=True, *args, **kwargs):
"""
Return cumulative minimum of the DataFrame.
Parameters
----------
skipna: bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
Returns
-------
DataFrame
Notes
-----
Parameters currently not supported is `axis`
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
>>> df.cummin()
a b
0 1 7
1 1 7
2 1 7
3 1 7
"""
if axis not in (0, "index", None):
raise NotImplementedError("Only axis=0 is currently supported.")

return self._apply_support_method(
"cummin", axis=axis, skipna=skipna, *args, **kwargs
)

def cummax(self, axis=None, skipna=True, *args, **kwargs):
"""
Return cumulative maximum of the DataFrame.
Parameters
----------
skipna: bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
Returns
-------
DataFrame
Notes
-----
Parameters currently not supported is `axis`
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
>>> df.cummax()
a b
0 1 7
1 2 8
2 3 9
3 4 10
"""
if axis not in (0, "index", None):
raise NotImplementedError("Only axis=0 is currently supported.")

return self._apply_support_method(
"cummax", axis=axis, skipna=skipna, *args, **kwargs
)

def cumsum(self, axis=None, skipna=True, *args, **kwargs):
"""
Return cumulative sum of the DataFrame.
Parameters
----------
skipna: bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
Returns
-------
DataFrame
Notes
-----
Parameters currently not supported is `axis`
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
>>> s.cumsum()
a b
0 1 7
1 3 15
2 6 24
3 10 34
"""
if axis not in (0, "index", None):
raise NotImplementedError("Only axis=0 is currently supported.")

return self._apply_support_method(
"cumsum", axis=axis, skipna=skipna, *args, **kwargs
)

def cumprod(self, axis=None, skipna=True, *args, **kwargs):
"""
Return cumulative product of the DataFrame.
Parameters
----------
skipna: bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
Returns
-------
DataFrame
Notes
-----
Parameters currently not supported is `axis`
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
>>> s.cumprod()
a b
0 1 7
1 2 56
2 6 504
3 24 5040
"""
if axis not in (0, "index", None):
raise NotImplementedError("Only axis=0 is currently supported.")
def _scan(
self, op, axis=None, *args, **kwargs,
):
axis = self._get_axis_from_axis_arg(axis)

return self._apply_support_method(
"cumprod", axis=axis, skipna=skipna, *args, **kwargs
)
if axis == 0:
return super()._scan(op, axis=axis, *args, **kwargs)
elif axis == 1:
return self._apply_support_method_axis_1(f"cum{op}", **kwargs)

def mode(self, axis=0, numeric_only=False, dropna=True):
"""
Expand Down Expand Up @@ -6716,13 +6578,14 @@ def _apply_support_method_axis_0(self, method, *args, **kwargs):
def _apply_support_method_axis_1(self, method, *args, **kwargs):
# for dask metadata compatibility
skipna = kwargs.pop("skipna", None)
skipna = True if skipna is None else skipna
if method not in _cupy_nan_methods_map and skipna not in (
None,
True,
1,
):
raise NotImplementedError(
f"Row-wise operation to calculate '{method}'"
f"Row-wise operations to calculate '{method}'"
f" currently do not support `skipna=False`."
)

Expand Down Expand Up @@ -6751,6 +6614,11 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs):
"Row-wise operations currently do not " "support `bool_only`."
)

# This parameter is only necessary for axis 0 reductions that cuDF
# performs internally. cupy already upcasts smaller integer/bool types
# to int64 when accumulating.
kwargs.pop("cast_to_int", None)

prepared, mask, common_dtype = self._prepare_for_rowwise_op(
method, skipna
)
Expand Down
Loading

0 comments on commit 368890f

Please sign in to comment.