diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index db333328692..87e1a87e68b 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -12,9 +12,10 @@ from cudf import _lib as libcudf from cudf._typing import ScalarLike from cudf.core.column import ColumnBase +from cudf.core.mixins import Scannable -class NumericalBaseColumn(ColumnBase): +class NumericalBaseColumn(ColumnBase, Scannable): """A column composed of numerical data. This class encodes a standard interface for different types of columns @@ -32,6 +33,13 @@ class NumericalBaseColumn(ColumnBase): "std", } + _VALID_SCANS = { + "cumsum", + "cumprod", + "cummin", + "cummax", + } + def _can_return_nan(self, skipna: bool = None) -> bool: return not skipna and self.has_nulls() @@ -174,7 +182,7 @@ def round( """Round the values in the Column to the given number of decimals.""" return libcudf.round.round(self, decimal_places=decimals, how=how) - def _apply_scan_op(self, op: str) -> ColumnBase: - return libcudf.reduce.scan(op, self, True)._with_type_metadata( - self.dtype - ) + def _scan(self, op: str) -> ColumnBase: + return libcudf.reduce.scan( + op.replace("cum", ""), self, True + )._with_type_metadata(self.dtype) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3ef178fd7ff..bf880e3b25a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5158,7 +5158,7 @@ def _scan( if axis == 0: return super()._scan(op, axis=axis, *args, **kwargs) elif axis == 1: - return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) + return self._apply_cupy_method_axis_1(op, **kwargs) @annotate("DATAFRAME_MODE", color="green", domain="cudf_python") def mode(self, axis=0, numeric_only=False, dropna=True): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 27b87d2cfb7..58a45645d59 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -47,7 +47,7 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import Merge, MergeSemi -from cudf.core.mixins import BinaryOperand +from cudf.core.mixins import BinaryOperand, Scannable from cudf.core.window import Rolling from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring @@ -99,7 +99,7 @@ } -class Frame(BinaryOperand): +class Frame(BinaryOperand, Scannable): """A collection of Column objects with an optional index. Parameters @@ -118,6 +118,21 @@ class Frame(BinaryOperand): _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS + _VALID_SCANS = { + "cumsum", + "cumprod", + "cummin", + "cummax", + } + + # Necessary because the function names don't directly map to the docs. + _SCAN_DOCSTRINGS = { + "cumsum": {"op_name": "cumulative sum"}, + "cumprod": {"op_name": "cumulative product"}, + "cummin": {"op_name": "cumulative min"}, + "cummax": {"op_name": "cumulative max"}, + } + def __init__(self, data=None, index=None): if data is None: data = {} @@ -4368,151 +4383,23 @@ def median( # Scans @annotate("FRAME_SCAN", color="green", domain="cudf_python") - def _scan(self, op, axis=None, skipna=True, cast_to_int=False): - skipna = True if skipna is None else skipna - - results = {} - for name, col in self._data.items(): - if skipna: - try: - result_col = col.nans_to_nulls() - except AttributeError: - result_col = col - else: - if col.has_nulls(include_nan=True): - # Workaround as find_first_value doesn't seem to work - # incase of bools. - first_index = int( - col.isnull().astype("int8").find_first_value(1) - ) - result_col = col.copy() - result_col[first_index:] = None - else: - result_col = col - - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): - # For reductions that accumulate a value (e.g. sum, not max) - # pandas returns an int64 dtype for all int or bool dtypes. - result_col = result_col.astype(np.int64) - results[name] = result_col._apply_scan_op(op) - # TODO: This will work for Index because it's passing self._index - # (which is None), but eventually we may want to remove that parameter - # for Index._from_data and simplify. - return self._from_data(results, index=self._index) - - @annotate("FRAME_CUMMIN", color="green", domain="cudf_python") - def cummin(self, axis=None, skipna=True, *args, **kwargs): + def _scan(self, op, axis=None, skipna=True): """ - Return cumulative minimum of the Series or DataFrame. + Return {op_name} of the {cls}. Parameters ---------- - axis: {index (0), columns(1)} + axis: {{index (0), columns(1)}} Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. - Returns - ------- - Series or DataFrame - - Examples - -------- - **Series** - - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cummin() - 0 1 - 1 1 - 2 1 - 3 1 - 4 1 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.cummin() - a b - 0 1 7 - 1 1 7 - 2 1 7 - 3 1 7 - """ - return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) - - @annotate("FRAME_CUMMAX", color="green", domain="cudf_python") - def cummax(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative maximum of the Series or DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. Returns ------- - Series or DataFrame - - Examples - -------- - **Series** - - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cummax() - 0 1 - 1 5 - 2 5 - 3 5 - 4 5 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.cummax() - a b - 0 1 7 - 1 2 8 - 2 3 9 - 3 4 10 - """ - return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) - - @annotate("FRAME_CUMSUM", color="green", domain="cudf_python") - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative sum of the Series or DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - - Returns - ------- - Series or DataFrame + {cls} Examples -------- @@ -4530,7 +4417,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): **DataFrame** >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df = cudf.DataFrame({{'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}}) >>> s.cumsum() a b 0 1 7 @@ -4538,55 +4425,44 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): 2 6 24 3 10 34 """ - return self._scan( - "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs - ) - - @annotate("FRAME_CUMPROD", color="green", domain="cudf_python") - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative product of the Series or DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series or DataFrame - - Examples - -------- - **Series** - - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cumprod() - 0 1 - 1 5 - 2 10 - 3 40 - 4 120 + cast_to_int = op in ("cumsum", "cumprod") + skipna = True if skipna is None else skipna - **DataFrame** + results = {} + for name, col in self._data.items(): + if skipna: + try: + result_col = col.nans_to_nulls() + except AttributeError: + result_col = col + else: + if col.has_nulls(include_nan=True): + # Workaround as find_first_value doesn't seem to work + # incase of bools. + first_index = int( + col.isnull().astype("int8").find_first_value(1) + ) + result_col = col.copy() + result_col[first_index:] = None + else: + result_col = col - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> s.cumprod() - a b - 0 1 7 - 1 2 56 - 2 6 504 - 3 24 5040 - """ - return self._scan( - "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs - ) + if ( + cast_to_int + and not is_decimal_dtype(result_col.dtype) + and ( + np.issubdtype(result_col.dtype, np.integer) + or np.issubdtype(result_col.dtype, np.bool_) + ) + ): + # For reductions that accumulate a value (e.g. sum, not max) + # pandas returns an int64 dtype for all int or bool dtypes. + result_col = result_col.astype(np.int64) + results[name] = getattr(result_col, op)() + # TODO: This will work for Index because it's passing self._index + # (which is None), but eventually we may want to remove that parameter + # for Index._from_data and simplify. + return self._from_data(results, index=self._index) @annotate("FRAME_TO_JSON", color="green", domain="cudf_python") @ioutils.doc_to_json() diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b76f5dcc261..8af73a5175b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -16,7 +16,7 @@ from cudf.api.types import is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column -from cudf.core.mixins import Reducible +from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.utils.utils import GetAttrGetItemMixin @@ -36,7 +36,7 @@ def _quantile_75(x): return x.quantile(0.75) -class GroupBy(Serializable, Reducible): +class GroupBy(Serializable, Reducible, Scannable): _VALID_REDUCTIONS = { "sum", @@ -54,6 +54,19 @@ class GroupBy(Serializable, Reducible): "std", } + _VALID_SCANS = { + "cumsum", + "cummin", + "cummax", + } + + # Necessary because the function names don't directly map to the docs. + _SCAN_DOCSTRINGS = { + "cumsum": {"op_name": "Cumulative sum"}, + "cummin": {"op_name": "Cumulative min"}, + "cummax": {"op_name": "Cumulative max"}, + } + _MAX_GROUPS_BEFORE_WARN = 100 def __init__( @@ -353,6 +366,10 @@ def _reduce( ) return self.agg(op) + def _scan(self, op: str, *args, **kwargs): + """{op_name} for each group.""" + return self.agg(op) + aggregate = agg def nth(self, n): @@ -1210,19 +1227,6 @@ def unique(self): """Get a list of the unique values for each column in each group.""" return self.agg("unique") - def cumsum(self): - """Compute the column-wise cumulative sum of the values in - each group.""" - return self.agg("cumsum") - - def cummin(self): - """Get the column-wise cumulative minimum value in each group.""" - return self.agg("cummin") - - def cummax(self): - """Get the column-wise cumulative maximum value in each group.""" - return self.agg("cummax") - def diff(self, periods=1, axis=0): """Get the difference between the values in each group. diff --git a/python/cudf/cudf/core/mixins/__init__.py b/python/cudf/cudf/core/mixins/__init__.py index dd3dcd6d388..8306f3f11b3 100644 --- a/python/cudf/cudf/core/mixins/__init__.py +++ b/python/cudf/cudf/core/mixins/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from .reductions import Reducible from .binops import BinaryOperand +from .reductions import Reducible +from .scans import Scannable -__all__ = ["Reducible", "BinaryOperand"] +__all__ = ["BinaryOperand", "Reducible", "Scannable"] diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py new file mode 100644 index 00000000000..723fc758b13 --- /dev/null +++ b/python/cudf/cudf/core/mixins/scans.py @@ -0,0 +1,11 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from .mixin_factory import _create_delegating_mixin + +Scannable = _create_delegating_mixin( + "Scannable", + "Mixin encapsulating scan operations.", + "SCAN", + "_scan", + {"cumsum", "cumprod", "cummin", "cummax",}, # noqa: E231 +) diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi new file mode 100644 index 00000000000..38cb9af284f --- /dev/null +++ b/python/cudf/cudf/core/mixins/scans.pyi @@ -0,0 +1,18 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from typing import Set + +class Scannable: + _SUPPORTED_SCANS: Set + + def cumsum(self): + ... + + def cumprod(self): + ... + + def cummin(self): + ... + + def cummax(self): + ...