diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c85ed0c8555..b1dc1c0e1cb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -47,6 +47,7 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import Merge, MergeSemi +from cudf.core.reductions import Reducible from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame from cudf.core.window import Rolling from cudf.utils import ioutils @@ -57,7 +58,7 @@ T = TypeVar("T", bound="Frame") -class Frame: +class Frame(Reducible): """A collection of Column objects with an optional index. Parameters @@ -73,6 +74,97 @@ class Frame: # attribute should be moved to IndexedFrame. _index: Optional[cudf.core.index.BaseIndex] + _VALID_REDUCTIONS = { + "sum", + "min", + "max", + "mean", + "sum_of_squares", + "median", + } + + _REDUCTION_INFO = { + "docstring_params": { + "min": { + "example": """>>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.min() + a 1 + b 7 + dtype: int64 + """, + "extra_parameters": "", + }, + "max": { + "example": """>>> import cudf + >>> df = cudf.DataFrame({{'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}}) + >>> df.max() + a 4 + b 10 + dtype: int64 + """, + "extra_parameters": "", + }, + "mean": { + "example": """>>> import cudf + >>> df = cudf.DataFrame({{'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}}) + >>> df.sum() + a 2.5 + b 8.5 + dtype: float64 + """, + "extra_parameters": "", + }, + "median": { + "example": """>>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + """, + "extra_parameters": "", + }, + "sum_of_squares": { + "example": """>>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.sum_of_squares() + a 38 + b 249 + dtype: int64 + """, + "extra_parameters": "", + }, + "sum": { + "example": """>>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + """, + "extra_parameters": """ + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + """, + }, + }, + "signature_templates": { + "sum": lambda self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0, *args, **kwargs: None # noqa: E501 + }, + } + def __init__(self, data=None, index=None): if data is None: data = {} @@ -4111,130 +4203,25 @@ def _get_axis_from_axis_arg(cls, axis): except KeyError: raise ValueError(f"No axis named {axis} for object type {cls}") - def _reduce(self, *args, **kwargs): - raise NotImplementedError( - f"Reductions are not supported for objects of type {type(self)}." - ) - - def min( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the minimum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() - a 1 - b 7 - dtype: int64 - """ - return self._reduce( - "min", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def max( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the maximum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.max() - a 4 - b 10 - dtype: int64 - """ - return self._reduce( - "max", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def sum( + def _reduce( self, + op: str, axis=None, skipna=None, - dtype=None, level=None, numeric_only=None, - min_count=0, + *args, **kwargs, ): - """ - Return sum of the values in the DataFrame. + """Return {op} of the values in the Frame. Parameters ---------- - - axis: {index (0), columns(1)} + axis: {{index (0), columns(1)}} Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. + {extra_parameters} Returns ------- @@ -4246,22 +4233,10 @@ def sum( Examples -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 + {example} """ - return self._reduce( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + raise NotImplementedError( + f"Reductions are not supported for objects of type {type(self)}." ) def product( @@ -4328,50 +4303,6 @@ def product( # Alias for pandas compatibility. prod = product - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._reduce( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - def std( self, axis=None, @@ -4538,18 +4469,7 @@ def kurtosis( **kwargs, ) - # Alias for kurtosis. - @copy_docstring(kurtosis) - def kurt( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - return self.kurtosis( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) + kurt = kurtosis def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs @@ -4678,73 +4598,6 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) - def sum_of_squares(self, dtype=None): - """Return the sum of squares of values. - - Parameters - ---------- - dtype: data type - Data type to cast the result to. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.sum_of_squares() - a 38 - b 249 - dtype: int64 - """ - return self._reduce("sum_of_squares", dtype=dtype) - - def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level` and `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - """ - return self._reduce( - "median", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - # Scans def _scan(self, op, axis=None, skipna=True, cast_to_int=False): skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index c7c8b10118e..d3f0b35b704 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -77,50 +77,44 @@ def _reduce(self, op: str, *args, **kwargs): @classmethod def _add_reduction(cls, reduction): - # This function creates reduction operations on-the-fly and assigns - # them to the class. + reduction_info = getattr(cls, "_REDUCTION_INFO", {}) # Generate a signature without the `op` parameter. - signature = inspect.signature(cls._reduce) - new_params = signature.parameters.copy() - new_params.pop("op") - signature = signature.replace(parameters=new_params.values()) + signature_templates = reduction_info.get("signature_templates", {}) + if reduction in signature_templates: + signature = inspect.signature(signature_templates[reduction]) + else: + signature = inspect.signature(cls._reduce) + new_params = signature.parameters.copy() + new_params.pop("op") + signature = signature.replace(parameters=new_params.values()) # Generate the list of arguments forwarded to _reduce. arglist = ", ".join( - [ - f"{key}={key}" - for key in signature.parameters - if key not in ("self", "args", "kwargs") - ] + f"{key}={key}" + for key in signature.parameters + if key not in ("self", "args", "kwargs") ) - if arglist: - arglist += ", *args, **kwargs" - else: - arglist = "*args, **kwargs" + arglist += (", " if arglist else "") + "*args, **kwargs" # The default docstring is that of the _reduce method. Additional # formatting arguments may be provided in a class-level dictionary # of the form _REDUCTION_DOCSTRINGS + docstring_info = reduction_info.get("docstring_params", {}) docstring = cls._reduce.__doc__.format( cls=cls.__name__, op=reduction, - **getattr(cls, "_REDUCTION_DOCSTRINGS", {}).get(reduction, {}), + **docstring_info.get(reduction, {}), ) # Create the desired function. namespace = {} - out = """ + out = f""" def {reduction}{signature}: \"\"\"{docstring} \"\"\" return self._reduce(op="{reduction}", {arglist}) - """.format( - reduction=reduction, - signature=str(signature), - arglist=arglist, - docstring=docstring, - ) + """ exec(out, namespace) setattr(cls, reduction, namespace[reduction])