From 2c074b1c9806c018d0280917b805374a55ca09ff Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Jul 2021 15:36:41 -0700 Subject: [PATCH 01/39] Move reductions from NumericalBase into Reducible mixin. --- .../cudf/cudf/core/column/numerical_base.py | 48 ++-------- python/cudf/cudf/core/reductions.py | 90 +++++++++++++++++++ 2 files changed, 98 insertions(+), 40 deletions(-) create mode 100644 python/cudf/cudf/core/reductions.py diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 1f84cb88e37..68604124288 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -10,11 +10,12 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import Dtype, ScalarLike +from cudf._typing import ScalarLike from cudf.core.column import ColumnBase +from cudf.core.reductions import Reducible -class NumericalBaseColumn(ColumnBase): +class NumericalBaseColumn(Reducible, ColumnBase): """A column composed of numerical data. This class encodes a standard interface for different types of columns @@ -23,7 +24,7 @@ class NumericalBaseColumn(ColumnBase): point, should be encoded here. """ - def reduce( + def _reduce( self, op: str, skipna: bool = None, min_count: int = 0, **kwargs ) -> ScalarLike: """Perform a reduction operation. @@ -31,7 +32,10 @@ def reduce( op : str The operation to perform. skipna : bool - Whether or not na values must be + Whether or not na values must be skipped. + min_count : int, default 0 + The minimum number of entries for the reduction, otherwise the + reduction returns NaN. """ preprocessed = self._process_for_reduction( skipna=skipna, min_count=min_count @@ -41,42 +45,6 @@ def reduce( else: return preprocessed - def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ) -> ScalarLike: - return self.reduce( - "sum", skipna=skipna, dtype=dtype, min_count=min_count - ) - - def product( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ) -> ScalarLike: - return self.reduce( - "product", skipna=skipna, dtype=dtype, min_count=min_count - ) - - def mean( - self, skipna: bool = None, dtype: Dtype = np.float64 - ) -> ScalarLike: - return self.reduce("mean", skipna=skipna, dtype=dtype) - - def var( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 - ) -> ScalarLike: - return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof) - - def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 - ) -> ScalarLike: - return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) - - def sum_of_squares( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ) -> ScalarLike: - return self.reduce( - "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count - ) - def _can_return_nan(self, skipna: bool = None) -> bool: return not skipna and self.has_nulls() diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py new file mode 100644 index 00000000000..148e1d78ab8 --- /dev/null +++ b/python/cudf/cudf/core/reductions.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +import numpy as np + + +class Reducible: + """Mixin encapsulating for reduction operations. + + Various classes in cuDF support + `reductions `__. In + practice the reductions are implemented via dispatch to a lower-level API. + For example, Frame-like objects dispatch to Columns, which in turn dispatch + to libcudf implementations. As a result, rather than encoding the logic for + different types of reductions, most classes can implement all reductions + in terms of a single function that performs necessary pre- and + post-processing of a result generated by calling to a lower-level API. This + class encapsulates that paradigm. + """ + + # TODO: We need a way to indicate supported/unsupported operations. + # Alternatively, we could just leave that up to the implementation of + # _reduce. + # TODO: If supported operations are indicated using a class member list + # (_SUPPORTED_OPERATIONS or so) we could use __init_subclass__ to combine + # all parent lists via walking the MRO. + + def _reduce(self, op: str, *args, **kwargs): + raise NotImplementedError + + def sum(self, *args, **kwargs): + return self._reduce("sum", *args, **kwargs) + + def product(self, *args, **kwargs): + return self._reduce("product", *args, **kwargs) + + # def min(self, *args, **kwargs): + # return self._reduce("min", *args, **kwargs) + # + # def max(self, *args, **kwargs): + # return self._reduce("max", *args, **kwargs) + + # def count(self, *args, **kwargs): + # return self._reduce("count", *args, **kwargs) + # + # def size(self, *args, **kwargs): + # return self._reduce("size", *args, **kwargs) + # + # def any(self, *args, **kwargs): + # return self._reduce("any", *args, **kwargs) + # + # def all(self, *args, **kwargs): + # return self._reduce("all", *args, **kwargs) + + def sum_of_squares(self, *args, **kwargs): + return self._reduce("sum_of_squares", *args, **kwargs) + + # TODO: Need a better way of setting a default dtype. If nothing else, I + # guess we'll just need to override this parameter in the relevant column + # types (numerical_base). + def mean(self, dtype=np.float64, *args, **kwargs): + return self._reduce("mean", dtype=dtype, *args, **kwargs) + + def var(self, dtype=np.float64, *args, **kwargs): + return self._reduce("var", dtype=dtype, *args, **kwargs) + + def std(self, dtype=np.float64, *args, **kwargs): + return self._reduce("std", dtype=dtype, *args, **kwargs) + + # def median(self, *args, **kwargs): + # return self._reduce("median", *args, **kwargs) + # + # def quantile(self, *args, **kwargs): + # return self._reduce("quantile", *args, **kwargs) + # + # def argmax(self, *args, **kwargs): + # return self._reduce("argmax", *args, **kwargs) + # + # def argmin(self, *args, **kwargs): + # return self._reduce("argmin", *args, **kwargs) + # + # def nunique(self, *args, **kwargs): + # return self._reduce("nunique", *args, **kwargs) + # + # def nth(self, *args, **kwargs): + # return self._reduce("nth", *args, **kwargs) + + # def collect(self, *args, **kwargs): + # return self._reduce("sum", *args, **kwargs) + # + # def unique(self, *args, **kwargs): + # return self._reduce("sum", *args, **kwargs) From 970b45d0bda643670eec1ed42ddf055dc3e7d6b5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Jul 2021 15:43:41 -0700 Subject: [PATCH 02/39] Override dtype for numerical type methods that require it. --- python/cudf/cudf/core/column/numerical_base.py | 9 +++++++++ python/cudf/cudf/core/reductions.py | 16 ++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 68604124288..de9cb139519 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -116,6 +116,15 @@ def quantile( ) return result + def mean(self, dtype=np.float64, *args, **kwargs): + return super().mean(dtype=dtype, *args, **kwargs) + + def var(self, dtype=np.float64, *args, **kwargs): + return super().var(dtype=dtype, *args, **kwargs) + + def std(self, dtype=np.float64, *args, **kwargs): + return super().std(dtype=dtype, *args, **kwargs) + def median(self, skipna: bool = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index 148e1d78ab8..7d787d54ab7 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -1,5 +1,4 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -import numpy as np class Reducible: @@ -53,17 +52,14 @@ def product(self, *args, **kwargs): def sum_of_squares(self, *args, **kwargs): return self._reduce("sum_of_squares", *args, **kwargs) - # TODO: Need a better way of setting a default dtype. If nothing else, I - # guess we'll just need to override this parameter in the relevant column - # types (numerical_base). - def mean(self, dtype=np.float64, *args, **kwargs): - return self._reduce("mean", dtype=dtype, *args, **kwargs) + def mean(self, *args, **kwargs): + return self._reduce("mean", *args, **kwargs) - def var(self, dtype=np.float64, *args, **kwargs): - return self._reduce("var", dtype=dtype, *args, **kwargs) + def var(self, *args, **kwargs): + return self._reduce("var", *args, **kwargs) - def std(self, dtype=np.float64, *args, **kwargs): - return self._reduce("std", dtype=dtype, *args, **kwargs) + def std(self, *args, **kwargs): + return self._reduce("std", *args, **kwargs) # def median(self, *args, **kwargs): # return self._reduce("median", *args, **kwargs) From 0a774f628bd234ef9a5be64447ae83e69c8b93fd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 18 Nov 2021 17:02:58 -0800 Subject: [PATCH 03/39] Generate reductions programmatically. --- python/cudf/cudf/core/column/column.py | 36 ----- .../cudf/cudf/core/column/numerical_base.py | 21 ++- python/cudf/cudf/core/column/timedelta.py | 5 +- python/cudf/cudf/core/dataframe.py | 11 +- python/cudf/cudf/core/reductions.py | 130 +++++++++--------- python/cudf/cudf/core/series.py | 10 +- python/cudf/cudf/core/single_column_frame.py | 5 +- 7 files changed, 106 insertions(+), 112 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1a83194489d..d28e12c1eaf 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1176,42 +1176,6 @@ def max(self, skipna: bool = None, dtype: Dtype = None): return libcudf.reduce.reduce("max", result_col, dtype=dtype) return result_col - def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ): - raise TypeError(f"cannot perform sum with type {self.dtype}") - - def product( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ): - raise TypeError(f"cannot perform product with type {self.dtype}") - - def mean(self, skipna: bool = None, dtype: Dtype = None): - raise TypeError(f"cannot perform mean with type {self.dtype}") - - def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): - raise TypeError(f"cannot perform std with type {self.dtype}") - - def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): - raise TypeError(f"cannot perform var with type {self.dtype}") - - def kurtosis(self, skipna: bool = None): - raise TypeError(f"cannot perform kurtosis with type {self.dtype}") - - def skew(self, skipna: bool = None): - raise TypeError(f"cannot perform skew with type {self.dtype}") - - def cov(self, other: ColumnBase): - raise TypeError( - f"cannot perform covarience with types {self.dtype}, " - f"{other.dtype}" - ) - - def corr(self, other: ColumnBase): - raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" - ) - def nans_to_nulls(self: T) -> T: # Only floats can contain nan. if self.dtype.kind != "f": diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index de9cb139519..866e864d407 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -24,8 +24,17 @@ class NumericalBaseColumn(Reducible, ColumnBase): point, should be encoded here. """ + _VALID_REDUCTIONS = { + "sum", + "product", + "sum_of_squares", + "mean", + "var", + "std", + } + def _reduce( - self, op: str, skipna: bool = None, min_count: int = 0, **kwargs + self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs ) -> ScalarLike: """Perform a reduction operation. @@ -117,13 +126,13 @@ def quantile( return result def mean(self, dtype=np.float64, *args, **kwargs): - return super().mean(dtype=dtype, *args, **kwargs) + return self._reduce("mean", dtype=dtype, *args, **kwargs) def var(self, dtype=np.float64, *args, **kwargs): - return super().var(dtype=dtype, *args, **kwargs) + return self._reduce("var", dtype=dtype, *args, **kwargs) def std(self, dtype=np.float64, *args, **kwargs): - return super().std(dtype=dtype, *args, **kwargs) + return self._reduce("std", dtype=dtype, *args, **kwargs) def median(self, skipna: bool = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna @@ -148,7 +157,7 @@ def _numeric_quantile( self, quant, interpolation, sorted_indices, exact ) - def cov(self, other: ColumnBase) -> float: + def cov(self, other: NumericalBaseColumn) -> float: if ( len(self) == 0 or len(other) == 0 @@ -160,7 +169,7 @@ def cov(self, other: ColumnBase) -> float: cov_sample = result.sum() / (len(self) - 1) return cov_sample - def corr(self, other: ColumnBase) -> float: + def corr(self, other: NumericalBaseColumn) -> float: if len(self) == 0 or len(other) == 0: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 4b7a3bcc197..400214475e7 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -397,7 +397,10 @@ def sum( self, skipna: bool = None, dtype: Dtype = None, min_count=0 ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.sum( + # TODO: mypy doesn't fully support monkey-patching, so the + # monkey-patching of reductions creates some serious limitations. + # We'll have to see how bad the problem is. + self.as_numerical.sum( # type: ignore skipna=skipna, dtype=dtype, min_count=min_count ), unit=self.time_unit, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c686cd0fd39..f07f2c46e41 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5328,10 +5328,13 @@ def _reduce( axis = self._get_axis_from_axis_arg(axis) if axis == 0: - result = [ - getattr(self._data[col], op)(**kwargs) - for col in self._data.names - ] + try: + result = [ + getattr(self._data[col], op)(**kwargs) + for col in self._data.names + ] + except AttributeError: + raise TypeError(f"cannot perform {op} with type {self.dtype}") return Series._from_data( {None: result}, as_index(self._data.names) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index 7d787d54ab7..c813aeda1d8 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -1,6 +1,31 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +# TODO: Implement a factory that generates classes encapsulating passthrough +# operations like this. Then we can programmatically create suitable classes +# for reductions, scans, and binops (and perhaps others). + + +# TODO: The docstring for a class's reductions should be taken from formatting +# the reduce method since it needs to support all the same things. + + +# TODO: We have 3 options for how to handle unsupported operations: +# 1. We only monkey-patch in the operations that are well-defined. This is what +# I'm currently doing, and it works, but it precludes mypy checking. +# 2. We define _all_ aggregations in Reducible and then delete the ill-defined +# ones in __init_subclass__. This is basically the inverse of what I'm +# currently doing. Relative to option 1, it replaces mypy false positives +# for false negatives: instead of saying attributes don't exist when they +# do, it will never detect missing attributes that really shouldn't exist +# for certain column types. +# 3. Define all aggregations like in 2 but don't delete them. mypy will treat +# this essentially identically to 2, so we don't get any more useful +# diagnostics, but we run into the additional problem that now all +# reduction functions are defined for all subclasses, even if they aren't +# well-defined operations. + + class Reducible: """Mixin encapsulating for reduction operations. @@ -15,72 +40,53 @@ class Reducible: class encapsulates that paradigm. """ - # TODO: We need a way to indicate supported/unsupported operations. - # Alternatively, we could just leave that up to the implementation of - # _reduce. - # TODO: If supported operations are indicated using a class member list - # (_SUPPORTED_OPERATIONS or so) we could use __init_subclass__ to combine - # all parent lists via walking the MRO. + _SUPPORTED_REDUCTIONS = { + "sum", + "product", + "min", + "max", + "count", + "size", + "any", + "all", + "sum_of_squares", + "mean", + "var", + "std", + "median", + "quantile", + "argmax", + "argmin", + "nunique", + "nth", + "collect", + "unique", + } def _reduce(self, op: str, *args, **kwargs): raise NotImplementedError - def sum(self, *args, **kwargs): - return self._reduce("sum", *args, **kwargs) - - def product(self, *args, **kwargs): - return self._reduce("product", *args, **kwargs) - - # def min(self, *args, **kwargs): - # return self._reduce("min", *args, **kwargs) - # - # def max(self, *args, **kwargs): - # return self._reduce("max", *args, **kwargs) - - # def count(self, *args, **kwargs): - # return self._reduce("count", *args, **kwargs) - # - # def size(self, *args, **kwargs): - # return self._reduce("size", *args, **kwargs) - # - # def any(self, *args, **kwargs): - # return self._reduce("any", *args, **kwargs) - # - # def all(self, *args, **kwargs): - # return self._reduce("all", *args, **kwargs) - - def sum_of_squares(self, *args, **kwargs): - return self._reduce("sum_of_squares", *args, **kwargs) - - def mean(self, *args, **kwargs): - return self._reduce("mean", *args, **kwargs) - - def var(self, *args, **kwargs): - return self._reduce("var", *args, **kwargs) + @classmethod + def _add_reduction(cls, reduction): + def op(self, *args, **kwargs): + return self._reduce(reduction, *args, **kwargs) - def std(self, *args, **kwargs): - return self._reduce("std", *args, **kwargs) + setattr(cls, reduction, op) - # def median(self, *args, **kwargs): - # return self._reduce("median", *args, **kwargs) - # - # def quantile(self, *args, **kwargs): - # return self._reduce("quantile", *args, **kwargs) - # - # def argmax(self, *args, **kwargs): - # return self._reduce("argmax", *args, **kwargs) - # - # def argmin(self, *args, **kwargs): - # return self._reduce("argmin", *args, **kwargs) - # - # def nunique(self, *args, **kwargs): - # return self._reduce("nunique", *args, **kwargs) - # - # def nth(self, *args, **kwargs): - # return self._reduce("nth", *args, **kwargs) + @classmethod + def __init_subclass__(cls): + # Only add the set of reductions that are valid for a particular class. + # Subclasses may override the methods directly if they need special + # behavior. + valid_reductions = set() + for base_cls in cls.__mro__: + valid_reductions |= getattr(base_cls, "_VALID_REDUCTIONS", set()) - # def collect(self, *args, **kwargs): - # return self._reduce("sum", *args, **kwargs) - # - # def unique(self, *args, **kwargs): - # return self._reduce("sum", *args, **kwargs) + assert len(valid_reductions - cls._SUPPORTED_REDUCTIONS) == 0, ( + "Invalid requested reductions " + f"{valid_reductions - cls._SUPPORTED_REDUCTIONS}" + ) + for reduction in valid_reductions: + # Allow overriding the operation in the classes. + if not hasattr(cls, reduction): + cls._add_reduction(reduction) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e96531d4b1c..f2c07d8b492 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2752,7 +2752,10 @@ def cov(self, other, min_periods=None): lhs, rhs = _align_indices([lhs, rhs], how="inner") - return lhs._column.cov(rhs._column) + try: + return lhs._column.cov(rhs._column) + except AttributeError: + raise TypeError(f"cannot perform cov with type {self.dtype}") def transpose(self): """Return the transpose, which is by definition self. @@ -2788,7 +2791,10 @@ def corr(self, other, method="pearson", min_periods=None): rhs = other.nans_to_nulls().dropna() lhs, rhs = _align_indices([lhs, rhs], how="inner") - return lhs._column.corr(rhs._column) + try: + return lhs._column.corr(rhs._column) + except AttributeError: + raise TypeError(f"cannot perform corr with type {self.dtype}") def autocorr(self, lag=1): """Compute the lag-N autocorrelation. This method computes the Pearson diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7793a2fdf29..44779ff54a7 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -45,7 +45,10 @@ def _reduce( raise NotImplementedError( "numeric_only parameter is not implemented yet" ) - return getattr(self._column, op)(**kwargs) + try: + return getattr(self._column, op)(**kwargs) + except AttributeError: + raise TypeError(f"cannot perform {op} with type {self.dtype}") def _scan(self, op, axis=None, *args, **kwargs): if axis not in (None, 0): From 3435e90ffb3111c7a8d5f6f62e2524acb34316a1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 18 Nov 2021 18:02:43 -0800 Subject: [PATCH 04/39] Make ColumnBase Reducible. --- python/cudf/cudf/core/column/column.py | 40 +++++++++++++------ .../cudf/cudf/core/column/numerical_base.py | 23 +---------- 2 files changed, 29 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d28e12c1eaf..5651867d822 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -68,6 +68,7 @@ ListDtype, StructDtype, ) +from cudf.core.reductions import Reducible from cudf.utils import utils from cudf.utils.dtypes import ( cudf_dtype_from_pa_type, @@ -82,7 +83,14 @@ T = TypeVar("T", bound="ColumnBase") -class ColumnBase(Column, Serializable): +class ColumnBase(Column, Serializable, Reducible): + _VALID_REDUCTIONS = { + "any", + "all", + "max", + "min", + } + def as_frame(self) -> "cudf.core.frame.Frame": """ Converts a Column to Frame @@ -1164,17 +1172,25 @@ def _minmax(self, skipna: bool = None): return libcudf.reduce.minmax(result_col) return result_col - def min(self, skipna: bool = None, dtype: Dtype = None): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("min", result_col, dtype=dtype) - return result_col - - def max(self, skipna: bool = None, dtype: Dtype = None): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("max", result_col, dtype=dtype) - return result_col + def _reduce( + self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs + ) -> ScalarLike: + """Perform a reduction operation. + + op : str + The operation to perform. + skipna : bool + Whether or not na values must be skipped. + min_count : int, default 0 + The minimum number of entries for the reduction, otherwise the + reduction returns NaN. + """ + preprocessed = self._process_for_reduction( + skipna=skipna, min_count=min_count + ) + if isinstance(preprocessed, ColumnBase): + return libcudf.reduce.reduce(op, preprocessed, **kwargs) + return preprocessed def nans_to_nulls(self: T) -> T: # Only floats can contain nan. diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 866e864d407..725e15aa3e2 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -15,7 +15,7 @@ from cudf.core.reductions import Reducible -class NumericalBaseColumn(Reducible, ColumnBase): +class NumericalBaseColumn(ColumnBase, Reducible): """A column composed of numerical data. This class encodes a standard interface for different types of columns @@ -33,27 +33,6 @@ class NumericalBaseColumn(Reducible, ColumnBase): "std", } - def _reduce( - self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs - ) -> ScalarLike: - """Perform a reduction operation. - - op : str - The operation to perform. - skipna : bool - Whether or not na values must be skipped. - min_count : int, default 0 - The minimum number of entries for the reduction, otherwise the - reduction returns NaN. - """ - preprocessed = self._process_for_reduction( - skipna=skipna, min_count=min_count - ) - if isinstance(preprocessed, ColumnBase): - return libcudf.reduce.reduce(op, preprocessed, **kwargs) - else: - return preprocessed - def _can_return_nan(self, skipna: bool = None) -> bool: return not skipna and self.has_nulls() From 77c77678f2a27b1a09e9877233a5a66e58bd2d53 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Nov 2021 12:15:14 -0800 Subject: [PATCH 05/39] Use Reducible in GroupBy. --- python/cudf/cudf/core/groupby/groupby.py | 70 +++++++----------------- python/cudf/cudf/core/reductions.py | 6 ++ python/cudf/cudf/core/resample.py | 2 + 3 files changed, 27 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a393d8e9457..441b594c34f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -16,6 +16,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column from cudf.core.multiindex import MultiIndex +from cudf.core.reductions import Reducible from cudf.utils.utils import GetAttrGetItemMixin, cached_property @@ -34,7 +35,21 @@ def _quantile_75(x): return x.quantile(0.75) -class GroupBy(Serializable): +class GroupBy(Serializable, Reducible): + + _VALID_REDUCTIONS = { + "sum", + "prod", + "idxmin", + "idxmax", + "min", + "max", + "mean", + "median", + "nunique", + "first", + "last", + } _MAX_GROUPS_BEFORE_WARN = 100 @@ -295,6 +310,7 @@ def agg(self, func): return result + _reduce = agg aggregate = agg def nth(self, n): @@ -811,38 +827,6 @@ def describe(self, include=None, exclude=None): ) return res - def sum(self): - """Compute the column-wise sum of the values in each group.""" - return self.agg("sum") - - def prod(self): - """Compute the column-wise product of the values in each group.""" - return self.agg("prod") - - def idxmin(self): - """Get the column-wise index of the minimum value in each group.""" - return self.agg("idxmin") - - def idxmax(self): - """Get the column-wise index of the maximum value in each group.""" - return self.agg("idxmax") - - def min(self): - """Get the column-wise minimum value in each group.""" - return self.agg("min") - - def max(self): - """Get the column-wise maximum value in each group.""" - return self.agg("max") - - def mean(self): - """Compute the column-wise mean of the values in each group.""" - return self.agg("mean") - - def median(self): - """Get the column-wise median of the values in each group.""" - return self.agg("median") - def corr(self, method="pearson", min_periods=1): """ Compute pairwise correlation of columns, excluding NA/null values. @@ -1005,10 +989,6 @@ def func(x): return self.agg(func) - def nunique(self): - """Compute the number of unique values in each column in each group.""" - return self.agg("nunique") - def collect(self): """Get a list of all the values for each column in each group.""" return self.agg("collect") @@ -1030,14 +1010,6 @@ def cummax(self): """Get the column-wise cumulative maximum value in each group.""" return self.agg("cummax") - def first(self): - """Get the first non-null value in each group.""" - return self.agg("first") - - def last(self): - """Get the last non-null value in each group.""" - return self.agg("last") - def diff(self, periods=1, axis=0): """Get the difference between the values in each group. @@ -1367,12 +1339,6 @@ def __getitem__(self, key): by=self.grouping.keys, dropna=self._dropna, sort=self._sort ) - def nunique(self): - """ - Return the number of unique values per group - """ - return self.agg("nunique") - class SeriesGroupBy(GroupBy): """ @@ -1451,6 +1417,8 @@ def agg(self, func): return result + _reduce = agg + def apply(self, func): result = super().apply(func) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index c813aeda1d8..a1e4c678f7a 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -61,6 +61,12 @@ class encapsulates that paradigm. "nth", "collect", "unique", + "prod", + "idxmin", + "idxmax", + "nunique", + "first", + "last", } def _reduce(self, op: str, *args, **kwargs): diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index a4810701781..e51b829da95 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -49,6 +49,8 @@ def agg(self, func): else: return result.sort_index() + _reduce = agg + def asfreq(self): return self.obj._align_to_index( self.grouping.bin_labels, From 4d40e902e42c011c138e66c298d686653856a6d9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Nov 2021 15:13:13 -0800 Subject: [PATCH 06/39] Format _reduce to generate docstrings. --- python/cudf/cudf/core/column/column.py | 4 +-- python/cudf/cudf/core/groupby/groupby.py | 43 ++++++++++++++++++++++-- python/cudf/cudf/core/reductions.py | 26 +++++++------- python/cudf/cudf/core/resample.py | 2 -- 4 files changed, 53 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5651867d822..d1543d6fb69 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1175,10 +1175,8 @@ def _minmax(self, skipna: bool = None): def _reduce( self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs ) -> ScalarLike: - """Perform a reduction operation. + """Compute {op} of column values. - op : str - The operation to perform. skipna : bool Whether or not na values must be skipped. min_count : int, default 0 diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 441b594c34f..e9dc6aa17e2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -310,7 +310,46 @@ def agg(self, func): return result - _reduce = agg + def _reduce( + self, + op: str, + numeric_only: bool = False, + min_count: int = 0, + *args, + **kwargs, + ): + """Compute {op} of group values. + + Parameters + ---------- + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + min_count : int, default 0 + The required number of valid values to perform the operation. If + fewer than ``min_count`` non-NA values are present the result will + be NA. + + Returns + ------- + Series or DataFrame + Computed {op} of values within each group. + + Notes + ----- + Difference from pandas: + * Not supporting: numeric_only, min_count + """ + if numeric_only: + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + if min_count != 0: + raise NotImplementedError( + "min_count parameter is not implemented yet" + ) + return self.agg(op) + aggregate = agg def nth(self, n): @@ -1417,8 +1456,6 @@ def agg(self, func): return result - _reduce = agg - def apply(self, func): result = super().apply(func) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index a1e4c678f7a..e554dd34e35 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -10,20 +10,8 @@ # the reduce method since it needs to support all the same things. -# TODO: We have 3 options for how to handle unsupported operations: -# 1. We only monkey-patch in the operations that are well-defined. This is what -# I'm currently doing, and it works, but it precludes mypy checking. -# 2. We define _all_ aggregations in Reducible and then delete the ill-defined -# ones in __init_subclass__. This is basically the inverse of what I'm -# currently doing. Relative to option 1, it replaces mypy false positives -# for false negatives: instead of saying attributes don't exist when they -# do, it will never detect missing attributes that really shouldn't exist -# for certain column types. -# 3. Define all aggregations like in 2 but don't delete them. mypy will treat -# this essentially identically to 2, so we don't get any more useful -# diagnostics, but we run into the additional problem that now all -# reduction functions are defined for all subclasses, even if they aren't -# well-defined operations. +# TODO: Consider using a pyi file to trick mypy into seeing the monkey-patched +# methods. class Reducible: @@ -38,6 +26,13 @@ class Reducible: in terms of a single function that performs necessary pre- and post-processing of a result generated by calling to a lower-level API. This class encapsulates that paradigm. + + Notes + ----- + The documentation for the reductions is generated by formatting the + docstring for _reduce via `cls._reduce.__doc__.format(op=reduction)`. + Therefore, subclasses are responsible for writing an appropriate docstring + for the _reduce method (one that omits documentation of the op parameter). """ _SUPPORTED_REDUCTIONS = { @@ -69,6 +64,7 @@ class encapsulates that paradigm. "last", } + # TODO: Add a return type. def _reduce(self, op: str, *args, **kwargs): raise NotImplementedError @@ -77,6 +73,8 @@ def _add_reduction(cls, reduction): def op(self, *args, **kwargs): return self._reduce(reduction, *args, **kwargs) + # The default docstring is that + op.__doc__ = cls._reduce.__doc__.format(op=reduction) setattr(cls, reduction, op) @classmethod diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index e51b829da95..a4810701781 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -49,8 +49,6 @@ def agg(self, func): else: return result.sort_index() - _reduce = agg - def asfreq(self): return self.obj._align_to_index( self.grouping.bin_labels, From 31772178a57b243da6294a8307ca868c7f2021de Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Nov 2021 15:52:34 -0800 Subject: [PATCH 07/39] Make Rolling Reducible. --- python/cudf/cudf/core/groupby/groupby.py | 2 ++ python/cudf/cudf/core/window/rolling.py | 32 ++++++++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e9dc6aa17e2..b2298f42fb1 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -49,6 +49,8 @@ class GroupBy(Serializable, Reducible): "nunique", "first", "last", + "var", + "std", } _MAX_GROUPS_BEFORE_WARN = 100 diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 0f4256e49a6..059daf99ecd 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -11,11 +11,12 @@ from cudf.api.types import is_integer, is_number from cudf.core import column from cudf.core.column.column import as_column +from cudf.core.reductions import Reducible from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin -class Rolling(GetAttrGetItemMixin): +class Rolling(GetAttrGetItemMixin, Reducible): """ Rolling window calculations. @@ -163,6 +164,15 @@ class Rolling(GetAttrGetItemMixin): _time_window = False + _VALID_REDUCTIONS = { + "sum", + "min", + "max", + "mean", + "var", + "std", + } + def __init__( self, obj, @@ -256,17 +266,17 @@ def _apply_agg(self, agg_name): else: return self._apply_agg_dataframe(self.obj, agg_name) - def sum(self): - return self._apply_agg("sum") - - def min(self): - return self._apply_agg("min") - - def max(self): - return self._apply_agg("max") + def _reduce( + self, op: str, *args, **kwargs, + ): + """Calculate the rolling {op}. - def mean(self): - return self._apply_agg("mean") + Returns + ------- + Series or DataFrame + Return type is the same as the original object. + """ + return self._apply_agg(op) def var(self, ddof=1): self.agg_params["ddof"] = ddof From 10eeab2bf710e067475501eb351c6c5654dcdca5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Nov 2021 13:09:53 -0800 Subject: [PATCH 08/39] Add full support for new reduction functions to match docstrings and signatures. --- python/cudf/cudf/core/reductions.py | 61 +++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index e554dd34e35..c7c8b10118e 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -6,13 +6,20 @@ # for reductions, scans, and binops (and perhaps others). -# TODO: The docstring for a class's reductions should be taken from formatting -# the reduce method since it needs to support all the same things. +# TODO: Figure out how to support examples in the docstrings. # TODO: Consider using a pyi file to trick mypy into seeing the monkey-patched # methods. +# TODO: Add type annotations for the class variables. + +# TODO: Extract signature from _reduce + +# TODO: Take advantage of the new approach to also format the cls into the doc. + +import inspect + class Reducible: """Mixin encapsulating for reduction operations. @@ -70,12 +77,52 @@ def _reduce(self, op: str, *args, **kwargs): @classmethod def _add_reduction(cls, reduction): - def op(self, *args, **kwargs): - return self._reduce(reduction, *args, **kwargs) + # This function creates reduction operations on-the-fly and assigns + # them to the class. + + # Generate a signature without the `op` parameter. + signature = inspect.signature(cls._reduce) + new_params = signature.parameters.copy() + new_params.pop("op") + signature = signature.replace(parameters=new_params.values()) + + # Generate the list of arguments forwarded to _reduce. + arglist = ", ".join( + [ + f"{key}={key}" + for key in signature.parameters + if key not in ("self", "args", "kwargs") + ] + ) + if arglist: + arglist += ", *args, **kwargs" + else: + arglist = "*args, **kwargs" + + # The default docstring is that of the _reduce method. Additional + # formatting arguments may be provided in a class-level dictionary + # of the form _REDUCTION_DOCSTRINGS + docstring = cls._reduce.__doc__.format( + cls=cls.__name__, + op=reduction, + **getattr(cls, "_REDUCTION_DOCSTRINGS", {}).get(reduction, {}), + ) - # The default docstring is that - op.__doc__ = cls._reduce.__doc__.format(op=reduction) - setattr(cls, reduction, op) + # Create the desired function. + namespace = {} + out = """ +def {reduction}{signature}: + \"\"\"{docstring} + \"\"\" + return self._reduce(op="{reduction}", {arglist}) + """.format( + reduction=reduction, + signature=str(signature), + arglist=arglist, + docstring=docstring, + ) + exec(out, namespace) + setattr(cls, reduction, namespace[reduction]) @classmethod def __init_subclass__(cls): From 34cd0b9d1b3390aa3ff7c697840e7b71e01dc825 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 14 Jan 2022 14:36:19 -0800 Subject: [PATCH 09/39] Initial version of pyi file, all reductions working except for quantile and size. --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 12 +-- python/cudf/cudf/core/column/datetime.py | 6 +- .../cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 10 +-- python/cudf/cudf/core/reductions.py | 8 -- python/cudf/cudf/core/reductions.pyi | 88 +++++++++++++++++++ 8 files changed, 105 insertions(+), 25 deletions(-) create mode 100644 python/cudf/cudf/core/reductions.pyi diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de06e62cbb1..c210d631aff 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1000,7 +1000,7 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": def data_array_view(self) -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view - def unique(self) -> CategoricalColumn: + def unique(self, *args, **kwargs) -> CategoricalColumn: codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d1543d6fb69..741a2b5a860 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -187,7 +187,7 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: return False return self.binary_operator("NULL_EQUALS", other).all() - def all(self, skipna: bool = True) -> bool: + def all(self, skipna: bool = True, *args, **kwargs) -> bool: # If all entries are null the result is True, including when the column # is empty. result_col = self.nans_to_nulls() if skipna else self @@ -200,7 +200,7 @@ def all(self, skipna: bool = True) -> bool: return result_col - def any(self, skipna: bool = True) -> bool: + def any(self, skipna: bool = True, *args, **kwargs) -> bool: # Early exit for fast cases. result_col = self.nans_to_nulls() if skipna else self if not skipna and result_col.has_nulls(): @@ -686,12 +686,12 @@ def append(self, other: ColumnBase) -> ColumnBase: def quantile( self, q: Union[float, Sequence[float]], - interpolation: builtins.str, - exact: bool, + interpolation: str, + exact: bool, *args, **kwargs, ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") - def median(self, skipna: bool = None) -> ScalarLike: + def median(self, skipna: bool = None, *args, **kwargs) -> ScalarLike: raise TypeError(f"cannot perform median with type {self.dtype}") def take( @@ -1106,7 +1106,7 @@ def searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self) -> ColumnBase: + def unique(self, *args, **kwargs) -> ColumnBase: """ Get unique values in the data """ diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b763790986a..60645b0110d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -350,21 +350,21 @@ def _default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column""" return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: + def mean(self, skipna=None, dtype=np.float64, *args, **kwargs) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64, *args, **kwargs ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype) * _numpy_to_pandas_conversion[self.time_unit], ) - def median(self, skipna: bool = None) -> pd.Timestamp: + def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timestamp: return pd.Timestamp( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 725e15aa3e2..68adf594462 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -113,7 +113,7 @@ def var(self, dtype=np.float64, *args, **kwargs): def std(self, dtype=np.float64, *args, **kwargs): return self._reduce("std", dtype=dtype, *args, **kwargs) - def median(self, skipna: bool = None) -> NumericalBaseColumn: + def median(self, skipna: bool = None, *args, **kwargs) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9b44b4e6831..b286be78117 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5089,7 +5089,7 @@ def to_arrow(self) -> pa.Array: return super().to_arrow() def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0, *args, **kwargs ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 400214475e7..e92347d42d1 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -369,13 +369,13 @@ def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: return self return libcudf.unary.cast(self, dtype=dtype) - def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: + def mean(self, skipna=None, dtype: Dtype = np.float64, *args, **kwargs) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) - def median(self, skipna: bool = None) -> pd.Timedelta: + def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) @@ -384,7 +384,7 @@ def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs ) -> "column.ColumnBase": result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact @@ -394,7 +394,7 @@ def quantile( return result.astype(self.dtype) def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count=0 + self, skipna: bool = None, dtype: Dtype = None, min_count=0, *args, **kwargs ) -> pd.Timedelta: return pd.Timedelta( # TODO: mypy doesn't fully support monkey-patching, so the @@ -407,7 +407,7 @@ def sum( ) def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64, *args, **kwargs ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype), diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index c7c8b10118e..ba7026faac4 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -6,18 +6,11 @@ # for reductions, scans, and binops (and perhaps others). -# TODO: Figure out how to support examples in the docstrings. - - # TODO: Consider using a pyi file to trick mypy into seeing the monkey-patched # methods. # TODO: Add type annotations for the class variables. -# TODO: Extract signature from _reduce - -# TODO: Take advantage of the new approach to also format the cls into the doc. - import inspect @@ -66,7 +59,6 @@ class encapsulates that paradigm. "prod", "idxmin", "idxmax", - "nunique", "first", "last", } diff --git a/python/cudf/cudf/core/reductions.pyi b/python/cudf/cudf/core/reductions.pyi new file mode 100644 index 00000000000..d50b551dfa2 --- /dev/null +++ b/python/cudf/cudf/core/reductions.pyi @@ -0,0 +1,88 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations + +from typing import Sequence, Union +# +# from cudf._typing import Dtype, DtypeObj, ScalarLike +# from cudf.core.buffer import Buffer +# from cudf.core.column import ColumnBase +# +# T = TypeVar("T") + + +class Reducible: + def sum(self, *args, **kwargs): + ... + + def product(self, *args, **kwargs): + ... + + def min(self, *args, **kwargs): + ... + + def max(self, *args, **kwargs): + ... + + def count(self, *args, **kwargs): + ... + + def size(self, *args, **kwargs): + ... + + def any(self, *args, **kwargs): + ... + + def all(self, *args, **kwargs): + ... + + def sum_of_squares(self, *args, **kwargs): + ... + + def mean(self, *args, **kwargs): + ... + + def var(self, *args, **kwargs): + ... + + def std(self, *args, **kwargs): + ... + + def median(self, *args, **kwargs): + ... + + def quantile(self, q: Union[float, Sequence[float]], interpolation: str, *args, **kwargs): + ... + + def argmax(self, *args, **kwargs): + ... + + def argmin(self, *args, **kwargs): + ... + + def nunique(self, *args, **kwargs): + ... + + def nth(self, *args, **kwargs): + ... + + def collect(self, *args, **kwargs): + ... + + def unique(self, *args, **kwargs): + ... + + def prod(self, *args, **kwargs): + ... + + def idxmin(self, *args, **kwargs): + ... + + def idxmax(self, *args, **kwargs): + ... + + def first(self, *args, **kwargs): + ... + + def last(self, *args, **kwargs): + ... From fce53b89358f5f34b3387fa55fe324cb9d7f9cfa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 18 Jan 2022 14:39:05 -0800 Subject: [PATCH 10/39] Fix quantile issues. --- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/reductions.py | 2 +- python/cudf/cudf/core/reductions.pyi | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 60645b0110d..a1431025cf1 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -370,7 +370,7 @@ def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timestamp: ) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs ) -> ColumnBase: result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 68adf594462..56b56ad3c5c 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -85,7 +85,7 @@ def skew(self, skipna: bool = None) -> ScalarLike: return skew def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs ) -> NumericalBaseColumn: if isinstance(q, Number) or cudf.api.types.is_list_like(q): np_array_q = np.asarray(q) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index ba7026faac4..6046714fbcf 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. # TODO: Implement a factory that generates classes encapsulating passthrough diff --git a/python/cudf/cudf/core/reductions.pyi b/python/cudf/cudf/core/reductions.pyi index d50b551dfa2..aa1d0e46b71 100644 --- a/python/cudf/cudf/core/reductions.pyi +++ b/python/cudf/cudf/core/reductions.pyi @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. from __future__ import annotations @@ -51,7 +51,7 @@ class Reducible: def median(self, *args, **kwargs): ... - def quantile(self, q: Union[float, Sequence[float]], interpolation: str, *args, **kwargs): + def quantile(self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs): ... def argmax(self, *args, **kwargs): From 9d1fb4f20854f761d24ae8d6a5965d0e42d0dc2d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 18 Jan 2022 14:39:34 -0800 Subject: [PATCH 11/39] Remove size reduction. --- python/cudf/cudf/core/reductions.py | 1 - python/cudf/cudf/core/reductions.pyi | 3 --- 2 files changed, 4 deletions(-) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index 6046714fbcf..2b8fd1fae14 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -41,7 +41,6 @@ class encapsulates that paradigm. "min", "max", "count", - "size", "any", "all", "sum_of_squares", diff --git a/python/cudf/cudf/core/reductions.pyi b/python/cudf/cudf/core/reductions.pyi index aa1d0e46b71..19b2d7c9306 100644 --- a/python/cudf/cudf/core/reductions.pyi +++ b/python/cudf/cudf/core/reductions.pyi @@ -27,9 +27,6 @@ class Reducible: def count(self, *args, **kwargs): ... - def size(self, *args, **kwargs): - ... - def any(self, *args, **kwargs): ... From 4eaecaa24cc0f38f640919264d57b043b04c91f9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 18 Jan 2022 17:10:53 -0800 Subject: [PATCH 12/39] Write a factory for delegating mixins. --- python/cudf/cudf/core/reductions.py | 256 ++++++++++++++++----------- python/cudf/cudf/core/reductions.pyi | 6 - 2 files changed, 156 insertions(+), 106 deletions(-) diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/reductions.py index 2b8fd1fae14..e2b1ddaa152 100644 --- a/python/cudf/cudf/core/reductions.py +++ b/python/cudf/cudf/core/reductions.py @@ -1,41 +1,167 @@ # Copyright (c) 2022, NVIDIA CORPORATION. +import inspect -# TODO: Implement a factory that generates classes encapsulating passthrough -# operations like this. Then we can programmatically create suitable classes -# for reductions, scans, and binops (and perhaps others). +def _create_delegating_mixin( + mixin_name, docstring, category_name, operation_name, supported_operations +): + """Factory for mixins defining collections of delegated operations. + + This function generates mixins based on two common paradigms in cuDF: + + 1. Many classes implement similar operations (e.g. `sum`) via a sequence of + calls to lower-level APIs terminating in a libcudf C++ function call. + 2. libcudf groups many operations into categories using a common API. These + APIs usually accept an enum to delinate the specific operation to + perform, e.g. binary operations use the `binary_operator` enum when + calling the `binary_operation` function. cuDF Python mimics this + structure by having operations within a category delegate to a common + internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop). + + This factory creates mixins for a category of operations implemented by via + this delegator pattern. Its usage is best demonstrated by example below. + + Parameters + ---------- + category_name : str + The category of operations for which a mixin is being created. This + name will be used to define the following attributes as shown in the + example below: + - f'_{category_name}_DOCSTRINGS' + - f'_VALID_{category_name}S' + - f'_SUPPORTED_{category_name}S' + operation_name : str + The name given to the core function implementing this category of + operations. The corresponding function is the entrypoint for child + classes. + supported_ops : List[str] + The list of valid operations that subclasses of the resulting mixin may + request to be implemented. + + Examples + -------- + >>> MyFoo = _create_delegating_mixin( + ... "MyFoo", "MyFoo's docstring", "FOO", "do_foo", {"foo1", "foo2"} + ... ) + + >>> # The above is equivalent to defining a class like so: + ... class MyFoo: + ... # The set of valid foo operations. + ... _VALID_FOOS = {"foo1", "foo2"} + ... + ... # This is the method for child classes to override. Note that the + ... # first parameter is always called "op". + ... def do_foo(self, op, *args, **kwargs): + ... raise NotImplementedError + + >>> # MyFoo can be used as follows. + >>> class BarImplementsFoo(MyFoo): + ... _SUPPORTED_FOOS = ("foo1",) + ... _FOO_DOCSTRINGS = {"ret": "42"} + ... + ... # This method's docstring will be formatted and used for all valid + ... # operations. The `op` key is always automatically filled in, while + ... # other keys are formatted using _FOO_DOCSTRINGS. + ... def do_foo(self, op, *args, **kwargs): + ... '''Perform the operation {op}, which returns {ret}.''' + ... return 42 + + >>> bar = BarImplementsFoo() + >>> print(bar.foo1()) + 42 + + >>> # This will raise an AttributeError because foo2 is not supported by + >>> # the BarImplementsFoo subclass of MyFoo. + >>> # print(bar.foo2()) + + >>> # The docstring is formatted with the operation name as well as any + >>> # additional keys provided via the _FOO_DOCSTRINGS parameter. + >>> print(bar.foo1.__doc__) + Perform the operation foo1, which returns 42. + """ -# TODO: Consider using a pyi file to trick mypy into seeing the monkey-patched -# methods. + docstring_attr = f"{category_name}_DOCSTRINGS" + validity_attr = f"_VALID_{category_name}S" + + class OperationMixin: + @classmethod + def _add_operation(cls, operation): + # This function creates operations on-the-fly. + + # Generate a signature without the `op` parameter. + base_operation = getattr(cls, operation_name) + signature = inspect.signature(base_operation) + new_params = signature.parameters.copy() + new_params.pop("op") + signature = signature.replace(parameters=new_params.values()) + + # Generate the list of arguments forwarded to _reduce. + arglist = ", ".join( + [ + f"{key}={key}" + for key in signature.parameters + if key not in ("self", "args", "kwargs") + ] + ) + if arglist: + arglist += ", *args, **kwargs" + else: + arglist = "*args, **kwargs" + + # Apply the formatted docstring of the base operation to the + # operation being created here. + docstring = base_operation.__doc__.format( + cls=cls.__name__, + op=operation, + **getattr(cls, docstring_attr, {}).get(operation, {}), + ) + + namespace = {} + out = f""" +def {operation}{str(signature)}: + '''{docstring} + ''' + return self.{operation_name}(op="{operation}", {arglist}) +""" + exec(out, namespace) + setattr(cls, operation, namespace[operation]) + + @classmethod + def __init_subclass__(cls): + # Only add the valid set of operations for a particular class. + valid_operations = set() + for base_cls in cls.__mro__: + valid_operations |= getattr(base_cls, validity_attr, set()) + + invalid_operations = valid_operations - supported_operations + assert len(invalid_operations) == 0, ( + f"Invalid requested operations: {invalid_operations}" + ) + for operation in valid_operations: + # Check if the operation is already defined so that subclasses + # can override the method if desired. + if not hasattr(cls, operation): + cls._add_operation(operation) + + def _operation(self, op: str, *args, **kwargs): + raise NotImplementedError -# TODO: Add type annotations for the class variables. + OperationMixin.__name__ = mixin_name + OperationMixin.__doc__ = docstring + setattr(OperationMixin, operation_name, _operation) + setattr(OperationMixin, f"_SUPPORTED_{category_name}S", + supported_operations) -import inspect + return OperationMixin -class Reducible: - """Mixin encapsulating for reduction operations. - - Various classes in cuDF support - `reductions `__. In - practice the reductions are implemented via dispatch to a lower-level API. - For example, Frame-like objects dispatch to Columns, which in turn dispatch - to libcudf implementations. As a result, rather than encoding the logic for - different types of reductions, most classes can implement all reductions - in terms of a single function that performs necessary pre- and - post-processing of a result generated by calling to a lower-level API. This - class encapsulates that paradigm. - - Notes - ----- - The documentation for the reductions is generated by formatting the - docstring for _reduce via `cls._reduce.__doc__.format(op=reduction)`. - Therefore, subclasses are responsible for writing an appropriate docstring - for the _reduce method (one that omits documentation of the op parameter). - """ - - _SUPPORTED_REDUCTIONS = { +Reducible = _create_delegating_mixin( + "Reducible", + "Mixin encapsulating for reduction operations.", + "REDUCTION", + "_reduce", + { "sum", "product", "min", @@ -61,74 +187,4 @@ class encapsulates that paradigm. "first", "last", } - - # TODO: Add a return type. - def _reduce(self, op: str, *args, **kwargs): - raise NotImplementedError - - @classmethod - def _add_reduction(cls, reduction): - # This function creates reduction operations on-the-fly and assigns - # them to the class. - - # Generate a signature without the `op` parameter. - signature = inspect.signature(cls._reduce) - new_params = signature.parameters.copy() - new_params.pop("op") - signature = signature.replace(parameters=new_params.values()) - - # Generate the list of arguments forwarded to _reduce. - arglist = ", ".join( - [ - f"{key}={key}" - for key in signature.parameters - if key not in ("self", "args", "kwargs") - ] - ) - if arglist: - arglist += ", *args, **kwargs" - else: - arglist = "*args, **kwargs" - - # The default docstring is that of the _reduce method. Additional - # formatting arguments may be provided in a class-level dictionary - # of the form _REDUCTION_DOCSTRINGS - docstring = cls._reduce.__doc__.format( - cls=cls.__name__, - op=reduction, - **getattr(cls, "_REDUCTION_DOCSTRINGS", {}).get(reduction, {}), - ) - - # Create the desired function. - namespace = {} - out = """ -def {reduction}{signature}: - \"\"\"{docstring} - \"\"\" - return self._reduce(op="{reduction}", {arglist}) - """.format( - reduction=reduction, - signature=str(signature), - arglist=arglist, - docstring=docstring, - ) - exec(out, namespace) - setattr(cls, reduction, namespace[reduction]) - - @classmethod - def __init_subclass__(cls): - # Only add the set of reductions that are valid for a particular class. - # Subclasses may override the methods directly if they need special - # behavior. - valid_reductions = set() - for base_cls in cls.__mro__: - valid_reductions |= getattr(base_cls, "_VALID_REDUCTIONS", set()) - - assert len(valid_reductions - cls._SUPPORTED_REDUCTIONS) == 0, ( - "Invalid requested reductions " - f"{valid_reductions - cls._SUPPORTED_REDUCTIONS}" - ) - for reduction in valid_reductions: - # Allow overriding the operation in the classes. - if not hasattr(cls, reduction): - cls._add_reduction(reduction) +) diff --git a/python/cudf/cudf/core/reductions.pyi b/python/cudf/cudf/core/reductions.pyi index 19b2d7c9306..360bf73482f 100644 --- a/python/cudf/cudf/core/reductions.pyi +++ b/python/cudf/cudf/core/reductions.pyi @@ -3,12 +3,6 @@ from __future__ import annotations from typing import Sequence, Union -# -# from cudf._typing import Dtype, DtypeObj, ScalarLike -# from cudf.core.buffer import Buffer -# from cudf.core.column import ColumnBase -# -# T = TypeVar("T") class Reducible: From c6c2e808f81627d79b6e3840773486c208d39596 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 19 Jan 2022 10:12:49 -0800 Subject: [PATCH 13/39] Move reducible to new subpackage. --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/mixins/__init__.py | 5 +++++ python/cudf/cudf/core/{ => mixins}/reductions.py | 0 python/cudf/cudf/core/{ => mixins}/reductions.pyi | 0 python/cudf/cudf/core/window/rolling.py | 2 +- 7 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 python/cudf/cudf/core/mixins/__init__.py rename python/cudf/cudf/core/{ => mixins}/reductions.py (100%) rename python/cudf/cudf/core/{ => mixins}/reductions.pyi (100%) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 741a2b5a860..70883f717f4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -68,7 +68,7 @@ ListDtype, StructDtype, ) -from cudf.core.reductions import Reducible +from cudf.core.mixins import Reducible from cudf.utils import utils from cudf.utils.dtypes import ( cudf_dtype_from_pa_type, diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 56b56ad3c5c..5776363f16c 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -12,7 +12,7 @@ from cudf import _lib as libcudf from cudf._typing import ScalarLike from cudf.core.column import ColumnBase -from cudf.core.reductions import Reducible +from cudf.core.mixins import Reducible class NumericalBaseColumn(ColumnBase, Reducible): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b2298f42fb1..b0bd3694850 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -16,7 +16,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column from cudf.core.multiindex import MultiIndex -from cudf.core.reductions import Reducible +from cudf.core.mixins import Reducible from cudf.utils.utils import GetAttrGetItemMixin, cached_property diff --git a/python/cudf/cudf/core/mixins/__init__.py b/python/cudf/cudf/core/mixins/__init__.py new file mode 100644 index 00000000000..cecf4c1c7ed --- /dev/null +++ b/python/cudf/cudf/core/mixins/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from .reductions import Reducible + +__all__ = ["Reducible"] diff --git a/python/cudf/cudf/core/reductions.py b/python/cudf/cudf/core/mixins/reductions.py similarity index 100% rename from python/cudf/cudf/core/reductions.py rename to python/cudf/cudf/core/mixins/reductions.py diff --git a/python/cudf/cudf/core/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi similarity index 100% rename from python/cudf/cudf/core/reductions.pyi rename to python/cudf/cudf/core/mixins/reductions.pyi diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 059daf99ecd..ce57c2c1215 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -11,7 +11,7 @@ from cudf.api.types import is_integer, is_number from cudf.core import column from cudf.core.column.column import as_column -from cudf.core.reductions import Reducible +from cudf.core.mixins import Reducible from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin From c43a1a66e1a56c89a5086f977900d077e0765f66 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 19 Jan 2022 16:33:38 -0800 Subject: [PATCH 14/39] Move factory to a separate module. --- python/cudf/cudf/core/mixins/mixin_factory.py | 156 ++++++++++++++++++ python/cudf/cudf/core/mixins/reductions.py | 155 +---------------- 2 files changed, 157 insertions(+), 154 deletions(-) create mode 100644 python/cudf/cudf/core/mixins/mixin_factory.py diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py new file mode 100644 index 00000000000..3070af36f0f --- /dev/null +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -0,0 +1,156 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import inspect + + +def _create_delegating_mixin( + mixin_name, docstring, category_name, operation_name, supported_operations +): + """Factory for mixins defining collections of delegated operations. + + This function generates mixins based on two common paradigms in cuDF: + + 1. Many classes implement similar operations (e.g. `sum`) via a sequence of + calls to lower-level APIs terminating in a libcudf C++ function call. + 2. libcudf groups many operations into categories using a common API. These + APIs usually accept an enum to delinate the specific operation to + perform, e.g. binary operations use the `binary_operator` enum when + calling the `binary_operation` function. cuDF Python mimics this + structure by having operations within a category delegate to a common + internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop). + + This factory creates mixins for a category of operations implemented by via + this delegator pattern. Its usage is best demonstrated by example below. + + Parameters + ---------- + category_name : str + The category of operations for which a mixin is being created. This + name will be used to define the following attributes as shown in the + example below: + - f'_{category_name}_DOCSTRINGS' + - f'_VALID_{category_name}S' + - f'_SUPPORTED_{category_name}S' + operation_name : str + The name given to the core function implementing this category of + operations. The corresponding function is the entrypoint for child + classes. + supported_ops : List[str] + The list of valid operations that subclasses of the resulting mixin may + request to be implemented. + + Examples + -------- + >>> MyFoo = _create_delegating_mixin( + ... "MyFoo", "MyFoo's docstring", "FOO", "do_foo", {"foo1", "foo2"} + ... ) + + >>> # The above is equivalent to defining a class like so: + ... class MyFoo: + ... # The set of valid foo operations. + ... _VALID_FOOS = {"foo1", "foo2"} + ... + ... # This is the method for child classes to override. Note that the + ... # first parameter is always called "op". + ... def do_foo(self, op, *args, **kwargs): + ... raise NotImplementedError + + >>> # MyFoo can be used as follows. + >>> class BarImplementsFoo(MyFoo): + ... _SUPPORTED_FOOS = ("foo1",) + ... _FOO_DOCSTRINGS = {"ret": "42"} + ... + ... # This method's docstring will be formatted and used for all valid + ... # operations. The `op` key is always automatically filled in, while + ... # other keys are formatted using _FOO_DOCSTRINGS. + ... def do_foo(self, op, *args, **kwargs): + ... '''Perform the operation {op}, which returns {ret}.''' + ... return 42 + + >>> bar = BarImplementsFoo() + >>> print(bar.foo1()) + 42 + + >>> # This will raise an AttributeError because foo2 is not supported by + >>> # the BarImplementsFoo subclass of MyFoo. + >>> # print(bar.foo2()) + + >>> # The docstring is formatted with the operation name as well as any + >>> # additional keys provided via the _FOO_DOCSTRINGS parameter. + >>> print(bar.foo1.__doc__) + Perform the operation foo1, which returns 42. + """ + + docstring_attr = f"{category_name}_DOCSTRINGS" + validity_attr = f"_VALID_{category_name}S" + + class OperationMixin: + @classmethod + def _add_operation(cls, operation): + # This function creates operations on-the-fly. + + # Generate a signature without the `op` parameter. + base_operation = getattr(cls, operation_name) + signature = inspect.signature(base_operation) + new_params = signature.parameters.copy() + new_params.pop("op") + signature = signature.replace(parameters=new_params.values()) + + # Generate the list of arguments forwarded to _reduce. + arglist = ", ".join( + [ + f"{key}={key}" + for key in signature.parameters + if key not in ("self", "args", "kwargs") + ] + ) + if arglist: + arglist += ", *args, **kwargs" + else: + arglist = "*args, **kwargs" + + # Apply the formatted docstring of the base operation to the + # operation being created here. + docstring = base_operation.__doc__.format( + cls=cls.__name__, + op=operation, + **getattr(cls, docstring_attr, {}).get(operation, {}), + ) + + namespace = {} + out = f""" +def {operation}{str(signature)}: + '''{docstring} + ''' + return self.{operation_name}(op="{operation}", {arglist}) +""" + exec(out, namespace) + setattr(cls, operation, namespace[operation]) + + @classmethod + def __init_subclass__(cls): + # Only add the valid set of operations for a particular class. + valid_operations = set() + for base_cls in cls.__mro__: + valid_operations |= getattr(base_cls, validity_attr, set()) + + invalid_operations = valid_operations - supported_operations + assert len(invalid_operations) == 0, ( + f"Invalid requested operations: {invalid_operations}" + ) + for operation in valid_operations: + # Check if the operation is already defined so that subclasses + # can override the method if desired. + if not hasattr(cls, operation): + cls._add_operation(operation) + + def _operation(self, op: str, *args, **kwargs): + raise NotImplementedError + + OperationMixin.__name__ = mixin_name + OperationMixin.__doc__ = docstring + setattr(OperationMixin, operation_name, _operation) + setattr(OperationMixin, f"_SUPPORTED_{category_name}S", + supported_operations) + + return OperationMixin diff --git a/python/cudf/cudf/core/mixins/reductions.py b/python/cudf/cudf/core/mixins/reductions.py index e2b1ddaa152..8a3e436cb54 100644 --- a/python/cudf/cudf/core/mixins/reductions.py +++ b/python/cudf/cudf/core/mixins/reductions.py @@ -1,159 +1,6 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -import inspect - - -def _create_delegating_mixin( - mixin_name, docstring, category_name, operation_name, supported_operations -): - """Factory for mixins defining collections of delegated operations. - - This function generates mixins based on two common paradigms in cuDF: - - 1. Many classes implement similar operations (e.g. `sum`) via a sequence of - calls to lower-level APIs terminating in a libcudf C++ function call. - 2. libcudf groups many operations into categories using a common API. These - APIs usually accept an enum to delinate the specific operation to - perform, e.g. binary operations use the `binary_operator` enum when - calling the `binary_operation` function. cuDF Python mimics this - structure by having operations within a category delegate to a common - internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop). - - This factory creates mixins for a category of operations implemented by via - this delegator pattern. Its usage is best demonstrated by example below. - - Parameters - ---------- - category_name : str - The category of operations for which a mixin is being created. This - name will be used to define the following attributes as shown in the - example below: - - f'_{category_name}_DOCSTRINGS' - - f'_VALID_{category_name}S' - - f'_SUPPORTED_{category_name}S' - operation_name : str - The name given to the core function implementing this category of - operations. The corresponding function is the entrypoint for child - classes. - supported_ops : List[str] - The list of valid operations that subclasses of the resulting mixin may - request to be implemented. - - Examples - -------- - >>> MyFoo = _create_delegating_mixin( - ... "MyFoo", "MyFoo's docstring", "FOO", "do_foo", {"foo1", "foo2"} - ... ) - - >>> # The above is equivalent to defining a class like so: - ... class MyFoo: - ... # The set of valid foo operations. - ... _VALID_FOOS = {"foo1", "foo2"} - ... - ... # This is the method for child classes to override. Note that the - ... # first parameter is always called "op". - ... def do_foo(self, op, *args, **kwargs): - ... raise NotImplementedError - - >>> # MyFoo can be used as follows. - >>> class BarImplementsFoo(MyFoo): - ... _SUPPORTED_FOOS = ("foo1",) - ... _FOO_DOCSTRINGS = {"ret": "42"} - ... - ... # This method's docstring will be formatted and used for all valid - ... # operations. The `op` key is always automatically filled in, while - ... # other keys are formatted using _FOO_DOCSTRINGS. - ... def do_foo(self, op, *args, **kwargs): - ... '''Perform the operation {op}, which returns {ret}.''' - ... return 42 - - >>> bar = BarImplementsFoo() - >>> print(bar.foo1()) - 42 - - >>> # This will raise an AttributeError because foo2 is not supported by - >>> # the BarImplementsFoo subclass of MyFoo. - >>> # print(bar.foo2()) - - >>> # The docstring is formatted with the operation name as well as any - >>> # additional keys provided via the _FOO_DOCSTRINGS parameter. - >>> print(bar.foo1.__doc__) - Perform the operation foo1, which returns 42. - """ - - docstring_attr = f"{category_name}_DOCSTRINGS" - validity_attr = f"_VALID_{category_name}S" - - class OperationMixin: - @classmethod - def _add_operation(cls, operation): - # This function creates operations on-the-fly. - - # Generate a signature without the `op` parameter. - base_operation = getattr(cls, operation_name) - signature = inspect.signature(base_operation) - new_params = signature.parameters.copy() - new_params.pop("op") - signature = signature.replace(parameters=new_params.values()) - - # Generate the list of arguments forwarded to _reduce. - arglist = ", ".join( - [ - f"{key}={key}" - for key in signature.parameters - if key not in ("self", "args", "kwargs") - ] - ) - if arglist: - arglist += ", *args, **kwargs" - else: - arglist = "*args, **kwargs" - - # Apply the formatted docstring of the base operation to the - # operation being created here. - docstring = base_operation.__doc__.format( - cls=cls.__name__, - op=operation, - **getattr(cls, docstring_attr, {}).get(operation, {}), - ) - - namespace = {} - out = f""" -def {operation}{str(signature)}: - '''{docstring} - ''' - return self.{operation_name}(op="{operation}", {arglist}) -""" - exec(out, namespace) - setattr(cls, operation, namespace[operation]) - - @classmethod - def __init_subclass__(cls): - # Only add the valid set of operations for a particular class. - valid_operations = set() - for base_cls in cls.__mro__: - valid_operations |= getattr(base_cls, validity_attr, set()) - - invalid_operations = valid_operations - supported_operations - assert len(invalid_operations) == 0, ( - f"Invalid requested operations: {invalid_operations}" - ) - for operation in valid_operations: - # Check if the operation is already defined so that subclasses - # can override the method if desired. - if not hasattr(cls, operation): - cls._add_operation(operation) - - def _operation(self, op: str, *args, **kwargs): - raise NotImplementedError - - OperationMixin.__name__ = mixin_name - OperationMixin.__doc__ = docstring - setattr(OperationMixin, operation_name, _operation) - setattr(OperationMixin, f"_SUPPORTED_{category_name}S", - supported_operations) - - return OperationMixin +from .mixin_factory import _create_delegating_mixin Reducible = _create_delegating_mixin( From 01d15743c1ca73f84974fbb0f0e5c1d02e6df843 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 19 Jan 2022 17:05:03 -0800 Subject: [PATCH 15/39] Some final cleanup and polishing of documentation. --- python/cudf/cudf/core/mixins/mixin_factory.py | 31 +++++++++++++------ python/cudf/cudf/core/mixins/reductions.py | 5 ++- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 3070af36f0f..dc0554e0341 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -10,17 +10,24 @@ def _create_delegating_mixin( This function generates mixins based on two common paradigms in cuDF: - 1. Many classes implement similar operations (e.g. `sum`) via a sequence of - calls to lower-level APIs terminating in a libcudf C++ function call. - 2. libcudf groups many operations into categories using a common API. These + 1. libcudf groups many operations into categories using a common API. These APIs usually accept an enum to delinate the specific operation to perform, e.g. binary operations use the `binary_operator` enum when calling the `binary_operation` function. cuDF Python mimics this structure by having operations within a category delegate to a common internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop). + 2. Many cuDF classes implement similar operations (e.g. `sum`) via + delegation to lower-level APIs before reaching a libcudf C++ function + call. As a result, many API function calls actually involve multiple + delegations to lower-level APIs that can look essentially identical. An + example of such a sequence would be DataFrame.sum -> DataFrame._reduce + -> Column.sum -> Column._reduce -> libcudf. This factory creates mixins for a category of operations implemented by via - this delegator pattern. Its usage is best demonstrated by example below. + this delegator pattern. The resulting mixins make it easy to share common + functions across various classes while also providing a common entrypoint + for implementing the centralized logic for a given category of operations. + Its usage is best demonstrated by example below. Parameters ---------- @@ -96,7 +103,7 @@ def _add_operation(cls, operation): new_params.pop("op") signature = signature.replace(parameters=new_params.values()) - # Generate the list of arguments forwarded to _reduce. + # Generate the list of arguments forwarded to the base operation. arglist = ", ".join( [ f"{key}={key}" @@ -135,9 +142,9 @@ def __init_subclass__(cls): valid_operations |= getattr(base_cls, validity_attr, set()) invalid_operations = valid_operations - supported_operations - assert len(invalid_operations) == 0, ( - f"Invalid requested operations: {invalid_operations}" - ) + assert ( + len(invalid_operations) == 0 + ), f"Invalid requested operations: {invalid_operations}" for operation in valid_operations: # Check if the operation is already defined so that subclasses # can override the method if desired. @@ -150,7 +157,11 @@ def _operation(self, op: str, *args, **kwargs): OperationMixin.__name__ = mixin_name OperationMixin.__doc__ = docstring setattr(OperationMixin, operation_name, _operation) - setattr(OperationMixin, f"_SUPPORTED_{category_name}S", - supported_operations) + # This attribute is set in case lookup is convenient at a later point, but + # it is not strictly necessary since `supported_operations` is part of the + # closure associated with the class's creation. + setattr( + OperationMixin, f"_SUPPORTED_{category_name}S", supported_operations + ) return OperationMixin diff --git a/python/cudf/cudf/core/mixins/reductions.py b/python/cudf/cudf/core/mixins/reductions.py index 8a3e436cb54..24e5319b9a3 100644 --- a/python/cudf/cudf/core/mixins/reductions.py +++ b/python/cudf/cudf/core/mixins/reductions.py @@ -2,10 +2,9 @@ from .mixin_factory import _create_delegating_mixin - Reducible = _create_delegating_mixin( "Reducible", - "Mixin encapsulating for reduction operations.", + "Mixin encapsulating reduction operations.", "REDUCTION", "_reduce", { @@ -33,5 +32,5 @@ "idxmax", "first", "last", - } + }, ) From ba6e98f5fa24ace3a2c5bb3bfbee91ff09f6ea1f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 24 Jan 2022 12:06:49 -0800 Subject: [PATCH 16/39] Fix style. --- python/cudf/cudf/core/column/column.py | 4 ++- python/cudf/cudf/core/column/datetime.py | 18 ++++++++++--- .../cudf/cudf/core/column/numerical_base.py | 11 ++++++-- python/cudf/cudf/core/column/string.py | 7 +++++- python/cudf/cudf/core/column/timedelta.py | 25 ++++++++++++++++--- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/mixins/reductions.pyi | 1 - 7 files changed, 55 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 70883f717f4..6b95d9e99fe 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -687,7 +687,9 @@ def quantile( self, q: Union[float, Sequence[float]], interpolation: str, - exact: bool, *args, **kwargs, + exact: bool, + *args, + **kwargs, ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a1431025cf1..f3a6da93cd0 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -350,14 +350,21 @@ def _default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column""" return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64, *args, **kwargs) -> ScalarLike: + def mean( + self, skipna=None, dtype=np.float64, *args, **kwargs + ) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64, *args, **kwargs + self, + skipna: bool = None, + ddof: int = 1, + dtype: Dtype = np.float64, + *args, + **kwargs, ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype) @@ -370,7 +377,12 @@ def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timestamp: ) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs + self, + q: Union[float, Sequence[float]], + interpolation: str, + exact: bool, + *args, + **kwargs, ) -> ColumnBase: result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 5776363f16c..583eb59bb4b 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -85,7 +85,12 @@ def skew(self, skipna: bool = None) -> ScalarLike: return skew def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs + self, + q: Union[float, Sequence[float]], + interpolation: str, + exact: bool, + *args, + **kwargs, ) -> NumericalBaseColumn: if isinstance(q, Number) or cudf.api.types.is_list_like(q): np_array_q = np.asarray(q) @@ -113,7 +118,9 @@ def var(self, dtype=np.float64, *args, **kwargs): def std(self, dtype=np.float64, *args, **kwargs): return self._reduce("std", dtype=dtype, *args, **kwargs) - def median(self, skipna: bool = None, *args, **kwargs) -> NumericalBaseColumn: + def median( + self, skipna: bool = None, *args, **kwargs + ) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b286be78117..4c503ba5a0b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5089,7 +5089,12 @@ def to_arrow(self) -> pa.Array: return super().to_arrow() def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0, *args, **kwargs + self, + skipna: bool = None, + dtype: Dtype = None, + min_count: int = 0, + *args, + **kwargs, ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e92347d42d1..cea261205f6 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -369,7 +369,9 @@ def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: return self return libcudf.unary.cast(self, dtype=dtype) - def mean(self, skipna=None, dtype: Dtype = np.float64, *args, **kwargs) -> pd.Timedelta: + def mean( + self, skipna=None, dtype: Dtype = np.float64, *args, **kwargs + ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, @@ -384,7 +386,12 @@ def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs + self, + q: Union[float, Sequence[float]], + interpolation: str, + exact: bool, + *args, + **kwargs, ) -> "column.ColumnBase": result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact @@ -394,7 +401,12 @@ def quantile( return result.astype(self.dtype) def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count=0, *args, **kwargs + self, + skipna: bool = None, + dtype: Dtype = None, + min_count=0, + *args, + **kwargs, ) -> pd.Timedelta: return pd.Timedelta( # TODO: mypy doesn't fully support monkey-patching, so the @@ -407,7 +419,12 @@ def sum( ) def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64, *args, **kwargs + self, + skipna: bool = None, + ddof: int = 1, + dtype: Dtype = np.float64, + *args, + **kwargs, ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype), diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b0bd3694850..ec53b4ed93a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -15,8 +15,8 @@ from cudf.api.types import is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column -from cudf.core.multiindex import MultiIndex from cudf.core.mixins import Reducible +from cudf.core.multiindex import MultiIndex from cudf.utils.utils import GetAttrGetItemMixin, cached_property diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index 360bf73482f..ece904f5c2a 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -4,7 +4,6 @@ from __future__ import annotations from typing import Sequence, Union - class Reducible: def sum(self, *args, **kwargs): ... From 8fbcd73957bf1d318505fa8fa9cd950543ffeb98 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 24 Jan 2022 15:08:41 -0800 Subject: [PATCH 17/39] Some minor simplifications and clarifications. --- python/cudf/cudf/core/mixins/mixin_factory.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index dc0554e0341..d7efe2674d3 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -98,10 +98,9 @@ def _add_operation(cls, operation): # Generate a signature without the `op` parameter. base_operation = getattr(cls, operation_name) - signature = inspect.signature(base_operation) - new_params = signature.parameters.copy() - new_params.pop("op") - signature = signature.replace(parameters=new_params.values()) + base_params = inspect.signature(base_operation).parameters.values() + params = [p for p in base_params if p.name != "op"] + signature = inspect.Signature(parameters=params) # Generate the list of arguments forwarded to the base operation. arglist = ", ".join( @@ -111,10 +110,7 @@ def _add_operation(cls, operation): if key not in ("self", "args", "kwargs") ] ) - if arglist: - arglist += ", *args, **kwargs" - else: - arglist = "*args, **kwargs" + arglist = arglist + (", " if arglist else "") + "*args, **kwargs" # Apply the formatted docstring of the base operation to the # operation being created here. @@ -125,13 +121,13 @@ def _add_operation(cls, operation): ) namespace = {} - out = f""" + exec_str = f""" def {operation}{str(signature)}: '''{docstring} ''' return self.{operation_name}(op="{operation}", {arglist}) """ - exec(out, namespace) + exec(exec_str, namespace) setattr(cls, operation, namespace[operation]) @classmethod From d2829cf35d0786720d800be0ed87c931e0f3e877 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 24 Jan 2022 15:25:17 -0800 Subject: [PATCH 18/39] Fix error messages and remove unnecessary comparison in tests. --- python/cudf/cudf/core/series.py | 9 +++++++-- python/cudf/cudf/tests/test_stats.py | 9 ++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f2c07d8b492..7e1b8549f06 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2755,7 +2755,10 @@ def cov(self, other, min_periods=None): try: return lhs._column.cov(rhs._column) except AttributeError: - raise TypeError(f"cannot perform cov with type {self.dtype}") + raise TypeError( + f"cannot perform covariance with types {self.dtype}, " + f"{other.dtype}" + ) def transpose(self): """Return the transpose, which is by definition self. @@ -2794,7 +2797,9 @@ def corr(self, other, method="pearson", min_periods=None): try: return lhs._column.corr(rhs._column) except AttributeError: - raise TypeError(f"cannot perform corr with type {self.dtype}") + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) def autocorr(self, lag=1): """Compute the lag-N autocorrelation. This method computes the Pearson diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 142ca6c6831..8ae77936e46 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,6 +1,5 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. -import re from concurrent.futures import ThreadPoolExecutor import numpy as np @@ -510,9 +509,7 @@ def test_cov_corr_invalid_dtypes(gsr): rfunc=gsr.corr, lfunc_args_and_kwargs=([psr],), rfunc_args_and_kwargs=([gsr],), - expected_error_message=re.escape( - f"cannot perform corr with types {gsr.dtype}, {gsr.dtype}" - ), + compare_error_message=False, ) assert_exceptions_equal( @@ -520,7 +517,5 @@ def test_cov_corr_invalid_dtypes(gsr): rfunc=gsr.cov, lfunc_args_and_kwargs=([psr],), rfunc_args_and_kwargs=([gsr],), - expected_error_message=re.escape( - f"cannot perform covarience with types {gsr.dtype}, {gsr.dtype}" - ), + compare_error_message=False, ) From a63e0c3966b8cfc42a90d2899d211a0562d43b01 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 24 Jan 2022 15:29:52 -0800 Subject: [PATCH 19/39] Remove quantile from reductions since it doesn't strictly follow the paradigm (may result in the same dimensionality if multiple quantiles are requested) and it is always overridden anyway. --- python/cudf/cudf/core/column/column.py | 7 +------ python/cudf/cudf/core/column/datetime.py | 7 +------ python/cudf/cudf/core/column/numerical_base.py | 7 +------ python/cudf/cudf/core/column/timedelta.py | 7 +------ python/cudf/cudf/core/mixins/reductions.py | 1 - python/cudf/cudf/core/mixins/reductions.pyi | 5 ----- 6 files changed, 4 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6b95d9e99fe..57767e019da 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -684,12 +684,7 @@ def append(self, other: ColumnBase) -> ColumnBase: return concat_columns([self, as_column(other)]) def quantile( - self, - q: Union[float, Sequence[float]], - interpolation: str, - exact: bool, - *args, - **kwargs, + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f3a6da93cd0..64f44763454 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -377,12 +377,7 @@ def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timestamp: ) def quantile( - self, - q: Union[float, Sequence[float]], - interpolation: str, - exact: bool, - *args, - **kwargs, + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> ColumnBase: result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 583eb59bb4b..70f7fc4444d 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -85,12 +85,7 @@ def skew(self, skipna: bool = None) -> ScalarLike: return skew def quantile( - self, - q: Union[float, Sequence[float]], - interpolation: str, - exact: bool, - *args, - **kwargs, + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> NumericalBaseColumn: if isinstance(q, Number) or cudf.api.types.is_list_like(q): np_array_q = np.asarray(q) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index cea261205f6..42c294cfd0c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -386,12 +386,7 @@ def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) def quantile( - self, - q: Union[float, Sequence[float]], - interpolation: str, - exact: bool, - *args, - **kwargs, + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> "column.ColumnBase": result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact diff --git a/python/cudf/cudf/core/mixins/reductions.py b/python/cudf/cudf/core/mixins/reductions.py index 24e5319b9a3..f73f0e8fbc6 100644 --- a/python/cudf/cudf/core/mixins/reductions.py +++ b/python/cudf/cudf/core/mixins/reductions.py @@ -20,7 +20,6 @@ "var", "std", "median", - "quantile", "argmax", "argmin", "nunique", diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index ece904f5c2a..1ce2d8acdd6 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Sequence, Union - class Reducible: def sum(self, *args, **kwargs): ... @@ -41,9 +39,6 @@ class Reducible: def median(self, *args, **kwargs): ... - def quantile(self, q: Union[float, Sequence[float]], interpolation: str, exact: bool, *args, **kwargs): - ... - def argmax(self, *args, **kwargs): ... From 2150f60b492f8d9c557648a36a0afc7252373b58 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 27 Jan 2022 11:35:25 -0800 Subject: [PATCH 20/39] Document missing parameters and remove unnecessary Reducible inheritance. --- python/cudf/cudf/core/column/numerical_base.py | 3 +-- python/cudf/cudf/core/mixins/mixin_factory.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 70f7fc4444d..5e7d63693e9 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -12,10 +12,9 @@ from cudf import _lib as libcudf from cudf._typing import ScalarLike from cudf.core.column import ColumnBase -from cudf.core.mixins import Reducible -class NumericalBaseColumn(ColumnBase, Reducible): +class NumericalBaseColumn(ColumnBase): """A column composed of numerical data. This class encodes a standard interface for different types of columns diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index d7efe2674d3..5377f0f1002 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -31,6 +31,12 @@ def _create_delegating_mixin( Parameters ---------- + mixin_name : str + The name of the class. This argument should be the same as the object + that this function's output is assigned to, e.g. + :code:`Baz = _create_delegating_mixin("Baz", ...)`. + docstring : str + The documentation string for the mixin class. category_name : str The category of operations for which a mixin is being created. This name will be used to define the following attributes as shown in the From c9395cb3be271c5782a0a2930b0368e64f24456d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 27 Jan 2022 12:14:47 -0800 Subject: [PATCH 21/39] Remove all superfluous args/kwargs parameters. --- python/cudf/cudf/core/column/column.py | 9 ++-- python/cudf/cudf/core/column/datetime.py | 6 +-- .../cudf/cudf/core/column/numerical_base.py | 4 +- python/cudf/cudf/core/column/string.py | 7 +-- python/cudf/cudf/core/mixins/reductions.pyi | 47 +++++++++---------- 5 files changed, 29 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 57767e019da..8640eaa7432 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -187,7 +187,7 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: return False return self.binary_operator("NULL_EQUALS", other).all() - def all(self, skipna: bool = True, *args, **kwargs) -> bool: + def all(self, skipna: bool = True) -> bool: # If all entries are null the result is True, including when the column # is empty. result_col = self.nans_to_nulls() if skipna else self @@ -200,7 +200,7 @@ def all(self, skipna: bool = True, *args, **kwargs) -> bool: return result_col - def any(self, skipna: bool = True, *args, **kwargs) -> bool: + def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. result_col = self.nans_to_nulls() if skipna else self if not skipna and result_col.has_nulls(): @@ -688,9 +688,6 @@ def quantile( ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") - def median(self, skipna: bool = None, *args, **kwargs) -> ScalarLike: - raise TypeError(f"cannot perform median with type {self.dtype}") - def take( self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True ) -> T: @@ -1103,7 +1100,7 @@ def searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self, *args, **kwargs) -> ColumnBase: + def unique(self) -> ColumnBase: """ Get unique values in the data """ diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 64f44763454..e7442e1236b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -350,9 +350,7 @@ def _default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column""" return np.datetime64("nat", self.time_unit) - def mean( - self, skipna=None, dtype=np.float64, *args, **kwargs - ) -> ScalarLike: + def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, @@ -371,7 +369,7 @@ def std( * _numpy_to_pandas_conversion[self.time_unit], ) - def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timestamp: + def median(self, skipna: bool = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 5e7d63693e9..a6a124e1ac4 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -112,9 +112,7 @@ def var(self, dtype=np.float64, *args, **kwargs): def std(self, dtype=np.float64, *args, **kwargs): return self._reduce("std", dtype=dtype, *args, **kwargs) - def median( - self, skipna: bool = None, *args, **kwargs - ) -> NumericalBaseColumn: + def median(self, skipna: bool = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4c503ba5a0b..e2c321381d8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5089,12 +5089,7 @@ def to_arrow(self) -> pa.Array: return super().to_arrow() def sum( - self, - skipna: bool = None, - dtype: Dtype = None, - min_count: int = 0, - *args, - **kwargs, + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0, ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index 1ce2d8acdd6..600f30e9372 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -3,71 +3,68 @@ from __future__ import annotations class Reducible: - def sum(self, *args, **kwargs): + def sum(self): ... - def product(self, *args, **kwargs): + def product(self): ... - def min(self, *args, **kwargs): + def min(self): ... - def max(self, *args, **kwargs): + def max(self): ... - def count(self, *args, **kwargs): + def count(self): ... - def any(self, *args, **kwargs): + def any(self): ... - def all(self, *args, **kwargs): + def all(self): ... - def sum_of_squares(self, *args, **kwargs): + def sum_of_squares(self): ... - def mean(self, *args, **kwargs): + def mean(self): ... - def var(self, *args, **kwargs): + def var(self): ... - def std(self, *args, **kwargs): + def std(self): ... - def median(self, *args, **kwargs): + def median(self): ... - def argmax(self, *args, **kwargs): + def argmax(self): ... - def argmin(self, *args, **kwargs): + def argmin(self): ... - def nunique(self, *args, **kwargs): + def nunique(self): ... - def nth(self, *args, **kwargs): + def nth(self): ... - def collect(self, *args, **kwargs): + def collect(self): ... - def unique(self, *args, **kwargs): + def prod(self): ... - def prod(self, *args, **kwargs): + def idxmin(self): ... - def idxmin(self, *args, **kwargs): + def idxmax(self): ... - def idxmax(self, *args, **kwargs): + def first(self): ... - def first(self, *args, **kwargs): - ... - - def last(self, *args, **kwargs): + def last(self): ... From 6f6cc6fbc7b50eb08625b2fdc8f9b2530b618094 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 27 Jan 2022 12:27:38 -0800 Subject: [PATCH 22/39] Fix signatures for methods of numerical columns called by datetime or timedelta columns. --- python/cudf/cudf/core/column/datetime.py | 17 ++++++++----- .../cudf/cudf/core/column/numerical_base.py | 22 ++++++++++++----- python/cudf/cudf/core/column/timedelta.py | 24 ++++++++----------- 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index e7442e1236b..d7da342bdcf 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -350,22 +350,27 @@ def _default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column""" return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: + def mean( + self, skipna=None, min_count: int = 0, dtype=np.float64 + ) -> ScalarLike: return pd.Timestamp( - self.as_numerical.mean(skipna=skipna, dtype=dtype), + self.as_numerical.mean( + skipna=skipna, min_count=min_count, dtype=dtype + ), unit=self.time_unit, ) def std( self, skipna: bool = None, - ddof: int = 1, + min_count: int = 0, dtype: Dtype = np.float64, - *args, - **kwargs, + ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype) + self.as_numerical.std( + skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + ) * _numpy_to_pandas_conversion[self.time_unit], ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index a6a124e1ac4..f491aae9fab 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -103,14 +103,24 @@ def quantile( ) return result - def mean(self, dtype=np.float64, *args, **kwargs): - return self._reduce("mean", dtype=dtype, *args, **kwargs) + def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64): + return self._reduce( + "mean", skipna=skipna, min_count=min_count, dtype=dtype + ) - def var(self, dtype=np.float64, *args, **kwargs): - return self._reduce("var", dtype=dtype, *args, **kwargs) + def var( + self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1 + ): + return self._reduce( + "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + ) - def std(self, dtype=np.float64, *args, **kwargs): - return self._reduce("std", dtype=dtype, *args, **kwargs) + def std( + self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1 + ): + return self._reduce( + "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + ) def median(self, skipna: bool = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 42c294cfd0c..55adeb20a2a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -396,19 +396,14 @@ def quantile( return result.astype(self.dtype) def sum( - self, - skipna: bool = None, - dtype: Dtype = None, - min_count=0, - *args, - **kwargs, + self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None, ) -> pd.Timedelta: return pd.Timedelta( - # TODO: mypy doesn't fully support monkey-patching, so the - # monkey-patching of reductions creates some serious limitations. - # We'll have to see how bad the problem is. + # Since sum isn't overriden in Numerical[Base]Column, mypy only + # sees the signature from Reducible (which doesn't have the extra + # parameters from ColumnBase._reduce) so we have to ignore this. self.as_numerical.sum( # type: ignore - skipna=skipna, dtype=dtype, min_count=min_count + skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, ) @@ -416,13 +411,14 @@ def sum( def std( self, skipna: bool = None, - ddof: int = 1, + min_count: int = 0, dtype: Dtype = np.float64, - *args, - **kwargs, + ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype), + self.as_numerical.std( + skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype + ), unit=self.time_unit, ) From ac0a2452d32a87c4c7bfd16a5bee572f573bc7cf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 27 Jan 2022 12:36:39 -0800 Subject: [PATCH 23/39] Remove more args/kwargs. --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c210d631aff..de06e62cbb1 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1000,7 +1000,7 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": def data_array_view(self) -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view - def unique(self, *args, **kwargs) -> CategoricalColumn: + def unique(self) -> CategoricalColumn: codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 55adeb20a2a..0e5feb347aa 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -369,15 +369,13 @@ def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: return self return libcudf.unary.cast(self, dtype=dtype) - def mean( - self, skipna=None, dtype: Dtype = np.float64, *args, **kwargs - ) -> pd.Timedelta: + def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) - def median(self, skipna: bool = None, *args, **kwargs) -> pd.Timedelta: + def median(self, skipna: bool = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) From eb651bcd70ffc395fad981796563f090cf3f0d19 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 14 Feb 2022 11:38:42 -0500 Subject: [PATCH 24/39] Use a descriptor instead --- python/cudf/cudf/core/mixins/mixin_factory.py | 91 ++++++++++--------- 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 5377f0f1002..67f49650d18 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -1,8 +1,17 @@ # Copyright (c) 2022, NVIDIA CORPORATION. +import functools import inspect +def _partialmethod(method, *args1, **kwargs1): + @functools.wraps(method) + def wrapper(self, *args2, **kwargs2): + return method(self, *args1, *args2, **kwargs1, **kwargs2) + + return wrapper + + def _create_delegating_mixin( mixin_name, docstring, category_name, operation_name, supported_operations ): @@ -93,49 +102,46 @@ def _create_delegating_mixin( >>> print(bar.foo1.__doc__) Perform the operation foo1, which returns 42. """ - docstring_attr = f"{category_name}_DOCSTRINGS" validity_attr = f"_VALID_{category_name}S" - class OperationMixin: - @classmethod - def _add_operation(cls, operation): - # This function creates operations on-the-fly. - - # Generate a signature without the `op` parameter. - base_operation = getattr(cls, operation_name) - base_params = inspect.signature(base_operation).parameters.values() - params = [p for p in base_params if p.name != "op"] - signature = inspect.Signature(parameters=params) - - # Generate the list of arguments forwarded to the base operation. - arglist = ", ".join( - [ - f"{key}={key}" - for key in signature.parameters - if key not in ("self", "args", "kwargs") - ] - ) - arglist = arglist + (", " if arglist else "") + "*args, **kwargs" - - # Apply the formatted docstring of the base operation to the - # operation being created here. - docstring = base_operation.__doc__.format( - cls=cls.__name__, - op=operation, - **getattr(cls, docstring_attr, {}).get(operation, {}), - ) + class Operation: + """ + """ + + def __init__(self): + self._operation_name = operation_name - namespace = {} - exec_str = f""" -def {operation}{str(signature)}: - '''{docstring} - ''' - return self.{operation_name}(op="{operation}", {arglist}) -""" - exec(exec_str, namespace) - setattr(cls, operation, namespace[operation]) + def __set_name__(self, owner, name): + self._name = name + def __get__(self, obj, owner=None): + base_operation = getattr(owner, self._operation_name) + retfunc = _partialmethod(base_operation, op=self._name) + + retfunc.__doc__ = base_operation.__doc__.format( + cls=owner.__name__, + op=self._name, + **getattr(owner, docstring_attr, {}).get(self._name, {}), + ) + retfunc.__name__ = self._name + retfunc_params = { + k: v + for k, v in inspect.signature( + base_operation + ).parameters.items() + if k != "op" + }.values() + retfunc.__signature__ = inspect.Signature(retfunc_params) + + setattr(owner, self._name, retfunc) + + if obj is None: + return getattr(owner, self._name) + else: + return getattr(obj, self._name) + + class OperationMixin: @classmethod def __init_subclass__(cls): # Only add the valid set of operations for a particular class. @@ -147,11 +153,12 @@ def __init_subclass__(cls): assert ( len(invalid_operations) == 0 ), f"Invalid requested operations: {invalid_operations}" + for operation in valid_operations: - # Check if the operation is already defined so that subclasses - # can override the method if desired. - if not hasattr(cls, operation): - cls._add_operation(operation) + if operation not in dir(cls): + op_attr = Operation() + setattr(cls, operation, op_attr) + op_attr.__set_name__(cls, operation) def _operation(self, op: str, *args, **kwargs): raise NotImplementedError From 7e2bc87c96b20c1f60c1b5ef9ea278f6776b6041 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 14 Feb 2022 11:39:49 -0500 Subject: [PATCH 25/39] Doc --- python/cudf/cudf/core/mixins/mixin_factory.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 67f49650d18..0018985ae83 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -4,6 +4,8 @@ import inspect +# A simple `partialmethod` that allows setting attributes such as +# __doc__ on instances. def _partialmethod(method, *args1, **kwargs1): @functools.wraps(method) def wrapper(self, *args2, **kwargs2): From 53a623de4c5f06e48787ce44a77eb422755c7acf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:52:00 -0500 Subject: [PATCH 26/39] Update python/cudf/cudf/core/mixins/mixin_factory.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/mixins/mixin_factory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 0018985ae83..ac89584b7f5 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -127,13 +127,13 @@ def __get__(self, obj, owner=None): **getattr(owner, docstring_attr, {}).get(self._name, {}), ) retfunc.__name__ = self._name - retfunc_params = { - k: v + retfunc_params = [ + v for k, v in inspect.signature( base_operation ).parameters.items() if k != "op" - }.values() + ] retfunc.__signature__ = inspect.Signature(retfunc_params) setattr(owner, self._name, retfunc) From 9b78852fec19b0742384bdffe2e36890f3d8de32 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 14 Feb 2022 18:54:28 -0500 Subject: [PATCH 27/39] Merge __set_name__ into __init__ --- python/cudf/cudf/core/mixins/mixin_factory.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 0018985ae83..920f273bdf0 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -108,17 +108,11 @@ def _create_delegating_mixin( validity_attr = f"_VALID_{category_name}S" class Operation: - """ - """ - - def __init__(self): - self._operation_name = operation_name - - def __set_name__(self, owner, name): + def __init__(self, name): self._name = name def __get__(self, obj, owner=None): - base_operation = getattr(owner, self._operation_name) + base_operation = getattr(owner, operation_name) retfunc = _partialmethod(base_operation, op=self._name) retfunc.__doc__ = base_operation.__doc__.format( @@ -158,9 +152,8 @@ def __init_subclass__(cls): for operation in valid_operations: if operation not in dir(cls): - op_attr = Operation() + op_attr = Operation(operation) setattr(cls, operation, op_attr) - op_attr.__set_name__(cls, operation) def _operation(self, op: str, *args, **kwargs): raise NotImplementedError From 1e580f71b6cdeac9a146433005b1cd7cf7d30405 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 10:34:48 -0800 Subject: [PATCH 28/39] Replace functools with explicitly patching attributes (necessary for correctness). --- python/cudf/cudf/core/mixins/mixin_factory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index e8b5e128dc9..94cbfbc9053 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -1,13 +1,11 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -import functools import inspect # A simple `partialmethod` that allows setting attributes such as # __doc__ on instances. def _partialmethod(method, *args1, **kwargs1): - @functools.wraps(method) def wrapper(self, *args2, **kwargs2): return method(self, *args1, *args2, **kwargs1, **kwargs2) @@ -121,6 +119,10 @@ def __get__(self, obj, owner=None): **getattr(owner, docstring_attr, {}).get(self._name, {}), ) retfunc.__name__ = self._name + retfunc.__qualname__ = ".".join([owner.__name__, self._name]) + retfunc.__module__ = base_operation.__module__ + retfunc.__annotations__ = base_operation.__annotations__.copy() + retfunc.__annotations__.pop("op") retfunc_params = [ v for k, v in inspect.signature( From f5ec1d672259e4737219dc1e13131b1be704d454 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 10:36:23 -0800 Subject: [PATCH 29/39] Fix copyrights. --- ci/checks/copyright.py | 2 +- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/window/rolling.py | 2 +- python/cudf/cudf/tests/test_stats.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index d72fd95fea3..9c37a6ab041 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -230,4 +230,4 @@ def checkCopyright_main(): if __name__ == "__main__": import sys - sys.exit(checkCopyright_main()) \ No newline at end of file + sys.exit(checkCopyright_main()) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 86bf30170bd..44e5c6f62f1 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index f491aae9fab..db333328692 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. """Define an interface for columns that can perform numerical operations.""" from __future__ import annotations diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index f620135eecf..7a5f777e88e 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index ce57c2c1215..578a6c33c2c 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION +# Copyright (c) 2020-2022, NVIDIA CORPORATION import itertools diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index bed5fe2ca99..7bf339d6ab7 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from concurrent.futures import ThreadPoolExecutor From b8884f18e28d0ce439a48f171ffdc7e6b29f27ae Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 10:43:16 -0800 Subject: [PATCH 30/39] Address PR comments. --- python/cudf/cudf/core/mixins/mixin_factory.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 94cbfbc9053..45375bd2daa 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -13,14 +13,18 @@ def wrapper(self, *args2, **kwargs2): def _create_delegating_mixin( - mixin_name, docstring, category_name, operation_name, supported_operations + mixin_name, + docstring, + category_name, + category_operation_name, + supported_operations, ): """Factory for mixins defining collections of delegated operations. This function generates mixins based on two common paradigms in cuDF: 1. libcudf groups many operations into categories using a common API. These - APIs usually accept an enum to delinate the specific operation to + APIs usually accept an enum to delineate the specific operation to perform, e.g. binary operations use the `binary_operator` enum when calling the `binary_operation` function. cuDF Python mimics this structure by having operations within a category delegate to a common @@ -53,7 +57,7 @@ def _create_delegating_mixin( - f'_{category_name}_DOCSTRINGS' - f'_VALID_{category_name}S' - f'_SUPPORTED_{category_name}S' - operation_name : str + category_operation_name : str The name given to the core function implementing this category of operations. The corresponding function is the entrypoint for child classes. @@ -110,7 +114,7 @@ def __init__(self, name): self._name = name def __get__(self, obj, owner=None): - base_operation = getattr(owner, operation_name) + base_operation = getattr(owner, category_operation_name) retfunc = _partialmethod(base_operation, op=self._name) retfunc.__doc__ = base_operation.__doc__.format( @@ -162,7 +166,7 @@ def _operation(self, op: str, *args, **kwargs): OperationMixin.__name__ = mixin_name OperationMixin.__doc__ = docstring - setattr(OperationMixin, operation_name, _operation) + setattr(OperationMixin, category_operation_name, _operation) # This attribute is set in case lookup is convenient at a later point, but # it is not strictly necessary since `supported_operations` is part of the # closure associated with the class's creation. From f88f78453a9e3a95f14e392d936ff28374c9bc69 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 10:55:58 -0800 Subject: [PATCH 31/39] Move Operation out of the function to reduce local state. --- python/cudf/cudf/core/mixins/mixin_factory.py | 76 ++++++++++--------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 45375bd2daa..3be31fe55b4 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -12,6 +12,42 @@ def wrapper(self, *args2, **kwargs2): return wrapper +class Operation: + def __init__(self, name, category_name, base_operation): + self._name = name + self._docstring_attr = f"{category_name}_DOCSTRINGS" + self._base_operation = base_operation + + def __get__(self, obj, owner=None): + retfunc = _partialmethod(self._base_operation, op=self._name) + + retfunc.__doc__ = self._base_operation.__doc__.format( + cls=owner.__name__, + op=self._name, + **getattr(owner, self._docstring_attr, {}).get(self._name, {}), + ) + retfunc.__name__ = self._name + retfunc.__qualname__ = ".".join([owner.__name__, self._name]) + retfunc.__module__ = self._base_operation.__module__ + retfunc.__annotations__ = self._base_operation.__annotations__.copy() + retfunc.__annotations__.pop("op") + retfunc_params = [ + v + for k, v in inspect.signature( + self._base_operation + ).parameters.items() + if k != "op" + ] + retfunc.__signature__ = inspect.Signature(retfunc_params) + + setattr(owner, self._name, retfunc) + + if obj is None: + return getattr(owner, self._name) + else: + return getattr(obj, self._name) + + def _create_delegating_mixin( mixin_name, docstring, @@ -106,43 +142,8 @@ def _create_delegating_mixin( >>> print(bar.foo1.__doc__) Perform the operation foo1, which returns 42. """ - docstring_attr = f"{category_name}_DOCSTRINGS" validity_attr = f"_VALID_{category_name}S" - class Operation: - def __init__(self, name): - self._name = name - - def __get__(self, obj, owner=None): - base_operation = getattr(owner, category_operation_name) - retfunc = _partialmethod(base_operation, op=self._name) - - retfunc.__doc__ = base_operation.__doc__.format( - cls=owner.__name__, - op=self._name, - **getattr(owner, docstring_attr, {}).get(self._name, {}), - ) - retfunc.__name__ = self._name - retfunc.__qualname__ = ".".join([owner.__name__, self._name]) - retfunc.__module__ = base_operation.__module__ - retfunc.__annotations__ = base_operation.__annotations__.copy() - retfunc.__annotations__.pop("op") - retfunc_params = [ - v - for k, v in inspect.signature( - base_operation - ).parameters.items() - if k != "op" - ] - retfunc.__signature__ = inspect.Signature(retfunc_params) - - setattr(owner, self._name, retfunc) - - if obj is None: - return getattr(owner, self._name) - else: - return getattr(obj, self._name) - class OperationMixin: @classmethod def __init_subclass__(cls): @@ -156,9 +157,12 @@ def __init_subclass__(cls): len(invalid_operations) == 0 ), f"Invalid requested operations: {invalid_operations}" + base_operation = getattr(cls, category_operation_name) for operation in valid_operations: if operation not in dir(cls): - op_attr = Operation(operation) + op_attr = Operation( + operation, category_name, base_operation + ) setattr(cls, operation, op_attr) def _operation(self, op: str, *args, **kwargs): From 502d07e0ddced05c6bd3fbb2d087a4d3a9a1d23a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 11:21:34 -0800 Subject: [PATCH 32/39] Some cleanup. --- python/cudf/cudf/core/mixins/mixin_factory.py | 62 +++++++++++++++---- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 3be31fe55b4..6281bce039a 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -13,18 +13,37 @@ def wrapper(self, *args2, **kwargs2): class Operation: - def __init__(self, name, category_name, base_operation): + """Descriptor used to define operations for delegating mixins. + + This class is designed to be assigned to the attributes (the delegating + methods) defined by the OperationMixin. This class will create the method + and mimic all the expected attributes for that method to appear as though + it was originally designed on the class. The use of the descriptor pattern + ensures that the method is only created the first time it is invoked, after + which all further calls use the callable generated on the first invocation. + + Parameters + ---------- + name : str + The name of the operation. + docstring_format_args : str + The attribute of the owning class from which to pull format parameters + for this operation's docstring. + base_operation : str + The underlying operation function to be invoked when operation `name` + is called on the owning class. + """ + + def __init__(self, name, docstring_format_args, base_operation): self._name = name - self._docstring_attr = f"{category_name}_DOCSTRINGS" + self._docstring_format_args = docstring_format_args self._base_operation = base_operation def __get__(self, obj, owner=None): retfunc = _partialmethod(self._base_operation, op=self._name) retfunc.__doc__ = self._base_operation.__doc__.format( - cls=owner.__name__, - op=self._name, - **getattr(owner, self._docstring_attr, {}).get(self._name, {}), + cls=owner.__name__, op=self._name, **self._docstring_format_args, ) retfunc.__name__ = self._name retfunc.__qualname__ = ".".join([owner.__name__, self._name]) @@ -88,8 +107,8 @@ def _create_delegating_mixin( The documentation string for the mixin class. category_name : str The category of operations for which a mixin is being created. This - name will be used to define the following attributes as shown in the - example below: + name will be used to define or access the following attributes as shown + in the example below: - f'_{category_name}_DOCSTRINGS' - f'_VALID_{category_name}S' - f'_SUPPORTED_{category_name}S' @@ -142,7 +161,14 @@ def _create_delegating_mixin( >>> print(bar.foo1.__doc__) Perform the operation foo1, which returns 42. """ + # The first two attributes may be defined on subclasses of the generated + # OperationMixin to indicate valid attributes and parameters to use when + # formatting docstrings. The supported_attr will be defined on the + # OperationMixin itself to indicate what operations its subclass may + # inherit from it. validity_attr = f"_VALID_{category_name}S" + docstring_attr = f"{category_name}_DOCSTRINGS" + supported_attr = f"_SUPPORTED_{category_name}S" class OperationMixin: @classmethod @@ -160,22 +186,32 @@ def __init_subclass__(cls): base_operation = getattr(cls, category_operation_name) for operation in valid_operations: if operation not in dir(cls): + docstring_format_args = getattr( + cls, docstring_attr, {} + ).get(operation, {}) op_attr = Operation( - operation, category_name, base_operation + operation, docstring_format_args, base_operation ) setattr(cls, operation, op_attr) + OperationMixin.__name__ = mixin_name + OperationMixin.__qualname__ = mixin_name + OperationMixin.__doc__ = docstring + def _operation(self, op: str, *args, **kwargs): raise NotImplementedError - OperationMixin.__name__ = mixin_name - OperationMixin.__doc__ = docstring + _operation.__name__ = category_operation_name + _operation.__qualname__ = ".".join([mixin_name, category_operation_name]) + _operation.__doc__ = ( + f"The core {category_name.lower()} function. Must be overridden by " + "subclasses, the default implementation raises a NotImplementedError." + ) + setattr(OperationMixin, category_operation_name, _operation) # This attribute is set in case lookup is convenient at a later point, but # it is not strictly necessary since `supported_operations` is part of the # closure associated with the class's creation. - setattr( - OperationMixin, f"_SUPPORTED_{category_name}S", supported_operations - ) + setattr(OperationMixin, supported_attr, supported_operations) return OperationMixin From 6ed61aaf56553aa4212b90225cfd241e8bf3ab6c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 11:28:29 -0800 Subject: [PATCH 33/39] Revert changes to copyright file. --- ci/checks/copyright.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index 9c37a6ab041..d72fd95fea3 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -230,4 +230,4 @@ def checkCopyright_main(): if __name__ == "__main__": import sys - sys.exit(checkCopyright_main()) + sys.exit(checkCopyright_main()) \ No newline at end of file From b7859dd44b59001609e47dc22ab53b63fa764c4c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 16 Feb 2022 11:23:06 -0500 Subject: [PATCH 34/39] Better explain the need for a custom partialmethod --- python/cudf/cudf/core/mixins/mixin_factory.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 6281bce039a..93a4d394966 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -3,8 +3,9 @@ import inspect -# A simple `partialmethod` that allows setting attributes such as -# __doc__ on instances. +# `functools.partialmethod` does not allow setting attributes such as +# __doc__ on the resulting method. So we use a simple alternative to +# it here: def _partialmethod(method, *args1, **kwargs1): def wrapper(self, *args2, **kwargs2): return method(self, *args1, *args2, **kwargs1, **kwargs2) From 764eb8106afe73251c2c82e0d961db1a4e455a26 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Feb 2022 12:35:45 -0800 Subject: [PATCH 35/39] Address most PR comments. --- python/cudf/cudf/core/mixins/mixin_factory.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 93a4d394966..2e9dce0633c 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -43,14 +43,20 @@ def __init__(self, name, docstring_format_args, base_operation): def __get__(self, obj, owner=None): retfunc = _partialmethod(self._base_operation, op=self._name) - retfunc.__doc__ = self._base_operation.__doc__.format( - cls=owner.__name__, op=self._name, **self._docstring_format_args, - ) + # Required attributes that will exist. retfunc.__name__ = self._name retfunc.__qualname__ = ".".join([owner.__name__, self._name]) retfunc.__module__ = self._base_operation.__module__ + + if self._base_operation.__doc__ is not None: + retfunc.__doc__ = self._base_operation.__doc__.format( + cls=owner.__name__, + op=self._name, + **self._docstring_format_args, + ) + retfunc.__annotations__ = self._base_operation.__annotations__.copy() - retfunc.__annotations__.pop("op") + retfunc.__annotations__.pop("op", None) retfunc_params = [ v for k, v in inspect.signature( @@ -72,7 +78,7 @@ def _create_delegating_mixin( mixin_name, docstring, category_name, - category_operation_name, + base_operation_name, supported_operations, ): """Factory for mixins defining collections of delegated operations. @@ -111,9 +117,9 @@ def _create_delegating_mixin( name will be used to define or access the following attributes as shown in the example below: - f'_{category_name}_DOCSTRINGS' - - f'_VALID_{category_name}S' - - f'_SUPPORTED_{category_name}S' - category_operation_name : str + - f'_VALID_{category_name}S' # The subset of ops a subclass allows + - f'_SUPPORTED_{category_name}S' # The ops supported by the mixin + base_operation_name : str The name given to the core function implementing this category of operations. The corresponding function is the entrypoint for child classes. @@ -130,7 +136,7 @@ def _create_delegating_mixin( >>> # The above is equivalent to defining a class like so: ... class MyFoo: ... # The set of valid foo operations. - ... _VALID_FOOS = {"foo1", "foo2"} + ... _SUPPORTED_FOOS = {"foo1", "foo2"} ... ... # This is the method for child classes to override. Note that the ... # first parameter is always called "op". @@ -139,7 +145,7 @@ def _create_delegating_mixin( >>> # MyFoo can be used as follows. >>> class BarImplementsFoo(MyFoo): - ... _SUPPORTED_FOOS = ("foo1",) + ... _VALID_FOOS = ("foo1",) ... _FOO_DOCSTRINGS = {"ret": "42"} ... ... # This method's docstring will be formatted and used for all valid @@ -168,7 +174,7 @@ def _create_delegating_mixin( # OperationMixin itself to indicate what operations its subclass may # inherit from it. validity_attr = f"_VALID_{category_name}S" - docstring_attr = f"{category_name}_DOCSTRINGS" + docstring_attr = f"_{category_name}_DOCSTRINGS" supported_attr = f"_SUPPORTED_{category_name}S" class OperationMixin: @@ -184,7 +190,7 @@ def __init_subclass__(cls): len(invalid_operations) == 0 ), f"Invalid requested operations: {invalid_operations}" - base_operation = getattr(cls, category_operation_name) + base_operation = getattr(cls, base_operation_name) for operation in valid_operations: if operation not in dir(cls): docstring_format_args = getattr( @@ -202,14 +208,14 @@ def __init_subclass__(cls): def _operation(self, op: str, *args, **kwargs): raise NotImplementedError - _operation.__name__ = category_operation_name - _operation.__qualname__ = ".".join([mixin_name, category_operation_name]) + _operation.__name__ = base_operation_name + _operation.__qualname__ = ".".join([mixin_name, base_operation_name]) _operation.__doc__ = ( f"The core {category_name.lower()} function. Must be overridden by " "subclasses, the default implementation raises a NotImplementedError." ) - setattr(OperationMixin, category_operation_name, _operation) + setattr(OperationMixin, base_operation_name, _operation) # This attribute is set in case lookup is convenient at a later point, but # it is not strictly necessary since `supported_operations` is part of the # closure associated with the class's creation. From 1303bb7a773d4ef94016762cdf35c79430ac8f3c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 18 Feb 2022 12:08:22 -0800 Subject: [PATCH 36/39] Enable overriding of the base operation in child classes. --- python/cudf/cudf/core/mixins/mixin_factory.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 2e9dce0633c..8c88e886a54 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -74,6 +74,30 @@ def __get__(self, obj, owner=None): return getattr(obj, self._name) +def _should_define_operation(cls, operation, base_operation_name): + if operation not in dir(cls): + return True + + # If the class doesn't override the base operation we stick to whatever + # parent implementation exists. + if base_operation_name not in cls.__dict__: + return False + + # There are two possibilities here: + # 1. A parent class manually overrode the operation. That override takes + # precedence even if the current class overrode the base_operation. + # 2. A parent class has an auto-generated operation. The current class must + # override it so that its base operation is used rather than the parent. + for base_cls in cls.__mro__: + # The first attribute in the MRO is the one that will be used. + if operation in base_cls.__dict__: + return isinstance(base_cls.__dict__[operation], Operation) + + # This line should be unreachable since we know the attribute exists + # somewhere in the MRO if the for loop was entered. + assert False, "Operation attribute not found in hierarchy." + + def _create_delegating_mixin( mixin_name, docstring, @@ -192,7 +216,9 @@ def __init_subclass__(cls): base_operation = getattr(cls, base_operation_name) for operation in valid_operations: - if operation not in dir(cls): + if _should_define_operation( + cls, operation, base_operation_name + ): docstring_format_args = getattr( cls, docstring_attr, {} ).get(operation, {}) From 9f7f207b0b0a93717b7fcb9876c0b57835136a5c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 23 Feb 2022 16:12:41 -0800 Subject: [PATCH 37/39] Properly support composition of OperationMixins. --- python/cudf/cudf/core/mixins/mixin_factory.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 8c88e886a54..ae79941baf9 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -204,6 +204,12 @@ def _create_delegating_mixin( class OperationMixin: @classmethod def __init_subclass__(cls): + # Support composition of various OperationMixins. Note that since + # this __init_subclass__ is defined on mixins, it does not prohibit + # classes that inherit it from implementing this method as well as + # long as those implementations also include this super call. + super().__init_subclass__() + # Only add the valid set of operations for a particular class. valid_operations = set() for base_cls in cls.__mro__: From 8eb72debc3853ac10d44b3aa5ce4fa45bd4cc2fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 24 Feb 2022 17:36:19 -0800 Subject: [PATCH 38/39] Add a slightly extended comment. --- python/cudf/cudf/core/mixins/mixin_factory.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index ae79941baf9..b44a7d6a8c6 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -83,11 +83,16 @@ def _should_define_operation(cls, operation, base_operation_name): if base_operation_name not in cls.__dict__: return False - # There are two possibilities here: - # 1. A parent class manually overrode the operation. That override takes - # precedence even if the current class overrode the base_operation. - # 2. A parent class has an auto-generated operation. The current class must - # override it so that its base operation is used rather than the parent. + # At this point we know that the class has the operation defined but it + # also overrides the base operation. Since this function is called before + # the operation is defined on the current class, we know that it inherited + # the operation from a parent. We therefore have two possibilities: + # 1. A parent class manually defined the operation. That override takes + # precedence even if the current class defined the base operation. + # 2. A parent class has an auto-generated operation, i.e. it is of type + # Operation and was created by OperationMixin.__init_subclass__. The + # current class must override it so that its base operation is used + # rather than the parent's base operation. for base_cls in cls.__mro__: # The first attribute in the MRO is the one that will be used. if operation in base_cls.__dict__: From 74e7445bb9d4277bdf59f975967201eb71eb2cb3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 25 Feb 2022 10:23:21 -0800 Subject: [PATCH 39/39] Update docstring with concrete example. --- python/cudf/cudf/core/mixins/mixin_factory.py | 70 +++++++++---------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index b44a7d6a8c6..ecb18f61830 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -158,44 +158,42 @@ def _create_delegating_mixin( Examples -------- - >>> MyFoo = _create_delegating_mixin( - ... "MyFoo", "MyFoo's docstring", "FOO", "do_foo", {"foo1", "foo2"} - ... ) - - >>> # The above is equivalent to defining a class like so: - ... class MyFoo: - ... # The set of valid foo operations. - ... _SUPPORTED_FOOS = {"foo1", "foo2"} + >>> # The class below: + >>> class Person: + ... def _greet(self, op): + ... print(op) ... - ... # This is the method for child classes to override. Note that the - ... # first parameter is always called "op". - ... def do_foo(self, op, *args, **kwargs): - ... raise NotImplementedError - - >>> # MyFoo can be used as follows. - >>> class BarImplementsFoo(MyFoo): - ... _VALID_FOOS = ("foo1",) - ... _FOO_DOCSTRINGS = {"ret": "42"} + ... def hello(self): + ... self._greet("hello") + ... + ... def goodbye(self): + ... self._greet("goodbye") + >>> # can be rewritten using a delegating mixin as follows: + >>> Greeter = _create_delegating_mixin( + ... "Greeter", "", "GREETING", "_greet", {"hello", "goodbye", "hey"} + ... ) + >>> # The `hello` and `goodbye` methods will now be automatically generated + >>> # for the Person class below. + >>> class Person(Greeter): + ... _VALID_GREETINGS = {"hello", "goodbye"} ... - ... # This method's docstring will be formatted and used for all valid - ... # operations. The `op` key is always automatically filled in, while - ... # other keys are formatted using _FOO_DOCSTRINGS. - ... def do_foo(self, op, *args, **kwargs): - ... '''Perform the operation {op}, which returns {ret}.''' - ... return 42 - - >>> bar = BarImplementsFoo() - >>> print(bar.foo1()) - 42 - - >>> # This will raise an AttributeError because foo2 is not supported by - >>> # the BarImplementsFoo subclass of MyFoo. - >>> # print(bar.foo2()) - - >>> # The docstring is formatted with the operation name as well as any - >>> # additional keys provided via the _FOO_DOCSTRINGS parameter. - >>> print(bar.foo1.__doc__) - Perform the operation foo1, which returns 42. + ... def _greet(self, op: str): + ... '''Say {op}.''' + ... print(op) + >>> mom = Person() + >>> mom.hello() + hello + >>> # The Greeter class could also enable the `hey` method, but Person did + >>> # not include it in the _VALID_GREETINGS set so it will not exist. + >>> mom.hey() + Traceback (most recent call last): + ... + AttributeError: 'Person' object has no attribute 'hey' + >>> # The docstrings for each method are generated by formatting the _greet + >>> # docstring with the operation name as well as any additional keys + >>> # provided via the _GREETING_DOCSTRINGS parameter. + >>> print(mom.hello.__doc__) + Say hello. """ # The first two attributes may be defined on subclasses of the generated # OperationMixin to indicate valid attributes and parameters to use when