Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a mixin for reductions #9925

Merged
merged 46 commits into from
Feb 25, 2022
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
2c074b1
Move reductions from NumericalBase into Reducible mixin.
vyasr Jul 21, 2021
970b45d
Override dtype for numerical type methods that require it.
vyasr Jul 21, 2021
0a774f6
Generate reductions programmatically.
vyasr Nov 19, 2021
3435e90
Make ColumnBase Reducible.
vyasr Nov 19, 2021
77c7767
Use Reducible in GroupBy.
vyasr Nov 19, 2021
4d40e90
Format _reduce to generate docstrings.
vyasr Nov 19, 2021
3177217
Make Rolling Reducible.
vyasr Nov 19, 2021
10eeab2
Add full support for new reduction functions to match docstrings and …
vyasr Nov 22, 2021
34cd0b9
Initial version of pyi file, all reductions working except for quanti…
vyasr Jan 14, 2022
fce53b8
Fix quantile issues.
vyasr Jan 18, 2022
9d1fb4f
Remove size reduction.
vyasr Jan 18, 2022
4eaecaa
Write a factory for delegating mixins.
vyasr Jan 19, 2022
c6c2e80
Move reducible to new subpackage.
vyasr Jan 19, 2022
c43a1a6
Move factory to a separate module.
vyasr Jan 20, 2022
01d1574
Some final cleanup and polishing of documentation.
vyasr Jan 20, 2022
ba6e98f
Fix style.
vyasr Jan 24, 2022
8fbcd73
Some minor simplifications and clarifications.
vyasr Jan 24, 2022
d2829cf
Fix error messages and remove unnecessary comparison in tests.
vyasr Jan 24, 2022
a63e0c3
Remove quantile from reductions since it doesn't strictly follow the …
vyasr Jan 24, 2022
2150f60
Document missing parameters and remove unnecessary Reducible inherita…
vyasr Jan 27, 2022
c9395cb
Remove all superfluous args/kwargs parameters.
vyasr Jan 27, 2022
6f6cc6f
Fix signatures for methods of numerical columns called by datetime or…
vyasr Jan 27, 2022
ac0a245
Remove more args/kwargs.
vyasr Jan 27, 2022
671ed05
Merge branch 'branch-22.04' into refactor/reductions
vyasr Jan 29, 2022
9d95b25
Merge branch 'branch-22.04' into refactor/reductions
vyasr Feb 8, 2022
eb651bc
Use a descriptor instead
shwina Feb 14, 2022
7e2bc87
Doc
shwina Feb 14, 2022
53a623d
Update python/cudf/cudf/core/mixins/mixin_factory.py
shwina Feb 14, 2022
9b78852
Merge __set_name__ into __init__
shwina Feb 14, 2022
a2b1a36
Merge branch 'refactor-reductions-descriptor' of github.com:shwina/cu…
shwina Feb 14, 2022
9b5c680
Merge pull request #2 from shwina/refactor-reductions-descriptor
vyasr Feb 15, 2022
2cd96d4
Merge remote-tracking branch 'origin/branch-22.04' into refactor/redu…
vyasr Feb 15, 2022
e7d181e
Merge branch 'refactor/reductions' of github.com:vyasr/cudf into refa…
vyasr Feb 15, 2022
1e580f7
Replace functools with explicitly patching attributes (necessary for …
vyasr Feb 15, 2022
f5ec1d6
Fix copyrights.
vyasr Feb 15, 2022
b8884f1
Address PR comments.
vyasr Feb 15, 2022
f88f784
Move Operation out of the function to reduce local state.
vyasr Feb 15, 2022
502d07e
Some cleanup.
vyasr Feb 15, 2022
6ed61aa
Revert changes to copyright file.
vyasr Feb 15, 2022
b7859dd
Better explain the need for a custom partialmethod
shwina Feb 16, 2022
764eb81
Address most PR comments.
vyasr Feb 17, 2022
1303bb7
Enable overriding of the base operation in child classes.
vyasr Feb 18, 2022
9f7f207
Properly support composition of OperationMixins.
vyasr Feb 24, 2022
8eb72de
Add a slightly extended comment.
vyasr Feb 25, 2022
a83d5ba
Merge remote-tracking branch 'origin/branch-22.04' into refactor/redu…
vyasr Feb 25, 2022
74e7445
Update docstring with concrete example.
vyasr Feb 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 25 additions & 53 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
ListDtype,
StructDtype,
)
from cudf.core.mixins import Reducible
from cudf.utils import utils
from cudf.utils.dtypes import (
cudf_dtype_from_pa_type,
Expand All @@ -82,7 +83,14 @@
T = TypeVar("T", bound="ColumnBase")


class ColumnBase(Column, Serializable, NotIterable):
class ColumnBase(Column, Serializable, Reducible, NotIterable):
_VALID_REDUCTIONS = {
vyasr marked this conversation as resolved.
Show resolved Hide resolved
"any",
"all",
"max",
"min",
}

def as_frame(self) -> "cudf.core.frame.Frame":
"""
Converts a Column to Frame
Expand Down Expand Up @@ -622,16 +630,10 @@ def append(self, other: ColumnBase) -> ColumnBase:
return concat_columns([self, as_column(other)])

def quantile(
self,
q: Union[float, Sequence[float]],
interpolation: builtins.str,
exact: bool,
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
) -> ColumnBase:
raise TypeError(f"cannot perform quantile with type {self.dtype}")

def median(self, skipna: bool = None) -> ScalarLike:
raise TypeError(f"cannot perform median with type {self.dtype}")

def take(
self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
) -> T:
Expand Down Expand Up @@ -1110,53 +1112,23 @@ def _minmax(self, skipna: bool = None):
return libcudf.reduce.minmax(result_col)
return result_col

def min(self, skipna: bool = None, dtype: Dtype = None):
result_col = self._process_for_reduction(skipna=skipna)
if isinstance(result_col, ColumnBase):
return libcudf.reduce.reduce("min", result_col, dtype=dtype)
return result_col

def max(self, skipna: bool = None, dtype: Dtype = None):
result_col = self._process_for_reduction(skipna=skipna)
if isinstance(result_col, ColumnBase):
return libcudf.reduce.reduce("max", result_col, dtype=dtype)
return result_col

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
):
raise TypeError(f"cannot perform sum with type {self.dtype}")

def product(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
):
raise TypeError(f"cannot perform product with type {self.dtype}")

def mean(self, skipna: bool = None, dtype: Dtype = None):
raise TypeError(f"cannot perform mean with type {self.dtype}")

def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
raise TypeError(f"cannot perform std with type {self.dtype}")

def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
raise TypeError(f"cannot perform var with type {self.dtype}")

def kurtosis(self, skipna: bool = None):
raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
def _reduce(
self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs
) -> ScalarLike:
"""Compute {op} of column values.

def skew(self, skipna: bool = None):
raise TypeError(f"cannot perform skew with type {self.dtype}")

def cov(self, other: ColumnBase):
raise TypeError(
f"cannot perform covarience with types {self.dtype}, "
f"{other.dtype}"
)

def corr(self, other: ColumnBase):
raise TypeError(
f"cannot perform corr with types {self.dtype}, {other.dtype}"
skipna : bool
Whether or not na values must be skipped.
min_count : int, default 0
The minimum number of entries for the reduction, otherwise the
reduction returns NaN.
"""
preprocessed = self._process_for_reduction(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
return preprocessed

@property
def contains_na_entries(self) -> bool:
Expand Down
20 changes: 15 additions & 5 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -346,17 +346,27 @@ def as_string_column(
column.column_empty(0, dtype="object", masked=False),
)

def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
def mean(
self, skipna=None, min_count: int = 0, dtype=np.float64
) -> ScalarLike:
return pd.Timestamp(
self.as_numerical.mean(skipna=skipna, dtype=dtype),
self.as_numerical.mean(
skipna=skipna, min_count=min_count, dtype=dtype
),
unit=self.time_unit,
)

def std(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
self,
skipna: bool = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype)
self.as_numerical.std(
skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
)
* _numpy_to_pandas_conversion[self.time_unit],
)

Expand Down
88 changes: 31 additions & 57 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
"""Define an interface for columns that can perform numerical operations."""

from __future__ import annotations
Expand All @@ -10,7 +10,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import Dtype, ScalarLike
from cudf._typing import ScalarLike
from cudf.core.column import ColumnBase


Expand All @@ -23,59 +23,14 @@ class NumericalBaseColumn(ColumnBase):
point, should be encoded here.
"""

def reduce(
self, op: str, skipna: bool = None, min_count: int = 0, **kwargs
) -> ScalarLike:
"""Perform a reduction operation.

op : str
The operation to perform.
skipna : bool
Whether or not na values must be
"""
preprocessed = self._process_for_reduction(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
else:
return preprocessed

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> ScalarLike:
return self.reduce(
"sum", skipna=skipna, dtype=dtype, min_count=min_count
)

def product(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> ScalarLike:
return self.reduce(
"product", skipna=skipna, dtype=dtype, min_count=min_count
)

def mean(
self, skipna: bool = None, dtype: Dtype = np.float64
) -> ScalarLike:
return self.reduce("mean", skipna=skipna, dtype=dtype)

def var(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
) -> ScalarLike:
return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)

def std(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
) -> ScalarLike:
return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)

def sum_of_squares(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> ScalarLike:
return self.reduce(
"sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
)
_VALID_REDUCTIONS = {
"sum",
"product",
"sum_of_squares",
"mean",
"var",
"std",
}

def _can_return_nan(self, skipna: bool = None) -> bool:
return not skipna and self.has_nulls()
Expand Down Expand Up @@ -148,6 +103,25 @@ def quantile(
)
return result

def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64):
return self._reduce(
"mean", skipna=skipna, min_count=min_count, dtype=dtype
)

def var(
self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
):
return self._reduce(
"var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
)

def std(
self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
):
return self._reduce(
"std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
)

def median(self, skipna: bool = None) -> NumericalBaseColumn:
skipna = True if skipna is None else skipna

Expand All @@ -171,7 +145,7 @@ def _numeric_quantile(
self, quant, interpolation, sorted_indices, exact
)

def cov(self, other: ColumnBase) -> float:
def cov(self, other: NumericalBaseColumn) -> float:
if (
len(self) == 0
or len(other) == 0
Expand All @@ -183,7 +157,7 @@ def cov(self, other: ColumnBase) -> float:
cov_sample = result.sum() / (len(self) - 1)
return cov_sample

def corr(self, other: ColumnBase) -> float:
def corr(self, other: NumericalBaseColumn) -> float:
if len(self) == 0 or len(other) == 0:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5107,7 +5107,7 @@ def to_arrow(self) -> pa.Array:
return super().to_arrow()

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0,
):
result_col = self._process_for_reduction(
skipna=skipna, min_count=min_count
Expand Down
21 changes: 15 additions & 6 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -385,20 +385,29 @@ def quantile(
return result.astype(self.dtype)

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count=0
self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None,
) -> pd.Timedelta:
return pd.Timedelta(
self.as_numerical.sum(
skipna=skipna, dtype=dtype, min_count=min_count
# Since sum isn't overriden in Numerical[Base]Column, mypy only
# sees the signature from Reducible (which doesn't have the extra
# parameters from ColumnBase._reduce) so we have to ignore this.
self.as_numerical.sum( # type: ignore
skipna=skipna, min_count=min_count, dtype=dtype
),
unit=self.time_unit,
)

def std(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
self,
skipna: bool = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype),
self.as_numerical.std(
skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
),
unit=self.time_unit,
)

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5326,10 +5326,13 @@ def _reduce(
axis = self._get_axis_from_axis_arg(axis)

if axis == 0:
result = [
getattr(self._data[col], op)(**kwargs)
for col in self._data.names
]
try:
result = [
getattr(self._data[col], op)(**kwargs)
for col in self._data.names
]
except AttributeError:
raise TypeError(f"cannot perform {op} with type {self.dtype}")

return Series._from_data(
{None: result}, as_index(self._data.names)
Expand Down
Loading