Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix any, all reduction behavior for axis=None and warn for other reductions #13831

Merged
merged 15 commits into from
Aug 9, 2023
Merged
46 changes: 37 additions & 9 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,6 +1333,11 @@ def __array_function__(self, func, types, args, kwargs):
return NotImplemented

try:
if func.__name__ in {"any", "all"}:
# NumPy default for `axis` is
# different from `cudf`/`pandas`
# hence need this special handling.
kwargs.setdefault("axis", None)
if cudf_func := getattr(self.__class__, func.__name__, None):
out = cudf_func(*args, **kwargs)
# The dot product of two DataFrames returns an array in pandas.
Expand Down Expand Up @@ -2557,7 +2562,7 @@ def reindex(
"Cannot specify both 'axis' and any of 'index' or 'columns'."
)

axis = self._get_axis_from_axis_arg(axis)
axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
if axis == 0:
if index is None:
index = labels
Expand Down Expand Up @@ -5798,7 +5803,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
_SUPPORT_AXIS_LOOKUP = {
0: 0,
1: 1,
None: 0,
"index": 0,
"columns": 1,
}
Expand Down Expand Up @@ -5826,9 +5830,26 @@ def _reduce(
if source.empty:
return Series(index=cudf.Index([], dtype="str"))

axis = source._get_axis_from_axis_arg(axis)
if axis is None:
if op in {"any", "all"}:
axis = 2
else:
# Do not remove until pandas 2.0 support is added.
warnings.warn(
f"In a future version, {type(self).__name__}"
f".{op}(axis=None) will return a scalar {op} over "
"the entire DataFrame. To retain the old behavior, "
f"use '{type(self).__name__}.{op}(axis=0)' or "
f"just '{type(self)}.{op}()'",
FutureWarning,
)
axis = 0
elif axis is no_default:
axis = 0
else:
axis = source._get_axis_from_axis_arg(axis)

if axis == 0:
if axis in {0, 2}:
try:
result = [
getattr(source._data[col], op)(**kwargs)
Expand Down Expand Up @@ -5867,7 +5888,10 @@ def _reduce(
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return Series(index=cudf.Index([], dtype="str"))
if axis == 2:
return getattr(as_column([]), op)(**kwargs)
else:
return Series(index=cudf.Index([], dtype="str"))
try:
result = [
getattr(source._data[col], op)(**kwargs)
Expand All @@ -5879,12 +5903,16 @@ def _reduce(
)
else:
raise

return Series._from_data(
{None: result}, as_index(source._data.names)
)
if axis == 2:
return getattr(as_column(result), op)(**kwargs)
else:
return Series._from_data(
{None: result}, as_index(source._data.names)
)
elif axis == 1:
return source._apply_cupy_method_axis_1(op, **kwargs)
else:
raise ValueError(f"Incorrect value of {axis=} received for {op}")
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

@_cudf_nvtx_annotate
def _scan(
Expand Down
64 changes: 49 additions & 15 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import cudf
from cudf import _lib as libcudf
from cudf._typing import Dtype
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import (
Expand Down Expand Up @@ -1885,7 +1886,7 @@ def _reduce(self, *args, **kwargs):
@_cudf_nvtx_annotate
def min(
self,
axis=None,
axis=no_default,
skipna=True,
level=None,
numeric_only=None,
Expand Down Expand Up @@ -1936,7 +1937,7 @@ def min(
@_cudf_nvtx_annotate
def max(
self,
axis=None,
axis=no_default,
skipna=True,
level=None,
numeric_only=None,
Expand Down Expand Up @@ -1987,7 +1988,7 @@ def max(
@_cudf_nvtx_annotate
def sum(
self,
axis=None,
axis=no_default,
skipna=True,
dtype=None,
level=None,
Expand Down Expand Up @@ -2045,7 +2046,7 @@ def sum(
@_cudf_nvtx_annotate
def product(
self,
axis=None,
axis=no_default,
skipna=True,
dtype=None,
level=None,
Expand Down Expand Up @@ -2089,11 +2090,11 @@ def product(
b 5040
dtype: int64
"""
axis = self._get_axis_from_axis_arg(axis)

return self._reduce(
# cuDF columns use "product" as the op name, but cupy uses "prod"
# and we need cupy if axis == 1.
"product" if axis == 0 else "prod",
"prod" if axis in {1, "columns"} else "product",
axis=axis,
skipna=skipna,
dtype=dtype,
Expand All @@ -2108,7 +2109,12 @@ def product(

@_cudf_nvtx_annotate
def mean(
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
self,
axis=no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
"""
Return the mean of the values for the requested axis.
Expand Down Expand Up @@ -2154,7 +2160,7 @@ def mean(
@_cudf_nvtx_annotate
def std(
self,
axis=None,
axis=no_default,
skipna=True,
level=None,
ddof=1,
Expand Down Expand Up @@ -2210,7 +2216,7 @@ def std(
@_cudf_nvtx_annotate
def var(
self,
axis=None,
axis=no_default,
skipna=True,
level=None,
ddof=1,
Expand Down Expand Up @@ -2264,7 +2270,12 @@ def var(

@_cudf_nvtx_annotate
def kurtosis(
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
self,
axis=no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
"""
Return Fisher's unbiased kurtosis of a sample.
Expand Down Expand Up @@ -2305,7 +2316,7 @@ def kurtosis(
b -1.2
dtype: float64
"""
if axis not in (0, "index", None):
if axis not in (0, "index", None, no_default):
raise NotImplementedError("Only axis=0 is currently supported.")

return self._reduce(
Expand All @@ -2322,7 +2333,12 @@ def kurtosis(

@_cudf_nvtx_annotate
def skew(
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
self,
axis=no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
"""
Return unbiased Fisher-Pearson skew of a sample.
Expand Down Expand Up @@ -2366,7 +2382,7 @@ def skew(
b -0.37037
dtype: float64
"""
if axis not in (0, "index", None):
if axis not in (0, "index", None, no_default):
raise NotImplementedError("Only axis=0 is currently supported.")

return self._reduce(
Expand All @@ -2385,6 +2401,15 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
Parameters
----------
axis : {0 or 'index', 1 or 'columns', None}, default 0
Indicate which axis or axes should be reduced. For `Series`
this parameter is unused and defaults to `0`.
- 0 or 'index' : reduce the index, return a Series
whose index is the original column labels.
- 1 or 'columns' : reduce the columns, return a Series
whose index is the original index.
- None : reduce all axes, return a scalar.
skipna: bool, default True
Exclude NA/null values. If the entire row/column is NA and
skipna is True, then the result will be True, as for an
Expand All @@ -2398,7 +2423,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
Notes
-----
Parameters currently not supported are `axis`, `bool_only`, `level`.
Parameters currently not supported are `bool_only`, `level`.
Examples
--------
Expand All @@ -2424,6 +2449,15 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
Parameters
----------
axis : {0 or 'index', 1 or 'columns', None}, default 0
Indicate which axis or axes should be reduced. For `Series`
this parameter is unused and defaults to `0`.
- 0 or 'index' : reduce the index, return a Series
whose index is the original column labels.
- 1 or 'columns' : reduce the columns, return a Series
whose index is the original index.
- None : reduce all axes, return a scalar.
skipna: bool, default True
Exclude NA/null values. If the entire row/column is NA and
skipna is True, then the result will be False, as for an
Expand All @@ -2437,7 +2471,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
Notes
-----
Parameters currently not supported are `axis`, `bool_only`, `level`.
Parameters currently not supported are `bool_only`, `level`.
Examples
--------
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3421,7 +3421,7 @@ def sample(
0 1 3
1 2 4
"""
axis = self._get_axis_from_axis_arg(axis)
axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
size = self.shape[axis]

# Compute `n` from parameter `frac`.
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import cudf
from cudf._typing import Dtype, NotImplementedType, ScalarLike
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_scalar_or_zero_d_array,
is_bool_dtype,
Expand All @@ -30,20 +31,19 @@ class SingleColumnFrame(Frame, NotIterable):

_SUPPORT_AXIS_LOOKUP = {
0: 0,
None: 0,
"index": 0,
}

@_cudf_nvtx_annotate
def _reduce(
self,
op,
axis=None,
axis=no_default,
level=None,
numeric_only=None,
**kwargs,
):
if axis not in (None, 0):
if axis not in (None, 0, no_default):
raise NotImplementedError("axis parameter is not implemented yet")

if level is not None:
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/tests/test_array_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def test_array_func_cudf_series(np_ar, func):
lambda x: np.sum(x, axis=0),
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
lambda x: np.var(x, ddof=1),
lambda x: np.dot(x, x.transpose()),
lambda x: np.all(x),
lambda x: np.any(x),
lambda x: np.product(x),
lambda x: np.product(x, axis=0),
lambda x: np.product(x, axis=1),
],
)
def test_array_func_cudf_dataframe(pd_df, func):
Expand Down
60 changes: 59 additions & 1 deletion python/cudf/cudf/tests/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
from cudf import Series
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
from cudf.testing import _utils as utils
from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
from cudf.testing._utils import (
NUMERIC_TYPES,
assert_eq,
expect_warning_if,
gen_rand,
)

params_dtype = NUMERIC_TYPES

Expand Down Expand Up @@ -306,3 +311,56 @@ def test_categorical_reductions(op):
psr = gsr.to_pandas()

utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op))


@pytest.mark.parametrize(
"data",
[
{"a": [1, 2, 3], "b": [10, 11, 12]},
{"a": [1, 0, 3], "b": [10, 11, 12]},
{"a": [1, 2, 3], "b": [10, 11, None]},
{
"a": [],
},
{},
],
)
@pytest.mark.parametrize("op", ["all", "any"])
def test_any_all_axis_none(data, op):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

expected = getattr(pdf, op)(axis=None)
actual = getattr(gdf, op)(axis=None)

assert expected == actual


@pytest.mark.parametrize(
"op",
[
"sum",
"product",
"std",
"var",
"kurt",
"kurtosis",
"skew",
"min",
"max",
"mean",
"median",
],
)
def test_reductions_axis_none_warning(op):
df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]})
pdf = df.to_pandas()
with pytest.warns(FutureWarning):
actual = getattr(df, op)(axis=None)
with expect_warning_if(
op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"},
FutureWarning,
):
expected = getattr(pdf, op)(axis=None)

assert_eq(expected, actual, check_dtype=False)