From edb25a84aaafa0d65d36ebb94113393d4b6474fb Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 8 Aug 2023 20:52:11 -0500 Subject: [PATCH] Fix `any`, `all` reduction behavior for `axis=None` and warn for other reductions (#13831) Fixes: #13827 This PR: - [x] Fixes `axis=None` behavior for `any` & `all` reductions. - [x] Introduces `FutureWarning` for upcoming change in behavior for the rest of the reductions since some of the reductions are only updated in pandas-2.0 and the rest would be updated in pandas-3.0. - [x] Fixed numpy array function inconsistency because of mismatching default for `axis` parameter. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13831 --- python/cudf/cudf/core/dataframe.py | 48 +++++++++++--- python/cudf/cudf/core/frame.py | 64 ++++++++++++++----- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/single_column_frame.py | 6 +- python/cudf/cudf/tests/test_array_function.py | 5 ++ python/cudf/cudf/tests/test_reductions.py | 60 ++++++++++++++++- 6 files changed, 156 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0298dd103f5..dabef4adde0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1333,6 +1333,11 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented try: + if func.__name__ in {"any", "all"}: + # NumPy default for `axis` is + # different from `cudf`/`pandas` + # hence need this special handling. + kwargs.setdefault("axis", None) if cudf_func := getattr(self.__class__, func.__name__, None): out = cudf_func(*args, **kwargs) # The dot product of two DataFrames returns an array in pandas. @@ -2557,7 +2562,7 @@ def reindex( "Cannot specify both 'axis' and any of 'index' or 'columns'." ) - axis = self._get_axis_from_axis_arg(axis) + axis = 0 if axis is None else self._get_axis_from_axis_arg(axis) if axis == 0: if index is None: index = labels @@ -5798,7 +5803,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): _SUPPORT_AXIS_LOOKUP = { 0: 0, 1: 1, - None: 0, "index": 0, "columns": 1, } @@ -5826,9 +5830,26 @@ def _reduce( if source.empty: return Series(index=cudf.Index([], dtype="str")) - axis = source._get_axis_from_axis_arg(axis) + if axis is None: + if op in {"any", "all"}: + axis = 2 + else: + # Do not remove until pandas 2.0 support is added. + warnings.warn( + f"In a future version, {type(self).__name__}" + f".{op}(axis=None) will return a scalar {op} over " + "the entire DataFrame. To retain the old behavior, " + f"use '{type(self).__name__}.{op}(axis=0)' or " + f"just '{type(self)}.{op}()'", + FutureWarning, + ) + axis = 0 + elif axis is no_default: + axis = 0 + else: + axis = source._get_axis_from_axis_arg(axis) - if axis == 0: + if axis in {0, 2}: try: result = [ getattr(source._data[col], op)(**kwargs) @@ -5867,7 +5888,10 @@ def _reduce( ) source = self._get_columns_by_label(numeric_cols) if source.empty: - return Series(index=cudf.Index([], dtype="str")) + if axis == 2: + return getattr(as_column([]), op)(**kwargs) + else: + return Series(index=cudf.Index([], dtype="str")) try: result = [ getattr(source._data[col], op)(**kwargs) @@ -5879,12 +5903,16 @@ def _reduce( ) else: raise - - return Series._from_data( - {None: result}, as_index(source._data.names) - ) + if axis == 2: + return getattr(as_column(result), op)(**kwargs) + else: + return Series._from_data( + {None: result}, as_index(source._data.names) + ) elif axis == 1: return source._apply_cupy_method_axis_1(op, **kwargs) + else: + raise ValueError(f"Invalid value of {axis=} received for {op}") @_cudf_nvtx_annotate def _scan( @@ -5894,6 +5922,8 @@ def _scan( *args, **kwargs, ): + if axis is None: + axis = 0 axis = self._get_axis_from_axis_arg(axis) if axis == 0: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 466a704c56e..69757fe900d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -31,6 +31,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import Dtype +from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -1885,7 +1886,7 @@ def _reduce(self, *args, **kwargs): @_cudf_nvtx_annotate def min( self, - axis=None, + axis=no_default, skipna=True, level=None, numeric_only=None, @@ -1936,7 +1937,7 @@ def min( @_cudf_nvtx_annotate def max( self, - axis=None, + axis=no_default, skipna=True, level=None, numeric_only=None, @@ -1987,7 +1988,7 @@ def max( @_cudf_nvtx_annotate def sum( self, - axis=None, + axis=no_default, skipna=True, dtype=None, level=None, @@ -2045,7 +2046,7 @@ def sum( @_cudf_nvtx_annotate def product( self, - axis=None, + axis=no_default, skipna=True, dtype=None, level=None, @@ -2089,11 +2090,11 @@ def product( b 5040 dtype: int64 """ - axis = self._get_axis_from_axis_arg(axis) + return self._reduce( # cuDF columns use "product" as the op name, but cupy uses "prod" # and we need cupy if axis == 1. - "product" if axis == 0 else "prod", + "prod" if axis in {1, "columns"} else "product", axis=axis, skipna=skipna, dtype=dtype, @@ -2108,7 +2109,12 @@ def product( @_cudf_nvtx_annotate def mean( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + self, + axis=no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): """ Return the mean of the values for the requested axis. @@ -2154,7 +2160,7 @@ def mean( @_cudf_nvtx_annotate def std( self, - axis=None, + axis=no_default, skipna=True, level=None, ddof=1, @@ -2210,7 +2216,7 @@ def std( @_cudf_nvtx_annotate def var( self, - axis=None, + axis=no_default, skipna=True, level=None, ddof=1, @@ -2264,7 +2270,12 @@ def var( @_cudf_nvtx_annotate def kurtosis( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + self, + axis=no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): """ Return Fisher's unbiased kurtosis of a sample. @@ -2305,7 +2316,7 @@ def kurtosis( b -1.2 dtype: float64 """ - if axis not in (0, "index", None): + if axis not in (0, "index", None, no_default): raise NotImplementedError("Only axis=0 is currently supported.") return self._reduce( @@ -2322,7 +2333,12 @@ def kurtosis( @_cudf_nvtx_annotate def skew( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + self, + axis=no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): """ Return unbiased Fisher-Pearson skew of a sample. @@ -2366,7 +2382,7 @@ def skew( b -0.37037 dtype: float64 """ - if axis not in (0, "index", None): + if axis not in (0, "index", None, no_default): raise NotImplementedError("Only axis=0 is currently supported.") return self._reduce( @@ -2385,6 +2401,15 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): Parameters ---------- + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. For `Series` + this parameter is unused and defaults to `0`. + + - 0 or 'index' : reduce the index, return a Series + whose index is the original column labels. + - 1 or 'columns' : reduce the columns, return a Series + whose index is the original index. + - None : reduce all axes, return a scalar. skipna: bool, default True Exclude NA/null values. If the entire row/column is NA and skipna is True, then the result will be True, as for an @@ -2398,7 +2423,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): Notes ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `bool_only`, `level`. Examples -------- @@ -2424,6 +2449,15 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): Parameters ---------- + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. For `Series` + this parameter is unused and defaults to `0`. + + - 0 or 'index' : reduce the index, return a Series + whose index is the original column labels. + - 1 or 'columns' : reduce the columns, return a Series + whose index is the original index. + - None : reduce all axes, return a scalar. skipna: bool, default True Exclude NA/null values. If the entire row/column is NA and skipna is True, then the result will be False, as for an @@ -2437,7 +2471,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): Notes ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `bool_only`, `level`. Examples -------- diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e6ac34f2290..51a2d085d00 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3421,7 +3421,7 @@ def sample( 0 1 3 1 2 4 """ - axis = self._get_axis_from_axis_arg(axis) + axis = 0 if axis is None else self._get_axis_from_axis_arg(axis) size = self.shape[axis] # Compute `n` from parameter `frac`. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 0edad039444..ffb432ed14a 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -11,6 +11,7 @@ import cudf from cudf._typing import Dtype, NotImplementedType, ScalarLike +from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, @@ -30,7 +31,6 @@ class SingleColumnFrame(Frame, NotIterable): _SUPPORT_AXIS_LOOKUP = { 0: 0, - None: 0, "index": 0, } @@ -38,12 +38,12 @@ class SingleColumnFrame(Frame, NotIterable): def _reduce( self, op, - axis=None, + axis=no_default, level=None, numeric_only=None, **kwargs, ): - if axis not in (None, 0): + if axis not in (None, 0, no_default): raise NotImplementedError("axis parameter is not implemented yet") if level is not None: diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index a355ebb40b2..758a8cbb535 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -67,6 +67,11 @@ def test_array_func_cudf_series(np_ar, func): lambda x: np.sum(x, axis=0), lambda x: np.var(x, ddof=1), lambda x: np.dot(x, x.transpose()), + lambda x: np.all(x), + lambda x: np.any(x), + lambda x: np.product(x), + lambda x: np.product(x, axis=0), + lambda x: np.product(x, axis=1), ], ) def test_array_func_cudf_dataframe(pd_df, func): diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index c549ac20f59..47968ec1d97 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -12,7 +12,12 @@ from cudf import Series from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import _utils as utils -from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand +from cudf.testing._utils import ( + NUMERIC_TYPES, + assert_eq, + expect_warning_if, + gen_rand, +) params_dtype = NUMERIC_TYPES @@ -306,3 +311,56 @@ def test_categorical_reductions(op): psr = gsr.to_pandas() utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op)) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [10, 11, 12]}, + {"a": [1, 0, 3], "b": [10, 11, 12]}, + {"a": [1, 2, 3], "b": [10, 11, None]}, + { + "a": [], + }, + {}, + ], +) +@pytest.mark.parametrize("op", ["all", "any"]) +def test_any_all_axis_none(data, op): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = getattr(pdf, op)(axis=None) + actual = getattr(gdf, op)(axis=None) + + assert expected == actual + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "product", + "std", + "var", + "kurt", + "kurtosis", + "skew", + "min", + "max", + "mean", + "median", + ], +) +def test_reductions_axis_none_warning(op): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) + pdf = df.to_pandas() + with pytest.warns(FutureWarning): + actual = getattr(df, op)(axis=None) + with expect_warning_if( + op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"}, + FutureWarning, + ): + expected = getattr(pdf, op)(axis=None) + + assert_eq(expected, actual, check_dtype=False)