From edb25a84aaafa0d65d36ebb94113393d4b6474fb Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 8 Aug 2023 20:52:11 -0500
Subject: [PATCH] Fix `any`, `all` reduction behavior for `axis=None` and warn
 for other reductions (#13831)

Fixes: #13827

This PR:

- [x] Fixes `axis=None` behavior for `any` & `all` reductions.
- [x] Introduces `FutureWarning` for upcoming change in behavior for the rest of the reductions since some of the reductions are only updated in pandas-2.0 and the rest would be updated in pandas-3.0.
- [x] Fixed numpy array function inconsistency because of mismatching default for `axis` parameter.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13831
---
 python/cudf/cudf/core/dataframe.py            | 48 +++++++++++---
 python/cudf/cudf/core/frame.py                | 64 ++++++++++++++-----
 python/cudf/cudf/core/indexed_frame.py        |  2 +-
 python/cudf/cudf/core/single_column_frame.py  |  6 +-
 python/cudf/cudf/tests/test_array_function.py |  5 ++
 python/cudf/cudf/tests/test_reductions.py     | 60 ++++++++++++++++-
 6 files changed, 156 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0298dd103f5..dabef4adde0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1333,6 +1333,11 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
         try:
+            if func.__name__ in {"any", "all"}:
+                # NumPy default for `axis` is
+                # different from `cudf`/`pandas`
+                # hence need this special handling.
+                kwargs.setdefault("axis", None)
             if cudf_func := getattr(self.__class__, func.__name__, None):
                 out = cudf_func(*args, **kwargs)
                 # The dot product of two DataFrames returns an array in pandas.
@@ -2557,7 +2562,7 @@ def reindex(
                 "Cannot specify both 'axis' and any of 'index' or 'columns'."
             )
 
-        axis = self._get_axis_from_axis_arg(axis)
+        axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         if axis == 0:
             if index is None:
                 index = labels
@@ -5798,7 +5803,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
     _SUPPORT_AXIS_LOOKUP = {
         0: 0,
         1: 1,
-        None: 0,
         "index": 0,
         "columns": 1,
     }
@@ -5826,9 +5830,26 @@ def _reduce(
             if source.empty:
                 return Series(index=cudf.Index([], dtype="str"))
 
-        axis = source._get_axis_from_axis_arg(axis)
+        if axis is None:
+            if op in {"any", "all"}:
+                axis = 2
+            else:
+                # Do not remove until pandas 2.0 support is added.
+                warnings.warn(
+                    f"In a future version, {type(self).__name__}"
+                    f".{op}(axis=None) will return a scalar {op} over "
+                    "the entire DataFrame. To retain the old behavior, "
+                    f"use '{type(self).__name__}.{op}(axis=0)' or "
+                    f"just '{type(self)}.{op}()'",
+                    FutureWarning,
+                )
+                axis = 0
+        elif axis is no_default:
+            axis = 0
+        else:
+            axis = source._get_axis_from_axis_arg(axis)
 
-        if axis == 0:
+        if axis in {0, 2}:
             try:
                 result = [
                     getattr(source._data[col], op)(**kwargs)
@@ -5867,7 +5888,10 @@ def _reduce(
                     )
                     source = self._get_columns_by_label(numeric_cols)
                     if source.empty:
-                        return Series(index=cudf.Index([], dtype="str"))
+                        if axis == 2:
+                            return getattr(as_column([]), op)(**kwargs)
+                        else:
+                            return Series(index=cudf.Index([], dtype="str"))
                     try:
                         result = [
                             getattr(source._data[col], op)(**kwargs)
@@ -5879,12 +5903,16 @@ def _reduce(
                         )
                 else:
                     raise
-
-            return Series._from_data(
-                {None: result}, as_index(source._data.names)
-            )
+            if axis == 2:
+                return getattr(as_column(result), op)(**kwargs)
+            else:
+                return Series._from_data(
+                    {None: result}, as_index(source._data.names)
+                )
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
+        else:
+            raise ValueError(f"Invalid value of {axis=} received for {op}")
 
     @_cudf_nvtx_annotate
     def _scan(
@@ -5894,6 +5922,8 @@ def _scan(
         *args,
         **kwargs,
     ):
+        if axis is None:
+            axis = 0
         axis = self._get_axis_from_axis_arg(axis)
 
         if axis == 0:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 466a704c56e..69757fe900d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -31,6 +31,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import Dtype
+from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -1885,7 +1886,7 @@ def _reduce(self, *args, **kwargs):
     @_cudf_nvtx_annotate
     def min(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         numeric_only=None,
@@ -1936,7 +1937,7 @@ def min(
     @_cudf_nvtx_annotate
     def max(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         numeric_only=None,
@@ -1987,7 +1988,7 @@ def max(
     @_cudf_nvtx_annotate
     def sum(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         dtype=None,
         level=None,
@@ -2045,7 +2046,7 @@ def sum(
     @_cudf_nvtx_annotate
     def product(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         dtype=None,
         level=None,
@@ -2089,11 +2090,11 @@ def product(
         b    5040
         dtype: int64
         """
-        axis = self._get_axis_from_axis_arg(axis)
+
         return self._reduce(
             # cuDF columns use "product" as the op name, but cupy uses "prod"
             # and we need cupy if axis == 1.
-            "product" if axis == 0 else "prod",
+            "prod" if axis in {1, "columns"} else "product",
             axis=axis,
             skipna=skipna,
             dtype=dtype,
@@ -2108,7 +2109,12 @@ def product(
 
     @_cudf_nvtx_annotate
     def mean(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return the mean of the values for the requested axis.
@@ -2154,7 +2160,7 @@ def mean(
     @_cudf_nvtx_annotate
     def std(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         ddof=1,
@@ -2210,7 +2216,7 @@ def std(
     @_cudf_nvtx_annotate
     def var(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         ddof=1,
@@ -2264,7 +2270,12 @@ def var(
 
     @_cudf_nvtx_annotate
     def kurtosis(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -2305,7 +2316,7 @@ def kurtosis(
         b   -1.2
         dtype: float64
         """
-        if axis not in (0, "index", None):
+        if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
 
         return self._reduce(
@@ -2322,7 +2333,12 @@ def kurtosis(
 
     @_cudf_nvtx_annotate
     def skew(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -2366,7 +2382,7 @@ def skew(
         b   -0.37037
         dtype: float64
         """
-        if axis not in (0, "index", None):
+        if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
 
         return self._reduce(
@@ -2385,6 +2401,15 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
 
         Parameters
         ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series`
+            this parameter is unused and defaults to `0`.
+
+            - 0 or 'index' : reduce the index, return a Series
+                whose index is the original column labels.
+            - 1 or 'columns' : reduce the columns, return a Series
+                whose index is the original index.
+            - None : reduce all axes, return a scalar.
         skipna: bool, default True
             Exclude NA/null values. If the entire row/column is NA and
             skipna is True, then the result will be True, as for an
@@ -2398,7 +2423,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
+        Parameters currently not supported are `bool_only`, `level`.
 
         Examples
         --------
@@ -2424,6 +2449,15 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
 
         Parameters
         ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series`
+            this parameter is unused and defaults to `0`.
+
+            - 0 or 'index' : reduce the index, return a Series
+                whose index is the original column labels.
+            - 1 or 'columns' : reduce the columns, return a Series
+                whose index is the original index.
+            - None : reduce all axes, return a scalar.
         skipna: bool, default True
             Exclude NA/null values. If the entire row/column is NA and
             skipna is True, then the result will be False, as for an
@@ -2437,7 +2471,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
+        Parameters currently not supported are `bool_only`, `level`.
 
         Examples
         --------
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e6ac34f2290..51a2d085d00 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3421,7 +3421,7 @@ def sample(
         0  1  3
         1  2  4
         """
-        axis = self._get_axis_from_axis_arg(axis)
+        axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         size = self.shape[axis]
 
         # Compute `n` from parameter `frac`.
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 0edad039444..ffb432ed14a 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -11,6 +11,7 @@
 
 import cudf
 from cudf._typing import Dtype, NotImplementedType, ScalarLike
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -30,7 +31,6 @@ class SingleColumnFrame(Frame, NotIterable):
 
     _SUPPORT_AXIS_LOOKUP = {
         0: 0,
-        None: 0,
         "index": 0,
     }
 
@@ -38,12 +38,12 @@ class SingleColumnFrame(Frame, NotIterable):
     def _reduce(
         self,
         op,
-        axis=None,
+        axis=no_default,
         level=None,
         numeric_only=None,
         **kwargs,
     ):
-        if axis not in (None, 0):
+        if axis not in (None, 0, no_default):
             raise NotImplementedError("axis parameter is not implemented yet")
 
         if level is not None:
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index a355ebb40b2..758a8cbb535 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -67,6 +67,11 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.sum(x, axis=0),
         lambda x: np.var(x, ddof=1),
         lambda x: np.dot(x, x.transpose()),
+        lambda x: np.all(x),
+        lambda x: np.any(x),
+        lambda x: np.product(x),
+        lambda x: np.product(x, axis=0),
+        lambda x: np.product(x, axis=1),
     ],
 )
 def test_array_func_cudf_dataframe(pd_df, func):
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index c549ac20f59..47968ec1d97 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -12,7 +12,12 @@
 from cudf import Series
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import _utils as utils
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.testing._utils import (
+    NUMERIC_TYPES,
+    assert_eq,
+    expect_warning_if,
+    gen_rand,
+)
 
 params_dtype = NUMERIC_TYPES
 
@@ -306,3 +311,56 @@ def test_categorical_reductions(op):
     psr = gsr.to_pandas()
 
     utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [10, 11, 12]},
+        {"a": [1, 0, 3], "b": [10, 11, 12]},
+        {"a": [1, 2, 3], "b": [10, 11, None]},
+        {
+            "a": [],
+        },
+        {},
+    ],
+)
+@pytest.mark.parametrize("op", ["all", "any"])
+def test_any_all_axis_none(data, op):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expected = getattr(pdf, op)(axis=None)
+    actual = getattr(gdf, op)(axis=None)
+
+    assert expected == actual
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "product",
+        "std",
+        "var",
+        "kurt",
+        "kurtosis",
+        "skew",
+        "min",
+        "max",
+        "mean",
+        "median",
+    ],
+)
+def test_reductions_axis_none_warning(op):
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]})
+    pdf = df.to_pandas()
+    with pytest.warns(FutureWarning):
+        actual = getattr(df, op)(axis=None)
+    with expect_warning_if(
+        op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"},
+        FutureWarning,
+    ):
+        expected = getattr(pdf, op)(axis=None)
+
+    assert_eq(expected, actual, check_dtype=False)