CLN: Cleanups in groupby due to numeric_only deprecations (#49761)

* DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample * Change to TypeError * Better error message * WIP * WIP * CLN: Cleanups in groupby due to numeric_only deprecations * revert * Remove ops from groupby.String * fixup * fixup
pandas-dev · Nov 22, 2022 · 2f751ad · jbrockmendel · Nov 28, 2022 · 2f751ad
1 parent 025fbd0
commit 2f751ad
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 86 deletions.
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -671,12 +671,8 @@ class String:
         ["str", "string[python]"],
         [
             "sum",
-            "prod",
             "min",
             "max",
-            "mean",
-            "median",
-            "var",
             "first",
             "last",
             "any",

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -863,7 +863,7 @@ def skew(
         self,
         axis: Axis | lib.NoDefault = lib.no_default,
         skipna: bool = True,
-        numeric_only: bool | None = None,
+        numeric_only: bool = False,
         **kwargs,
     ) -> Series:
         result = self._op_via_apply(
@@ -1357,9 +1357,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
 
         # We could use `mgr.apply` here and not have to set_axis, but
         #  we would have to do shape gymnastics for ArrayManager compat
-        res_mgr = mgr.grouped_reduce(
-            arr_func, ignore_failures=numeric_only is lib.no_default
-        )
+        res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=False)
         res_mgr.set_axis(1, mgr.axes[1])
 
         if len(res_mgr) < orig_mgr_len:
@@ -1785,84 +1783,64 @@ def nunique(self, dropna: bool = True) -> DataFrame:
 
     @doc(
         _shared_docs["idxmax"],
-        numeric_only_default="True for axis=0, False for axis=1",
+        numeric_only_default="False",
     )
     def idxmax(
         self,
         axis: Axis = 0,
         skipna: bool = True,
-        numeric_only: bool | lib.NoDefault = lib.no_default,
+        numeric_only: bool = False,
     ) -> DataFrame:
         axis = DataFrame._get_axis_number(axis)
-        if numeric_only is lib.no_default:
-            # Cannot use self._resolve_numeric_only; we must pass None to
-            # DataFrame.idxmax for backwards compatibility
-            numeric_only_arg = None if axis == 0 else False
-        else:
-            numeric_only_arg = numeric_only
 
         def func(df):
-            with warnings.catch_warnings():
-                # Suppress numeric_only warnings here, will warn below
-                warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")
-                res = df._reduce(
-                    nanops.nanargmax,
-                    "argmax",
-                    axis=axis,
-                    skipna=skipna,
-                    numeric_only=numeric_only_arg,
-                )
-                indices = res._values
-                index = df._get_axis(axis)
-                result = [index[i] if i >= 0 else np.nan for i in indices]
-                return df._constructor_sliced(result, index=res.index)
+            res = df._reduce(
+                nanops.nanargmax,
+                "argmax",
+                axis=axis,
+                skipna=skipna,
+                numeric_only=numeric_only,
+            )
+            indices = res._values
+            index = df._get_axis(axis)
+            result = [index[i] if i >= 0 else np.nan for i in indices]
+            return df._constructor_sliced(result, index=res.index)
 
         func.__name__ = "idxmax"
         result = self._python_apply_general(
             func, self._obj_with_exclusions, not_indexed_same=True
         )
-        self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)
         return result
 
     @doc(
         _shared_docs["idxmin"],
-        numeric_only_default="True for axis=0, False for axis=1",
+        numeric_only_default="False",
     )
     def idxmin(
         self,
         axis: Axis = 0,
         skipna: bool = True,
-        numeric_only: bool | lib.NoDefault = lib.no_default,
+        numeric_only: bool = False,
     ) -> DataFrame:
         axis = DataFrame._get_axis_number(axis)
-        if numeric_only is lib.no_default:
-            # Cannot use self._resolve_numeric_only; we must pass None to
-            # DataFrame.idxmin for backwards compatibility
-            numeric_only_arg = None if axis == 0 else False
-        else:
-            numeric_only_arg = numeric_only
 
         def func(df):
-            with warnings.catch_warnings():
-                # Suppress numeric_only warnings here, will warn below
-                warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")
-                res = df._reduce(
-                    nanops.nanargmin,
-                    "argmin",
-                    axis=axis,
-                    skipna=skipna,
-                    numeric_only=numeric_only_arg,
-                )
-                indices = res._values
-                index = df._get_axis(axis)
-                result = [index[i] if i >= 0 else np.nan for i in indices]
-                return df._constructor_sliced(result, index=res.index)
+            res = df._reduce(
+                nanops.nanargmin,
+                "argmin",
+                axis=axis,
+                skipna=skipna,
+                numeric_only=numeric_only,
+            )
+            indices = res._values
+            index = df._get_axis(axis)
+            result = [index[i] if i >= 0 else np.nan for i in indices]
+            return df._constructor_sliced(result, index=res.index)
 
         func.__name__ = "idxmin"
         result = self._python_apply_general(
             func, self._obj_with_exclusions, not_indexed_same=True
         )
-        self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)
         return result
 
     boxplot = boxplot_frame_groupby

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -324,8 +324,12 @@ class providing the base-class of operations.
 Parameters
 ----------
 numeric_only : bool, default {no}
-    Include only float, int, boolean columns. If None, will attempt to use
-    everything, then use only numeric data.
+    Include only float, int, boolean columns.
+
+    .. versionchanged:: 2.0.0
+
+        numeric_only no longer accepts ``None``.
+
 min_count : int, default {mc}
     The required number of valid values to perform the operation. If fewer
     than ``min_count`` non-NA values are present the result will be NA.
@@ -1654,7 +1658,6 @@ def _agg_general(
                 alt=npfunc,
                 numeric_only=numeric_only,
                 min_count=min_count,
-                ignore_failures=numeric_only is lib.no_default,
             )
             return result.__finalize__(self.obj, method="groupby")
 
@@ -1705,7 +1708,6 @@ def _cython_agg_general(
         alt: Callable,
         numeric_only: bool | lib.NoDefault,
         min_count: int = -1,
-        ignore_failures: bool = True,
         **kwargs,
     ):
         # Note: we never get here with how="ohlc" for DataFrameGroupBy;
@@ -1749,7 +1751,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
 
         # TypeError -> we may have an exception in trying to aggregate
         #  continue and exclude the block
-        new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
+        new_mgr = data.grouped_reduce(array_func, ignore_failures=False)
 
         if not is_ser and len(new_mgr) < orig_len:
             warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
@@ -2054,8 +2056,11 @@ def mean(
         Parameters
         ----------
         numeric_only : bool, default True
-            Include only float, int, boolean columns. If None, will attempt to use
-            everything, then use only numeric data.
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None``.
 
         engine : str, default None
             * ``'cython'`` : Runs the operation through C-extensions from cython.
@@ -2123,7 +2128,6 @@ def mean(
                 "mean",
                 alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
                 numeric_only=numeric_only,
-                ignore_failures=numeric_only is lib.no_default,
             )
             return result.__finalize__(self.obj, method="groupby")
 
@@ -2139,8 +2143,11 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
         Parameters
         ----------
         numeric_only : bool, default True
-            Include only float, int, boolean columns. If None, will attempt to use
-            everything, then use only numeric data.
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None``.
 
         Returns
         -------
@@ -2153,7 +2160,6 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
             "median",
             alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
             numeric_only=numeric_only,
-            ignore_failures=numeric_only is lib.no_default,
         )
         return result.__finalize__(self.obj, method="groupby")
 
@@ -2287,7 +2293,6 @@ def var(
                 "var",
                 alt=lambda x: Series(x).var(ddof=ddof),
                 numeric_only=numeric_only,
-                ignore_failures=numeric_only is lib.no_default,
                 ddof=ddof,
             )
 
@@ -3286,8 +3291,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         is_ser = obj.ndim == 1
         mgr = self._get_data_to_aggregate()
         data = mgr.get_numeric_data() if numeric_only_bool else mgr
-        ignore_failures = numeric_only_bool
-        res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
+        res_mgr = data.grouped_reduce(blk_func, ignore_failures=False)
 
         if (
             numeric_only is lib.no_default
@@ -3765,9 +3769,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         if numeric_only_bool:
             mgr = mgr.get_numeric_data()
 
-        res_mgr = mgr.grouped_reduce(
-            blk_func, ignore_failures=numeric_only is lib.no_default
-        )
+        res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False)
 
         if not is_ser and len(res_mgr.items) != orig_mgr_len:
             howstr = how.replace("group_", "")

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -101,8 +101,9 @@ def test_cython_agg_nothing_to_agg():
 
     frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
 
-    with tm.assert_produces_warning(FutureWarning):
-        result = frame[["b"]].groupby(frame["a"]).mean()
+    with pytest.raises(TypeError, match="Could not convert"):
+        frame[["b"]].groupby(frame["a"]).mean()
+    result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
     expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -103,9 +103,7 @@ def test_basic():  # TODO: split this test
     gb = df.groupby("A", observed=False)
     exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
     expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
-    msg = "The default value of numeric_only"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = gb.sum()
+    result = gb.sum()
     tm.assert_frame_equal(result, expected)
 
     # GH 8623
@@ -857,12 +855,14 @@ def test_preserve_categorical_dtype():
         }
     )
     for col in ["C1", "C2"]:
-        msg = "The default value of numeric_only"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            result1 = df.groupby(by=col, as_index=False, observed=False).mean()
-            result2 = (
-                df.groupby(by=col, as_index=True, observed=False).mean().reset_index()
-            )
+        result1 = df.groupby(by=col, as_index=False, observed=False).mean(
+            numeric_only=True
+        )
+        result2 = (
+            df.groupby(by=col, as_index=True, observed=False)
+            .mean(numeric_only=True)
+            .reset_index()
+        )
         expected = exp_full.reindex(columns=result1.columns)
         tm.assert_frame_equal(result1, expected)
         tm.assert_frame_equal(result2, expected)
@@ -1856,10 +1856,7 @@ def test_category_order_reducer(
         df = df.set_index(keys)
     args = get_groupby_method_args(reduction_func, df)
     gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
-    msg = "is deprecated and will be removed in a future version"
-    warn = FutureWarning if reduction_func == "mad" else None
-    with tm.assert_produces_warning(warn, match=msg):
-        op_result = getattr(gb, reduction_func)(*args)
+    op_result = getattr(gb, reduction_func)(*args)
     if as_index:
         result = op_result.index.get_level_values("a").categories
     else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -671,12 +671,8 @@ class String: @@
             ["str", "string[python]"],
             [
                 "sum",
-                "prod",
                 "min",
                 "max",
-                "mean",
-                "median",
-                "var",
                 "first",
                 "last",
                 "any",
@@ Expand Down @@