Skip to content

Commit

Permalink
CLN: Cleanups in groupby due to numeric_only deprecations (#49761)
Browse files Browse the repository at this point in the history
* DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample

* Change to TypeError

* Better error message

* WIP

* WIP

* CLN: Cleanups in groupby due to numeric_only deprecations

* revert

* Remove ops from groupby.String

* fixup

* fixup
  • Loading branch information
rhshadrach authored Nov 22, 2022
1 parent 025fbd0 commit 2f751ad
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 86 deletions.
4 changes: 0 additions & 4 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,12 +671,8 @@ class String:
["str", "string[python]"],
[
"sum",
"prod",
"min",
"max",
"mean",
"median",
"var",
"first",
"last",
"any",
Expand Down
78 changes: 28 additions & 50 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def skew(
self,
axis: Axis | lib.NoDefault = lib.no_default,
skipna: bool = True,
numeric_only: bool | None = None,
numeric_only: bool = False,
**kwargs,
) -> Series:
result = self._op_via_apply(
Expand Down Expand Up @@ -1357,9 +1357,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:

# We could use `mgr.apply` here and not have to set_axis, but
# we would have to do shape gymnastics for ArrayManager compat
res_mgr = mgr.grouped_reduce(
arr_func, ignore_failures=numeric_only is lib.no_default

This comment has been minimized.

Copy link
@jbrockmendel

jbrockmendel Nov 28, 2022

Member

it looks like all remaining usages of grouped_reduce use ignore_failures=False, which if correct means 1) the keyword can be removed and 2) some of the warn_dropping_nuisance_columns_deprecated checks in this file can be removed

)
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=False)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < orig_mgr_len:
Expand Down Expand Up @@ -1785,84 +1783,64 @@ def nunique(self, dropna: bool = True) -> DataFrame:

@doc(
_shared_docs["idxmax"],
numeric_only_default="True for axis=0, False for axis=1",
numeric_only_default="False",
)
def idxmax(
self,
axis: Axis = 0,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> DataFrame:
axis = DataFrame._get_axis_number(axis)
if numeric_only is lib.no_default:
# Cannot use self._resolve_numeric_only; we must pass None to
# DataFrame.idxmax for backwards compatibility
numeric_only_arg = None if axis == 0 else False
else:
numeric_only_arg = numeric_only

def func(df):
with warnings.catch_warnings():
# Suppress numeric_only warnings here, will warn below
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmax"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
)
self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)
return result

@doc(
_shared_docs["idxmin"],
numeric_only_default="True for axis=0, False for axis=1",
numeric_only_default="False",
)
def idxmin(
self,
axis: Axis = 0,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> DataFrame:
axis = DataFrame._get_axis_number(axis)
if numeric_only is lib.no_default:
# Cannot use self._resolve_numeric_only; we must pass None to
# DataFrame.idxmin for backwards compatibility
numeric_only_arg = None if axis == 0 else False
else:
numeric_only_arg = numeric_only

def func(df):
with warnings.catch_warnings():
# Suppress numeric_only warnings here, will warn below
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmin"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
)
self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)
return result

boxplot = boxplot_frame_groupby
Expand Down
36 changes: 19 additions & 17 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,12 @@ class providing the base-class of operations.
Parameters
----------
numeric_only : bool, default {no}
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data.
Include only float, int, boolean columns.
.. versionchanged:: 2.0.0
numeric_only no longer accepts ``None``.
min_count : int, default {mc}
The required number of valid values to perform the operation. If fewer
than ``min_count`` non-NA values are present the result will be NA.
Expand Down Expand Up @@ -1654,7 +1658,6 @@ def _agg_general(
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -1705,7 +1708,6 @@ def _cython_agg_general(
alt: Callable,
numeric_only: bool | lib.NoDefault,
min_count: int = -1,
ignore_failures: bool = True,
**kwargs,
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
Expand Down Expand Up @@ -1749,7 +1751,7 @@ def array_func(values: ArrayLike) -> ArrayLike:

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
new_mgr = data.grouped_reduce(array_func, ignore_failures=False)

if not is_ser and len(new_mgr) < orig_len:
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
Expand Down Expand Up @@ -2054,8 +2056,11 @@ def mean(
Parameters
----------
numeric_only : bool, default True
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data.
Include only float, int, boolean columns.
.. versionchanged:: 2.0.0
numeric_only no longer accepts ``None``.
engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
Expand Down Expand Up @@ -2123,7 +2128,6 @@ def mean(
"mean",
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand All @@ -2139,8 +2143,11 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
Parameters
----------
numeric_only : bool, default True
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data.
Include only float, int, boolean columns.
.. versionchanged:: 2.0.0
numeric_only no longer accepts ``None``.
Returns
-------
Expand All @@ -2153,7 +2160,6 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
"median",
alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2287,7 +2293,6 @@ def var(
"var",
alt=lambda x: Series(x).var(ddof=ddof),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
ddof=ddof,
)

Expand Down Expand Up @@ -3286,8 +3291,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
is_ser = obj.ndim == 1
mgr = self._get_data_to_aggregate()
data = mgr.get_numeric_data() if numeric_only_bool else mgr
ignore_failures = numeric_only_bool
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
res_mgr = data.grouped_reduce(blk_func, ignore_failures=False)

if (
numeric_only is lib.no_default
Expand Down Expand Up @@ -3765,9 +3769,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
if numeric_only_bool:
mgr = mgr.get_numeric_data()

res_mgr = mgr.grouped_reduce(
blk_func, ignore_failures=numeric_only is lib.no_default
)
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False)

if not is_ser and len(res_mgr.items) != orig_mgr_len:
howstr = how.replace("group_", "")
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ def test_cython_agg_nothing_to_agg():

frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})

with tm.assert_produces_warning(FutureWarning):
result = frame[["b"]].groupby(frame["a"]).mean()
with pytest.raises(TypeError, match="Could not convert"):
frame[["b"]].groupby(frame["a"]).mean()
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
tm.assert_frame_equal(result, expected)

Expand Down
23 changes: 10 additions & 13 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ def test_basic(): # TODO: split this test
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.sum()
result = gb.sum()
tm.assert_frame_equal(result, expected)

# GH 8623
Expand Down Expand Up @@ -857,12 +855,14 @@ def test_preserve_categorical_dtype():
}
)
for col in ["C1", "C2"]:
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
result2 = (
df.groupby(by=col, as_index=True, observed=False).mean().reset_index()
)
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
numeric_only=True
)
result2 = (
df.groupby(by=col, as_index=True, observed=False)
.mean(numeric_only=True)
.reset_index()
)
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
Expand Down Expand Up @@ -1856,10 +1856,7 @@ def test_category_order_reducer(
df = df.set_index(keys)
args = get_groupby_method_args(reduction_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
msg = "is deprecated and will be removed in a future version"
warn = FutureWarning if reduction_func == "mad" else None
with tm.assert_produces_warning(warn, match=msg):
op_result = getattr(gb, reduction_func)(*args)
op_result = getattr(gb, reduction_func)(*args)
if as_index:
result = op_result.index.get_level_values("a").categories
else:
Expand Down

0 comments on commit 2f751ad

Please sign in to comment.