From ac09649b30b33d59470f76ee4f6eb0a63392eca4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Sep 2021 12:03:18 -0700 Subject: [PATCH] Backport PR #43154: Updating _resolve_numeric_only function of GroupBy (#43481) Co-authored-by: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> --- doc/source/whatsnew/v1.3.3.rst | 1 + pandas/core/groupby/groupby.py | 8 +++ pandas/tests/groupby/aggregate/test_cython.py | 3 +- pandas/tests/groupby/test_groupby.py | 62 +++++++++++++++++++ 4 files changed, 73 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 1acadc26d335b..c066d015f5e62 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.__getitem__` raising error for slice of :class:`DatetimeIndex` when index is non monotonic (:issue:`43223`) - Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`) - Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`) +- Fixed regression in :meth:`DataFrame.groupby` where aggregation on columns with object types dropped results on those columns (:issue:`42395`, :issue:`43108`) - Fixed regression in :meth:`Series.fillna` raising ``TypeError`` when filling ``float`` ``Series`` with list-like fill value having a dtype which couldn't cast lostlessly (like ``float32`` filled with ``float64``) (:issue:`43424`) - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6e8fbae77b4f2..edc5d5c6903b4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1126,6 +1126,14 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.obj.ndim == 2: # i.e. DataFrameGroupBy numeric_only = True + # GH#42395 GH#43108 GH#43154 + # Regression from 1.2.5 to 1.3 caused object columns to be dropped + obj = self._obj_with_exclusions + check = obj._get_numeric_data() + if len(obj.columns) and not len(check.columns) and not obj.empty: + numeric_only = False + # TODO: v1.4+ Add FutureWarning + else: numeric_only = False diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index a035c5500e2dc..694f843ec138f 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -97,7 +97,8 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - result = frame[["b"]].groupby(frame["a"]).mean() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame[["b"]].groupby(frame["a"]).mean() expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates()) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0181481b29c44..b67e97ae6c6ff 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -17,6 +17,7 @@ MultiIndex, RangeIndex, Series, + Timedelta, Timestamp, date_range, read_csv, @@ -2377,6 +2378,67 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +def test_groupby_aggregation_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] + ) + + expected = DataFrame( + { + "v": [[1, 1], [10, 20]], + }, + index=Index(["M", "W"], dtype="object", name="MW"), + ) + + gb = df.groupby(by=["MW"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_non_numeric_dtype(): + # GH #42395 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [Timedelta(i * 10, "days") for i in range(1, 6)], + } + ) + + expected = DataFrame( + { + "y": [Timedelta(i, "days") for i in range(7, 9)], + "z": [Timedelta(i * 10, "days") for i in range(7, 9)], + }, + index=Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_numeric_with_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": list(range(1, 6)), + } + ) + + expected = DataFrame( + {"z": [7, 8]}, + index=Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + def test_groupby_filtered_df_std(): # GH 16174 dicts = [