Backport PR #43154: Updating _resolve_numeric_only function of GroupBy (

#43481) Co-authored-by: Prerana Chakraborty <[email protected]>
pandas-dev · Sep 9, 2021 · ac09649 · ac09649
1 parent 04328a7
commit ac09649
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst
@@ -26,6 +26,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.__getitem__` raising error for slice of :class:`DatetimeIndex` when index is non monotonic (:issue:`43223`)
 - Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`)
 - Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`)
+- Fixed regression in :meth:`DataFrame.groupby` where aggregation on columns with object types dropped results on those columns (:issue:`42395`, :issue:`43108`)
 - Fixed regression in :meth:`Series.fillna` raising ``TypeError`` when filling ``float`` ``Series`` with list-like fill value having a dtype which couldn't cast lostlessly (like ``float32`` filled with ``float64``) (:issue:`43424`)
 -
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1126,6 +1126,14 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
             if self.obj.ndim == 2:
                 # i.e. DataFrameGroupBy
                 numeric_only = True
+                # GH#42395 GH#43108 GH#43154
+                # Regression from 1.2.5 to 1.3 caused object columns to be dropped
+                obj = self._obj_with_exclusions
+                check = obj._get_numeric_data()
+                if len(obj.columns) and not len(check.columns) and not obj.empty:
+                    numeric_only = False
+                    # TODO: v1.4+ Add FutureWarning
+
             else:
                 numeric_only = False
 

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -97,7 +97,8 @@ def test_cython_agg_nothing_to_agg():
 
     frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
 
-    result = frame[["b"]].groupby(frame["a"]).mean()
+    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+        result = frame[["b"]].groupby(frame["a"]).mean()
     expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -17,6 +17,7 @@
     MultiIndex,
     RangeIndex,
     Series,
+    Timedelta,
     Timestamp,
     date_range,
     read_csv,
@@ -2377,6 +2378,67 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
     tm.assert_frame_equal(result, expected)
 
 
+def test_groupby_aggregation_non_numeric_dtype():
+    # GH #43108
+    df = DataFrame(
+        [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"]
+    )
+
+    expected = DataFrame(
+        {
+            "v": [[1, 1], [10, 20]],
+        },
+        index=Index(["M", "W"], dtype="object", name="MW"),
+    )
+
+    gb = df.groupby(by=["MW"])
+    result = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregation_multi_non_numeric_dtype():
+    # GH #42395
+    df = DataFrame(
+        {
+            "x": [1, 0, 1, 1, 0],
+            "y": [Timedelta(i, "days") for i in range(1, 6)],
+            "z": [Timedelta(i * 10, "days") for i in range(1, 6)],
+        }
+    )
+
+    expected = DataFrame(
+        {
+            "y": [Timedelta(i, "days") for i in range(7, 9)],
+            "z": [Timedelta(i * 10, "days") for i in range(7, 9)],
+        },
+        index=Index([0, 1], dtype="int64", name="x"),
+    )
+
+    gb = df.groupby(by=["x"])
+    result = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregation_numeric_with_non_numeric_dtype():
+    # GH #43108
+    df = DataFrame(
+        {
+            "x": [1, 0, 1, 1, 0],
+            "y": [Timedelta(i, "days") for i in range(1, 6)],
+            "z": list(range(1, 6)),
+        }
+    )
+
+    expected = DataFrame(
+        {"z": [7, 8]},
+        index=Index([0, 1], dtype="int64", name="x"),
+    )
+
+    gb = df.groupby(by=["x"])
+    result = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
 def test_groupby_filtered_df_std():
     # GH 16174
     dicts = [