diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e08b78b98e642..7c42bb5a727ba 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2304,7 +2304,6 @@ def _groupby_op( ): if isinstance(self.dtype, StringDtype): if how in [ - "sum", "prod", "mean", "median", diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6cf9cca341794..4835d808f2433 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2609,7 +2609,6 @@ def _groupby_op( if isinstance(self.dtype, StringDtype): # StringArray if op.how in [ - "sum", "prod", "mean", "median", diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index c1480f54163e0..bab8566a06dc2 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -6,6 +6,7 @@ is_bool_dtype, is_numeric_dtype, is_object_dtype, + is_string_dtype, ) import pandas as pd @@ -150,6 +151,7 @@ def test_in_numeric_groupby(self, data_for_grouping): is_numeric_dtype(dtype) or is_bool_dtype(dtype) or dtype.name == "decimal" + or is_string_dtype(dtype) or is_object_dtype(dtype) or dtype.kind == "m" # in particular duration[*][pyarrow] ): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d28eb227314c7..b937e7dcc8136 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -148,11 +148,11 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a434391983c01..ac7f305880878 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -709,8 +709,6 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): grouped = df.groupby("A") no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") - if using_infer_string: - no_drop_nuisance += ("sum",) if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False @@ -1814,7 +1812,7 @@ def get_categorical_invalid_expected(): if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per or is_str: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f3768ee3433b5..e915011875c60 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -182,7 +182,6 @@ def test_groupby_raises_string( if using_infer_string: if groupby_func in [ - "sum", "prod", "mean", "median", @@ -213,13 +212,7 @@ def test_groupby_raises_string( elif groupby_func in ["cummin", "cummax"]: msg = msg.replace("object", "str") elif groupby_func == "corrwith": - if df["d"].dtype.storage == "pyarrow": - msg = ( - "ArrowStringArrayNumpySemantics' with dtype str does not " - "support operation 'mean'" - ) - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" @@ -273,17 +266,12 @@ def test_groupby_raises_string_np( }[groupby_func_np] if using_infer_string: - klass = TypeError - if df["d"].dtype.storage == "python": - msg = ( - f"Cannot perform reduction '{groupby_func_np.__name__}' " - "with string dtype" - ) - else: - msg = ( - "'ArrowStringArrayNumpySemantics' with dtype str does not " - f"support operation '{groupby_func_np.__name__}'" - ) + if groupby_func_np is np.mean: + klass = TypeError + msg = ( + f"Cannot perform reduction '{groupby_func_np.__name__}' " + "with string dtype" + ) _call_and_check(klass, msg, how, gb, groupby_func_np, ()) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 940c9e6700ea2..b7b80b5e427ff 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -912,11 +912,6 @@ def test_frame_downsample_method( msg = expected_data with pytest.raises(klass, match=msg): _ = func(**kwargs) - elif method == "sum" and using_infer_string and numeric_only is not True: - klass = TypeError - msg = f"dtype 'str' does not support operation '{method}'" - with pytest.raises(klass, match=msg): - _ = func(**kwargs) else: result = func(**kwargs) expected = DataFrame(expected_data, index=expected_index) @@ -968,10 +963,6 @@ def test_series_downsample_method( msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) - elif method == "sum" and using_infer_string and numeric_only is not True: - msg = "dtype 'str' does not support operation 'sum'" - with pytest.raises(TypeError, match=msg): - func(**kwargs) else: result = func(**kwargs)