diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 74f9af5bf8447..d6aa13184f685 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -333,6 +333,31 @@ values as measured by ``np.allclose``. Now no such casting occurs. df.groupby('key').agg(lambda x: x.sum()) +``float`` result for :meth:`.GroupBy.mean`, :meth:`.GroupBy.median`, and :meth:`.GroupBy.var` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, these methods could result in different dtypes depending on the input values. +Now, these methods will always return a float dtype. (:issue:`41137`) + +.. ipython:: python + + df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]}) + +*pandas 1.2.x* + +.. code-block:: ipython + + In [5]: df.groupby(df.index).mean() + Out[5]: + a b c + 0 True 1 1.0 + +*pandas 1.3.0* + +.. ipython:: python + + df.groupby(df.index).mean() + Try operating inplace when setting values with ``loc`` and ``iloc`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4310d849d66e8..b27eb4bb8f325 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1600,12 +1600,12 @@ def mean(self, numeric_only: bool = True): Groupby two columns and return the mean of the remaining column. >>> df.groupby(['A', 'B']).mean() - C + C A B - 1 2.0 2 - 4.0 1 - 2 3.0 1 - 5.0 2 + 1 2.0 2.0 + 4.0 1.0 + 2 3.0 1.0 + 5.0 2.0 Groupby one column and return the mean of only particular column in the group. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 877204f5bb2a2..1f79be3e20276 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -150,8 +150,8 @@ class Grouper: >>> df.groupby(pd.Grouper(key="Animal")).mean() Speed Animal - Falcon 200 - Parrot 10 + Falcon 200.0 + Parrot 10.0 Specify a resample operation on the column 'Publish date' diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b995a74c20a40..aff454c64a8fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -49,6 +49,7 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, @@ -304,10 +305,13 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: return np.dtype(np.int64) elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): return Int64Dtype() - elif how in ["mean", "median", "var"] and isinstance( - dtype, (BooleanDtype, _IntegerDtype) - ): - return Float64Dtype() + elif how in ["mean", "median", "var"]: + if isinstance(dtype, (BooleanDtype, _IntegerDtype)): + return Float64Dtype() + elif is_float_dtype(dtype): + return dtype + elif is_numeric_dtype(dtype): + return np.dtype(np.float64) return dtype def uses_mask(self) -> bool: diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 9b2792f3b0aea..1a045fa33f487 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=True) index = pd.Index(index, name="B") - expected = pd.Series([3, 1, 4], index=index, name="A") + expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A") if as_index: self.assert_series_equal(result, expected) else: @@ -54,7 +54,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") - expected = pd.Series([1, 3, 4], index=index, name="A") + expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A") self.assert_series_equal(result, expected) def test_groupby_extension_transform(self, data_for_grouping): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 8a52734b27bc7..172137ff3a5a2 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -272,7 +272,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=True) index = pd.Index(index, name="B") - expected = pd.Series([3, 1], index=index, name="A") + expected = pd.Series([3.0, 1.0], index=index, name="A") if as_index: self.assert_series_equal(result, expected) else: @@ -301,7 +301,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") - expected = pd.Series([1, 3], index=index, name="A") + expected = pd.Series([1.0, 3.0], index=index, name="A") self.assert_series_equal(result, expected) def test_groupby_extension_transform(self, data_for_grouping): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a33a2bbe1d0a..18739320d90d3 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -512,7 +512,9 @@ def test_uint64_type_handling(dtype, how): expected = df.groupby("y").agg({"x": how}) df.x = df.x.astype(dtype) result = df.groupby("y").agg({"x": how}) - result.x = result.x.astype(np.int64) + if how not in ("mean", "median"): + # mean and median always result in floats + result.x = result.x.astype(np.int64) tm.assert_frame_equal(result, expected, check_exact=True) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index ded10ab11d5a8..4a8aabe41b754 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -196,9 +196,6 @@ def test_cython_agg_empty_buckets(op, targop, observed): g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) - if observed and op not in ("min", "max"): - # TODO: GH 41137 - expected = expected.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7349664614614..217aa0d7ede17 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -285,8 +285,6 @@ def test_apply(ordered): result = grouped.apply(lambda x: np.mean(x)) tm.assert_frame_equal(result, expected) - # we coerce back to ints - expected = expected.astype("int") result = grouped.mean() tm.assert_frame_equal(result, expected) @@ -371,7 +369,7 @@ def test_observed(observed, using_array_manager): result = groups_double_key.agg("mean") expected = DataFrame( { - "val": [10, 30, 20, 40], + "val": [10.0, 30.0, 20.0, 40.0], "cat": Categorical( ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True ), @@ -418,7 +416,9 @@ def test_observed_codes_remap(observed): groups_double_key = df.groupby([values, "C2"], observed=observed) idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) - expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx) + expected = DataFrame( + {"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx + ) if not observed: expected = cartesian_product_for_groupers( expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"] @@ -1515,7 +1515,9 @@ def test_read_only_category_no_sort(): df = DataFrame( {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))} ) - expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b")) + expected = DataFrame( + data={"a": [2.0, 6.0]}, index=CategoricalIndex([1, 2], name="b") + ) result = df.groupby("b", sort=False).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d256b19dbb148..afa28866f0833 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1257,7 +1257,7 @@ def test_groupby_keys_same_size_as_index(): ) df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean() - expected = df.set_index([df.index, "metric"]) + expected = df.set_index([df.index, "metric"]).astype(float) tm.assert_frame_equal(result, expected) @@ -1350,7 +1350,7 @@ def test_groupby_2d_malformed(): d["ones"] = [1, 1] d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean() - res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) + res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -2114,7 +2114,7 @@ def test_groupby_crash_on_nunique(axis): def test_groupby_list_level(): # GH 9790 - expected = DataFrame(np.arange(0, 9).reshape(3, 3)) + expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float) result = expected.groupby(level=[0]).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index bbf78a9013731..4c482bafa6c9c 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -274,7 +274,7 @@ def test_to_csv_date_format(self): df_sec["B"] = 0 df_sec["C"] = 1 - expected_rows = ["A,B,C", "2013-01-01,0,1"] + expected_rows = ["A,B,C", "2013-01-01,0,1.0"] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 1c7aa5c444da9..296182a6bdbea 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1064,7 +1064,7 @@ def test_nanosecond_resample_error(): result = r.agg("mean") exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") - exp = Series(range(len(exp_indx)), index=exp_indx) + exp = Series(range(len(exp_indx)), index=exp_indx, dtype=float) tm.assert_series_equal(result, exp) @@ -1636,15 +1636,15 @@ def test_resample_with_nat(): index_1s = DatetimeIndex( ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"] ) - frame_1s = DataFrame([3, 7, 11], index=index_1s) + frame_1s = DataFrame([3.0, 7.0, 11.0], index=index_1s) tm.assert_frame_equal(frame.resample("1s").mean(), frame_1s) index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"]) - frame_2s = DataFrame([5, 11], index=index_2s) + frame_2s = DataFrame([5.0, 11.0], index=index_2s) tm.assert_frame_equal(frame.resample("2s").mean(), frame_2s) index_3s = DatetimeIndex(["1970-01-01 00:00:00"]) - frame_3s = DataFrame([7], index=index_3s) + frame_3s = DataFrame([7.0], index=index_3s) tm.assert_frame_equal(frame.resample("3s").mean(), frame_3s) tm.assert_frame_equal(frame.resample("60s").mean(), frame_3s) @@ -1687,7 +1687,7 @@ def f(data, add_arg): # Testing dataframe df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - result = df.groupby("A").resample("D").agg(f, multiplier) + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) expected = df.groupby("A").resample("D").mean().multiply(multiplier) # TODO: GH 41137 expected = expected.astype("float64") diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index e2b13f6a00677..a6491952375a4 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -269,7 +269,7 @@ def test_with_local_timezone_pytz(self): # Index is moved back a day with the timezone conversion from UTC to # Pacific expected_index = period_range(start=start, end=end, freq="D") - offsets.Day() - expected = Series(1, index=expected_index) + expected = Series(1.0, index=expected_index) tm.assert_series_equal(result, expected) def test_resample_with_pytz(self): @@ -279,7 +279,7 @@ def test_resample_with_pytz(self): ) result = s.resample("D").mean() expected = Series( - 2, + 2.0, index=pd.DatetimeIndex( ["2017-01-01", "2017-01-02"], tz="US/Eastern", freq="D" ), @@ -312,7 +312,7 @@ def test_with_local_timezone_dateutil(self): expected_index = ( period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() ) - expected = Series(1, index=expected_index) + expected = Series(1.0, index=expected_index) tm.assert_series_equal(result, expected) def test_resample_nonexistent_time_bin_edge(self): @@ -777,8 +777,8 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): "freq, expected_values", [ ("1s", [3, np.NaN, 7, 11]), - ("2s", [3, int((7 + 11) / 2)]), - ("3s", [int((3 + 7) / 2), 11]), + ("2s", [3, (7 + 11) / 2]), + ("3s", [(3 + 7) / 2, 11]), ], ) def test_resample_with_nat(self, periods, values, freq, expected_values): @@ -798,7 +798,7 @@ def test_resample_with_only_nat(self): pi = PeriodIndex([pd.NaT] * 3, freq="S") frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) - expected = DataFrame(index=expected_index, columns=["a"], dtype="int64") + expected = DataFrame(index=expected_index, columns=["a"], dtype="float64") result = frame.resample("1s").mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index e127f69b12674..d55dbfca9ebdf 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -77,7 +77,7 @@ def test_resample_timedelta_idempotency(): index = timedelta_range("0", periods=9, freq="10L") series = Series(range(9), index=index) result = series.resample("10L").mean() - expected = series + expected = series.astype(float) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2276281e3ecf8..63e277c1580af 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -240,13 +240,13 @@ def test_pivot_with_non_observable_dropna(self, dropna): categories=["low", "high"], ordered=True, ), - "B": range(5), + "B": [0.0, 1.0, 2.0, 3.0, 4.0], } ) result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( - {"B": [2, 3]}, + {"B": [2.0, 3.0]}, index=Index( Categorical.from_codes( [0, 1], categories=["low", "high"], ordered=True @@ -279,6 +279,8 @@ def test_pivot_with_non_observable_dropna(self, dropna): name="A", ), ) + if not dropna: + expected["B"] = expected["B"].astype(float) tm.assert_frame_equal(result, expected) @@ -287,6 +289,8 @@ def test_pivot_with_interval_index(self, interval_values, dropna): df = DataFrame({"A": interval_values, "B": 1}) result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) + if not dropna: + expected = expected.astype(float) tm.assert_frame_equal(result, expected) def test_pivot_with_interval_index_margins(self): @@ -388,10 +392,7 @@ def test_pivot_preserve_dtypes(self, columns, values): ) result = dict(df_res.dtypes) - expected = { - col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64") - for col in df_res - } + expected = {col: np.dtype("float64") for col in df_res} assert result == expected def test_pivot_no_values(self): @@ -1711,8 +1712,13 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): expected = DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") - def test_categorical_margins(self, observed): + def test_categorical_margins(self, observed, request): + if observed: + request.node.add_marker( + pytest.mark.xfail( + reason="GH#17035 (np.mean of ints is casted back to ints)" + ) + ) # GH 10989 df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} @@ -1725,8 +1731,13 @@ def test_categorical_margins(self, observed): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") - def test_categorical_margins_category(self, observed): + def test_categorical_margins_category(self, observed, request): + if observed: + request.node.add_marker( + pytest.mark.xfail( + reason="GH#17035 (np.mean of ints is casted back to ints)" + ) + ) df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} )