Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: groupby sum, mean, var should always be floats #41139

Merged
merged 13 commits into from
May 21, 2021
Merged
25 changes: 25 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,31 @@ values as measured by ``np.allclose``. Now no such casting occurs.

df.groupby('key').agg(lambda x: x.sum())

``float`` result for :meth:`.GroupBy.mean`, :meth:`.GroupBy.median`, and :meth:`.GroupBy.var`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, these methods could result in different dtypes depending on the input values.
Now, these methods will always return a float dtype. (:issue:`41137`)

.. ipython:: python

df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]})

*pandas 1.2.x*

.. code-block:: ipython

In [5]: df.groupby(df.index).mean()
Out[5]:
a b c
0 True 1 1.0

*pandas 1.3.0*

.. ipython:: python

df.groupby(df.index).mean()

Try operating inplace when setting values with ``loc`` and ``iloc``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,12 +1600,12 @@ def mean(self, numeric_only: bool = True):
Groupby two columns and return the mean of the remaining column.

>>> df.groupby(['A', 'B']).mean()
C
C
A B
1 2.0 2
4.0 1
2 3.0 1
5.0 2
1 2.0 2.0
4.0 1.0
2 3.0 1.0
5.0 2.0

Groupby one column and return the mean of only particular column in
the group.
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ class Grouper:
>>> df.groupby(pd.Grouper(key="Animal")).mean()
Speed
Animal
Falcon 200
Parrot 10
Falcon 200.0
Parrot 10.0

Specify a resample operation on the column 'Publish date'

Expand Down
12 changes: 8 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
is_categorical_dtype,
is_complex_dtype,
is_datetime64_any_dtype,
is_float_dtype,
is_integer_dtype,
is_numeric_dtype,
is_sparse,
Expand Down Expand Up @@ -304,10 +305,13 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj:
return np.dtype(np.int64)
elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
return Int64Dtype()
elif how in ["mean", "median", "var"] and isinstance(
dtype, (BooleanDtype, _IntegerDtype)
):
return Float64Dtype()
elif how in ["mean", "median", "var"]:
jreback marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(dtype, (BooleanDtype, _IntegerDtype)):
return Float64Dtype()
elif is_float_dtype(dtype):
return dtype
elif is_numeric_dtype(dtype):
return np.dtype(np.float64)
return dtype

def uses_mask(self) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=True)

index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
Expand Down Expand Up @@ -54,7 +54,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=False)

index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
self.assert_series_equal(result, expected)

def test_groupby_extension_transform(self, data_for_grouping):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=True)

index = pd.Index(index, name="B")
expected = pd.Series([3, 1], index=index, name="A")
expected = pd.Series([3.0, 1.0], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
Expand Down Expand Up @@ -301,7 +301,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=False)

index = pd.Index(index, name="B")
expected = pd.Series([1, 3], index=index, name="A")
expected = pd.Series([1.0, 3.0], index=index, name="A")
self.assert_series_equal(result, expected)

def test_groupby_extension_transform(self, data_for_grouping):
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,9 @@ def test_uint64_type_handling(dtype, how):
expected = df.groupby("y").agg({"x": how})
df.x = df.x.astype(dtype)
result = df.groupby("y").agg({"x": how})
result.x = result.x.astype(np.int64)
if how not in ("mean", "median"):
# mean and median always result in floats
result.x = result.x.astype(np.int64)
tm.assert_frame_equal(result, expected, check_exact=True)


Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,6 @@ def test_cython_agg_empty_buckets(op, targop, observed):

g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
if observed and op not in ("min", "max"):
# TODO: GH 41137
expected = expected.astype("int64")
tm.assert_frame_equal(result, expected)


Expand Down
12 changes: 7 additions & 5 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,6 @@ def test_apply(ordered):
result = grouped.apply(lambda x: np.mean(x))
tm.assert_frame_equal(result, expected)

# we coerce back to ints
expected = expected.astype("int")
result = grouped.mean()
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -371,7 +369,7 @@ def test_observed(observed, using_array_manager):
result = groups_double_key.agg("mean")
expected = DataFrame(
{
"val": [10, 30, 20, 40],
"val": [10.0, 30.0, 20.0, 40.0],
"cat": Categorical(
["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
),
Expand Down Expand Up @@ -418,7 +416,9 @@ def test_observed_codes_remap(observed):
groups_double_key = df.groupby([values, "C2"], observed=observed)

idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx)
expected = DataFrame(
{"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
)
if not observed:
expected = cartesian_product_for_groupers(
expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
Expand Down Expand Up @@ -1515,7 +1515,9 @@ def test_read_only_category_no_sort():
df = DataFrame(
{"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
)
expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b"))
expected = DataFrame(
data={"a": [2.0, 6.0]}, index=CategoricalIndex([1, 2], name="b")
)
result = df.groupby("b", sort=False).mean()
tm.assert_frame_equal(result, expected)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,7 +1257,7 @@ def test_groupby_keys_same_size_as_index():
)
df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
expected = df.set_index([df.index, "metric"])
expected = df.set_index([df.index, "metric"]).astype(float)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1350,7 +1350,7 @@ def test_groupby_2d_malformed():
d["ones"] = [1, 1]
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean()
res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)

Expand Down Expand Up @@ -2114,7 +2114,7 @@ def test_groupby_crash_on_nunique(axis):

def test_groupby_list_level():
# GH 9790
expected = DataFrame(np.arange(0, 9).reshape(3, 3))
expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
result = expected.groupby(level=[0]).mean()
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def test_to_csv_date_format(self):
df_sec["B"] = 0
df_sec["C"] = 1

expected_rows = ["A,B,C", "2013-01-01,0,1"]
expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)

df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,7 @@ def test_nanosecond_resample_error():
result = r.agg("mean")

exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n")
exp = Series(range(len(exp_indx)), index=exp_indx)
exp = Series(range(len(exp_indx)), index=exp_indx, dtype=float)

tm.assert_series_equal(result, exp)

Expand Down Expand Up @@ -1636,15 +1636,15 @@ def test_resample_with_nat():
index_1s = DatetimeIndex(
["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"]
)
frame_1s = DataFrame([3, 7, 11], index=index_1s)
frame_1s = DataFrame([3.0, 7.0, 11.0], index=index_1s)
tm.assert_frame_equal(frame.resample("1s").mean(), frame_1s)

index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"])
frame_2s = DataFrame([5, 11], index=index_2s)
frame_2s = DataFrame([5.0, 11.0], index=index_2s)
tm.assert_frame_equal(frame.resample("2s").mean(), frame_2s)

index_3s = DatetimeIndex(["1970-01-01 00:00:00"])
frame_3s = DataFrame([7], index=index_3s)
frame_3s = DataFrame([7.0], index=index_3s)
tm.assert_frame_equal(frame.resample("3s").mean(), frame_3s)

tm.assert_frame_equal(frame.resample("60s").mean(), frame_3s)
Expand Down Expand Up @@ -1687,7 +1687,7 @@ def f(data, add_arg):

# Testing dataframe
df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10))
result = df.groupby("A").resample("D").agg(f, multiplier)
result = df.groupby("A").resample("D").agg(f, multiplier).astype(float)
expected = df.groupby("A").resample("D").mean().multiply(multiplier)
# TODO: GH 41137
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhshadrach does this PR take care of the TODO comment here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes - thanks, I've opened #44374

expected = expected.astype("float64")
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/resample/test_period_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def test_with_local_timezone_pytz(self):
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = period_range(start=start, end=end, freq="D") - offsets.Day()
expected = Series(1, index=expected_index)
expected = Series(1.0, index=expected_index)
tm.assert_series_equal(result, expected)

def test_resample_with_pytz(self):
Expand All @@ -279,7 +279,7 @@ def test_resample_with_pytz(self):
)
result = s.resample("D").mean()
expected = Series(
2,
2.0,
index=pd.DatetimeIndex(
["2017-01-01", "2017-01-02"], tz="US/Eastern", freq="D"
),
Expand Down Expand Up @@ -312,7 +312,7 @@ def test_with_local_timezone_dateutil(self):
expected_index = (
period_range(start=start, end=end, freq="D", name="idx") - offsets.Day()
)
expected = Series(1, index=expected_index)
expected = Series(1.0, index=expected_index)
tm.assert_series_equal(result, expected)

def test_resample_nonexistent_time_bin_edge(self):
Expand Down Expand Up @@ -777,8 +777,8 @@ def test_upsampling_ohlc(self, freq, period_mult, kind):
"freq, expected_values",
[
("1s", [3, np.NaN, 7, 11]),
("2s", [3, int((7 + 11) / 2)]),
("3s", [int((3 + 7) / 2), 11]),
("2s", [3, (7 + 11) / 2]),
("3s", [(3 + 7) / 2, 11]),
],
)
def test_resample_with_nat(self, periods, values, freq, expected_values):
Expand All @@ -798,7 +798,7 @@ def test_resample_with_only_nat(self):
pi = PeriodIndex([pd.NaT] * 3, freq="S")
frame = DataFrame([2, 3, 5], index=pi, columns=["a"])
expected_index = PeriodIndex(data=[], freq=pi.freq)
expected = DataFrame(index=expected_index, columns=["a"], dtype="int64")
expected = DataFrame(index=expected_index, columns=["a"], dtype="float64")
result = frame.resample("1s").mean()
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_resample_timedelta_idempotency():
index = timedelta_range("0", periods=9, freq="10L")
series = Series(range(9), index=index)
result = series.resample("10L").mean()
expected = series
expected = series.astype(float)
tm.assert_series_equal(result, expected)


Expand Down
31 changes: 21 additions & 10 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,13 +240,13 @@ def test_pivot_with_non_observable_dropna(self, dropna):
categories=["low", "high"],
ordered=True,
),
"B": range(5),
"B": [0.0, 1.0, 2.0, 3.0, 4.0],
}
)

result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame(
{"B": [2, 3]},
{"B": [2.0, 3.0]},
index=Index(
Categorical.from_codes(
[0, 1], categories=["low", "high"], ordered=True
Expand Down Expand Up @@ -279,6 +279,8 @@ def test_pivot_with_non_observable_dropna(self, dropna):
name="A",
),
)
if not dropna:
expected["B"] = expected["B"].astype(float)

tm.assert_frame_equal(result, expected)

Expand All @@ -287,6 +289,8 @@ def test_pivot_with_interval_index(self, interval_values, dropna):
df = DataFrame({"A": interval_values, "B": 1})
result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A"))
if not dropna:
expected = expected.astype(float)
tm.assert_frame_equal(result, expected)

def test_pivot_with_interval_index_margins(self):
Expand Down Expand Up @@ -388,10 +392,7 @@ def test_pivot_preserve_dtypes(self, columns, values):
)

result = dict(df_res.dtypes)
expected = {
col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64")
for col in df_res
}
expected = {col: np.dtype("float64") for col in df_res}
assert result == expected

def test_pivot_no_values(self):
Expand Down Expand Up @@ -1711,8 +1712,13 @@ def test_pivot_table_margins_name_with_aggfunc_list(self):
expected = DataFrame(table.values, index=ix, columns=cols)
tm.assert_frame_equal(table, expected)

@pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)")
def test_categorical_margins(self, observed):
def test_categorical_margins(self, observed, request):
if observed:
request.node.add_marker(
pytest.mark.xfail(
reason="GH#17035 (np.mean of ints is casted back to ints)"
)
)
# GH 10989
df = DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
Expand All @@ -1725,8 +1731,13 @@ def test_categorical_margins(self, observed):
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
tm.assert_frame_equal(table, expected)

@pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)")
def test_categorical_margins_category(self, observed):
def test_categorical_margins_category(self, observed, request):
if observed:
request.node.add_marker(
pytest.mark.xfail(
reason="GH#17035 (np.mean of ints is casted back to ints)"
)
)
df = DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
)
Expand Down