Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: GroupBy.apply() returns different results if a different GroupBy method is called first #35314

Merged
merged 42 commits into from
Aug 7, 2020
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
e84a15d
GroupBy.apply() calls self._reset_group_selection at the start. Erran…
smithto1 Jul 14, 2020
2f764db
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Jul 15, 2020
e122809
gb.apply() now resets group selection so it always returns grouping c…
smithto1 Jul 15, 2020
27b2694
test uses .drop() instead of selection
smithto1 Jul 15, 2020
0cca6df
wrote new tests
smithto1 Jul 15, 2020
2786eb5
rewrote test
smithto1 Jul 16, 2020
33cdf65
whatsnew
smithto1 Jul 16, 2020
6a65e2f
restore if-stat in test_transform
smithto1 Jul 16, 2020
9948d2f
amended test
smithto1 Jul 16, 2020
e4a132e
restored test
smithto1 Jul 16, 2020
4170ca6
restored test
smithto1 Jul 16, 2020
0ab1c8a
amended test
smithto1 Jul 16, 2020
f2a32f4
cleanup
smithto1 Jul 16, 2020
f5b674b
trailing comma
smithto1 Jul 16, 2020
7028756
fixed test_to_latex
smithto1 Jul 17, 2020
45abe63
added test to ensure .describe() keeps non-nuisance groupin columns
smithto1 Jul 17, 2020
063f0ea
minimized changes to exsiting tests
smithto1 Jul 17, 2020
7f0d192
add .describe() to whatsnew
smithto1 Jul 17, 2020
974da63
parametrize test over as_index=T/F
smithto1 Jul 17, 2020
b395e39
restored .describe to old behaviour
smithto1 Jul 18, 2020
2405504
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Jul 18, 2020
682c93a
Merge remote-tracking branch 'upstream/master' into issue34656-temp
smithto1 Jul 18, 2020
67e9744
restoring test_function.py to master
smithto1 Jul 18, 2020
8f1b9c9
added comment
smithto1 Jul 18, 2020
804b0e0
Merge branch 'issue34656-temp' into issue34656
smithto1 Jul 18, 2020
f422b7d
fixed describe to work with duplicate cols
smithto1 Jul 26, 2020
b07a290
merge fixed
smithto1 Jul 26, 2020
6bec040
update comment
smithto1 Jul 26, 2020
8cdd4cd
context manager in agg_general
smithto1 Jul 27, 2020
abe8be3
remove hashed out line
smithto1 Jul 27, 2020
7112cf8
limited context manager in _make_wrapper
smithto1 Jul 27, 2020
0c8b144
removed unrelated test
smithto1 Jul 27, 2020
755c8f0
update comment
smithto1 Jul 27, 2020
8951a73
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Aug 3, 2020
b61695b
whatsnew on v1.1.1
smithto1 Aug 3, 2020
673a35b
comment typo
smithto1 Aug 3, 2020
42f53dd
amend comment to restart tests
smithto1 Aug 3, 2020
18634a6
whatsnew to 1.2.0
smithto1 Aug 5, 2020
95553a1
resolve merge
smithto1 Aug 5, 2020
1a0aa44
remove line that can't be tiggered by test
smithto1 Aug 5, 2020
b09e41e
restart tests
smithto1 Aug 5, 2020
a7e264f
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Aug 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,7 @@ Groupby/resample/rolling
- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
- Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`).
- Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)
smithto1 marked this conversation as resolved.
Show resolved Hide resolved

Reshaping
^^^^^^^^^
Expand Down
89 changes: 46 additions & 43 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,13 +734,12 @@ def pipe(self, func, *args, **kwargs):
def _make_wrapper(self, name):
assert name in self._apply_allowlist

self._set_group_selection()

# need to setup the selection
# as are not passed directly but in the grouper
f = getattr(self._obj_with_exclusions, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))
with _group_selection_context(self):
# need to setup the selection
# as are not passed directly but in the grouper
f = getattr(self._obj_with_exclusions, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))

f = getattr(type(self._obj_with_exclusions), name)
sig = inspect.signature(f)
Expand Down Expand Up @@ -990,28 +989,30 @@ def _agg_general(
alias: str,
npfunc: Callable,
):
self._set_group_selection()

# try a cython aggregation if we can
try:
return self._cython_agg_general(
how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count,
)
except DataError:
pass
except NotImplementedError as err:
if "function is not implemented for this dtype" in str(
err
) or "category dtype not supported" in str(err):
# raised in _get_cython_function, in some cases can
# be trimmed by implementing cython funcs for more dtypes
with _group_selection_context(self):
# try a cython aggregation if we can
try:
return self._cython_agg_general(
how=alias,
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
)
except DataError:
pass
else:
raise
except NotImplementedError as err:
if "function is not implemented for this dtype" in str(
err
) or "category dtype not supported" in str(err):
# raised in _get_cython_function, in some cases can
# be trimmed by implementing cython funcs for more dtypes
pass
else:
raise
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wow, codecov pointed that this is not covered, can you see if you can get a test to lands here (otherwise remove it)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could not find a way to reach this line from any public function call. AFAICS anything that would raise a NotImplementedError that comes under the else:, also trips a DataError which catches it before this line.

So I've gone with removing it.


# apply a non-cython aggregation
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
return result
# apply a non-cython aggregation
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
return result

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
Expand Down Expand Up @@ -1928,29 +1929,31 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
nth_values = list(set(n))

nth_array = np.array(nth_values, dtype=np.intp)
self._set_group_selection()
with _group_selection_context(self):

mask_left = np.in1d(self._cumcount_array(), nth_array)
mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array)
mask = mask_left | mask_right
mask_left = np.in1d(self._cumcount_array(), nth_array)
mask_right = np.in1d(
self._cumcount_array(ascending=False) + 1, -nth_array
)
mask = mask_left | mask_right

ids, _, _ = self.grouper.group_info
ids, _, _ = self.grouper.group_info

# Drop NA values in grouping
mask = mask & (ids != -1)
# Drop NA values in grouping
mask = mask & (ids != -1)

out = self._selected_obj[mask]
if not self.as_index:
return out
out = self._selected_obj[mask]
if not self.as_index:
return out

result_index = self.grouper.result_index
out.index = result_index[ids[mask]]
result_index = self.grouper.result_index
out.index = result_index[ids[mask]]

if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)
if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)

out = self._reindex_output(out)
return out.sort_index() if self.sort else out
out = self._reindex_output(out)
return out.sort_index() if self.sort else out

# dropna is truthy
if isinstance(n, valid_containers):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip():
assert ts == grouped.first()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]

ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]


def test_sum_uint64_overflow():
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,3 +1014,32 @@ def test_apply_with_timezones_aware():
result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())

tm.assert_frame_equal(result1, result2)


def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
# GH #34656
# GH #34271
df = DataFrame(
{
"a": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
}
)

expected = pd.DataFrame(
{"a": [264, 297], "b": [15, 6], "c": [150, 60]},
index=pd.Index([88, 99], name="a"),
)

# Check output wehn no other methods are called before .apply()
smithto1 marked this conversation as resolved.
Show resolved Hide resolved
grp = df.groupby(by="a")
result = grp.apply(sum)
tm.assert_frame_equal(result, expected)

# Check output when another method is called before .apply()
grp = df.groupby(by="a")
args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
_ = getattr(grp, reduction_func)(*args)
result = grp.apply(sum)
tm.assert_frame_equal(result, expected)
8 changes: 5 additions & 3 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,15 @@ def test_grouper_creation_bug(self):
result = g.sum()
tm.assert_frame_equal(result, expected)

result = g.apply(lambda x: x.sum())
tm.assert_frame_equal(result, expected)

g = df.groupby(pd.Grouper(key="A", axis=0))
result = g.sum()
tm.assert_frame_equal(result, expected)

result = g.apply(lambda x: x.sum())
expected["A"] = [0, 2, 4]
expected = expected.loc[:, ["A", "B"]]
tm.assert_frame_equal(result, expected)

# GH14334
# pd.Grouper(key=...) may be passed in a list
df = DataFrame(
Expand Down