Skip to content

Commit

Permalink
COMPAT: Emit warning when groupby by a tuple (pandas-dev#18731)
Browse files Browse the repository at this point in the history
* COMPAT: Emit warning when groupby by a tuple

Closes pandas-dev#18314

* DOC: avoid future warning

* Cleanup, test unhashable

* PEP8

* Correct KeyError

* update

* xfail

* remove old comments

* pep8

* Fixups
  • Loading branch information
TomAugspurger authored Dec 18, 2017
1 parent 7a0ee19 commit b6a7cc9
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 2 deletions.
2 changes: 1 addition & 1 deletion doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,7 @@ You can also select multiple rows from each group by specifying multiple nth val
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')
df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, 4th, and last date index for each month
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])
Enumerate group items
~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ Deprecations
- ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`).
- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`).
- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`)
- Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated.
In the future, a tuple passed to ``'by'`` will always refer to a single key
that is the actual tuple, instead of treating the tuple as multiple keys. To
retain the previous behavior, use a list instead of a tuple (:issue:`18314`)
- ``Series.valid`` is deprecated. Use :meth:`Series.dropna` instead (:issue:`18800`).

.. _whatsnew_0220.prior_deprecations:
Expand Down
23 changes: 22 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
is_bool_dtype,
is_scalar,
is_list_like,
is_hashable,
needs_i8_conversion,
_ensure_float64,
_ensure_platform_int,
Expand Down Expand Up @@ -2850,7 +2851,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
elif isinstance(key, BaseGrouper):
return key, [], obj

# Everything which is not a list is a key (including tuples):
# In the future, a tuple key will always mean an actual key,
# not an iterable of keys. In the meantime, we attempt to provide
# a warning. We can assume that the user wanted a list of keys when
# the key is not in the index. We just have to be careful with
# unhashble elements of `key`. Any unhashable elements implies that
# they wanted a list of keys.
# https://github.com/pandas-dev/pandas/issues/18314
is_tuple = isinstance(key, tuple)
all_hashable = is_tuple and is_hashable(key)

if is_tuple:
if ((all_hashable and key not in obj and set(key).issubset(obj))
or not all_hashable):
# column names ('a', 'b') -> ['a', 'b']
# arrays like (a, b) -> [a, b]
msg = ("Interpreting tuple 'by' as a list of keys, rather than "
"a single key. Use 'by=[...]' instead of 'by=(...)'. In "
"the future, a tuple will always mean a single key.")
warnings.warn(msg, FutureWarning, stacklevel=5)
key = list(key)

if not isinstance(key, list):
keys = [key]
match_axis_length = False
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2727,6 +2727,38 @@ def test_empty_dataframe_groupby(self):

assert_frame_equal(result, expected)

def test_tuple_warns(self):
# https://github.com/pandas-dev/pandas/issues/18314
df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
with tm.assert_produces_warning(FutureWarning) as w:
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()

assert "Interpreting tuple 'by' as a list" in str(w[0].message)

with tm.assert_produces_warning(None):
df.groupby(('a', 'b')).c.mean()

def test_tuple_warns_unhashable(self):
# https://github.com/pandas-dev/pandas/issues/18314
business_dates = date_range(start='4/1/2014', end='6/30/2014',
freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])

with tm.assert_produces_warning(FutureWarning) as w:
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

assert "Interpreting tuple 'by' as a list" in str(w[0].message)

@pytest.mark.xfail(reason="GH-18798")
def test_tuple_correct_keyerror(self):
# https://github.com/pandas-dev/pandas/issues/18798
df = pd.DataFrame(1, index=range(3),
columns=pd.MultiIndex.from_product([[1, 2],
[3, 4]]))
with tm.assert_raises_regex(KeyError, "(7, 8)"):
df.groupby((7, 8)).mean()


def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = lmap(tuple, df[keys].values)
Expand Down

0 comments on commit b6a7cc9

Please sign in to comment.