From b53ef231ea2a98b4c805ef7fda92b0524db80fb8 Mon Sep 17 00:00:00 2001 From: Junya Hayashi Date: Thu, 1 Jan 2015 00:58:05 +0900 Subject: [PATCH 1/2] BUG: Fix not to reindex on non-Categorical groups (GH9049, GH9344) --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/groupby.py | 16 +++++++++------- pandas/tests/test_groupby.py | 25 +++++++++++++++++++++++++ vb_suite/groupby.py | 12 ++++++++++++ 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 0234a0dab8e28..3e9dcde6113b8 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -267,3 +267,4 @@ Bug Fixes - ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`). - Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`) +- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0a12484f9ab3a..fcaa5ad26b15d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1862,7 +1862,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = grouper.values # pre-computed - self._was_factor = False + self._grouping_type = None self._should_compress = True # we have a single grouper which may be a myriad of things, some of which are @@ -1887,7 +1887,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, level_values = index.levels[level].take(inds) self.grouper = level_values.map(self.grouper) else: - self._was_factor = True + self._grouping_type = "level" # all levels may not be observed labels, uniques = algos.factorize(inds, sort=True) @@ -1915,7 +1915,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, elif isinstance(self.grouper, Categorical): factor = self.grouper - self._was_factor = True + self._grouping_type = "categorical" # Is there any way to avoid this? self.grouper = np.asarray(factor) @@ -1988,8 +1988,9 @@ def group_index(self): return self._group_index def _make_labels(self): - if self._was_factor: # pragma: no cover - raise Exception('Should not call this method grouping by level') + if self._grouping_type in ("level", "categorical"): # pragma: no cover + raise Exception( + 'Should not call this method grouping by level or categorical') else: labels, uniques = algos.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) @@ -3238,10 +3239,11 @@ def _reindex_output(self, result): return result elif len(groupings) == 1: return result - elif not any([ping._was_factor for ping in groupings]): + elif not any([ping._grouping_type == "categorical" + for ping in groupings]): return result - levels_list = [ ping._group_index for ping in groupings ] + levels_list = [ ping.group_index for ping in groupings ] index = MultiIndex.from_product(levels_list, names=self.grouper.names) d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False } return result.reindex(**d).sortlevel(axis=self.axis) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index d1ab33e607f4d..8803a96fdf976 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3484,6 +3484,31 @@ def test_groupby_categorical_unequal_len(self): # len(bins) != len(series) here self.assertRaises(ValueError,lambda : series.groupby(bins).mean()) + def test_groupby_multiindex_missing_pair(self): + # GH9049 + df = DataFrame({'group1': ['a','a','a','b'], + 'group2': ['c','c','d','c'], + 'value': [1,1,1,5]}) + df = df.set_index(['group1', 'group2']) + df_grouped = df.groupby(level=['group1','group2'], sort=True) + + res = df_grouped.agg('sum') + idx = MultiIndex.from_tuples([('a','c'), ('a','d'), ('b','c')], names=['group1', 'group2']) + exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) + + tm.assert_frame_equal(res, exp) + + def test_groupby_levels_and_columns(self): + # GH9344, GH9049 + idx_names = ['x', 'y'] + idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) + df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) + + by_levels = df.groupby(level=idx_names).mean() + by_columns = df.reset_index().groupby(idx_names).mean() + + tm.assert_frame_equal(by_levels, by_columns) + def test_gb_apply_list_of_unequal_len_arrays(self): # GH1738 diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index eb690df4870e8..73f5f19d6a626 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -390,6 +390,18 @@ def f(g): groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup) + +#---------------------------------------------------------------------- +# multi-indexed group sum #9049 + +setup = common_setup + """ +N = 50 +df = DataFrame({'A': range(N) * 2, 'B': range(N*2), 'C': 1}).set_index(["A", "B"]) +""" + +groupby_sum_multiindex = Benchmark("df.groupby(level=[0, 1]).sum()", setup) + + #---------------------------------------------------------------------- # Transform testing From 3ce07eab12f267990c7e361a9ab351715cb3e6b8 Mon Sep 17 00:00:00 2001 From: Junya Hayashi Date: Tue, 20 Jan 2015 03:08:01 +0900 Subject: [PATCH 2/2] ENH: Refactor groupby for Categorical grouper --- pandas/core/groupby.py | 35 +++++++++-------------------------- pandas/tests/test_groupby.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fcaa5ad26b15d..29bdbe93866ed 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1862,7 +1862,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = grouper.values # pre-computed - self._grouping_type = None self._should_compress = True # we have a single grouper which may be a myriad of things, some of which are @@ -1887,8 +1886,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, level_values = index.levels[level].take(inds) self.grouper = level_values.map(self.grouper) else: - self._grouping_type = "level" - # all levels may not be observed labels, uniques = algos.factorize(inds, sort=True) @@ -1913,17 +1910,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif isinstance(self.grouper, Categorical): - - factor = self.grouper - self._grouping_type = "categorical" - - # Is there any way to avoid this? - self.grouper = np.asarray(factor) - - self._labels = factor.codes - self._group_index = factor.categories + self._labels = self.grouper.codes + self._group_index = self.grouper.categories if self.name is None: - self.name = factor.name + self.name = self.grouper.name # a passed Grouper like elif isinstance(self.grouper, Grouper): @@ -1936,8 +1926,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.name = grouper.name # no level passed - if not isinstance(self.grouper, (Series, Index, np.ndarray)): - if getattr(self.grouper,'ndim', 1) != 1: + if not isinstance(self.grouper, (Series, Index, Categorical, np.ndarray)): + if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError("Grouper for '%s' not 1-dimensional" % t) self.grouper = self.index.map(self.grouper) @@ -1988,22 +1978,15 @@ def group_index(self): return self._group_index def _make_labels(self): - if self._grouping_type in ("level", "categorical"): # pragma: no cover - raise Exception( - 'Should not call this method grouping by level or categorical') - else: + if self._labels is None or self._group_index is None: labels, uniques = algos.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques - _groups = None - - @property + @cache_readonly def groups(self): - if self._groups is None: - self._groups = self.index.groupby(self.grouper) - return self._groups + return self.index.groupby(self.grouper) def _get_grouper(obj, key=None, axis=0, level=None, sort=True): """ @@ -3239,7 +3222,7 @@ def _reindex_output(self, result): return result elif len(groupings) == 1: return result - elif not any([ping._grouping_type == "categorical" + elif not any([isinstance(ping.grouper, Categorical) for ping in groupings]): return result diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8803a96fdf976..1d309e2a6389f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3297,6 +3297,34 @@ def test_groupby_categorical(self): expected.index.names = ['myfactor', None] assert_frame_equal(desc_result, expected) + def test_groupby_datetime_categorical(self): + # GH9049: ensure backward compatibility + levels = pd.date_range('2014-01-01', periods=4) + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, name='myfactor') + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + expected.index.name = 'myfactor' + + assert_frame_equal(result, expected) + self.assertEqual(result.index.name, cats.name) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels, sort=False).describe() + expected.index.names = ['myfactor', None] + assert_frame_equal(desc_result, expected) + def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex