From 19f6041d97e86ebdd8ad5bfe3afd2761037b3e45 Mon Sep 17 00:00:00 2001 From: Eric Kisslinger Date: Mon, 27 Nov 2017 09:45:13 -0800 Subject: [PATCH] BUG: Fix groupby over a CategoricalIndex in axis=1 closes GH18432 Add multi-index columns test to test_groupby_categorical_columns_index() Add whatsnew for GH18432 bug fix Fix ValueError text for GH18432 bug fix Update whatsnew text Use kwargs instead of positional format params Move test_groupby_categorical_columns_index() to pandas/tests/groupby/test_grouping.py Directly construct expected dataframe in test_groupby_categorical_index_and_columns() --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/groupby.py | 8 +++++--- pandas/tests/groupby/test_grouping.py | 25 ++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e307e605687bf..bebfd0ab50e90 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -137,6 +137,7 @@ Categorical - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +- Bug in ``DataFrame.groupby(axis=1)`` with a ``CategoricalIndex`` (:issue:`18432`) String ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0e8368e5a4533..662a863c72325 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2859,9 +2859,11 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if is_categorical_dtype(gpr) and len(gpr) != len(obj): - raise ValueError("Categorical dtype grouper must " - "have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: + raise ValueError( + ("Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length" + .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index cc422f2d1cdeb..8702062e9cd0a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -6,7 +6,7 @@ from warnings import catch_warnings from pandas import (date_range, Timestamp, - Index, MultiIndex, DataFrame, Series) + Index, MultiIndex, DataFrame, Series, CategoricalIndex) from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal) from pandas.compat import lrange, long @@ -251,6 +251,29 @@ def test_groupby_levels_and_columns(self): by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) + def test_groupby_categorical_index_and_columns(self): + # GH18432 + columns = ['A', 'B', 'A', 'B'] + categories = ['B', 'A'] + data = np.ones((5, 4), int) + cat_columns = CategoricalIndex(columns, + categories=categories, + ordered=True) + df = DataFrame(data=data, columns=cat_columns) + result = df.groupby(axis=1, level=0).sum() + expected_data = 2 * np.ones((5, 2), int) + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) + expected = DataFrame(data=expected_data, columns=expected_columns) + assert_frame_equal(result, expected) + + # test transposed version + df = DataFrame(data.T, index=cat_columns) + result = df.groupby(axis=0, level=0).sum() + expected = DataFrame(data=expected_data.T, index=expected_columns) + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063