From 65762428c92b944358bbe4ab21ea2c41500e2ff8 Mon Sep 17 00:00:00 2001 From: HH Date: Tue, 10 Sep 2019 14:09:49 +0900 Subject: [PATCH] EHN: Groupby on multiindex with missing data in group keys raises IndexError (#20519) * If all index values in some level is NA, fill with NaN --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/indexes/multi.py | 5 ++++- pandas/tests/groupby/test_grouping.py | 32 +++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4decc99087a9e4..4a1e874f0c8d7d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -174,7 +174,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - -- +- ``IndexError`` would not raise if all index values in some index level is missing data (:issue:`20519`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) Reshaping diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b614952ba1e043..528ad8e66421ca 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1279,7 +1279,10 @@ def _get_grouper_for_level(self, mapper, level): # Remove unobserved levels from level_index level_index = level_index.take(uniques) - grouper = level_index.take(codes) + if len(level_index): + grouper = level_index.take(codes) + else: + grouper = level_index.take(codes,fill_value=True) return grouper, codes, level_index diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 403f5f11ee7686..0fbd87d0f15309 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -628,6 +628,38 @@ def test_groupby_empty(self): # check name assert s.groupby(s).grouper.names == ["name"] + def test_groupby_level_index_value_all_na(self): + # issue 20519 + df = pd.DataFrame([["x", np.nan, 1]], columns=["A", "B", "C"]).set_index( + ["A", "B"] + ) + result = df.groupby(level=["A", "B"]).sum() + expected = DataFrame( + data=[], + index=MultiIndex( + levels=[["x"],[]], + codes=[[],[]], + names=["A","B"] + ), + columns=["C"] + ) + tm.assert_frame_equal(result, expected, check_index_type=False, check_dtype=False) + + df = pd.DataFrame( + [[None, None, "x", 2], [np.nan, "y", np.nan, 4]], columns=["A", "B", "C", "D"] + ).set_index(["A", "B", "C"]) + result = df.groupby(level=["A", "B", "C"]).sum() + expected = DataFrame( + data=[], + index=MultiIndex( + levels=[[], ["y"], ["x"]], + codes=[[], [], []], + names=["A", "B", "C"] + ), + columns=["D"] + ) + tm.assert_frame_equal(result, expected, check_index_type=False, check_dtype=False) + # get_group # --------------------------------