From 9c0d99bcf5e0ac7a0fbea513c2127f62cda691af Mon Sep 17 00:00:00 2001 From: OXPHOS Date: Mon, 14 Mar 2016 03:08:13 -0400 Subject: [PATCH 1/6] Modified _generate_marginal_results To fix bug #12577: Crosstab margins ignoring dropna --- pandas/tools/pivot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index d699639c6c796..47ee9bc521f92 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -267,7 +267,7 @@ def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows).agg(aggfunc) + margin = data[data[cols].notnull().any(axis=1)][rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): @@ -304,7 +304,7 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols).agg(aggfunc) + row_margin = data[data[rows].notnull().any(axis=1)][cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -462,4 +462,4 @@ def _get_names(arrs, names, prefix='row'): if not isinstance(names, list): names = list(names) - return names + return names \ No newline at end of file From 7c2894936fd01a593b9925731bfb3d6a64384f2a Mon Sep 17 00:00:00 2001 From: Pan Deng Date: Mon, 14 Mar 2016 14:23:14 -0400 Subject: [PATCH 2/6] change in pivot_table-if margin if dropna, pass truncated data to _add_margin --- pandas/tools/pivot.py | 18 ++++++++++++------ pandas/tools/tests/test_pivot.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 47ee9bc521f92..320d43b158a88 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -149,9 +149,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', table = table.fillna(value=fill_value, downcast='infer') if margins: - table = _add_margins(table, data, values, rows=index, - cols=columns, aggfunc=aggfunc, - margins_name=margins_name) + if dropna: + data_dropna = data[data.notnull().all(axis = 1)] + table = _add_margins(table, data_dropna, values, rows=index, + cols=columns, aggfunc=aggfunc, + margins_name=margins_name) + else: + table = _add_margins(table, data, values, rows=index, + cols=columns, aggfunc=aggfunc, + margins_name=margins_name) # discard the top level if values_passed and not values_multi and not table.empty: @@ -267,7 +273,7 @@ def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: - margin = data[data[cols].notnull().any(axis=1)][rows + values].groupby(rows).agg(aggfunc) + margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): @@ -304,7 +310,7 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[data[rows].notnull().any(axis=1)][cols + values].groupby(cols).agg(aggfunc) + row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -462,4 +468,4 @@ def _get_names(arrs, names, prefix='row'): if not isinstance(names, list): names = list(names) - return names \ No newline at end of file + return names diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 994269d36cd85..6b1a3759cb7bd 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -935,6 +935,29 @@ def test_crosstab_no_overlap(self): expected = pd.DataFrame() tm.assert_frame_equal(actual, expected) + + def test_margin_ignore_dropna_bug(self): + # Bug#12577 + df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], 'b': [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a,df.b, margins=True) + expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) + expected.index = Index([1.0, 2.0, 'All'], name = 'a') + expected.columns = Index([3, 4, 'All'], name = 'b') + tm.assert_frame_equal(actual, expected) + + df = DataFrame({'a':[1, np.nan, np.nan, np.nan, 2, np.nan], 'b':[3, np.nan, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a,df.b, margins=True) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, 'All'], name = 'a') + expected.columns = Index([3.0, 4.0, 'All'], name = 'b') + tm.assert_frame_equal(actual, expected) + + df = DataFrame({'a':[1, np.nan, np.nan, np.nan, np.nan, 2], 'b':[3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a,df.b, margins=True) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, 'All'], name = 'a') + expected.columns = Index([3, 4, 'All'], name = 'b') + tm.assert_frame_equal(actual, expected) if __name__ == '__main__': import nose From 0039da5c2f844ac0f4ce05de466517b22ced6041 Mon Sep 17 00:00:00 2001 From: Pan Deng Date: Mon, 14 Mar 2016 17:16:39 -0400 Subject: [PATCH 3/6] Documentaion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit simplified codes in pivot_table() updated what’s new entry updated comments in test_pivot.py --- doc/source/whatsnew/v0.18.1.txt | 1 + pandas/tools/pivot.py | 12 ++++-------- pandas/tools/tests/test_pivot.py | 14 ++++++++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index dbe446f0a7b4f..05c6d27f9d094 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -89,3 +89,4 @@ Bug Fixes ~~~~~~~~~ - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) +- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 320d43b158a88..578dd0ad0501a 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -150,14 +150,10 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if margins: if dropna: - data_dropna = data[data.notnull().all(axis = 1)] - table = _add_margins(table, data_dropna, values, rows=index, - cols=columns, aggfunc=aggfunc, - margins_name=margins_name) - else: - table = _add_margins(table, data, values, rows=index, - cols=columns, aggfunc=aggfunc, - margins_name=margins_name) + data = data[data.notnull().all(axis = 1)] + table = _add_margins(table, data, values, rows=index, + cols=columns, aggfunc=aggfunc, + margins_name=margins_name) # discard the top level if values_passed and not values_multi and not table.empty: diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 6b1a3759cb7bd..527296b6758ac 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -937,22 +937,28 @@ def test_crosstab_no_overlap(self): tm.assert_frame_equal(actual, expected) def test_margin_ignore_dropna_bug(self): - # Bug#12577 - df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], 'b': [3, 3, 4, 4, 4, 4]}) + # GH 12577 + # pivot_table counts null into margin ('All') + # when margins=true and dropna=true + + df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], + 'b': [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a,df.b, margins=True) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) expected.index = Index([1.0, 2.0, 'All'], name = 'a') expected.columns = Index([3, 4, 'All'], name = 'b') tm.assert_frame_equal(actual, expected) - df = DataFrame({'a':[1, np.nan, np.nan, np.nan, 2, np.nan], 'b':[3, np.nan, 4, 4, 4, 4]}) + df = DataFrame({'a':[1, np.nan, np.nan, np.nan, 2, np.nan], + 'b':[3, np.nan, 4, 4, 4, 4]}) actual = pd.crosstab(df.a,df.b, margins=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name = 'a') expected.columns = Index([3.0, 4.0, 'All'], name = 'b') tm.assert_frame_equal(actual, expected) - df = DataFrame({'a':[1, np.nan, np.nan, np.nan, np.nan, 2], 'b':[3, 3, 4, 4, 4, 4]}) + df = DataFrame({'a':[1, np.nan, np.nan, np.nan, np.nan, 2], + 'b':[3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a,df.b, margins=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name = 'a') From 76fbacf4a6818ad10341d7189243b7156c431ed3 Mon Sep 17 00:00:00 2001 From: OXPHOS Date: Mon, 14 Mar 2016 22:21:33 -0400 Subject: [PATCH 4/6] Fix stylistic errors --- pandas/tools/pivot.py | 2 +- pandas/tools/tests/test_pivot.py | 38 ++++++++++++++++---------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 578dd0ad0501a..06b31b5d5dc30 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -150,7 +150,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if margins: if dropna: - data = data[data.notnull().all(axis = 1)] + data = data[data.notnull().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 527296b6758ac..ea07cee9f7dd4 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -935,34 +935,34 @@ def test_crosstab_no_overlap(self): expected = pd.DataFrame() tm.assert_frame_equal(actual, expected) - + def test_margin_ignore_dropna_bug(self): # GH 12577 - # pivot_table counts null into margin ('All') + # pivot_table counts null into margin ('All') # when margins=true and dropna=true - + df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], - 'b': [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a,df.b, margins=True) + 'b': [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) - expected.index = Index([1.0, 2.0, 'All'], name = 'a') - expected.columns = Index([3, 4, 'All'], name = 'b') + expected.index = Index([1.0, 2.0, 'All'], name='a') + expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected) - - df = DataFrame({'a':[1, np.nan, np.nan, np.nan, 2, np.nan], - 'b':[3, np.nan, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a,df.b, margins=True) + + df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], + 'b': [3, np.nan, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, 'All'], name = 'a') - expected.columns = Index([3.0, 4.0, 'All'], name = 'b') + expected.index = Index([1.0, 2.0, 'All'], name='a') + expected.columns = Index([3.0, 4.0, 'All'], name='b') tm.assert_frame_equal(actual, expected) - - df = DataFrame({'a':[1, np.nan, np.nan, np.nan, np.nan, 2], - 'b':[3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a,df.b, margins=True) + + df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2], + 'b': [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, 'All'], name = 'a') - expected.columns = Index([3, 4, 'All'], name = 'b') + expected.index = Index([1.0, 2.0, 'All'], name='a') + expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected) if __name__ == '__main__': From 869a2538f88116cec6869ff6454900e951f5542a Mon Sep 17 00:00:00 2001 From: OXPHOS Date: Wed, 16 Mar 2016 00:55:20 -0400 Subject: [PATCH 5/6] Fixed margin=true, dropna=false error --- pandas/tools/pivot.py | 5 +++-- pandas/tools/tests/test_pivot.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 06b31b5d5dc30..55fc4a1fd1b4b 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -183,8 +183,9 @@ def _add_margins(table, data, values, rows, cols, aggfunc, # could be passed a Series object with no 'columns' if hasattr(table, 'columns'): for level in table.columns.names[1:]: - if margins_name in table.columns.get_level_values(level): - raise ValueError(exception_msg) + if level is not None: + if margins_name in table.columns.get_level_values(level): + raise ValueError(exception_msg) if len(rows) > 1: key = (margins_name,) + ('',) * (len(rows) - 1) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index ea07cee9f7dd4..44f1de4754e55 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -943,7 +943,7 @@ def test_margin_ignore_dropna_bug(self): df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], 'b': [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3, 4, 'All'], name='b') @@ -951,7 +951,7 @@ def test_margin_ignore_dropna_bug(self): df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], 'b': [3, np.nan, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3.0, 4.0, 'All'], name='b') @@ -959,12 +959,36 @@ def test_margin_ignore_dropna_bug(self): df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2], 'b': [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected) + df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], + 'b': [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) + expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, 'All'], name='a') + expected.columns = Index([3, 4, 'All']) + tm.assert_frame_equal(actual, expected) + + df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], + 'b': [3, np.nan, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) + expected.index = Index([1.0, 2.0, 'All'], name='a') + expected.columns = Index([3.0, 4.0, 'All']) + tm.assert_frame_equal(actual, expected) + + df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2], + 'b': [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, 'All'], name='a') + expected.columns = Index([3, 4, 'All']) + tm.assert_frame_equal(actual, expected) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From d3ed27abfaa185f528e611dc5f6ccf18b937b49a Mon Sep 17 00:00:00 2001 From: OXPHOS Date: Wed, 16 Mar 2016 01:17:56 -0400 Subject: [PATCH 6/6] Update what's new --- doc/source/whatsnew/v0.18.1.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 05c6d27f9d094..00d8d5c145915 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -90,3 +90,4 @@ Bug Fixes - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`) +- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=False`` where column names result in KeyError