From 3106e4929c033a7bc95acd404bf1bf6ab4f1ae86 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Tue, 12 Jan 2021 14:23:53 +0300 Subject: [PATCH] FIX-#2543: fixed handling 'as_index' at groupby dictionary renaming aggregation (#2592) Signed-off-by: Dmitry Chigarev --- modin/backends/pandas/query_compiler.py | 7 ++- modin/pandas/groupby.py | 14 ++++++ modin/pandas/test/test_groupby.py | 62 +++++++++++++++++++++---- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index bce56da3901..66f3fc3cd1b 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2483,7 +2483,12 @@ def _groupby_dict_reduce( raise TypeError map_fns.append((new_col_name, groupby_reduce_functions[func][0])) - reduce_dict[(col, new_col_name)] = groupby_reduce_functions[func][1] + reduced_col_name = ( + (*col, new_col_name) + if isinstance(col, tuple) + else (col, new_col_name) + ) + reduce_dict[reduced_col_name] = groupby_reduce_functions[func][1] map_dict[col] = map_fns return GroupbyReduceFunction.register(map_dict, reduce_dict)( query_compiler=self, diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 2abeaa375b8..fdb08222ec1 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -425,6 +425,20 @@ def try_get_str_func(fn): ) if relabeling_required: + if not self._as_index: + nby_cols = len(result.columns) - len(new_columns) + order = np.concatenate([np.arange(nby_cols), order + nby_cols]) + by_cols = result.columns[:nby_cols] + new_columns = pandas.Index(new_columns) + if by_cols.nlevels != new_columns.nlevels: + by_cols = by_cols.remove_unused_levels() + empty_levels = [ + i + for i, level in enumerate(by_cols.levels) + if len(level) == 1 and level[0] == "" + ] + by_cols = by_cols.droplevel(empty_levels) + new_columns = by_cols.append(new_columns) result = result.iloc[:, order] result.columns = new_columns return result diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index b116ffadf69..ab500c24a09 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -25,6 +25,7 @@ test_data, test_data_values, modin_df_almost_equals_pandas, + generate_multiindex, ) pd.DEFAULT_NPARTITIONS = 4 @@ -1247,18 +1248,25 @@ def test_shift_freq(groupby_axis, shift_axis): ], "agg_dict": { "max": (list(test_data["int_data"].keys())[1], max), - "min": (list(test_data["int_data"].keys())[-1], min), + "min": (list(test_data["int_data"].keys())[-2], min), }, }, + pytest.param( + { + "by": [ + list(test_data["int_data"].keys())[0], + list(test_data["int_data"].keys())[-1], + ], + "agg_dict": { + "max": (list(test_data["int_data"].keys())[1], max), + "min": (list(test_data["int_data"].keys())[-1], min), + }, + }, + marks=pytest.mark.skip("See Modin issue #2542"), + ), ], ) -@pytest.mark.parametrize( - "as_index", - [ - True, - pytest.param(False, marks=pytest.mark.xfail(reason="See modin issue #2543")), - ], -) +@pytest.mark.parametrize("as_index", [True, False]) def test_agg_func_None_rename(by_and_agg_dict, as_index): modin_df, pandas_df = create_test_dfs(test_data["int_data"]) @@ -1271,6 +1279,44 @@ def test_agg_func_None_rename(by_and_agg_dict, as_index): df_equals(modin_result, pandas_result) +@pytest.mark.parametrize( + "as_index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail_backends( + ["BaseOnPython"], reason="See Pandas issue #39103" + ), + ), + ], +) +@pytest.mark.parametrize("by_length", [1, 3]) +@pytest.mark.parametrize( + "agg_fns", + [["sum", "min", "max"], ["mean", "quantile"]], + ids=["reduction", "aggregation"], +) +def test_dict_agg_rename_mi_columns(as_index, by_length, agg_fns): + md_df, pd_df = create_test_dfs(test_data["int_data"]) + mi_columns = generate_multiindex(len(md_df.columns), nlevels=4) + + md_df.columns, pd_df.columns = mi_columns, mi_columns + + by = list(md_df.columns[:by_length]) + agg_cols = list(md_df.columns[by_length : by_length + 3]) + + agg_dict = { + f"custom-{i}" + str(agg_fns[i % len(agg_fns)]): (col, agg_fns[i % len(agg_fns)]) + for i, col in enumerate(agg_cols) + } + + md_res = md_df.groupby(by, as_index=as_index).agg(**agg_dict) + pd_res = md_df.groupby(by, as_index=as_index).agg(**agg_dict) + + df_equals(md_res, pd_res) + + @pytest.mark.parametrize( "operation", [