From 931fb46d8ac2a31ea27439179b5ded337e78b86e Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 18 Sep 2023 14:24:46 -0700 Subject: [PATCH 1/2] PERF-#6583: Remove redundant index reassignment in query() Signed-off-by: mvashishtha --- modin/pandas/dataframe.py | 35 ++++++++++++++++++++++++- modin/pandas/test/dataframe/test_udf.py | 35 +++++++++++++++++++++++++ modin/pandas/test/utils.py | 3 +++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 16dabeb4b42..98ef7a46f91 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1597,7 +1597,40 @@ def quantile( # methods and fields we need to use pandas.DataFrame.query _AXIS_ORDERS = ["index", "columns"] _get_index_resolvers = pandas.DataFrame._get_index_resolvers - _get_axis_resolvers = pandas.DataFrame._get_axis_resolvers + + def _get_axis_resolvers(self, axis: str) -> dict: + # forked from pandas because we only want to update the index if there's more + # than one level of the index. + # index or columns + axis_index = getattr(self, axis) + d = {} + prefix = axis[0] + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + key = f"{prefix}level_{i}" + level = i + + level_values = axis_index.get_level_values(level) + s = level_values.to_series() + if axis_index.nlevels > 1: + s.index = axis_index + d[key] = s + + # put the index/columns itself in the dict + if axis_index.nlevels > 2: + dindex = axis_index + else: + dindex = axis_index.to_series() + + d[axis] = dindex + return d + _get_cleaned_column_resolvers = pandas.DataFrame._get_cleaned_column_resolvers def query(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index 8f34786f290..e5ffdd1dfe0 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -446,6 +446,41 @@ def test_query(data, funcs, engine): df_equals(modin_result.dtypes, pandas_result.dtypes) +def test_query_named_index(): + eval_general( + *(df.set_index("col1") for df in create_test_dfs(test_data["int_data"])), + lambda df: df.query("col1 % 2 == 0 | col2 % 2 == 1"), + # work around https://github.com/modin-project/modin/issues/6016 + raising_exceptions=Exception, + ) + + +def test_query_named_multiindex(): + eval_general( + *( + df.set_index(["col1", "col2"]) + for df in create_test_dfs(test_data["int_data"]) + ), + lambda df: df.query("col1 % 2 == 1 | col2 % 2 == 1"), + # work around https://github.com/modin-project/modin/issues/6016 + raising_exceptions=Exception, + ) + + +def test_query_multiindex_without_names(): + def make_df(without_index): + new_df = without_index.set_index(["col1", "col2"]) + new_df.index.names = [None, None] + return new_df + + eval_general( + *(make_df(df) for df in create_test_dfs(test_data["int_data"])), + lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col3 % 2 == 1"), + # work around https://github.com/modin-project/modin/issues/6016 + raising_exceptions=Exception, + ) + + def test_empty_query(): modin_df = pd.DataFrame([1, 2, 3, 4, 5]) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index b4d77554949..ab1251ccf4f 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -308,6 +308,9 @@ "col3 > col4": "col3 > col4", "col1 == col2": "col1 == col2", "(col2 > col1) and (col1 < col3)": "(col2 > col1) and (col1 < col3)", + # this is how to query for values of an unnamed index per + # https://pandas.pydata.org/docs/user_guide/indexing.html#multiindex-query-syntax + "ilevel_0 % 2 == 1": "ilevel_0 % 2 == 1", } query_func_keys = list(query_func.keys()) query_func_values = list(query_func.values()) From 6a264131e937931279b597a5ca2d0abe6836e15a Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 18 Sep 2023 15:57:48 -0700 Subject: [PATCH 2/2] Fix BaseOnPython tests Signed-off-by: mvashishtha --- modin/pandas/test/dataframe/test_udf.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index e5ffdd1dfe0..2bd82aca804 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -449,35 +449,35 @@ def test_query(data, funcs, engine): def test_query_named_index(): eval_general( *(df.set_index("col1") for df in create_test_dfs(test_data["int_data"])), - lambda df: df.query("col1 % 2 == 0 | col2 % 2 == 1"), + lambda df: df.query("col1 % 2 == 0 | col3 % 2 == 1"), # work around https://github.com/modin-project/modin/issues/6016 - raising_exceptions=Exception, + raising_exceptions=(Exception,), ) def test_query_named_multiindex(): eval_general( *( - df.set_index(["col1", "col2"]) + df.set_index(["col1", "col3"]) for df in create_test_dfs(test_data["int_data"]) ), - lambda df: df.query("col1 % 2 == 1 | col2 % 2 == 1"), + lambda df: df.query("col1 % 2 == 1 | col3 % 2 == 1"), # work around https://github.com/modin-project/modin/issues/6016 - raising_exceptions=Exception, + raising_exceptions=(Exception,), ) def test_query_multiindex_without_names(): def make_df(without_index): - new_df = without_index.set_index(["col1", "col2"]) + new_df = without_index.set_index(["col1", "col3"]) new_df.index.names = [None, None] return new_df eval_general( *(make_df(df) for df in create_test_dfs(test_data["int_data"])), - lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col3 % 2 == 1"), + lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col4 % 2 == 1"), # work around https://github.com/modin-project/modin/issues/6016 - raising_exceptions=Exception, + raising_exceptions=(Exception,), )