diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index ed16bedd481..17c45b0dac7 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2271,6 +2271,25 @@ def nsmallest(self, n=5, columns=None, keep="first"): self, n=n, columns=columns, keep=keep ) + @doc_utils.add_refer_to("DataFrame.query") + def rowwise_query(self, expr, **kwargs): + """ + Query columns of the QueryCompiler with a boolean expression row-wise. + + Parameters + ---------- + expr : str + **kwargs : dict + + Returns + ------- + BaseQueryCompiler + New QueryCompiler containing the rows where the boolean expression is satisfied. + """ + raise NotImplementedError( + "Row-wise queries execution is not implemented for the selected backend." + ) + @doc_utils.add_refer_to("DataFrame.eval") def eval(self, expr, **kwargs): """ diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 08cda41c962..d8d00d5bb9f 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -18,6 +18,7 @@ queries for the ``PandasDataframe``. """ +import ast import hashlib import re import warnings @@ -3186,6 +3187,65 @@ def _list_like_func(self, func, axis, *args, **kwargs): ) return self.__constructor__(new_modin_frame) + def rowwise_query(self, expr, **kwargs): + """ + Query the columns of a ``PandasQueryCompiler`` with a boolean row-wise expression. + + Basically, in row-wise expressions we only allow column names, constants + and other variables captured using the '@' symbol. No function/method + cannot be called inside such expressions. + + Parameters + ---------- + expr : str + Row-wise boolean expression. + **kwargs : dict + Arguments to pass to the ``pandas.DataFrame.query()``. + + Returns + ------- + PandasQueryCompiler + + Raises + ------ + NotImplementedError + In case the passed expression cannot be executed row-wise. + """ + # Walk through the AST and verify it doesn't contain any nodes that + # prevent us from executing the query row-wise (we're basically + # looking for 'ast.Call') + nodes = ast.parse(expr.replace("@", "")).body + is_row_wise_query = True + + while nodes: + node = nodes.pop() + if isinstance(node, ast.Expr): + node = getattr(node, "value", node) + + if isinstance(node, ast.UnaryOp): + nodes.append(node.operand) + elif isinstance(node, ast.BinOp): + nodes.extend([node.left, node.right]) + elif isinstance(node, ast.BoolOp): + nodes.extend(node.values) + elif isinstance(node, ast.Compare): + nodes.extend([node.left] + node.comparators) + elif isinstance(node, (ast.Name, ast.Constant)): + pass + else: + # if we end up here then the expression is no longer simple + # enough to run it row-wise, so exiting + is_row_wise_query = False + break + + if not is_row_wise_query: + raise NotImplementedError("A non row-wise query was passed.") + + def query_builder(df, **modin_internal_kwargs): + return df.query(expr, inplace=False, **kwargs, **modin_internal_kwargs) + + return self.__constructor__(self._modin_frame.filter(1, query_builder)) + def _callable_func(self, func, axis, *args, **kwargs): """ Apply passed function to each row/column. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 9ee9a6cf9e1..16dabeb4b42 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1605,10 +1605,23 @@ def query(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 Query the columns of a ``DataFrame`` with a boolean expression. """ self._update_var_dicts_in_kwargs(expr, kwargs) + self._validate_eval_query(expr, **kwargs) inplace = validate_bool_kwarg(inplace, "inplace") - new_query_compiler = pandas.DataFrame.query( - self, expr, inplace=False, **kwargs - )._query_compiler + # HACK: this condition kind of breaks the idea of backend agnostic API as all queries + # _should_ work fine for all of the engines using `pandas.DataFrame.query(...)` approach. + # However, at this point we know that we can execute simple queries way more efficiently + # using the QC's API directly in case of pandas backend. Ideally, we have to make it work + # with the 'pandas.query' approach the same as good the direct QC call is. But investigating + # and fixing the root cause of the perf difference appears to be much more complicated + # than putting this hack here. Hopefully, we'll get rid of it soon: + # https://github.com/modin-project/modin/issues/6499 + try: + new_query_compiler = self._query_compiler.rowwise_query(expr, **kwargs) + except NotImplementedError: + # a non row-wise query was passed, falling back to pandas implementation + new_query_compiler = pandas.DataFrame.query( + self, expr, inplace=False, **kwargs + )._query_compiler return self._create_or_update_from_compiler(new_query_compiler, inplace) def rename( diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index 28258cbaf1a..8d4213015ba 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -1310,3 +1310,33 @@ def test_skip_set_columns(): # of equality comparison, in this case the new columns should be set unconditionally, # meaning that the '_deferred_column' has to be True assert df._query_compiler._modin_frame._deferred_column + + +def test_query_dispatching(): + """ + Test whether the logic of determining whether the passed query + can be performed row-wise works correctly in ``PandasQueryCompiler.rowwise_query()``. + + The tested method raises a ``NotImpementedError`` if the query cannot be performed row-wise + and raises nothing if it can. + """ + qc = pd.DataFrame( + {"a": [1], "b": [2], "c": [3], "d": [4], "e": [5]} + )._query_compiler + + local_var = 10 # noqa: F841 (unused variable) + + # these queries should be performed row-wise (so no exception) + qc.rowwise_query("a < 1") + qc.rowwise_query("a < b") + qc.rowwise_query("a < (b + @local_var) * c > 10") + + # these queries cannot be performed row-wise (so they must raise an exception) + with pytest.raises(NotImplementedError): + qc.rowwise_query("a < b[0]") + with pytest.raises(NotImplementedError): + qc.rowwise_query("a < b.min()") + with pytest.raises(NotImplementedError): + qc.rowwise_query("a < (b + @local_var + (b - e.min())) * c > 10") + with pytest.raises(NotImplementedError): + qc.rowwise_query("a < b.size")