Add eval method to Dataset (pydata#7163)

* Add `eval` method to Dataset This needs proper tests & docs, but would this be a good idea? Example in the docstring
dcherian · Dec 6, 2023 · 3d6ec7e · 3d6ec7e
1 parent 299abd6
commit 3d6ec7e
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 0 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -192,6 +192,7 @@ Computation
    Dataset.map_blocks
    Dataset.polyfit
    Dataset.curvefit
+   Dataset.eval
 
 Aggregation
 -----------

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -41,6 +41,9 @@ New Features
 - :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming
   the core dim has exactly one chunk. (:pull:`8475`).
   By `Maximilian Roos <https://github.com/max-sixty>`_.
+- Add a :py:meth:`Dataset.eval` method, similar to the pandas' method of the
+  same name. (:pull:`7163`). This is currently marked as experimental and
+  doesn't yet support the ``numexpr`` engine.
 - :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` allow passing a
   callable, similar to :py:meth:`Dataset.where` & :py:meth:`Dataset.sortby` & others.
   (:pull:`8511`).

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -98,6 +98,7 @@
     Self,
     T_ChunkDim,
     T_Chunks,
+    T_DataArray,
     T_DataArrayOrSet,
     T_Dataset,
     ZarrWriteModes,
@@ -9554,6 +9555,68 @@ def argmax(self, dim: Hashable | None = None, **kwargs) -> Self:
                 "Dataset.argmin() with a sequence or ... for dim"
             )
 
+    def eval(
+        self,
+        statement: str,
+        *,
+        parser: QueryParserOptions = "pandas",
+    ) -> Self | T_DataArray:
+        """
+        Calculate an expression supplied as a string in the context of the dataset.
+
+        This is currently experimental; the API may change particularly around
+        assignments, which currently returnn a ``Dataset`` with the additional variable.
+        Currently only the ``python`` engine is supported, which has the same
+        performance as executing in python.
+
+        Parameters
+        ----------
+        statement : str
+            String containing the Python-like expression to evaluate.
+
+        Returns
+        -------
+        result : Dataset or DataArray, depending on whether ``statement`` contains an
+        assignment.
+
+        Examples
+        --------
+        >>> ds = xr.Dataset(
+        ...     {"a": ("x", np.arange(0, 5, 1)), "b": ("x", np.linspace(0, 1, 5))}
+        ... )
+        >>> ds
+        <xarray.Dataset>
+        Dimensions:  (x: 5)
+        Dimensions without coordinates: x
+        Data variables:
+            a        (x) int64 0 1 2 3 4
+            b        (x) float64 0.0 0.25 0.5 0.75 1.0
+
+        >>> ds.eval("a + b")
+        <xarray.DataArray (x: 5)>
+        array([0.  , 1.25, 2.5 , 3.75, 5.  ])
+        Dimensions without coordinates: x
+
+        >>> ds.eval("c = a + b")
+        <xarray.Dataset>
+        Dimensions:  (x: 5)
+        Dimensions without coordinates: x
+        Data variables:
+            a        (x) int64 0 1 2 3 4
+            b        (x) float64 0.0 0.25 0.5 0.75 1.0
+            c        (x) float64 0.0 1.25 2.5 3.75 5.0
+        """
+
+        return pd.eval(
+            statement,
+            resolvers=[self],
+            target=self,
+            parser=parser,
+            # Because numexpr returns a numpy array, using that engine results in
+            # different behavior. We'd be very open to a contribution handling this.
+            engine="python",
+        )
+
     def query(
         self,
         queries: Mapping[Any, Any] | None = None,

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -6718,6 +6718,23 @@ def test_query(self, backend, engine, parser) -> None:
 # pytest tests — new tests should go here, rather than in the class.
 
 
+@pytest.mark.parametrize("parser", ["pandas", "python"])
+def test_eval(ds, parser) -> None:
+    """Currently much more minimal testing that `query` above, and much of the setup
+    isn't used. But the risks are fairly low — `query` shares much of the code, and
+    the method is currently experimental."""
+
+    actual = ds.eval("z1 + 5", parser=parser)
+    expect = ds["z1"] + 5
+    assert_identical(expect, actual)
+
+    # check pandas query syntax is supported
+    if parser == "pandas":
+        actual = ds.eval("(z1 > 5) and (z2 > 0)", parser=parser)
+        expect = (ds["z1"] > 5) & (ds["z2"] > 0)
+        assert_identical(expect, actual)
+
+
 @pytest.mark.parametrize("test_elements", ([1, 2], np.array([1, 2]), DataArray([1, 2])))
 def test_isin(test_elements, backend) -> None:
     expected = Dataset(