Merge branch 'main' into named-array-reductions

pydata · Oct 14, 2023 · e77bba6 · e77bba6
2 parents 48bd08a + 5876365
commit e77bba6
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 44 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -54,6 +54,7 @@ Deprecations
 
 Bug fixes
 ~~~~~~~~~
+
 - :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning
   when the operation was a no-op. (:issue:`8266`)
   By `Simon Hansen <https://github.com/hoxbro>`_.
@@ -64,6 +65,12 @@ Bug fixes
   (:issue:`8271`, :pull:`8272`). By `Spencer Clark
   <https://github.com/spencerkclark>`_.
 
+- Fix excess metadata requests when using a Zarr store. Prior to this, metadata
+  was re-read every time data was retrieved from the array, now metadata is retrieved only once
+  when they array is initialized.
+  (:issue:`8290`, :pull:`8297`).
+  By `Oliver McCormack <https://github.com/olimcc>`_.
+
 
 Documentation
 ~~~~~~~~~~~~~

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -61,27 +61,29 @@ def encode_zarr_attr_value(value):
 
 
 class ZarrArrayWrapper(BackendArray):
-    __slots__ = ("datastore", "dtype", "shape", "variable_name")
+    __slots__ = ("datastore", "dtype", "shape", "variable_name", "_array")
 
     def __init__(self, variable_name, datastore):
         self.datastore = datastore
         self.variable_name = variable_name
 
-        array = self.get_array()
-        self.shape = array.shape
+        # some callers attempt to evaluate an array if an `array` property exists on the object.
+        # we prefix with _ to avoid this inference.
+        self._array = self.datastore.zarr_group[self.variable_name]
+        self.shape = self._array.shape
 
         # preserve vlen string object dtype (GH 7328)
-        if array.filters is not None and any(
-            [filt.codec_id == "vlen-utf8" for filt in array.filters]
+        if self._array.filters is not None and any(
+            [filt.codec_id == "vlen-utf8" for filt in self._array.filters]
         ):
             dtype = coding.strings.create_vlen_dtype(str)
         else:
-            dtype = array.dtype
+            dtype = self._array.dtype
 
         self.dtype = dtype
 
     def get_array(self):
-        return self.datastore.zarr_group[self.variable_name]
+        return self._array
 
     def _oindex(self, key):
         return self.get_array().oindex[key]

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -2362,6 +2362,28 @@ def notnull(self, keep_attrs: bool | None = None):
             keep_attrs=keep_attrs,
         )
 
+    @property
+    def real(self):
+        """
+        The real part of the variable.
+
+        See Also
+        --------
+        numpy.ndarray.real
+        """
+        return self._replace(data=self.data.real)
+
+    @property
+    def imag(self):
+        """
+        The imaginary part of the variable.
+
+        See Also
+        --------
+        numpy.ndarray.imag
+        """
+        return self._replace(data=self.data.imag)
+
     def __array_wrap__(self, obj, context=None):
         return Variable(self.dims, obj)
 

diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py
@@ -16,6 +16,7 @@
     Default,
     T_DuckArray,
     _default,
+    astype,
     is_chunked_duck_array,
     is_duck_array,
     is_duck_dask_array,
@@ -247,28 +248,6 @@ def data(self, data: T_DuckArray | np.typing.ArrayLike) -> None:
         self._check_shape(data)
         self._data = data
 
-    @property
-    def real(self) -> Self:
-        """
-        The real part of the NamedArray.
-
-        See Also
-        --------
-        numpy.ndarray.real
-        """
-        return self._replace(data=self.data.real)
-
-    @property
-    def imag(self) -> Self:
-        """
-        The imaginary part of the NamedArray.
-
-        See Also
-        --------
-        numpy.ndarray.imag
-        """
-        return self._replace(data=self.data.imag)
-
     def __dask_tokenize__(self) -> Hashable:
         # Use v.data, instead of v._data, in order to cope with the wrappers
         # around NetCDF and the like
@@ -614,7 +593,7 @@ def _as_sparse(
         except AttributeError as exc:
             raise ValueError(f"{sparse_format} is not a valid sparse format") from exc
 
-        data = as_sparse(self.data.astype(dtype), fill_value=fill_value)
+        data = as_sparse(astype(self.data, dtype), fill_value=fill_value)
         return self._replace(data=data)
 
     def _to_dense(self) -> Self:

diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py
@@ -4,6 +4,7 @@
 import sys
 from collections.abc import Hashable
 from enum import Enum
+from types import ModuleType
 from typing import TYPE_CHECKING, Any, Final, Protocol, TypeVar
 
 import numpy as np
@@ -15,9 +16,9 @@
         from typing_extensions import TypeGuard
 
     if sys.version_info >= (3, 11):
-        from typing import Self
+        pass
     else:
-        from typing_extensions import Self
+        pass
 
     try:
         from dask.array import Array as DaskArray
@@ -29,7 +30,7 @@
 
 # https://stackoverflow.com/questions/74633074/how-to-type-hint-a-generic-numpy-array
 T_DType_co = TypeVar("T_DType_co", bound=np.dtype[np.generic], covariant=True)
-# T_DType = TypeVar("T_DType", bound=np.dtype[np.generic])
+T_DType = TypeVar("T_DType", bound=np.dtype[np.generic])
 
 
 class _Array(Protocol[T_DType_co]):
@@ -41,17 +42,6 @@ def dtype(self) -> T_DType_co:
     def shape(self) -> tuple[int, ...]:
         ...
 
-    @property
-    def real(self) -> Self:
-        ...
-
-    @property
-    def imag(self) -> Self:
-        ...
-
-    def astype(self, dtype: np.typing.DTypeLike) -> Self:
-        ...
-
     # TODO: numpy doesn't use any inputs:
     # https://github.com/numpy/numpy/blob/v1.24.3/numpy/_typing/_array_like.py#L38
     def __array__(self) -> np.ndarray[Any, T_DType_co]:
@@ -161,3 +151,30 @@ def __dask_tokenize__(self) -> Hashable:
         from dask.base import normalize_token
 
         return normalize_token((type(self), self._value))  # type: ignore[no-any-return]
+
+
+# %% Array API functions
+def get_array_namespace(x: _Array[Any]) -> ModuleType:
+    if hasattr(x, "__array_namespace__"):
+        return x.__array_namespace__()  # type: ignore[no-any-return]
+    else:
+        return np
+
+
+def astype(x: _Array[Any], dtype: T_DType, /, *, copy: bool = True) -> _Array[T_DType]:
+    if hasattr(x, "__array_namespace__"):
+        xp = x.__array_namespace__()
+        return xp.astype(x, dtype, copy=copy)  # type: ignore[no-any-return]
+
+    # np.astype doesn't exist yet:
+    return x.astype(dtype, copy=copy)  # type: ignore[no-any-return, attr-defined]
+
+
+def imag(x: _Array[Any], /) -> _Array[Any]:
+    xp = get_array_namespace(x)
+    return xp.imag(x)  # type: ignore[no-any-return]
+
+
+def real(x: _Array[Any], /) -> _Array[Any]:
+    xp = get_array_namespace(x)
+    return xp.real(x)  # type: ignore[no-any-return]
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -19,6 +19,7 @@
 from os import listdir
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Final, cast
+from unittest.mock import patch
 
 import numpy as np
 import pandas as pd
@@ -2862,6 +2863,47 @@ def create_zarr_target(self):
             yield tmp
 
 
+@requires_zarr
+class TestZarrArrayWrapperCalls(TestZarrKVStoreV3):
+    def test_avoid_excess_metadata_calls(self) -> None:
+        """Test that chunk requests do not trigger redundant metadata requests.
+
+        This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls
+        to retrieve chunk data after initialization do not trigger additional
+        metadata requests.
+
+        https://github.com/pydata/xarray/issues/8290
+        """
+
+        import zarr
+
+        ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))})
+
+        # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__
+        # so that we can inspect calls to this method - specifically count of calls.
+        # Use of side_effect means that calls are passed through to the original method
+        # rather than a mocked method.
+        Group = zarr.hierarchy.Group
+        with (
+            self.create_zarr_target() as store,
+            patch.object(
+                Group, "__getitem__", side_effect=Group.__getitem__, autospec=True
+            ) as mock,
+        ):
+            ds.to_zarr(store, mode="w")
+
+            # We expect this to request array metadata information, so call_count should be >= 1,
+            # At time of writing, 2 calls are made
+            xrds = xr.open_zarr(store)
+            call_count = mock.call_count
+            assert call_count > 0
+
+            # compute() requests array data, which should not trigger additional metadata requests
+            # we assert that the number of calls has not increased after fetchhing the array
+            xrds.test.compute(scheduler="sync")
+            assert mock.call_count == call_count
+
+
 @requires_zarr
 @requires_fsspec
 def test_zarr_storage_options() -> None: