From 47eec7fb1c948f385e8b6f9ad71d4d49e2148bcc Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 13 Oct 2023 17:58:06 +0200 Subject: [PATCH 1/2] Remove real, imag, astype methods from NamedArray (#8295) * remove imag / real / astype * Remove real, imag and astype * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add imag, real functions * Update variable.py * Update utils.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update utils.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- xarray/core/variable.py | 22 +++++++++++++++++++ xarray/namedarray/core.py | 25 ++------------------- xarray/namedarray/utils.py | 45 ++++++++++++++++++++++++++------------ 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index fa5523b1340..ec7957fbb0f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -2400,6 +2400,28 @@ def notnull(self, keep_attrs: bool | None = None): keep_attrs=keep_attrs, ) + @property + def real(self): + """ + The real part of the variable. + + See Also + -------- + numpy.ndarray.real + """ + return self._replace(data=self.data.real) + + @property + def imag(self): + """ + The imaginary part of the variable. + + See Also + -------- + numpy.ndarray.imag + """ + return self._replace(data=self.data.imag) + def __array_wrap__(self, obj, context=None): return Variable(self.dims, obj) diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py index ec3d8fa171b..92cc742e131 100644 --- a/xarray/namedarray/core.py +++ b/xarray/namedarray/core.py @@ -14,6 +14,7 @@ Default, T_DuckArray, _default, + astype, is_chunked_duck_array, is_duck_array, is_duck_dask_array, @@ -245,28 +246,6 @@ def data(self, data: T_DuckArray | np.typing.ArrayLike) -> None: self._check_shape(data) self._data = data - @property - def real(self) -> Self: - """ - The real part of the NamedArray. - - See Also - -------- - numpy.ndarray.real - """ - return self._replace(data=self.data.real) - - @property - def imag(self) -> Self: - """ - The imaginary part of the NamedArray. - - See Also - -------- - numpy.ndarray.imag - """ - return self._replace(data=self.data.imag) - def __dask_tokenize__(self) -> Hashable: # Use v.data, instead of v._data, in order to cope with the wrappers # around NetCDF and the like @@ -497,7 +476,7 @@ def _as_sparse( except AttributeError as exc: raise ValueError(f"{sparse_format} is not a valid sparse format") from exc - data = as_sparse(self.data.astype(dtype), fill_value=fill_value) + data = as_sparse(astype(self.data, dtype), fill_value=fill_value) return self._replace(data=data) def _to_dense(self) -> Self: diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 6f7658ea00b..8c598a9a3b3 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -4,6 +4,7 @@ import sys from collections.abc import Hashable from enum import Enum +from types import ModuleType from typing import TYPE_CHECKING, Any, Final, Protocol, TypeVar import numpy as np @@ -15,9 +16,9 @@ from typing_extensions import TypeGuard if sys.version_info >= (3, 11): - from typing import Self + pass else: - from typing_extensions import Self + pass try: from dask.array import Array as DaskArray @@ -29,7 +30,7 @@ # https://stackoverflow.com/questions/74633074/how-to-type-hint-a-generic-numpy-array T_DType_co = TypeVar("T_DType_co", bound=np.dtype[np.generic], covariant=True) -# T_DType = TypeVar("T_DType", bound=np.dtype[np.generic]) +T_DType = TypeVar("T_DType", bound=np.dtype[np.generic]) class _Array(Protocol[T_DType_co]): @@ -41,17 +42,6 @@ def dtype(self) -> T_DType_co: def shape(self) -> tuple[int, ...]: ... - @property - def real(self) -> Self: - ... - - @property - def imag(self) -> Self: - ... - - def astype(self, dtype: np.typing.DTypeLike) -> Self: - ... - # TODO: numpy doesn't use any inputs: # https://github.com/numpy/numpy/blob/v1.24.3/numpy/_typing/_array_like.py#L38 def __array__(self) -> np.ndarray[Any, T_DType_co]: @@ -161,3 +151,30 @@ def __dask_tokenize__(self) -> Hashable: from dask.base import normalize_token return normalize_token((type(self), self._value)) # type: ignore[no-any-return] + + +# %% Array API functions +def get_array_namespace(x: _Array[Any]) -> ModuleType: + if hasattr(x, "__array_namespace__"): + return x.__array_namespace__() # type: ignore[no-any-return] + else: + return np + + +def astype(x: _Array[Any], dtype: T_DType, /, *, copy: bool = True) -> _Array[T_DType]: + if hasattr(x, "__array_namespace__"): + xp = x.__array_namespace__() + return xp.astype(x, dtype, copy=copy) # type: ignore[no-any-return] + + # np.astype doesn't exist yet: + return x.astype(dtype, copy=copy) # type: ignore[no-any-return, attr-defined] + + +def imag(x: _Array[Any], /) -> _Array[Any]: + xp = get_array_namespace(x) + return xp.imag(x) # type: ignore[no-any-return] + + +def real(x: _Array[Any], /) -> _Array[Any]: + xp = get_array_namespace(x) + return xp.real(x) # type: ignore[no-any-return] From 58763657dfed5fd566f3391b1a693c8427efe6d4 Mon Sep 17 00:00:00 2001 From: olimcc Date: Fri, 13 Oct 2023 09:38:56 -0700 Subject: [PATCH 2/2] Avoid redundant metadata reads in `ZarrArrayWrapper` (#8297) * Avoid redundant metadata reads in `ZarrArrayWrapper` Modify ZarrArrayWrapper to avoid re-reading metadata each time data is read from the array. * Improve test documentation * Update xarray/tests/test_backends.py Co-authored-by: Joe Hamman * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add what's new entry --------- Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Co-authored-by: Joe Hamman Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/whats-new.rst | 7 ++++++ xarray/backends/zarr.py | 16 +++++++------ xarray/tests/test_backends.py | 42 +++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 40c50e158ad..4a57faaa82e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -54,6 +54,7 @@ Deprecations Bug fixes ~~~~~~~~~ + - :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning when the operation was a no-op. (:issue:`8266`) By `Simon Hansen `_. @@ -64,6 +65,12 @@ Bug fixes (:issue:`8271`, :pull:`8272`). By `Spencer Clark `_. +- Fix excess metadata requests when using a Zarr store. Prior to this, metadata + was re-read every time data was retrieved from the array, now metadata is retrieved only once + when they array is initialized. + (:issue:`8290`, :pull:`8297`). + By `Oliver McCormack `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f88523422bb..d6ad15f4f87 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -61,27 +61,29 @@ def encode_zarr_attr_value(value): class ZarrArrayWrapper(BackendArray): - __slots__ = ("datastore", "dtype", "shape", "variable_name") + __slots__ = ("datastore", "dtype", "shape", "variable_name", "_array") def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name - array = self.get_array() - self.shape = array.shape + # some callers attempt to evaluate an array if an `array` property exists on the object. + # we prefix with _ to avoid this inference. + self._array = self.datastore.zarr_group[self.variable_name] + self.shape = self._array.shape # preserve vlen string object dtype (GH 7328) - if array.filters is not None and any( - [filt.codec_id == "vlen-utf8" for filt in array.filters] + if self._array.filters is not None and any( + [filt.codec_id == "vlen-utf8" for filt in self._array.filters] ): dtype = coding.strings.create_vlen_dtype(str) else: - dtype = array.dtype + dtype = self._array.dtype self.dtype = dtype def get_array(self): - return self.datastore.zarr_group[self.variable_name] + return self._array def _oindex(self, key): return self.get_array().oindex[key] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9ec67bf47dc..4c04841db8d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -19,6 +19,7 @@ from os import listdir from pathlib import Path from typing import TYPE_CHECKING, Any, Final, cast +from unittest.mock import patch import numpy as np import pandas as pd @@ -2862,6 +2863,47 @@ def create_zarr_target(self): yield tmp +@requires_zarr +class TestZarrArrayWrapperCalls(TestZarrKVStoreV3): + def test_avoid_excess_metadata_calls(self) -> None: + """Test that chunk requests do not trigger redundant metadata requests. + + This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls + to retrieve chunk data after initialization do not trigger additional + metadata requests. + + https://github.com/pydata/xarray/issues/8290 + """ + + import zarr + + ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))}) + + # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__ + # so that we can inspect calls to this method - specifically count of calls. + # Use of side_effect means that calls are passed through to the original method + # rather than a mocked method. + Group = zarr.hierarchy.Group + with ( + self.create_zarr_target() as store, + patch.object( + Group, "__getitem__", side_effect=Group.__getitem__, autospec=True + ) as mock, + ): + ds.to_zarr(store, mode="w") + + # We expect this to request array metadata information, so call_count should be >= 1, + # At time of writing, 2 calls are made + xrds = xr.open_zarr(store) + call_count = mock.call_count + assert call_count > 0 + + # compute() requests array data, which should not trigger additional metadata requests + # we assert that the number of calls has not increased after fetchhing the array + xrds.test.compute(scheduler="sync") + assert mock.call_count == call_count + + @requires_zarr @requires_fsspec def test_zarr_storage_options() -> None: