Skip to content

Commit

Permalink
Merge branch 'main' into named-array-reductions
Browse files Browse the repository at this point in the history
  • Loading branch information
andersy005 authored Oct 14, 2023
2 parents 48bd08a + 5876365 commit e77bba6
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 44 deletions.
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Deprecations

Bug fixes
~~~~~~~~~

- :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning
when the operation was a no-op. (:issue:`8266`)
By `Simon Hansen <https://github.com/hoxbro>`_.
Expand All @@ -64,6 +65,12 @@ Bug fixes
(:issue:`8271`, :pull:`8272`). By `Spencer Clark
<https://github.com/spencerkclark>`_.

- Fix excess metadata requests when using a Zarr store. Prior to this, metadata
was re-read every time data was retrieved from the array, now metadata is retrieved only once
when they array is initialized.
(:issue:`8290`, :pull:`8297`).
By `Oliver McCormack <https://github.com/olimcc>`_.


Documentation
~~~~~~~~~~~~~
Expand Down
16 changes: 9 additions & 7 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,27 +61,29 @@ def encode_zarr_attr_value(value):


class ZarrArrayWrapper(BackendArray):
__slots__ = ("datastore", "dtype", "shape", "variable_name")
__slots__ = ("datastore", "dtype", "shape", "variable_name", "_array")

def __init__(self, variable_name, datastore):
self.datastore = datastore
self.variable_name = variable_name

array = self.get_array()
self.shape = array.shape
# some callers attempt to evaluate an array if an `array` property exists on the object.
# we prefix with _ to avoid this inference.
self._array = self.datastore.zarr_group[self.variable_name]
self.shape = self._array.shape

# preserve vlen string object dtype (GH 7328)
if array.filters is not None and any(
[filt.codec_id == "vlen-utf8" for filt in array.filters]
if self._array.filters is not None and any(
[filt.codec_id == "vlen-utf8" for filt in self._array.filters]
):
dtype = coding.strings.create_vlen_dtype(str)
else:
dtype = array.dtype
dtype = self._array.dtype

self.dtype = dtype

def get_array(self):
return self.datastore.zarr_group[self.variable_name]
return self._array

def _oindex(self, key):
return self.get_array().oindex[key]
Expand Down
22 changes: 22 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -2362,6 +2362,28 @@ def notnull(self, keep_attrs: bool | None = None):
keep_attrs=keep_attrs,
)

@property
def real(self):
"""
The real part of the variable.
See Also
--------
numpy.ndarray.real
"""
return self._replace(data=self.data.real)

@property
def imag(self):
"""
The imaginary part of the variable.
See Also
--------
numpy.ndarray.imag
"""
return self._replace(data=self.data.imag)

def __array_wrap__(self, obj, context=None):
return Variable(self.dims, obj)

Expand Down
25 changes: 2 additions & 23 deletions xarray/namedarray/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Default,
T_DuckArray,
_default,
astype,
is_chunked_duck_array,
is_duck_array,
is_duck_dask_array,
Expand Down Expand Up @@ -247,28 +248,6 @@ def data(self, data: T_DuckArray | np.typing.ArrayLike) -> None:
self._check_shape(data)
self._data = data

@property
def real(self) -> Self:
"""
The real part of the NamedArray.
See Also
--------
numpy.ndarray.real
"""
return self._replace(data=self.data.real)

@property
def imag(self) -> Self:
"""
The imaginary part of the NamedArray.
See Also
--------
numpy.ndarray.imag
"""
return self._replace(data=self.data.imag)

def __dask_tokenize__(self) -> Hashable:
# Use v.data, instead of v._data, in order to cope with the wrappers
# around NetCDF and the like
Expand Down Expand Up @@ -614,7 +593,7 @@ def _as_sparse(
except AttributeError as exc:
raise ValueError(f"{sparse_format} is not a valid sparse format") from exc

data = as_sparse(self.data.astype(dtype), fill_value=fill_value)
data = as_sparse(astype(self.data, dtype), fill_value=fill_value)
return self._replace(data=data)

def _to_dense(self) -> Self:
Expand Down
45 changes: 31 additions & 14 deletions xarray/namedarray/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
from collections.abc import Hashable
from enum import Enum
from types import ModuleType
from typing import TYPE_CHECKING, Any, Final, Protocol, TypeVar

import numpy as np
Expand All @@ -15,9 +16,9 @@
from typing_extensions import TypeGuard

if sys.version_info >= (3, 11):
from typing import Self
pass
else:
from typing_extensions import Self
pass

try:
from dask.array import Array as DaskArray
Expand All @@ -29,7 +30,7 @@

# https://stackoverflow.com/questions/74633074/how-to-type-hint-a-generic-numpy-array
T_DType_co = TypeVar("T_DType_co", bound=np.dtype[np.generic], covariant=True)
# T_DType = TypeVar("T_DType", bound=np.dtype[np.generic])
T_DType = TypeVar("T_DType", bound=np.dtype[np.generic])


class _Array(Protocol[T_DType_co]):
Expand All @@ -41,17 +42,6 @@ def dtype(self) -> T_DType_co:
def shape(self) -> tuple[int, ...]:
...

@property
def real(self) -> Self:
...

@property
def imag(self) -> Self:
...

def astype(self, dtype: np.typing.DTypeLike) -> Self:
...

# TODO: numpy doesn't use any inputs:
# https://github.com/numpy/numpy/blob/v1.24.3/numpy/_typing/_array_like.py#L38
def __array__(self) -> np.ndarray[Any, T_DType_co]:
Expand Down Expand Up @@ -161,3 +151,30 @@ def __dask_tokenize__(self) -> Hashable:
from dask.base import normalize_token

return normalize_token((type(self), self._value)) # type: ignore[no-any-return]


# %% Array API functions
def get_array_namespace(x: _Array[Any]) -> ModuleType:
if hasattr(x, "__array_namespace__"):
return x.__array_namespace__() # type: ignore[no-any-return]
else:
return np


def astype(x: _Array[Any], dtype: T_DType, /, *, copy: bool = True) -> _Array[T_DType]:
if hasattr(x, "__array_namespace__"):
xp = x.__array_namespace__()
return xp.astype(x, dtype, copy=copy) # type: ignore[no-any-return]

# np.astype doesn't exist yet:
return x.astype(dtype, copy=copy) # type: ignore[no-any-return, attr-defined]


def imag(x: _Array[Any], /) -> _Array[Any]:
xp = get_array_namespace(x)
return xp.imag(x) # type: ignore[no-any-return]


def real(x: _Array[Any], /) -> _Array[Any]:
xp = get_array_namespace(x)
return xp.real(x) # type: ignore[no-any-return]
42 changes: 42 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from os import listdir
from pathlib import Path
from typing import TYPE_CHECKING, Any, Final, cast
from unittest.mock import patch

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -2862,6 +2863,47 @@ def create_zarr_target(self):
yield tmp


@requires_zarr
class TestZarrArrayWrapperCalls(TestZarrKVStoreV3):
def test_avoid_excess_metadata_calls(self) -> None:
"""Test that chunk requests do not trigger redundant metadata requests.
This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls
to retrieve chunk data after initialization do not trigger additional
metadata requests.
https://github.com/pydata/xarray/issues/8290
"""

import zarr

ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))})

# The call to retrieve metadata performs a group lookup. We patch Group.__getitem__
# so that we can inspect calls to this method - specifically count of calls.
# Use of side_effect means that calls are passed through to the original method
# rather than a mocked method.
Group = zarr.hierarchy.Group
with (
self.create_zarr_target() as store,
patch.object(
Group, "__getitem__", side_effect=Group.__getitem__, autospec=True
) as mock,
):
ds.to_zarr(store, mode="w")

# We expect this to request array metadata information, so call_count should be >= 1,
# At time of writing, 2 calls are made
xrds = xr.open_zarr(store)
call_count = mock.call_count
assert call_count > 0

# compute() requests array data, which should not trigger additional metadata requests
# we assert that the number of calls has not increased after fetchhing the array
xrds.test.compute(scheduler="sync")
assert mock.call_count == call_count


@requires_zarr
@requires_fsspec
def test_zarr_storage_options() -> None:
Expand Down

0 comments on commit e77bba6

Please sign in to comment.