Skip to content

Commit

Permalink
Avoid redundant metadata reads in ZarrArrayWrapper (#8297)
Browse files Browse the repository at this point in the history
* Avoid redundant metadata reads in `ZarrArrayWrapper`

Modify ZarrArrayWrapper to avoid re-reading metadata each time data
is read from the array.

* Improve test documentation

* Update xarray/tests/test_backends.py

Co-authored-by: Joe Hamman <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add what's new entry

---------

Co-authored-by: Maximilian Roos <[email protected]>
Co-authored-by: Joe Hamman <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Oct 13, 2023
1 parent 47eec7f commit 5876365
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 7 deletions.
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Deprecations

Bug fixes
~~~~~~~~~

- :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning
when the operation was a no-op. (:issue:`8266`)
By `Simon Hansen <https://github.com/hoxbro>`_.
Expand All @@ -64,6 +65,12 @@ Bug fixes
(:issue:`8271`, :pull:`8272`). By `Spencer Clark
<https://github.com/spencerkclark>`_.

- Fix excess metadata requests when using a Zarr store. Prior to this, metadata
was re-read every time data was retrieved from the array, now metadata is retrieved only once
when they array is initialized.
(:issue:`8290`, :pull:`8297`).
By `Oliver McCormack <https://github.com/olimcc>`_.


Documentation
~~~~~~~~~~~~~
Expand Down
16 changes: 9 additions & 7 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,27 +61,29 @@ def encode_zarr_attr_value(value):


class ZarrArrayWrapper(BackendArray):
__slots__ = ("datastore", "dtype", "shape", "variable_name")
__slots__ = ("datastore", "dtype", "shape", "variable_name", "_array")

def __init__(self, variable_name, datastore):
self.datastore = datastore
self.variable_name = variable_name

array = self.get_array()
self.shape = array.shape
# some callers attempt to evaluate an array if an `array` property exists on the object.
# we prefix with _ to avoid this inference.
self._array = self.datastore.zarr_group[self.variable_name]
self.shape = self._array.shape

# preserve vlen string object dtype (GH 7328)
if array.filters is not None and any(
[filt.codec_id == "vlen-utf8" for filt in array.filters]
if self._array.filters is not None and any(
[filt.codec_id == "vlen-utf8" for filt in self._array.filters]
):
dtype = coding.strings.create_vlen_dtype(str)
else:
dtype = array.dtype
dtype = self._array.dtype

self.dtype = dtype

def get_array(self):
return self.datastore.zarr_group[self.variable_name]
return self._array

def _oindex(self, key):
return self.get_array().oindex[key]
Expand Down
42 changes: 42 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from os import listdir
from pathlib import Path
from typing import TYPE_CHECKING, Any, Final, cast
from unittest.mock import patch

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -2862,6 +2863,47 @@ def create_zarr_target(self):
yield tmp


@requires_zarr
class TestZarrArrayWrapperCalls(TestZarrKVStoreV3):
def test_avoid_excess_metadata_calls(self) -> None:
"""Test that chunk requests do not trigger redundant metadata requests.
This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls
to retrieve chunk data after initialization do not trigger additional
metadata requests.
https://github.com/pydata/xarray/issues/8290
"""

import zarr

ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))})

# The call to retrieve metadata performs a group lookup. We patch Group.__getitem__
# so that we can inspect calls to this method - specifically count of calls.
# Use of side_effect means that calls are passed through to the original method
# rather than a mocked method.
Group = zarr.hierarchy.Group
with (
self.create_zarr_target() as store,
patch.object(
Group, "__getitem__", side_effect=Group.__getitem__, autospec=True
) as mock,
):
ds.to_zarr(store, mode="w")

# We expect this to request array metadata information, so call_count should be >= 1,
# At time of writing, 2 calls are made
xrds = xr.open_zarr(store)
call_count = mock.call_count
assert call_count > 0

# compute() requests array data, which should not trigger additional metadata requests
# we assert that the number of calls has not increased after fetchhing the array
xrds.test.compute(scheduler="sync")
assert mock.call_count == call_count


@requires_zarr
@requires_fsspec
def test_zarr_storage_options() -> None:
Expand Down

0 comments on commit 5876365

Please sign in to comment.