From 58763657dfed5fd566f3391b1a693c8427efe6d4 Mon Sep 17 00:00:00 2001 From: olimcc Date: Fri, 13 Oct 2023 09:38:56 -0700 Subject: [PATCH] Avoid redundant metadata reads in `ZarrArrayWrapper` (#8297) * Avoid redundant metadata reads in `ZarrArrayWrapper` Modify ZarrArrayWrapper to avoid re-reading metadata each time data is read from the array. * Improve test documentation * Update xarray/tests/test_backends.py Co-authored-by: Joe Hamman * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add what's new entry --------- Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Co-authored-by: Joe Hamman Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/whats-new.rst | 7 ++++++ xarray/backends/zarr.py | 16 +++++++------ xarray/tests/test_backends.py | 42 +++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 40c50e158ad..4a57faaa82e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -54,6 +54,7 @@ Deprecations Bug fixes ~~~~~~~~~ + - :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning when the operation was a no-op. (:issue:`8266`) By `Simon Hansen `_. @@ -64,6 +65,12 @@ Bug fixes (:issue:`8271`, :pull:`8272`). By `Spencer Clark `_. +- Fix excess metadata requests when using a Zarr store. Prior to this, metadata + was re-read every time data was retrieved from the array, now metadata is retrieved only once + when they array is initialized. + (:issue:`8290`, :pull:`8297`). + By `Oliver McCormack `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f88523422bb..d6ad15f4f87 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -61,27 +61,29 @@ def encode_zarr_attr_value(value): class ZarrArrayWrapper(BackendArray): - __slots__ = ("datastore", "dtype", "shape", "variable_name") + __slots__ = ("datastore", "dtype", "shape", "variable_name", "_array") def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name - array = self.get_array() - self.shape = array.shape + # some callers attempt to evaluate an array if an `array` property exists on the object. + # we prefix with _ to avoid this inference. + self._array = self.datastore.zarr_group[self.variable_name] + self.shape = self._array.shape # preserve vlen string object dtype (GH 7328) - if array.filters is not None and any( - [filt.codec_id == "vlen-utf8" for filt in array.filters] + if self._array.filters is not None and any( + [filt.codec_id == "vlen-utf8" for filt in self._array.filters] ): dtype = coding.strings.create_vlen_dtype(str) else: - dtype = array.dtype + dtype = self._array.dtype self.dtype = dtype def get_array(self): - return self.datastore.zarr_group[self.variable_name] + return self._array def _oindex(self, key): return self.get_array().oindex[key] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9ec67bf47dc..4c04841db8d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -19,6 +19,7 @@ from os import listdir from pathlib import Path from typing import TYPE_CHECKING, Any, Final, cast +from unittest.mock import patch import numpy as np import pandas as pd @@ -2862,6 +2863,47 @@ def create_zarr_target(self): yield tmp +@requires_zarr +class TestZarrArrayWrapperCalls(TestZarrKVStoreV3): + def test_avoid_excess_metadata_calls(self) -> None: + """Test that chunk requests do not trigger redundant metadata requests. + + This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls + to retrieve chunk data after initialization do not trigger additional + metadata requests. + + https://github.com/pydata/xarray/issues/8290 + """ + + import zarr + + ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))}) + + # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__ + # so that we can inspect calls to this method - specifically count of calls. + # Use of side_effect means that calls are passed through to the original method + # rather than a mocked method. + Group = zarr.hierarchy.Group + with ( + self.create_zarr_target() as store, + patch.object( + Group, "__getitem__", side_effect=Group.__getitem__, autospec=True + ) as mock, + ): + ds.to_zarr(store, mode="w") + + # We expect this to request array metadata information, so call_count should be >= 1, + # At time of writing, 2 calls are made + xrds = xr.open_zarr(store) + call_count = mock.call_count + assert call_count > 0 + + # compute() requests array data, which should not trigger additional metadata requests + # we assert that the number of calls has not increased after fetchhing the array + xrds.test.compute(scheduler="sync") + assert mock.call_count == call_count + + @requires_zarr @requires_fsspec def test_zarr_storage_options() -> None: