From 73d53d73dd1fc200cac54b1583828f4da0a354f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 25 Aug 2024 13:32:24 -0500 Subject: [PATCH 01/62] Fixed MemoryStore.list_dir Ensures that nested children are listed properly. --- src/zarr/store/memory.py | 14 +++++++++++--- src/zarr/testing/store.py | 7 +++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py index 999d750755..d474f18b28 100644 --- a/src/zarr/store/memory.py +++ b/src/zarr/store/memory.py @@ -117,6 +117,14 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: for key in keys_unique: yield key else: - for key in self._store_dict: - if key.startswith(prefix + "/") and key != prefix: - yield key.removeprefix(prefix + "/").split("/")[0] + # Our dictionary doesn't contain directory markers, but we want to include + # a pseudo directory when there's a nested item and we're listing an + # intermediate level. + n = prefix.count("/") + 2 + keys_unique = { + "/".join(k.split("/", n)[:n]) + for k in self._store_dict + if k.startswith(prefix + "/") + } + for key in keys_unique: + yield key.removeprefix(prefix + "/").split("/")[0] diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index ebef4824f7..e263cb38fd 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -192,6 +192,13 @@ async def test_list_dir(self, store: S) -> None: assert [k async for k in store.list_dir("foo")] == [] await store.set("foo/zarr.json", Buffer.from_bytes(b"bar")) await store.set("foo/c/1", Buffer.from_bytes(b"\x01")) + await store.set("foo/c/d/1", Buffer.from_bytes(b"\x01")) + await store.set("foo/c/d/2", Buffer.from_bytes(b"\x01")) + await store.set("foo/c/d/3", Buffer.from_bytes(b"\x01")) + + keys_expected = ["foo"] + keys_observed = [k async for k in store.list_dir("")] + assert set(keys_observed) == set(keys_expected), keys_observed keys_expected = ["zarr.json", "c"] keys_observed = [k async for k in store.list_dir("foo")] From 90940a0e01366a7339bf4abae5caca3fdcb73e30 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 25 Aug 2024 13:39:16 -0500 Subject: [PATCH 02/62] fixup s3 --- src/zarr/store/remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/store/remote.py b/src/zarr/store/remote.py index f5ea694b0a..83393e4dac 100644 --- a/src/zarr/store/remote.py +++ b/src/zarr/store/remote.py @@ -202,7 +202,7 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: except FileNotFoundError: return for onefile in (a.replace(prefix + "/", "") for a in allfiles): - yield onefile + yield onefile.removeprefix(self.path).removeprefix("/") async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]: for onefile in await self._fs._ls(prefix, detail=False): From 8ee89f4be622ca5450814eca56d6c1025992e970 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 25 Aug 2024 11:14:46 -0500 Subject: [PATCH 03/62] recursive Group.members This PR adds a recursive=True flag to Group.members, for recursively listing the members of some hierarhcy. This is useful for Consolidated Metadata, which needs to recursively inspect children. IMO, it's useful (and simple) enough to include in the public API. --- src/zarr/core/group.py | 53 ++++++++++++++++++++++++++++++++++-------- tests/v3/test_group.py | 52 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 14 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 86d27e3a97..4becaf940b 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -424,20 +424,43 @@ async def update_attributes(self, new_attributes: dict[str, Any]) -> AsyncGroup: def __repr__(self) -> str: return f"" - async def nmembers(self) -> int: + async def nmembers(self, recursive: bool = False) -> int: + """ + Count the number of members in this group. + + Parameters + ---------- + recursive : bool, default False + Whether to recursively count arrays and groups in child groups of + this Group. By default, just immediate child array and group members + are counted. + + Returns + ------- + count : int + """ # TODO: consider using aioitertools.builtins.sum for this # return await aioitertools.builtins.sum((1 async for _ in self.members()), start=0) n = 0 - async for _ in self.members(): + async for _ in self.members(recursive=recursive): n += 1 return n - async def members(self) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], None]: + async def members( + self, recursive: bool = False + ) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], None]: """ Returns an AsyncGenerator over the arrays and groups contained in this group. This method requires that `store_path.store` supports directory listing. The results are not guaranteed to be ordered. + + Parameters + ---------- + recursive : bool, default False + Whether to recursively include arrays and groups in child groups of + this Group. By default, just immediate child array and group members + are included. """ if not self.store_path.store.supports_listing: msg = ( @@ -456,7 +479,19 @@ async def members(self) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], N if key in _skip_keys: continue try: - yield (key, await self.getitem(key)) + obj = await self.getitem(key) + yield (key, obj) + + if ( + recursive + and hasattr(obj.metadata, "node_type") + and obj.metadata.node_type == "group" + ): + # the assert is just for mypy to know that `obj.metadata.node_type` + # implies an AsyncGroup, not an AsyncArray + assert isinstance(obj, AsyncGroup) + async for child_key, val in obj.members(recursive=recursive): + yield "/".join([key, child_key]), val except KeyError: # keyerror is raised when `key` names an object (in the object storage sense), # as opposed to a prefix, in the store under the prefix associated with this group @@ -628,17 +663,15 @@ def update_attributes(self, new_attributes: dict[str, Any]) -> Group: self._sync(self._async_group.update_attributes(new_attributes)) return self - @property - def nmembers(self) -> int: - return self._sync(self._async_group.nmembers()) + def nmembers(self, recursive: bool = False) -> int: + return self._sync(self._async_group.nmembers(recursive=recursive)) - @property - def members(self) -> tuple[tuple[str, Array | Group], ...]: + def members(self, recursive: bool = False) -> tuple[tuple[str, Array | Group], ...]: """ Return the sub-arrays and sub-groups of this group as a tuple of (name, array | group) pairs """ - _members = self._sync_iter(self._async_group.members()) + _members = self._sync_iter(self._async_group.members(recursive=recursive)) result = tuple(map(lambda kv: (kv[0], _parse_async_node(kv[1])), _members)) return result diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 39921c26d8..efc53352fe 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -88,7 +88,7 @@ def test_group_members(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) members_expected["subgroup"] = group.create_group("subgroup") # make a sub-sub-subgroup, to ensure that the children calculation doesn't go # too deep in the hierarchy - _ = members_expected["subgroup"].create_group("subsubgroup") # type: ignore + subsubgroup = members_expected["subgroup"].create_group("subsubgroup") # type: ignore members_expected["subarray"] = group.create_array( "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), exists_ok=True @@ -101,7 +101,13 @@ def test_group_members(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) # this creates a directory with a random key in it # this should not show up as a member sync(store.set(f"{path}/extra_directory/extra_object-2", Buffer.from_bytes(b"000000"))) - members_observed = group.members + members_observed = group.members() + # members are not guaranteed to be ordered, so sort before comparing + assert sorted(dict(members_observed)) == sorted(members_expected) + + # recursive=True + members_observed = group.members(recursive=True) + members_expected["subgroup/subsubgroup"] = subsubgroup # members are not guaranteed to be ordered, so sort before comparing assert sorted(dict(members_observed)) == sorted(members_expected) @@ -349,7 +355,8 @@ def test_group_create_array( if method == "create_array": array = group.create_array(name="array", shape=shape, dtype=dtype, data=data) elif method == "array": - array = group.array(name="array", shape=shape, dtype=dtype, data=data) + with pytest.warns(DeprecationWarning): + array = group.array(name="array", shape=shape, dtype=dtype, data=data) else: raise AssertionError @@ -358,7 +365,7 @@ def test_group_create_array( with pytest.raises(ContainsArrayError): group.create_array(name="array", shape=shape, dtype=dtype, data=data) elif method == "array": - with pytest.raises(ContainsArrayError): + with pytest.raises(ContainsArrayError), pytest.warns(DeprecationWarning): group.array(name="array", shape=shape, dtype=dtype, data=data) assert array.shape == shape assert array.dtype == np.dtype(dtype) @@ -653,3 +660,40 @@ async def test_asyncgroup_update_attributes( agroup_new_attributes = await agroup.update_attributes(attributes_new) assert agroup_new_attributes.attrs == attributes_new + + +async def test_group_members_async(store: LocalStore | MemoryStore): + group = AsyncGroup( + GroupMetadata(), + store_path=StorePath(store=store, path="root"), + ) + a0 = await group.create_array("a0", (1,)) + g0 = await group.create_group("g0") + a1 = await g0.create_array("a1", (1,)) + g1 = await g0.create_group("g1") + a2 = await g1.create_array("a2", (1,)) + g2 = await g1.create_group("g2") + + # immediate children + children = sorted([x async for x in group.members()], key=lambda x: x[0]) + assert children == [ + ("a0", a0), + ("g0", g0), + ] + + nmembers = await group.nmembers() + assert nmembers == 2 + + all_children = sorted([x async for x in group.members(recursive=True)], key=lambda x: x[0]) + expected = [ + ("a0", a0), + ("g0", g0), + ("g0/a1", a1), + ("g0/g1", g1), + ("g0/g1/a2", a2), + ("g0/g1/g2", g2), + ] + assert all_children == expected + + nmembers = await group.nmembers(recursive=True) + assert nmembers == 6 From 65a8bd4253fbca1b809b4e97701fecdc86d1eaac Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Aug 2024 21:06:20 -0500 Subject: [PATCH 04/62] Zarr-v3 Consolidated Metadata Implements the optional Consolidated Metadata feature of zarr-v3. --- src/zarr/api/asynchronous.py | 51 +++++++- src/zarr/codecs/crc32c_.py | 7 +- src/zarr/core/array.py | 8 +- src/zarr/core/common.py | 23 ++++ src/zarr/core/group.py | 71 ++++++++++- src/zarr/core/metadata.py | 23 +--- tests/v3/test_metadata.py | 227 +++++++++++++++++++++++++++++++++++ 7 files changed, 374 insertions(+), 36 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ad89584b44..0dd2e76f02 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import dataclasses import warnings from collections.abc import Iterable from typing import Any, Literal, Union, cast @@ -12,8 +13,14 @@ from zarr.core.array import Array, AsyncArray from zarr.core.buffer import NDArrayLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding -from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat -from zarr.core.group import AsyncGroup +from zarr.core.common import ( + JSON, + AccessModeLiteral, + ChunkCoords, + MemoryOrder, + ZarrFormat, +) +from zarr.core.group import AsyncGroup, ConsolidatedMetadata from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.store import ( StoreLike, @@ -126,8 +133,38 @@ def _default_zarr_version() -> ZarrFormat: return 3 -async def consolidate_metadata(*args: Any, **kwargs: Any) -> AsyncGroup: - raise NotImplementedError +async def consolidate_metadata(store: StoreLike) -> AsyncGroup: + """ + Consolidate the metadata of all nodes in a hierarchy. + + Upon completion, the metadata of the root node in the Zarr hierarchy will be + updated to include all the metadata of child nodes. + + Parameters + ---------- + store: StoreLike + The store-like object whose metadata you wish to consolidate. + + Returns + ------- + group: AsyncGroup + The group, with the ``consolidated_metadata`` field set to include + the metadata of each child node. + """ + group = await AsyncGroup.open(store) + members = dict([x async for x in group.members(recursive=True)]) + members_metadata = {} + + members_metadata = {k: v.metadata for k, v in members.items()} + + consolidated_metadata = ConsolidatedMetadata(metadata=members_metadata) + metadata = dataclasses.replace(group.metadata, consolidated_metadata=consolidated_metadata) + group = dataclasses.replace( + group, + metadata=metadata, + ) + await group._save_metadata() + return group async def copy(*args: Any, **kwargs: Any) -> tuple[int, int, int]: @@ -229,7 +266,7 @@ async def open( async def open_consolidated(*args: Any, **kwargs: Any) -> AsyncGroup: - raise NotImplementedError + return await open_group(*args, **kwargs) async def save( @@ -703,7 +740,9 @@ async def create( ) else: warnings.warn( - "dimension_separator is not yet implemented", RuntimeWarning, stacklevel=2 + "dimension_separator is not yet implemented", + RuntimeWarning, + stacklevel=2, ) if write_empty_chunks: warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2) diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index d40b95ef0c..f9e9ce4d1a 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -1,9 +1,10 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np +import typing_extensions from crc32c import crc32c from zarr.abc.codec import BytesBytesCodec @@ -37,7 +38,7 @@ async def _decode_single( crc32_bytes = data[-4:] inner_bytes = data[:-4] - computed_checksum = np.uint32(crc32c(inner_bytes)).tobytes() + computed_checksum = np.uint32(crc32c(cast(typing_extensions.Buffer, inner_bytes))).tobytes() stored_checksum = bytes(crc32_bytes) if computed_checksum != stored_checksum: raise ValueError( @@ -52,7 +53,7 @@ async def _encode_single( ) -> Buffer | None: data = chunk_bytes.as_numpy_array() # Calculate the checksum and "cast" it to a numpy array - checksum = np.array([crc32c(data)], dtype=np.uint32) + checksum = np.array([crc32c(cast(typing_extensions.Buffer, data))], dtype=np.uint32) # Append the checksum (as bytes) to the data return chunk_spec.prototype.buffer.from_array_like(np.append(data, checksum.view("b"))) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 3738b0859b..ce5dcd2b2e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -372,9 +372,15 @@ async def open( else: # V3 arrays are comprised of a zarr.json object assert zarr_json_bytes is not None + zarr_metadata = json.loads(zarr_json_bytes.to_bytes()) + if zarr_metadata.get("node_type") != "array": + # This KeyError is load bearing for `open`. That currently tries + # to open the node as an `array` and then falls back to opening + # as a group. + raise KeyError return cls( store_path=store_path, - metadata=ArrayV3Metadata.from_dict(json.loads(zarr_json_bytes.to_bytes())), + metadata=ArrayV3Metadata.from_dict(zarr_metadata), ) @property diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index aaa30cfcb8..4f53c27434 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -16,6 +16,8 @@ overload, ) +import numcodecs + if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -167,3 +169,24 @@ def parse_order(data: Any) -> Literal["C", "F"]: if data in ("C", "F"): return cast(Literal["C", "F"], data) raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") + + +def _json_convert(o: Any) -> Any: + if isinstance(o, np.dtype): + return str(o) + if np.isscalar(o): + # convert numpy scalar to python type, and pass + # python types through + out = getattr(o, "item", lambda: o)() + if isinstance(out, complex): + # python complex types are not JSON serializable, so we use the + # serialization defined in the zarr v3 spec + return [out.real, out.imag] + return out + if isinstance(o, Enum): + return o.name + # this serializes numcodecs compressors + # todo: implement to_dict for codecs + elif isinstance(o, numcodecs.abc.Codec): + config: dict[str, Any] = o.get_config() + return config diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4becaf940b..126a8ba94c 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -25,8 +25,10 @@ ZGROUP_JSON, ChunkCoords, ZarrFormat, + _json_convert, ) from zarr.core.config import config +from zarr.core.metadata import ArrayMetadata, ArrayV3Metadata from zarr.core.sync import SyncMixin, sync from zarr.store import StoreLike, StorePath, make_store_path from zarr.store.common import ensure_no_existing_node @@ -77,10 +79,57 @@ def _parse_async_node(node: AsyncArray | AsyncGroup) -> Array | Group: raise TypeError(f"Unknown node type, got {type(node)}") +@dataclass(frozen=True) +class ConsolidatedMetadata: + metadata: dict[str, ArrayMetadata | GroupMetadata] + kind: Literal["inline"] = "inline" + must_understand: Literal[False] = False + + def to_dict(self) -> dict[str, JSON]: + return { + "kind": self.kind, + "must_understand": self.must_understand, + "metadata": {k: v.to_dict() for k, v in self.metadata.items()}, + } + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: + data = dict(data) + raw_metadata = data.get("metadata") + if not isinstance(raw_metadata, dict): + raise TypeError("Unexpected type for 'metadata'") + + elif not raw_metadata: + raise ValueError("Must specify metadata") + + metadata: dict[str, ArrayMetadata | GroupMetadata] + if raw_metadata: + metadata = {} + for k, v in raw_metadata.items(): + if not isinstance(v, dict): + raise TypeError(f"Invalid value for metadata items. key={k}, type={type(v)}") + + node_type = v.get("node_type", None) + if node_type == "group": + metadata[k] = GroupMetadata.from_dict(v) + elif node_type == "array": + metadata[k] = ArrayV3Metadata.from_dict(v) + else: + raise ValueError(f"Invalid node_type: '{node_type}'") + # assert data["kind"] == "inline" + if data["kind"] != "inline": + raise ValueError + + if data["must_understand"] is not False: + raise ValueError + return cls(metadata=metadata) + + @dataclass(frozen=True) class GroupMetadata(Metadata): attributes: dict[str, Any] = field(default_factory=dict) zarr_format: ZarrFormat = 3 + consolidated_metadata: ConsolidatedMetadata | None = None node_type: Literal["group"] = field(default="group", init=False) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: @@ -88,7 +137,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: if self.zarr_format == 3: return { ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(self.to_dict(), indent=json_indent).encode() + json.dumps(self.to_dict(), default=_json_convert, indent=json_indent).encode() ) } else: @@ -101,20 +150,33 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: ), } - def __init__(self, attributes: dict[str, Any] | None = None, zarr_format: ZarrFormat = 3): + def __init__( + self, + attributes: dict[str, Any] | None = None, + zarr_format: ZarrFormat = 3, + consolidated_metadata: ConsolidatedMetadata | None = None, + ): attributes_parsed = parse_attributes(attributes) zarr_format_parsed = parse_zarr_format(zarr_format) object.__setattr__(self, "attributes", attributes_parsed) object.__setattr__(self, "zarr_format", zarr_format_parsed) + object.__setattr__(self, "consolidated_metadata", consolidated_metadata) @classmethod def from_dict(cls, data: dict[str, Any]) -> GroupMetadata: + data = dict(data) assert data.pop("node_type", None) in ("group", None) + consolidated_metadata = data.pop("consolidated_metadata", None) + if consolidated_metadata: + data["consolidated_metadata"] = ConsolidatedMetadata.from_dict(consolidated_metadata) return cls(**data) def to_dict(self) -> dict[str, Any]: - return asdict(self) + result = asdict(replace(self, consolidated_metadata=None)) + if self.consolidated_metadata: + result["consolidated_metadata"] = self.consolidated_metadata.to_dict() + return result @dataclass(frozen=True) @@ -497,7 +559,8 @@ async def members( # as opposed to a prefix, in the store under the prefix associated with this group # in which case `key` cannot be the name of a sub-array or sub-group. logger.warning( - "Object at %s is not recognized as a component of a Zarr hierarchy.", key + "Object at %s is not recognized as a component of a Zarr hierarchy.", + key, ) async def contains(self, member: str) -> bool: diff --git a/src/zarr/core/metadata.py b/src/zarr/core/metadata.py index d541e43205..0f3c444d28 100644 --- a/src/zarr/core/metadata.py +++ b/src/zarr/core/metadata.py @@ -20,7 +20,6 @@ if TYPE_CHECKING: from typing_extensions import Self -import numcodecs.abc from zarr.core.array_spec import ArraySpec from zarr.core.common import ( @@ -30,6 +29,7 @@ ZATTRS_JSON, ChunkCoords, ZarrFormat, + _json_convert, parse_dtype, parse_named_configuration, parse_shapelike, @@ -252,27 +252,6 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - def _json_convert(o: Any) -> Any: - if isinstance(o, np.dtype): - return str(o) - if np.isscalar(o): - # convert numpy scalar to python type, and pass - # python types through - out = getattr(o, "item", lambda: o)() - if isinstance(out, complex): - # python complex types are not JSON serializable, so we use the - # serialization defined in the zarr v3 spec - return [out.real, out.imag] - return out - if isinstance(o, Enum): - return o.name - # this serializes numcodecs compressors - # todo: implement to_dict for codecs - elif isinstance(o, numcodecs.abc.Codec): - config: dict[str, Any] = o.get_config() - return config - raise TypeError - json_indent = config.get("json_indent") return { ZARR_JSON: prototype.buffer.from_bytes( diff --git a/tests/v3/test_metadata.py b/tests/v3/test_metadata.py index e69de29bb2..dab5ecfde2 100644 --- a/tests/v3/test_metadata.py +++ b/tests/v3/test_metadata.py @@ -0,0 +1,227 @@ +import numpy as np + +import zarr.api.synchronous +from zarr.abc.store import Store +from zarr.api.asynchronous import ( + AsyncGroup, + consolidate_metadata, + group, + open, + open_consolidated, +) +from zarr.core.array import ArrayV3Metadata +from zarr.core.group import ConsolidatedMetadata, GroupMetadata + + +async def test_consolidated(memory_store: Store) -> None: + # TODO: Figure out desired keys in + # TODO: variety in the hierarchies + # More nesting + # arrays under arrays + # single array + # etc. + g = await group(store=memory_store, attributes={"foo": "bar"}) + await g.create_array(path="air", shape=(1, 2, 3)) + await g.create_array(path="lat", shape=(1,)) + await g.create_array(path="lon", shape=(2,)) + await g.create_array(path="time", shape=(3,)) + + child = await g.create_group("child", attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + + grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) + await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + + await consolidate_metadata(memory_store) + group2 = await AsyncGroup.open(memory_store) + + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + expected = GroupMetadata( + attributes={"foo": "bar"}, + consolidated_metadata=ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lon": ArrayV3Metadata.from_dict( + { + **{"shape": (2,)}, + "chunk_grid": { + "configuration": {"chunk_shape": (2,)}, + "name": "regular", + }, + **array_metadata, + } + ), + "time": ArrayV3Metadata.from_dict( + { + **{ + "shape": (3,), + "chunk_grid": { + "configuration": {"chunk_shape": (3,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "child": GroupMetadata(attributes={"key": "child"}), + "child/array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "child"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), + "child/grandchild/array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "grandchild"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + }, + ), + ) + assert group2.metadata == expected + group3 = await open(store=memory_store) + assert group3.metadata == expected + + group4 = await open_consolidated(store=memory_store) + assert group4.metadata == expected + + +def test_consolidated_sync(memory_store): + g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) + g.create_array(name="air", shape=(1, 2, 3)) + g.create_array(name="lat", shape=(1,)) + g.create_array(name="lon", shape=(2,)) + g.create_array(name="time", shape=(3,)) + + zarr.api.synchronous.consolidate_metadata(memory_store) + group2 = zarr.api.synchronous.Group.open(memory_store) + + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + expected = GroupMetadata( + attributes={"foo": "bar"}, + consolidated_metadata=ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lon": ArrayV3Metadata.from_dict( + { + **{"shape": (2,)}, + "chunk_grid": { + "configuration": {"chunk_shape": (2,)}, + "name": "regular", + }, + **array_metadata, + } + ), + "time": ArrayV3Metadata.from_dict( + { + **{ + "shape": (3,), + "chunk_grid": { + "configuration": {"chunk_shape": (3,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + }, + ), + ) + assert group2.metadata == expected + group3 = zarr.api.synchronous.open(store=memory_store) + assert group3.metadata == expected + + group4 = zarr.api.synchronous.open_consolidated(store=memory_store) + assert group4.metadata == expected From 5a31390ba5fe11f503dd31fbfaf2b343c587aff7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Sep 2024 13:32:28 -0500 Subject: [PATCH 05/62] fixup --- src/zarr/core/common.py | 9 ++++++++- tests/v3/test_metadata.py | 8 ++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index dbaa267ca4..cbfe5b8901 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -178,7 +178,14 @@ def _json_convert(o: Any) -> Any: if np.isscalar(o): # convert numpy scalar to python type, and pass # python types through - out = getattr(o, "item", lambda: o)() + if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"): + # https://github.com/zarr-developers/zarr-python/issues/2119 + # `.item()` on a datetime type might or might not return an + # integer, depending on the value. + # Explicitly cast to an int first, and then grab .item() + out = o.view("i8").item() + else: + out = getattr(o, "item", lambda: o)() if isinstance(out, complex): # python complex types are not JSON serializable, so we use the # serialization defined in the zarr v3 spec diff --git a/tests/v3/test_metadata.py b/tests/v3/test_metadata.py index dab5ecfde2..42c5a6a697 100644 --- a/tests/v3/test_metadata.py +++ b/tests/v3/test_metadata.py @@ -21,10 +21,10 @@ async def test_consolidated(memory_store: Store) -> None: # single array # etc. g = await group(store=memory_store, attributes={"foo": "bar"}) - await g.create_array(path="air", shape=(1, 2, 3)) - await g.create_array(path="lat", shape=(1,)) - await g.create_array(path="lon", shape=(2,)) - await g.create_array(path="time", shape=(3,)) + await g.create_array(name="air", shape=(1, 2, 3)) + await g.create_array(name="lat", shape=(1,)) + await g.create_array(name="lon", shape=(2,)) + await g.create_array(name="time", shape=(3,)) child = await g.create_group("child", attributes={"key": "child"}) await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) From fc901eb5be0e4aa113cd913feb6a135223ba9bb0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 11 Sep 2024 14:11:40 -0500 Subject: [PATCH 06/62] read zarr-v2 consolidated metadata --- src/zarr/api/asynchronous.py | 6 +- src/zarr/core/common.py | 1 + src/zarr/core/group.py | 58 ++++++++++++- tests/v3/conftest.py | 9 +++ tests/v3/test_metadata/test_v2.py | 130 ++++++++++++++++++++++++++++-- 5 files changed, 194 insertions(+), 10 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 3891a973d1..b70deea7c3 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -271,6 +271,7 @@ async def open( async def open_consolidated(*args: Any, **kwargs: Any) -> AsyncGroup: + kwargs.setdefault("open_consolidated", True) return await open_group(*args, **kwargs) @@ -531,6 +532,7 @@ async def open_group( zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # not used attributes: dict[str, JSON] | None = None, + open_consolidated: bool = False, ) -> AsyncGroup: """Open a group using file-mode-like semantics. @@ -590,7 +592,9 @@ async def open_group( attributes = {} try: - return await AsyncGroup.open(store_path, zarr_format=zarr_format) + return await AsyncGroup.open( + store_path, zarr_format=zarr_format, open_consolidated=open_consolidated + ) except (KeyError, FileNotFoundError): return await AsyncGroup.create( store_path, zarr_format=zarr_format, exists_ok=True, attributes=attributes diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index cbfe5b8901..2e63edf49c 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -28,6 +28,7 @@ ZARRAY_JSON = ".zarray" ZGROUP_JSON = ".zgroup" ZATTRS_JSON = ".zattrs" +ZMETADATA_v2_JSON = ".zmetadata" BytesLike = bytes | bytearray | memoryview ShapeLike = tuple[int, ...] | int diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 268f25e149..4042696f19 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -3,6 +3,7 @@ import asyncio import json import logging +from collections import defaultdict from dataclasses import asdict, dataclass, field, replace from typing import TYPE_CHECKING, Literal, cast, overload @@ -28,7 +29,7 @@ parse_shapelike, ) from zarr.core.config import config -from zarr.core.metadata import ArrayMetadata, ArrayV3Metadata +from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import SyncMixin, sync from zarr.store import StoreLike, StorePath, make_store_path from zarr.store.common import ensure_no_existing_node @@ -126,7 +127,14 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: elif node_type == "array": metadata[k] = ArrayV3Metadata.from_dict(v) else: - raise ValueError(f"Invalid node_type: '{node_type}'") + # We either have V2 metadata, or invalid metadata + if "shape" in v: + # probably ArrayV2Metadata + metadata[k] = ArrayV2Metadata.from_dict(v) + else: + # probably v2 Group metadata + metadata[k] = GroupMetadata.from_dict(v) + # assert data["kind"] == "inline" if data["kind"] != "inline": raise ValueError @@ -226,15 +234,26 @@ async def open( cls, store: StoreLike, zarr_format: Literal[2, 3, None] = 3, + open_consolidated: bool = False, ) -> AsyncGroup: store_path = await make_store_path(store) if zarr_format == 2: - zgroup_bytes, zattrs_bytes = await asyncio.gather( - (store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get() + paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] + if open_consolidated: + paths.append(store_path / ".zmetadata") # todo: configurable + + zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather( + *[path.get() for path in paths] ) if zgroup_bytes is None: raise FileNotFoundError(store_path) + + if open_consolidated: + consolidated_metadata_bytes = rest[0] + if consolidated_metadata_bytes is None: + raise FileNotFoundError(paths[-1]) + elif zarr_format == 3: zarr_json_bytes = await (store_path / ZARR_JSON).get() if zarr_json_bytes is None: @@ -265,6 +284,37 @@ async def open( zgroup = json.loads(zgroup_bytes.to_bytes()) zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} group_metadata = {**zgroup, "attributes": zattrs} + + if open_consolidated: + # this *should* be defined. + assert consolidated_metadata_bytes is not None # already checked above + + v2_consolidated_metadata = json.loads(consolidated_metadata_bytes.to_bytes()) + v2_consolidated_metadata = v2_consolidated_metadata["metadata"] + # We already read zattrs and zgroup. Should we ignore these? + v2_consolidated_metadata.pop(".zattrs") + v2_consolidated_metadata.pop(".zgroup") + + consolidated_metadata: defaultdict[str, dict[str, Any]] = defaultdict(dict) + + # keys like air/.zarray, air/.zattrs + for k, v in v2_consolidated_metadata.items(): + path, kind = k.rsplit("/.", 1) + + if kind == "zarray": + consolidated_metadata[path].update(v) + elif kind == "zattrs": + consolidated_metadata[path]["attributes"] = v + elif kind == "zgroup": + consolidated_metadata[path].update(v) + else: + raise ValueError(f"Invalid file type '{kind}' at path '{path}") + group_metadata["consolidated_metadata"] = { + "metadata": dict(consolidated_metadata), + "kind": "inline", + "must_understand": False, + } + else: # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None diff --git a/tests/v3/conftest.py b/tests/v3/conftest.py index b1308f058f..0b9d26f92f 100644 --- a/tests/v3/conftest.py +++ b/tests/v3/conftest.py @@ -24,6 +24,9 @@ from zarr.core.common import ChunkCoords, MemoryOrder, ZarrFormat +HERE = pathlib.Path(__file__).parent + + async def parse_store( store: Literal["local", "memory", "remote"], path: str ) -> LocalStore | MemoryStore | RemoteStore: @@ -91,6 +94,12 @@ async def async_group(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> As return agroup +@pytest.fixture(scope="function") +async def complex_hierarchy() -> LocalStore: + root = HERE / "examples/complex-hierarchy.zarr" + return LocalStore(root=root) + + @pytest.fixture(params=["numpy", "cupy"]) def xp(request: pytest.FixtureRequest) -> Iterator[ModuleType]: """Fixture to parametrize over numpy-like libraries""" diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 0053de08ca..afad93c29c 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -1,17 +1,22 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING, Literal -if TYPE_CHECKING: - from typing import Any - - from zarr.abc.codec import Codec - +import numpy as np import pytest +import zarr.store from zarr.codecs import GzipCodec +from zarr.core.buffer import cpu +from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata, parse_zarr_format_v2 +if TYPE_CHECKING: + from typing import Any + + from zarr.abc.codec import Codec + def test_parse_zarr_format_valid() -> None: assert parse_zarr_format_v2(2) == 2 @@ -70,3 +75,118 @@ def test_metadata_to_dict( observed.pop("dimension_separator") assert observed == expected + + +async def test_read_consolidated_metadata(): + # .zgroup, .zattrs, .metadata + zmetadata = { + "metadata": { + ".zattrs": { + "Conventions": "COARDS", + }, + ".zgroup": {"zarr_format": 2}, + "air/.zarray": { + "chunks": [730], + "compressor": {}, + "dtype": " Date: Thu, 12 Sep 2024 07:26:07 -0500 Subject: [PATCH 07/62] check writablem --- src/zarr/api/asynchronous.py | 2 ++ tests/v3/test_metadata/test_v3.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index b70deea7c3..83f881d554 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -157,6 +157,8 @@ async def consolidate_metadata(store: StoreLike) -> AsyncGroup: the metadata of each child node. """ group = await AsyncGroup.open(store) + group.store_path.store._check_writable() + members = dict([x async for x in group.members(max_depth=None)]) members_metadata = {} diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 23163083ae..44870234e8 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -480,3 +480,10 @@ def test_consolidated_sync(memory_store): group4 = zarr.api.synchronous.open_consolidated(store=memory_store) assert group4.metadata == expected + + +async def test_not_writable_raises(memory_store: zarr.store.MemoryStore) -> None: + await group(store=memory_store, attributes={"foo": "bar"}) + read_store = zarr.store.MemoryStore(store_dict=memory_store._store_dict) + with pytest.raises(ValueError, match="does not support writing"): + await consolidate_metadata(read_store) From 78af362fe887a516d6b82fcb002dd7eb10d76fc3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 07:50:57 -0500 Subject: [PATCH 08/62] Handle non-root paths --- src/zarr/api/asynchronous.py | 15 +- tests/v3/test_metadata.py | 236 ++++++++++++++++++++++++++++++ tests/v3/test_metadata/test_v3.py | 38 +++-- 3 files changed, 276 insertions(+), 13 deletions(-) create mode 100644 tests/v3/test_metadata.py diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 83f881d554..7ba667d69d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -138,7 +138,7 @@ def _default_zarr_version() -> ZarrFormat: return 3 -async def consolidate_metadata(store: StoreLike) -> AsyncGroup: +async def consolidate_metadata(store: StoreLike, path: str | None = None) -> AsyncGroup: """ Consolidate the metadata of all nodes in a hierarchy. @@ -149,6 +149,12 @@ async def consolidate_metadata(store: StoreLike) -> AsyncGroup: ---------- store: StoreLike The store-like object whose metadata you wish to consolidate. + path: str, optional + A path to a group in the store to consolidate at. Only children + below that group will be consolidated. + + By default, the root node is used so all the metadata in the + store is consolidated. Returns ------- @@ -156,7 +162,12 @@ async def consolidate_metadata(store: StoreLike) -> AsyncGroup: The group, with the ``consolidated_metadata`` field set to include the metadata of each child node. """ - group = await AsyncGroup.open(store) + store_path = await make_store_path(store) + + if path is not None: + store_path = store_path / path + + group = await AsyncGroup.open(store_path) group.store_path.store._check_writable() members = dict([x async for x in group.members(max_depth=None)]) diff --git a/tests/v3/test_metadata.py b/tests/v3/test_metadata.py new file mode 100644 index 0000000000..a0cc2b3b88 --- /dev/null +++ b/tests/v3/test_metadata.py @@ -0,0 +1,236 @@ +import numpy as np +import pytest + +import zarr.api.synchronous +import zarr.store +from zarr.abc.store import Store +from zarr.api.asynchronous import ( + AsyncGroup, + consolidate_metadata, + group, + open, + open_consolidated, +) +from zarr.core.array import ArrayV3Metadata +from zarr.core.group import ConsolidatedMetadata, GroupMetadata + + +async def test_consolidated(memory_store_with_hierarchy: Store) -> None: + # TODO: Figure out desired keys in + # TODO: variety in the hierarchies + # More nesting + # arrays under arrays + # single array + # etc. + g = await group(store=memory_store_with_hierarchy, attributes={"foo": "bar"}) + await g.create_array(name="air", shape=(1, 2, 3)) + await g.create_array(name="lat", shape=(1,)) + await g.create_array(name="lon", shape=(2,)) + await g.create_array(name="time", shape=(3,)) + + child = await g.create_group("child", attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + + grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) + await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + + await consolidate_metadata(memory_store_with_hierarchy) + group2 = await AsyncGroup.open(memory_store_with_hierarchy) + + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + expected = GroupMetadata( + attributes={"foo": "bar"}, + consolidated_metadata=ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lon": ArrayV3Metadata.from_dict( + { + **{"shape": (2,)}, + "chunk_grid": { + "configuration": {"chunk_shape": (2,)}, + "name": "regular", + }, + **array_metadata, + } + ), + "time": ArrayV3Metadata.from_dict( + { + **{ + "shape": (3,), + "chunk_grid": { + "configuration": {"chunk_shape": (3,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "child": GroupMetadata(attributes={"key": "child"}), + "child/array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "child"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), + "child/grandchild/array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "grandchild"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + }, + ), + ) + assert group2.metadata == expected + group3 = await open(store=memory_store_with_hierarchy) + assert group3.metadata == expected + + group4 = await open_consolidated(store=memory_store_with_hierarchy) + assert group4.metadata == expected + + +def test_consolidated_sync(memory_store_with_hierarchy): + g = zarr.api.synchronous.group(store=memory_store_with_hierarchy, attributes={"foo": "bar"}) + g.create_array(name="air", shape=(1, 2, 3)) + g.create_array(name="lat", shape=(1,)) + g.create_array(name="lon", shape=(2,)) + g.create_array(name="time", shape=(3,)) + + zarr.api.synchronous.consolidate_metadata(memory_store_with_hierarchy) + group2 = zarr.api.synchronous.Group.open(memory_store_with_hierarchy) + + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + expected = GroupMetadata( + attributes={"foo": "bar"}, + consolidated_metadata=ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lon": ArrayV3Metadata.from_dict( + { + **{"shape": (2,)}, + "chunk_grid": { + "configuration": {"chunk_shape": (2,)}, + "name": "regular", + }, + **array_metadata, + } + ), + "time": ArrayV3Metadata.from_dict( + { + **{ + "shape": (3,), + "chunk_grid": { + "configuration": {"chunk_shape": (3,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + }, + ), + ) + assert group2.metadata == expected + group3 = zarr.api.synchronous.open(store=memory_store_with_hierarchy) + assert group3.metadata == expected + + group4 = zarr.api.synchronous.open_consolidated(store=memory_store_with_hierarchy) + assert group4.metadata == expected + + +async def test_not_writable_raises(memory_store_with_hierarchy: zarr.store.MemoryStore) -> None: + await group(store=memory_store_with_hierarchy, attributes={"foo": "bar"}) + read_store = zarr.store.MemoryStore(store_dict=memory_store_with_hierarchy._store_dict) + with pytest.raises(ValueError, match="writable"): + await consolidate_metadata(read_store) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 44870234e8..c62e00ad1b 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -7,6 +7,7 @@ from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding +from zarr.store.common import StorePath if TYPE_CHECKING: from collections.abc import Sequence @@ -268,13 +269,8 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: assert result["fill_value"] == fill_value -async def test_consolidated(memory_store: Store) -> None: - # TODO: Figure out desired keys in - # TODO: variety in the hierarchies - # More nesting - # arrays under arrays - # single array - # etc. +@pytest.fixture +async def memory_store_with_hierarchy(memory_store: Store) -> None: g = await group(store=memory_store, attributes={"foo": "bar"}) await g.create_array(name="air", shape=(1, 2, 3)) await g.create_array(name="lat", shape=(1,)) @@ -286,9 +282,18 @@ async def test_consolidated(memory_store: Store) -> None: grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + return memory_store - await consolidate_metadata(memory_store) - group2 = await AsyncGroup.open(memory_store) + +async def test_consolidated(memory_store_with_hierarchy: Store) -> None: + # TODO: Figure out desired keys in + # TODO: variety in the hierarchies + # More nesting + # arrays under arrays + # single array + # etc. + await consolidate_metadata(memory_store_with_hierarchy) + group2 = await AsyncGroup.open(memory_store_with_hierarchy) array_metadata = { "attributes": {}, @@ -388,10 +393,10 @@ async def test_consolidated(memory_store: Store) -> None: ), ) assert group2.metadata == expected - group3 = await open(store=memory_store) + group3 = await open(store=memory_store_with_hierarchy) assert group3.metadata == expected - group4 = await open_consolidated(store=memory_store) + group4 = await open_consolidated(store=memory_store_with_hierarchy) assert group4.metadata == expected @@ -487,3 +492,14 @@ async def test_not_writable_raises(memory_store: zarr.store.MemoryStore) -> None read_store = zarr.store.MemoryStore(store_dict=memory_store._store_dict) with pytest.raises(ValueError, match="does not support writing"): await consolidate_metadata(read_store) + + +async def test_non_root_node(memory_store_with_hierarchy: Store) -> None: + await consolidate_metadata(memory_store_with_hierarchy, path="child") + root = await AsyncGroup.open(memory_store_with_hierarchy) + child = await AsyncGroup.open(StorePath(memory_store_with_hierarchy) / "child") + + assert root.metadata.consolidated_metadata is None + assert child.metadata.consolidated_metadata is not None + assert "air" not in child.metadata.consolidated_metadata.metadata + assert "grandchild" in child.metadata.consolidated_metadata.metadata From 750668cd7d6251710f407604f68a7434e451d7aa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 08:46:52 -0500 Subject: [PATCH 09/62] Some error handling --- src/zarr/core/group.py | 33 +++++++++++-------------------- tests/v3/test_metadata/test_v3.py | 30 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4042696f19..feae0bb3de 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -82,15 +82,6 @@ def _parse_async_node(node: AsyncArray | AsyncGroup) -> Array | Group: raise TypeError(f"Unknown node type, got {type(node)}") -# What does ConsolidatedMetadata mean at non-root levels of a Store? -# Specifically, if a we have a Group at `/a/b` and cnosolidate -# all the metadata under it, should the keys in the metadata map -# be -# 1: /a/b/1, /a/b/2, ... -# 2: /1, /2, ... -# ? - - @dataclass(frozen=True) class ConsolidatedMetadata: metadata: dict[str, ArrayMetadata | GroupMetadata] @@ -107,16 +98,21 @@ def to_dict(self) -> dict[str, JSON]: @classmethod def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: data = dict(data) + + kind = data.get("kind") + if kind != "inline": + raise ValueError(f"Consolidated metadata kind='{kind}' is not supported.") + + # Do we care about the value of 'must_understand'? + # if data["must_understand"] is not False: + # raise ValueError + raw_metadata = data.get("metadata") if not isinstance(raw_metadata, dict): - raise TypeError("Unexpected type for 'metadata'") - - elif not raw_metadata: - raise ValueError("Must specify metadata") + raise TypeError(f"Unexpected type for 'metadata': {type(raw_metadata)}") - metadata: dict[str, ArrayMetadata | GroupMetadata] + metadata: dict[str, ArrayMetadata | GroupMetadata] = {} if raw_metadata: - metadata = {} for k, v in raw_metadata.items(): if not isinstance(v, dict): raise TypeError(f"Invalid value for metadata items. key={k}, type={type(v)}") @@ -134,13 +130,6 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: else: # probably v2 Group metadata metadata[k] = GroupMetadata.from_dict(v) - - # assert data["kind"] == "inline" - if data["kind"] != "inline": - raise ValueError - - if data["must_understand"] is not False: - raise ValueError return cls(metadata=metadata) def _filter_prefix(self, key: str) -> ConsolidatedMetadata: diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index c62e00ad1b..bab0340895 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -503,3 +503,33 @@ async def test_non_root_node(memory_store_with_hierarchy: Store) -> None: assert child.metadata.consolidated_metadata is not None assert "air" not in child.metadata.consolidated_metadata.metadata assert "grandchild" in child.metadata.consolidated_metadata.metadata + + +def test_consolidated_metadata_from_dict(): + data = {"must_understand": False} + + # missing kind + with pytest.raises(ValueError, match="kind='None'"): + ConsolidatedMetadata.from_dict(data) + + # invalid kind + data["kind"] = "invalid" + with pytest.raises(ValueError, match="kind='invalid'"): + ConsolidatedMetadata.from_dict(data) + + # missing metadata + data["kind"] = "inline" + + with pytest.raises(TypeError, match="Unexpected type for 'metadata'"): + ConsolidatedMetadata.from_dict(data) + + data["kind"] = "inline" + # empty is fine + data["metadata"] = {} + ConsolidatedMetadata.from_dict(data) + + # invalid metadata + data["metadata"]["a"] = {"node_type": "array", "zarr_format": 3} + + with pytest.raises(TypeError): + ConsolidatedMetadata.from_dict(data) From 63697ab0a82ad21e5a38540d3ed048f2e74c891d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 08:47:29 -0500 Subject: [PATCH 10/62] cleanup --- src/zarr/core/group.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index feae0bb3de..c77557595a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -132,12 +132,6 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: metadata[k] = GroupMetadata.from_dict(v) return cls(metadata=metadata) - def _filter_prefix(self, key: str) -> ConsolidatedMetadata: - """ - Filter a - """ - return replace(self, metadata={k: v for k, v in self.metadata.items() if k.startswith(key)}) - @dataclass(frozen=True) class GroupMetadata(Metadata): From 5d79274914cd583475d9b9ae6b74286719f0dca3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 09:18:26 -0500 Subject: [PATCH 11/62] refactor open --- src/zarr/core/group.py | 112 +++++++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 38 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index c77557595a..771fe5a6be 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -217,14 +217,36 @@ async def open( cls, store: StoreLike, zarr_format: Literal[2, 3, None] = 3, - open_consolidated: bool = False, + open_consolidated: bool | str = False, ) -> AsyncGroup: + """ + Open a new AsyncGroup + + Parameters + ---------- + store: StoreLike + zarr_format: {2, 3}, optional + open_consolidated: bool or str, default False + Whether to use consolidated metadata when opening a V2 Group. + This parameter has no effect when ``zarr_format=3`` (the default), + where consolidated metadata is always used when present. + + By default, consolidated metadata is not used for ``zarr_format=2``, + matching the behavior of zarr-python 2.x. Pass ``open_consolidated=True`` + to use consolidated metadata with the default key ``.zmetadata``. + + Pass a string to use an alternative key in the store, which will be + be relative to the store or ``StorePath``. + """ store_path = await make_store_path(store) if zarr_format == 2: paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] if open_consolidated: - paths.append(store_path / ".zmetadata") # todo: configurable + if open_consolidated is True: + open_consolidated = ".zmetadata" + + paths.append(store_path / open_consolidated) zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather( *[path.get() for path in paths] @@ -262,47 +284,61 @@ async def open( raise ValueError(f"unexpected zarr_format: {zarr_format}") if zarr_format == 2: - # V2 groups are comprised of a .zgroup and .zattrs objects + # this is checked above, asserting here for mypy assert zgroup_bytes is not None - zgroup = json.loads(zgroup_bytes.to_bytes()) - zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} - group_metadata = {**zgroup, "attributes": zattrs} - - if open_consolidated: - # this *should* be defined. - assert consolidated_metadata_bytes is not None # already checked above - - v2_consolidated_metadata = json.loads(consolidated_metadata_bytes.to_bytes()) - v2_consolidated_metadata = v2_consolidated_metadata["metadata"] - # We already read zattrs and zgroup. Should we ignore these? - v2_consolidated_metadata.pop(".zattrs") - v2_consolidated_metadata.pop(".zgroup") - - consolidated_metadata: defaultdict[str, dict[str, Any]] = defaultdict(dict) - - # keys like air/.zarray, air/.zattrs - for k, v in v2_consolidated_metadata.items(): - path, kind = k.rsplit("/.", 1) - - if kind == "zarray": - consolidated_metadata[path].update(v) - elif kind == "zattrs": - consolidated_metadata[path]["attributes"] = v - elif kind == "zgroup": - consolidated_metadata[path].update(v) - else: - raise ValueError(f"Invalid file type '{kind}' at path '{path}") - group_metadata["consolidated_metadata"] = { - "metadata": dict(consolidated_metadata), - "kind": "inline", - "must_understand": False, - } - + return cls._from_bytes_v2( + store_path, zgroup_bytes, zattrs_bytes, consolidated_metadata_bytes + ) else: # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None - group_metadata = json.loads(zarr_json_bytes.to_bytes()) + return cls._from_bytes_v3(store_path, zarr_json_bytes) + @classmethod + def _from_bytes_v2( + cls, + store_path: StorePath, + zgroup_bytes: Buffer, + zattrs_bytes: Buffer | None, + consolidated_metadata_bytes: Buffer | None, + ) -> AsyncGroup: + # V2 groups are comprised of a .zgroup and .zattrs objects + zgroup = json.loads(zgroup_bytes.to_bytes()) + zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} + group_metadata = {**zgroup, "attributes": zattrs} + + if consolidated_metadata_bytes is not None: + v2_consolidated_metadata = json.loads(consolidated_metadata_bytes.to_bytes()) + v2_consolidated_metadata = v2_consolidated_metadata["metadata"] + # We already read zattrs and zgroup. Should we ignore these? + v2_consolidated_metadata.pop(".zattrs") + v2_consolidated_metadata.pop(".zgroup") + + consolidated_metadata: defaultdict[str, dict[str, Any]] = defaultdict(dict) + + # keys like air/.zarray, air/.zattrs + for k, v in v2_consolidated_metadata.items(): + path, kind = k.rsplit("/.", 1) + + if kind == "zarray": + consolidated_metadata[path].update(v) + elif kind == "zattrs": + consolidated_metadata[path]["attributes"] = v + elif kind == "zgroup": + consolidated_metadata[path].update(v) + else: + raise ValueError(f"Invalid file type '{kind}' at path '{path}") + group_metadata["consolidated_metadata"] = { + "metadata": dict(consolidated_metadata), + "kind": "inline", + "must_understand": False, + } + + return cls.from_dict(store_path, group_metadata) + + @classmethod + def _from_bytes_v3(cls, store_path: StorePath, zarr_json_bytes: Buffer) -> AsyncGroup: + group_metadata = json.loads(zarr_json_bytes.to_bytes()) return cls.from_dict(store_path, group_metadata) @classmethod From 0c67972a377d6ff8e2610ea3a8729688fa41f168 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 09:21:53 -0500 Subject: [PATCH 12/62] remove dupe file --- tests/v3/test_metadata.py | 236 -------------------------------------- 1 file changed, 236 deletions(-) delete mode 100644 tests/v3/test_metadata.py diff --git a/tests/v3/test_metadata.py b/tests/v3/test_metadata.py deleted file mode 100644 index a0cc2b3b88..0000000000 --- a/tests/v3/test_metadata.py +++ /dev/null @@ -1,236 +0,0 @@ -import numpy as np -import pytest - -import zarr.api.synchronous -import zarr.store -from zarr.abc.store import Store -from zarr.api.asynchronous import ( - AsyncGroup, - consolidate_metadata, - group, - open, - open_consolidated, -) -from zarr.core.array import ArrayV3Metadata -from zarr.core.group import ConsolidatedMetadata, GroupMetadata - - -async def test_consolidated(memory_store_with_hierarchy: Store) -> None: - # TODO: Figure out desired keys in - # TODO: variety in the hierarchies - # More nesting - # arrays under arrays - # single array - # etc. - g = await group(store=memory_store_with_hierarchy, attributes={"foo": "bar"}) - await g.create_array(name="air", shape=(1, 2, 3)) - await g.create_array(name="lat", shape=(1,)) - await g.create_array(name="lon", shape=(2,)) - await g.create_array(name="time", shape=(3,)) - - child = await g.create_group("child", attributes={"key": "child"}) - await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) - - grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) - await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) - - await consolidate_metadata(memory_store_with_hierarchy) - group2 = await AsyncGroup.open(memory_store_with_hierarchy) - - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - expected = GroupMetadata( - attributes={"foo": "bar"}, - consolidated_metadata=ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lon": ArrayV3Metadata.from_dict( - { - **{"shape": (2,)}, - "chunk_grid": { - "configuration": {"chunk_shape": (2,)}, - "name": "regular", - }, - **array_metadata, - } - ), - "time": ArrayV3Metadata.from_dict( - { - **{ - "shape": (3,), - "chunk_grid": { - "configuration": {"chunk_shape": (3,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "child": GroupMetadata(attributes={"key": "child"}), - "child/array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "child"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ), - "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), - "child/grandchild/array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "grandchild"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ), - }, - ), - ) - assert group2.metadata == expected - group3 = await open(store=memory_store_with_hierarchy) - assert group3.metadata == expected - - group4 = await open_consolidated(store=memory_store_with_hierarchy) - assert group4.metadata == expected - - -def test_consolidated_sync(memory_store_with_hierarchy): - g = zarr.api.synchronous.group(store=memory_store_with_hierarchy, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) - - zarr.api.synchronous.consolidate_metadata(memory_store_with_hierarchy) - group2 = zarr.api.synchronous.Group.open(memory_store_with_hierarchy) - - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - expected = GroupMetadata( - attributes={"foo": "bar"}, - consolidated_metadata=ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lon": ArrayV3Metadata.from_dict( - { - **{"shape": (2,)}, - "chunk_grid": { - "configuration": {"chunk_shape": (2,)}, - "name": "regular", - }, - **array_metadata, - } - ), - "time": ArrayV3Metadata.from_dict( - { - **{ - "shape": (3,), - "chunk_grid": { - "configuration": {"chunk_shape": (3,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - }, - ), - ) - assert group2.metadata == expected - group3 = zarr.api.synchronous.open(store=memory_store_with_hierarchy) - assert group3.metadata == expected - - group4 = zarr.api.synchronous.open_consolidated(store=memory_store_with_hierarchy) - assert group4.metadata == expected - - -async def test_not_writable_raises(memory_store_with_hierarchy: zarr.store.MemoryStore) -> None: - await group(store=memory_store_with_hierarchy, attributes={"foo": "bar"}) - read_store = zarr.store.MemoryStore(store_dict=memory_store_with_hierarchy._store_dict) - with pytest.raises(ValueError, match="writable"): - await consolidate_metadata(read_store) From 657ad1e44e94f917b7c4d94580630cbb156660d0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 09:30:56 -0500 Subject: [PATCH 13/62] v2 getitem --- src/zarr/core/group.py | 78 +++++++++++++++++-------------- tests/v3/test_metadata/test_v2.py | 17 ++++++- 2 files changed, 57 insertions(+), 38 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 771fe5a6be..18bb56187a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -362,42 +362,7 @@ async def getitem( # Consolidated metadata lets us avoid some I/O operations so try that first. if self.metadata.consolidated_metadata is not None: - try: - metadata = self.metadata.consolidated_metadata.metadata[key] - except KeyError as e: - # The Group Metadata has consolidated metadata, but the key - # isn't present. We trust this to mean that the key isn't in - # the hierarchy, and *don't* fall back to checking the store. - msg = f"'{key}' not found in consolidated metadata." - raise KeyError(msg) from e - - if metadata.zarr_format == 3: - assert isinstance(metadata, GroupMetadata | ArrayV3Metadata) - - if metadata.node_type == "group": - # To speed up nested access like - # group.getitem("a").getitem("b").getitem("array") - # we'll propagate the consolidated metadata to children. However, we need - # to remove the `store_path` prefix to ensure that `group.getitem("a")` - # has a different (shorter) prefix thanks to it having a longer root. - metadata = replace( - self.metadata, - consolidated_metadata=replace( - self.metadata.consolidated_metadata, - metadata={ - k.split(key, 1)[-1].lstrip("/"): v - for k, v in self.metadata.consolidated_metadata.metadata.items() - if k.startswith(key) and k != key - }, - ), - ) - return AsyncGroup(metadata=metadata, store_path=store_path) - else: - # TODO: order? - return AsyncArray(metadata=metadata, store_path=store_path) - else: - # v2 - raise NotImplementedError + return self._getitem_consolidated(store_path, key) # Note: # in zarr-python v2, we first check if `key` references an Array, else if `key` references @@ -448,6 +413,47 @@ async def getitem( else: raise ValueError(f"unexpected zarr_format: {self.metadata.zarr_format}") + def _getitem_consolidated(self, store_path: StorePath, key: str) -> AsyncArray | AsyncGroup: + # getitem, in the special case where we have consolidated metadata. + # Note that this is a regular def (non async) function. + # This shouldn't do any additional I/O. + + # the caller needs to verify this! + assert self.metadata.consolidated_metadata is not None + + try: + metadata = self.metadata.consolidated_metadata.metadata[key] + except KeyError as e: + # The Group Metadata has consolidated metadata, but the key + # isn't present. We trust this to mean that the key isn't in + # the hierarchy, and *don't* fall back to checking the store. + msg = f"'{key}' not found in consolidated metadata." + raise KeyError(msg) from e + + assert isinstance(metadata, GroupMetadata | ArrayMetadata) + + # generic over v2 / v3 here, so we don't have `metadata.node_type`. Use isinstance. + if isinstance(metadata, GroupMetadata): + # To speed up nested access like + # group.getitem("a").getitem("b").getitem("array") + # we'll propagate the consolidated metadata to children. However, we need + # to remove the `store_path` prefix to ensure that `group.getitem("a")` + # has a different (shorter) prefix thanks to it having a longer root. + metadata = replace( + self.metadata, + consolidated_metadata=replace( + self.metadata.consolidated_metadata, + metadata={ + k.split(key, 1)[-1].lstrip("/"): v + for k, v in self.metadata.consolidated_metadata.metadata.items() + if k.startswith(key) and k != key + }, + ), + ) + return AsyncGroup(metadata=metadata, store_path=store_path) + else: + return AsyncArray(metadata=metadata, store_path=store_path) + async def delitem(self, key: str) -> None: store_path = self.store_path / key if self.metadata.zarr_format == 3: diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index afad93c29c..e5ea490605 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import zarr.api.asynchronous import zarr.store from zarr.codecs import GzipCodec from zarr.core.buffer import cpu @@ -77,8 +78,8 @@ def test_metadata_to_dict( assert observed == expected -async def test_read_consolidated_metadata(): - # .zgroup, .zattrs, .metadata +@pytest.fixture +async def v2_consolidated_metadata(memory_store: zarr.store.MemoryStore) -> zarr.store.MemoryStore: zmetadata = { "metadata": { ".zattrs": { @@ -150,7 +151,12 @@ async def test_read_consolidated_metadata(): await store.set( "nested/.zgroup", cpu.Buffer.from_bytes(json.dumps({"zarr_format": 2}).encode()) ) + return store + +async def test_read_consolidated_metadata(v2_consolidated_metadata: zarr.store.MemoryStore): + # .zgroup, .zattrs, .metadata + store = v2_consolidated_metadata group = zarr.open_consolidated(store=store, zarr_format=2) assert group.metadata.consolidated_metadata is not None expected = ConsolidatedMetadata( @@ -190,3 +196,10 @@ async def test_read_consolidated_metadata(): ) result = group.metadata.consolidated_metadata assert result == expected + + +async def test_getitem_consolidated(v2_consolidated_metadata): + store = v2_consolidated_metadata + group = await zarr.api.asynchronous.open_consolidated(store=store, zarr_format=2) + air = await group.getitem("air") + assert air.metadata.shape == (730,) From 511ff76e6d05fdf8cb1f4579d5bbc782e000be13 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 09:37:20 -0500 Subject: [PATCH 14/62] fixup --- src/zarr/core/group.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 18bb56187a..1d9c2165d8 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -258,16 +258,24 @@ async def open( consolidated_metadata_bytes = rest[0] if consolidated_metadata_bytes is None: raise FileNotFoundError(paths[-1]) + else: + consolidated_metadata_bytes = None elif zarr_format == 3: zarr_json_bytes = await (store_path / ZARR_JSON).get() if zarr_json_bytes is None: raise FileNotFoundError(store_path) elif zarr_format is None: - zarr_json_bytes, zgroup_bytes, zattrs_bytes = await asyncio.gather( + ( + zarr_json_bytes, + zgroup_bytes, + zattrs_bytes, + consolidated_metadata_bytes, + ) = await asyncio.gather( (store_path / ZARR_JSON).get(), (store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get(), + (store_path / str(open_consolidated)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: # TODO: revisit this exception type From b360eb49c505bbd2cd759023f06878cabd40e556 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 10:40:59 -0500 Subject: [PATCH 15/62] Optimzied members --- src/zarr/api/asynchronous.py | 6 ++++-- src/zarr/core/group.py | 42 ++++++++++++++++++++++++++++++++++-- tests/v3/test_group.py | 31 ++++++++++++++++++-------- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 7ba667d69d..4d59c2ffca 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -138,7 +138,9 @@ def _default_zarr_version() -> ZarrFormat: return 3 -async def consolidate_metadata(store: StoreLike, path: str | None = None) -> AsyncGroup: +async def consolidate_metadata( + store: StoreLike, path: str | None = None, zarr_format: ZarrFormat = 3 +) -> AsyncGroup: """ Consolidate the metadata of all nodes in a hierarchy. @@ -167,7 +169,7 @@ async def consolidate_metadata(store: StoreLike, path: str | None = None) -> Asy if path is not None: store_path = store_path / path - group = await AsyncGroup.open(store_path) + group = await AsyncGroup.open(store_path, zarr_format=zarr_format) group.store_path.store._check_writable() members = dict([x async for x in group.members(max_depth=None)]) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1d9c2165d8..24ac4e97e9 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -35,7 +35,7 @@ from zarr.store.common import ensure_no_existing_node if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Iterable, Iterator + from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any from zarr.abc.codec import Codec @@ -832,7 +832,14 @@ async def members( async def _members( self, max_depth: int | None, current_depth: int ) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], None]: - # todo! + if self.metadata.consolidated_metadata is not None: + # we should be able to do members without any additional I/O + members = self._members_consolidated(max_depth, current_depth) + + for member in members: + yield member + return + if not self.store_path.store.supports_listing: msg = ( f"The store associated with this group ({type(self.store_path.store)}) " @@ -846,6 +853,11 @@ async def _members( # and scoped to specific zarr versions _skip_keys = ("zarr.json", ".zgroup", ".zattrs") + # hmm lots of I/O and logic interleaved here. + # We *could* have an async gen over self.metadata.consolidated_metadata.metadata.keys() + # and plug in here.l `getitem` will skip I/O. + # Kinda a shape to have all the asyncio task overhead though, when it isn't needed. + async for key in self.store_path.store.list_dir(self.store_path.path): if key in _skip_keys: continue @@ -874,6 +886,32 @@ async def _members( key, ) + def _members_consolidated( + self, max_depth: int | None, current_depth: int + ) -> Generator[tuple[str, AsyncArray | AsyncGroup], None]: + # I think we know that current_depth is always 0? + # I guess we could have opened a non-consolidated store, hit a consolidated group, and then gotten here. + ... + # the caller must check this + assert self.metadata.consolidated_metadata is not None + + keys = self.metadata.consolidated_metadata.metadata.keys() + + # we kind of just want the top-level keys. + for key in keys: + # Why use self._getitem_consolidated instead of just consolidated_metadata.metadata.keys()? + # + # We want to ensure that groups yield from .members() *also* set consolidated + # metadata. + # This is a very subtle point that maybe indicates the API is wrong. But the tradeoff + # is duplicating the consolidated metadata for all children... + # Maybe... + obj = self._getitem_consolidated(self.store_path, key) + current_depth = key.count("/") + + if (max_depth is None) or (current_depth <= max_depth): + yield key, obj + async def contains(self, member: str) -> bool: # TODO: this can be made more efficient. try: diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 86967eb7ed..35191b8312 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -74,18 +74,20 @@ def test_group_name_properties(store: LocalStore | MemoryStore, zarr_format: Zar assert bar.basename == "bar" -def test_group_members(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) -> None: +@pytest.mark.parametrize("consolidated_metadata", [True, False]) +def test_group_members( + store: MemoryStore | LocalStore, zarr_format: ZarrFormat, consolidated_metadata: bool +) -> None: """ Test that `Group.members` returns correct values, i.e. the arrays and groups (explicit and implicit) contained in that group. """ path = "group" - agroup = AsyncGroup( - metadata=GroupMetadata(zarr_format=zarr_format), - store_path=StorePath(store=store, path=path), + group = Group.create( + store=store, + zarr_format=zarr_format, ) - group = Group(agroup) members_expected: dict[str, Array | Group] = {} members_expected["subgroup"] = group.create_group("subgroup") @@ -115,6 +117,11 @@ def test_group_members(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) default_buffer_prototype().buffer.from_bytes(b"000000"), ) ) + + if consolidated_metadata: + zarr.consolidate_metadata(store=store, zarr_format=zarr_format) + group = zarr.open_consolidated(store=store, zarr_format=zarr_format) + members_observed = group.members() # members are not guaranteed to be ordered, so sort before comparing assert sorted(dict(members_observed)) == sorted(members_expected) @@ -692,10 +699,12 @@ async def test_asyncgroup_update_attributes( assert agroup_new_attributes.attrs == attributes_new -async def test_group_members_async(store: LocalStore | MemoryStore) -> None: - group = AsyncGroup( - GroupMetadata(), - store_path=StorePath(store=store, path="root"), +@pytest.mark.parametrize("consolidated_metadata", [True, False]) +async def test_group_members_async( + store: LocalStore | MemoryStore, consolidated_metadata: bool +) -> None: + group = await AsyncGroup.create( + store=store, ) a0 = await group.create_array("a0", shape=(1,)) g0 = await group.create_group("g0") @@ -738,6 +747,10 @@ async def test_group_members_async(store: LocalStore | MemoryStore) -> None: ] assert all_children == expected + if consolidated_metadata: + await zarr.api.asynchronous.consolidate_metadata(store=store) + group = await zarr.api.asynchronous.open_group(store=store) + nmembers = await group.nmembers(max_depth=None) assert nmembers == 6 From abcdbe6bdac583293dcfa0ddb81f66d33278ddd1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Sep 2024 16:48:54 -0500 Subject: [PATCH 16/62] Impl flatten --- src/zarr/core/group.py | 127 +++++++++++++++++++---- tests/v3/test_group.py | 16 +-- tests/v3/test_metadata/test_v2.py | 44 +++++++- tests/v3/test_metadata/test_v3.py | 164 +++++++++++++++++++++++++----- 4 files changed, 294 insertions(+), 57 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 24ac4e97e9..84d7e95750 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import itertools import json import logging from collections import defaultdict @@ -84,6 +85,13 @@ def _parse_async_node(node: AsyncArray | AsyncGroup) -> Array | Group: @dataclass(frozen=True) class ConsolidatedMetadata: + """ + Consolidated Metadata for this Group. + + This stores the metadata of child nodes below this group. Any child groups + will have their consolidated metadata set appropriately. + """ + metadata: dict[str, ArrayMetadata | GroupMetadata] kind: Literal["inline"] = "inline" must_understand: Literal[False] = False @@ -130,8 +138,108 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: else: # probably v2 Group metadata metadata[k] = GroupMetadata.from_dict(v) + metadata = cls._flat_to_nested(metadata) + return cls(metadata=metadata) + @staticmethod + def _flat_to_nested( + metadata: dict[str, ArrayMetadata | GroupMetadata], + ) -> dict[str, ArrayMetadata | GroupMetadata]: + """ + Convert a flat metadata representation to a nested one. + + Notes + ----- + Flat metadata is used when persisting the consolidated metadata. The keys + include the full path, not just the node name. The key prefixes can be + used to determine which nodes are children of which other nodes. + + Nested metadata is used in-memory. The outermost level will only have the + *immediate* children of the Group. All nested child groups will be stored + under the consolidated metadata of their immediate parent. + """ + # We have a flat mapping from {k: v} where the keys include the *full* + # path segment. + # + # We want a nested representation, so we need to group by Group, + # i.e. find all the children with the same prefix. + # keys = sorted(metadata, key=lambda x: (x.count("/"), x)) + + # ideally we build up a data structure that has {full_key: metadata} + metadata = dict(metadata) + + keys = sorted(metadata, key=lambda k: k.count("/")) + grouped = { + k: list(v) for k, v in itertools.groupby(keys, key=lambda k: k.rsplit("/", 1)[0]) + } + + # # we go top down and directly manipulate metadata. + for key, children_keys in grouped.items(): + # key is a key like "a", "a/b", "a/b/c" + # The basic idea is to find the immediate parent (so "", "a", or "a/b") + # and update that node's consolidated metadata to include the metadata + # in children_keys + *prefixes, name = key.split("/") + parent = metadata + + while prefixes: + # e.g. a/b/c has a parent "a/b". Walk through to get + # metadata["a"]["b"] + part = prefixes.pop() + # we can assume that parent[part] here is a group + # otherwise we wouldn't have a node with this `part` prefix. + # We can also assume that the parent node will have consolidated metadata. + parent = parent[part].consolidated_metadata.metadata # type: ignore[union-attr] + + node = parent[name] + children_keys = list(children_keys) + + if isinstance(node, ArrayMetadata): + # These are already present, either thanks to being an array in the + # root, or by being collected as a child in the else clause + continue + else: + children_keys = list(children_keys) + # We pop from metadata, since we're *moving* this under group + children = { + child_key.split("/")[-1]: metadata.pop(child_key) + for child_key in children_keys + if child_key != key + } + parent[name] = replace( + node, consolidated_metadata=ConsolidatedMetadata(metadata=children) + ) + return metadata + + def flattened_metadata(self) -> dict[str, ArrayMetadata | GroupMetadata]: + metadata = {} + if self.metadata is None: + raise ValueError + + def flatten( + key: str, group: GroupMetadata | ArrayMetadata + ) -> dict[str, ArrayMetadata | GroupMetadata]: + children: dict[str, ArrayMetadata | GroupMetadata] = {} + if isinstance(group, ArrayMetadata): + children[key] = group + else: + children[key] = replace(group, consolidated_metadata=None) + if group.consolidated_metadata and group.consolidated_metadata.metadata: + for name, val in group.consolidated_metadata.metadata.items(): + full_key = "/".join([key, name]) + if isinstance(val, GroupMetadata): + children.update(flatten(full_key, val)) + else: + children[full_key] = val + + return children + + for k, v in self.metadata.items(): + metadata.update(flatten(k, v)) + + return metadata + @dataclass(frozen=True) class GroupMetadata(Metadata): @@ -438,26 +546,7 @@ def _getitem_consolidated(self, store_path: StorePath, key: str) -> AsyncArray | msg = f"'{key}' not found in consolidated metadata." raise KeyError(msg) from e - assert isinstance(metadata, GroupMetadata | ArrayMetadata) - - # generic over v2 / v3 here, so we don't have `metadata.node_type`. Use isinstance. if isinstance(metadata, GroupMetadata): - # To speed up nested access like - # group.getitem("a").getitem("b").getitem("array") - # we'll propagate the consolidated metadata to children. However, we need - # to remove the `store_path` prefix to ensure that `group.getitem("a")` - # has a different (shorter) prefix thanks to it having a longer root. - metadata = replace( - self.metadata, - consolidated_metadata=replace( - self.metadata.consolidated_metadata, - metadata={ - k.split(key, 1)[-1].lstrip("/"): v - for k, v in self.metadata.consolidated_metadata.metadata.items() - if k.startswith(key) and k != key - }, - ), - ) return AsyncGroup(metadata=metadata, store_path=store_path) else: return AsyncArray(metadata=metadata, store_path=store_path) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 35191b8312..f948c27dd7 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -876,25 +876,19 @@ async def test_group_getitem_consolidated(store: LocalStore | MemoryStore): "g1": GroupMetadata( attributes={}, zarr_format=3, - consolidated_metadata=None, - ), - "g1/g2": GroupMetadata( - attributes={}, - zarr_format=3, - consolidated_metadata=None, + consolidated_metadata=ConsolidatedMetadata( + metadata={"g2": GroupMetadata(attributes={}, zarr_format=3)} + ), ), } ) assert rg0.metadata.consolidated_metadata == expected - expected.metadata.pop("g1") - expected.metadata["g2"] = expected.metadata.pop("g1/g2") rg1 = await rg0.getitem("g1") - assert rg1.metadata.consolidated_metadata == expected + assert rg1.metadata.consolidated_metadata == expected.metadata["g1"].consolidated_metadata rg2 = await rg1.getitem("g2") - expected.metadata.clear() - assert rg2.metadata.consolidated_metadata == expected + assert rg2.metadata.consolidated_metadata is None async def test_group_delitem_consolidated(store: LocalStore | MemoryStore): diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index e5ea490605..6c893ee2ef 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -119,6 +119,19 @@ async def v2_consolidated_metadata(memory_store: zarr.store.MemoryStore) -> zarr }, "nested/.zattrs": {"key": "value"}, "nested/.zgroup": {"zarr_format": 2}, + "nested/array/.zarray": { + "chunks": [730], + "compressor": {}, + "dtype": " zarr await store.set( "nested/.zgroup", cpu.Buffer.from_bytes(json.dumps({"zarr_format": 2}).encode()) ) + await store.set( + "nested/array/.zarray", + cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["nested/array/.zarray"]).encode()), + ) + await store.set( + "nested/array/.zattrs", + cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["nested/array/.zattrs"]).encode()), + ) + return store @@ -189,7 +211,27 @@ async def test_read_consolidated_metadata(v2_consolidated_metadata: zarr.store.M dimension_separator=".", compressor={}, ), - "nested": GroupMetadata(attributes={"key": "value"}, zarr_format=2), + "nested": GroupMetadata( + attributes={"key": "value"}, + zarr_format=2, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV2Metadata( + shape=(730,), + fill_value=0.0, + chunks=(730,), + attributes={ + "calendar": "standard", + }, + dtype=np.dtype("float32"), + order="C", + filters=None, + dimension_separator=".", + compressor={}, + ) + } + ), + ), }, kind="inline", must_understand=False, diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index bab0340895..6ffd562654 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -361,37 +361,50 @@ async def test_consolidated(memory_store_with_hierarchy: Store) -> None: **array_metadata, } ), - "child": GroupMetadata(attributes={"key": "child"}), - "child/array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "child"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ), - "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), - "child/grandchild/array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "grandchild"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, + "child": GroupMetadata( + attributes={"key": "child"}, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "child"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + "grandchild": GroupMetadata( + attributes={"key": "grandchild"}, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "grandchild"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ) + } + ), + ), }, - } + ), ), }, ), ) + assert group2.metadata == expected group3 = await open(store=memory_store_with_hierarchy) assert group3.metadata == expected @@ -533,3 +546,102 @@ def test_consolidated_metadata_from_dict(): with pytest.raises(TypeError): ConsolidatedMetadata.from_dict(data) + + +def test_flatten(): + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + metadata = ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "child": GroupMetadata( + attributes={"key": "child"}, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "child"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + "grandchild": GroupMetadata( + attributes={"key": "grandchild"}, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "grandchild"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ) + } + ), + ), + }, + ), + ), + }, + ) + result = metadata.flattened_metadata() + expected = { + "air": metadata.metadata["air"], + "lat": metadata.metadata["lat"], + "child": GroupMetadata(attributes={"key": "child"}), + "child/array": metadata.metadata["child"].consolidated_metadata.metadata["array"], + "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), + "child/grandchild/array": metadata.metadata["child"] + .consolidated_metadata.metadata["grandchild"] + .consolidated_metadata.metadata["array"], + } + assert result == expected From b9bcfe813025dab135239bfb6f06443829c24a65 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 08:55:15 -0500 Subject: [PATCH 17/62] Fixups --- src/zarr/core/group.py | 51 ++++++++++++++++++------------- tests/v3/test_group.py | 11 +++++-- tests/v3/test_metadata/test_v3.py | 2 +- 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 84d7e95750..d96ee751ff 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -186,7 +186,7 @@ def _flat_to_nested( while prefixes: # e.g. a/b/c has a parent "a/b". Walk through to get # metadata["a"]["b"] - part = prefixes.pop() + part = prefixes.pop(0) # we can assume that parent[part] here is a group # otherwise we wouldn't have a node with this `part` prefix. # We can also assume that the parent node will have consolidated metadata. @@ -212,7 +212,22 @@ def _flat_to_nested( ) return metadata + @property def flattened_metadata(self) -> dict[str, ArrayMetadata | GroupMetadata]: + """ + Return the flattened representation of Consolidated Metadata. + + The returned dictionary will have a key for each child node in the hierarchy + under this group. Under the default (nested) representation available through + ``self.metadata``, the dictionary only contains keys for immediate children. + + The keys of the dictionary will include the full path to a child node from + the current group, where segments are joined by ``//`. + + Examples + -------- + >>> cm = ConsolidatedMetadata() + """ metadata = {} if self.metadata is None: raise ValueError @@ -886,7 +901,7 @@ async def nmembers( count : int """ if self.metadata.consolidated_metadata is not None: - return len(self.metadata.consolidated_metadata.metadata) + return len(self.metadata.consolidated_metadata.flattened_metadata) # TODO: consider using aioitertools.builtins.sum for this # return await aioitertools.builtins.sum((1 async for _ in self.members()), start=0) n = 0 @@ -976,31 +991,23 @@ async def _members( ) def _members_consolidated( - self, max_depth: int | None, current_depth: int + self, max_depth: int | None, current_depth: int, prefix: str = "" ) -> Generator[tuple[str, AsyncArray | AsyncGroup], None]: - # I think we know that current_depth is always 0? - # I guess we could have opened a non-consolidated store, hit a consolidated group, and then gotten here. - ... - # the caller must check this - assert self.metadata.consolidated_metadata is not None - - keys = self.metadata.consolidated_metadata.metadata.keys() + consolidated_metadata = self.metadata.consolidated_metadata # we kind of just want the top-level keys. - for key in keys: - # Why use self._getitem_consolidated instead of just consolidated_metadata.metadata.keys()? - # - # We want to ensure that groups yield from .members() *also* set consolidated - # metadata. - # This is a very subtle point that maybe indicates the API is wrong. But the tradeoff - # is duplicating the consolidated metadata for all children... - # Maybe... - obj = self._getitem_consolidated(self.store_path, key) - current_depth = key.count("/") - - if (max_depth is None) or (current_depth <= max_depth): + if consolidated_metadata is not None: + for key in consolidated_metadata.metadata.keys(): + obj = self._getitem_consolidated(self.store_path, key) # Metadata -> Group/Array + # this is probably generally useful + key = "/".join([prefix, key]).lstrip("/") yield key, obj + if ((max_depth is None) or (current_depth < max_depth)) and isinstance( + obj, AsyncGroup + ): + yield from obj._members_consolidated(max_depth, current_depth + 1, prefix=key) + async def contains(self, member: str) -> bool: # TODO: this can be made more efficient. try: diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index f948c27dd7..a9e43d7279 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -82,6 +82,11 @@ def test_group_members( Test that `Group.members` returns correct values, i.e. the arrays and groups (explicit and implicit) contained in that group. """ + # group/ + # subgroup/ + # subsubgroup/ + # subsubsubgroup + # subarray path = "group" group = Group.create( @@ -916,10 +921,10 @@ async def test_group_delitem_consolidated(store: LocalStore | MemoryStore): await zarr.api.asynchronous.consolidate_metadata(store) - group = await zarr.api.asynchronous.open(store=store) - assert len(group.metadata.consolidated_metadata.metadata) == 8 + group = await zarr.api.asynchronous.open_consolidated(store=store) + assert len(group.metadata.consolidated_metadata.metadata) == 2 assert "g0" in group.metadata.consolidated_metadata.metadata await group.delitem("g0") - assert len(group.metadata.consolidated_metadata.metadata) == 7 + assert len(group.metadata.consolidated_metadata.metadata) == 1 assert "g0" not in group.metadata.consolidated_metadata.metadata diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 6ffd562654..31ab363b7f 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -633,7 +633,7 @@ def test_flatten(): ), }, ) - result = metadata.flattened_metadata() + result = metadata.flattened_metadata expected = { "air": metadata.metadata["air"], "lat": metadata.metadata["lat"], From 3575cdaab353bf58b6957f38a263ac95d378a8ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 09:41:23 -0500 Subject: [PATCH 18/62] doc --- src/zarr/core/group.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index d96ee751ff..6d56594a33 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -222,11 +222,25 @@ def flattened_metadata(self) -> dict[str, ArrayMetadata | GroupMetadata]: ``self.metadata``, the dictionary only contains keys for immediate children. The keys of the dictionary will include the full path to a child node from - the current group, where segments are joined by ``//`. + the current group, where segments are joined by ``/``. Examples -------- - >>> cm = ConsolidatedMetadata() + >>> cm = ConsolidatedMetadata( + ... metadata={ + ... "group-0": GroupMetadata( + ... consolidated_metadata=ConsolidatedMetadata( + ... { + ... "group-0-0": GroupMetadata(), + ... } + ... ) + ... ), + ... "group-1": GroupMetadata(), + ... } + ... ) + {'group-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), + 'group-0/group-0-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), + 'group-1': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group')} """ metadata = {} if self.metadata is None: From 7b6bd17f9fa4971080af71be09815a2a3abfbba2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 09:59:23 -0500 Subject: [PATCH 19/62] nest the tests --- tests/v3/test_metadata/test_v2.py | 327 +++++++++-------- tests/v3/test_metadata/test_v3.py | 580 +++++++++++++++--------------- 2 files changed, 457 insertions(+), 450 deletions(-) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 6c893ee2ef..025e0423a3 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -78,170 +78,179 @@ def test_metadata_to_dict( assert observed == expected -@pytest.fixture -async def v2_consolidated_metadata(memory_store: zarr.store.MemoryStore) -> zarr.store.MemoryStore: - zmetadata = { - "metadata": { - ".zattrs": { - "Conventions": "COARDS", - }, - ".zgroup": {"zarr_format": 2}, - "air/.zarray": { - "chunks": [730], - "compressor": {}, - "dtype": " zarr.store.MemoryStore: + zmetadata = { + "metadata": { + ".zattrs": { + "Conventions": "COARDS", + }, + ".zgroup": {"zarr_format": 2}, + "air/.zarray": { + "chunks": [730], + "compressor": {}, + "dtype": " None: return memory_store -async def test_consolidated(memory_store_with_hierarchy: Store) -> None: - # TODO: Figure out desired keys in - # TODO: variety in the hierarchies - # More nesting - # arrays under arrays - # single array - # etc. - await consolidate_metadata(memory_store_with_hierarchy) - group2 = await AsyncGroup.open(memory_store_with_hierarchy) - - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } +class TestConsolidated: + async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: + # TODO: Figure out desired keys in + # TODO: variety in the hierarchies + # More nesting + # arrays under arrays + # single array + # etc. + await consolidate_metadata(memory_store_with_hierarchy) + group2 = await AsyncGroup.open(memory_store_with_hierarchy) + + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + expected = GroupMetadata( + attributes={"foo": "bar"}, + consolidated_metadata=ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lon": ArrayV3Metadata.from_dict( + { + **{"shape": (2,)}, + "chunk_grid": { + "configuration": {"chunk_shape": (2,)}, + "name": "regular", + }, + **array_metadata, + } + ), + "time": ArrayV3Metadata.from_dict( + { + **{ + "shape": (3,), + "chunk_grid": { + "configuration": {"chunk_shape": (3,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "child": GroupMetadata( + attributes={"key": "child"}, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "child"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": {"chunk_shape": (4, 4)}, + "name": "regular", + }, + }, + } + ), + "grandchild": GroupMetadata( + attributes={"key": "grandchild"}, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "array": ArrayV3Metadata.from_dict( + { + **array_metadata, + **{ + "attributes": {"key": "grandchild"}, + "shape": (4, 4), + "chunk_grid": { + "configuration": { + "chunk_shape": (4, 4) + }, + "name": "regular", + }, + }, + } + ) + } + ), + ), + }, + ), + ), + }, + ), + ) + + assert group2.metadata == expected + group3 = await open(store=memory_store_with_hierarchy) + assert group3.metadata == expected + + group4 = await open_consolidated(store=memory_store_with_hierarchy) + assert group4.metadata == expected + + def test_consolidated_sync(self, memory_store): + g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) + g.create_array(name="air", shape=(1, 2, 3)) + g.create_array(name="lat", shape=(1,)) + g.create_array(name="lon", shape=(2,)) + g.create_array(name="time", shape=(3,)) + + zarr.api.synchronous.consolidate_metadata(memory_store) + group2 = zarr.api.synchronous.Group.open(memory_store) + + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + expected = GroupMetadata( + attributes={"foo": "bar"}, + consolidated_metadata=ConsolidatedMetadata( + kind="inline", + must_understand=False, + metadata={ + "air": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1, 2, 3), + "chunk_grid": { + "configuration": {"chunk_shape": (1, 2, 3)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lat": ArrayV3Metadata.from_dict( + { + **{ + "shape": (1,), + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + "lon": ArrayV3Metadata.from_dict( + { + **{"shape": (2,)}, + "chunk_grid": { + "configuration": {"chunk_shape": (2,)}, + "name": "regular", + }, + **array_metadata, + } + ), + "time": ArrayV3Metadata.from_dict( + { + **{ + "shape": (3,), + "chunk_grid": { + "configuration": {"chunk_shape": (3,)}, + "name": "regular", + }, + }, + **array_metadata, + } + ), + }, + ), + ) + assert group2.metadata == expected + group3 = zarr.api.synchronous.open(store=memory_store) + assert group3.metadata == expected + + group4 = zarr.api.synchronous.open_consolidated(store=memory_store) + assert group4.metadata == expected + + async def test_not_writable_raises(self, memory_store: zarr.store.MemoryStore) -> None: + await group(store=memory_store, attributes={"foo": "bar"}) + read_store = zarr.store.MemoryStore(store_dict=memory_store._store_dict) + with pytest.raises(ValueError, match="does not support writing"): + await consolidate_metadata(read_store) + + async def test_non_root_node(self, memory_store_with_hierarchy: Store) -> None: + await consolidate_metadata(memory_store_with_hierarchy, path="child") + root = await AsyncGroup.open(memory_store_with_hierarchy) + child = await AsyncGroup.open(StorePath(memory_store_with_hierarchy) / "child") + + assert root.metadata.consolidated_metadata is None + assert child.metadata.consolidated_metadata is not None + assert "air" not in child.metadata.consolidated_metadata.metadata + assert "grandchild" in child.metadata.consolidated_metadata.metadata + + def test_consolidated_metadata_from_dict(self): + data = {"must_understand": False} + + # missing kind + with pytest.raises(ValueError, match="kind='None'"): + ConsolidatedMetadata.from_dict(data) + + # invalid kind + data["kind"] = "invalid" + with pytest.raises(ValueError, match="kind='invalid'"): + ConsolidatedMetadata.from_dict(data) + + # missing metadata + data["kind"] = "inline" + + with pytest.raises(TypeError, match="Unexpected type for 'metadata'"): + ConsolidatedMetadata.from_dict(data) + + data["kind"] = "inline" + # empty is fine + data["metadata"] = {} + ConsolidatedMetadata.from_dict(data) + + # invalid metadata + data["metadata"]["a"] = {"node_type": "array", "zarr_format": 3} + + with pytest.raises(TypeError): + ConsolidatedMetadata.from_dict(data) - expected = GroupMetadata( - attributes={"foo": "bar"}, - consolidated_metadata=ConsolidatedMetadata( + def test_flatten(self): + array_metadata = { + "attributes": {}, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": np.dtype("float64"), + "fill_value": np.float64(0.0), + "node_type": "array", + # "shape": (1, 2, 3), + "zarr_format": 3, + } + + metadata = ConsolidatedMetadata( kind="inline", must_understand=False, metadata={ @@ -339,28 +589,6 @@ async def test_consolidated(memory_store_with_hierarchy: Store) -> None: **array_metadata, } ), - "lon": ArrayV3Metadata.from_dict( - { - **{"shape": (2,)}, - "chunk_grid": { - "configuration": {"chunk_shape": (2,)}, - "name": "regular", - }, - **array_metadata, - } - ), - "time": ArrayV3Metadata.from_dict( - { - **{ - "shape": (3,), - "chunk_grid": { - "configuration": {"chunk_shape": (3,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), "child": GroupMetadata( attributes={"key": "child"}, consolidated_metadata=ConsolidatedMetadata( @@ -402,246 +630,16 @@ async def test_consolidated(memory_store_with_hierarchy: Store) -> None: ), ), }, - ), - ) - - assert group2.metadata == expected - group3 = await open(store=memory_store_with_hierarchy) - assert group3.metadata == expected - - group4 = await open_consolidated(store=memory_store_with_hierarchy) - assert group4.metadata == expected - - -def test_consolidated_sync(memory_store): - g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) - - zarr.api.synchronous.consolidate_metadata(memory_store) - group2 = zarr.api.synchronous.Group.open(memory_store) - - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - expected = GroupMetadata( - attributes={"foo": "bar"}, - consolidated_metadata=ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lon": ArrayV3Metadata.from_dict( - { - **{"shape": (2,)}, - "chunk_grid": { - "configuration": {"chunk_shape": (2,)}, - "name": "regular", - }, - **array_metadata, - } - ), - "time": ArrayV3Metadata.from_dict( - { - **{ - "shape": (3,), - "chunk_grid": { - "configuration": {"chunk_shape": (3,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - }, - ), - ) - assert group2.metadata == expected - group3 = zarr.api.synchronous.open(store=memory_store) - assert group3.metadata == expected - - group4 = zarr.api.synchronous.open_consolidated(store=memory_store) - assert group4.metadata == expected - - -async def test_not_writable_raises(memory_store: zarr.store.MemoryStore) -> None: - await group(store=memory_store, attributes={"foo": "bar"}) - read_store = zarr.store.MemoryStore(store_dict=memory_store._store_dict) - with pytest.raises(ValueError, match="does not support writing"): - await consolidate_metadata(read_store) - - -async def test_non_root_node(memory_store_with_hierarchy: Store) -> None: - await consolidate_metadata(memory_store_with_hierarchy, path="child") - root = await AsyncGroup.open(memory_store_with_hierarchy) - child = await AsyncGroup.open(StorePath(memory_store_with_hierarchy) / "child") - - assert root.metadata.consolidated_metadata is None - assert child.metadata.consolidated_metadata is not None - assert "air" not in child.metadata.consolidated_metadata.metadata - assert "grandchild" in child.metadata.consolidated_metadata.metadata - - -def test_consolidated_metadata_from_dict(): - data = {"must_understand": False} - - # missing kind - with pytest.raises(ValueError, match="kind='None'"): - ConsolidatedMetadata.from_dict(data) - - # invalid kind - data["kind"] = "invalid" - with pytest.raises(ValueError, match="kind='invalid'"): - ConsolidatedMetadata.from_dict(data) - - # missing metadata - data["kind"] = "inline" - - with pytest.raises(TypeError, match="Unexpected type for 'metadata'"): - ConsolidatedMetadata.from_dict(data) - - data["kind"] = "inline" - # empty is fine - data["metadata"] = {} - ConsolidatedMetadata.from_dict(data) - - # invalid metadata - data["metadata"]["a"] = {"node_type": "array", "zarr_format": 3} - - with pytest.raises(TypeError): - ConsolidatedMetadata.from_dict(data) - - -def test_flatten(): - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - metadata = ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "child": GroupMetadata( - attributes={"key": "child"}, - consolidated_metadata=ConsolidatedMetadata( - metadata={ - "array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "child"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ), - "grandchild": GroupMetadata( - attributes={"key": "grandchild"}, - consolidated_metadata=ConsolidatedMetadata( - metadata={ - "array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "grandchild"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ) - } - ), - ), - }, - ), - ), - }, - ) - result = metadata.flattened_metadata - expected = { - "air": metadata.metadata["air"], - "lat": metadata.metadata["lat"], - "child": GroupMetadata(attributes={"key": "child"}), - "child/array": metadata.metadata["child"].consolidated_metadata.metadata["array"], - "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), - "child/grandchild/array": metadata.metadata["child"] - .consolidated_metadata.metadata["grandchild"] - .consolidated_metadata.metadata["array"], - } - assert result == expected + ) + result = metadata.flattened_metadata + expected = { + "air": metadata.metadata["air"], + "lat": metadata.metadata["lat"], + "child": GroupMetadata(attributes={"key": "child"}), + "child/array": metadata.metadata["child"].consolidated_metadata.metadata["array"], + "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), + "child/grandchild/array": metadata.metadata["child"] + .consolidated_metadata.metadata["grandchild"] + .consolidated_metadata.metadata["array"], + } + assert result == expected From 500a91ef307a3eed9ff3cac3d940534981d169e8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 10:04:18 -0500 Subject: [PATCH 20/62] fixup --- src/zarr/core/group.py | 6 +++--- tests/v3/test_metadata/test_v3.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 6d56594a33..086e97f9ac 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -123,7 +123,9 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: if raw_metadata: for k, v in raw_metadata.items(): if not isinstance(v, dict): - raise TypeError(f"Invalid value for metadata items. key={k}, type={type(v)}") + raise TypeError( + f"Invalid value for metadata items. key='{k}', type='{type(v).__name__}'" + ) node_type = v.get("node_type", None) if node_type == "group": @@ -243,8 +245,6 @@ def flattened_metadata(self) -> dict[str, ArrayMetadata | GroupMetadata]: 'group-1': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group')} """ metadata = {} - if self.metadata is None: - raise ValueError def flatten( key: str, group: GroupMetadata | ArrayMetadata diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index de453bd6e8..8403738162 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -643,3 +643,15 @@ def test_flatten(self): .consolidated_metadata.metadata["array"], } assert result == expected + + def test_invalid_metadata_raises(self): + payload = { + "kind": "inline", + "must_understand": False, + "metadata": { + "foo": [1, 2, 3] # invalid + }, + } + + with pytest.raises(TypeError, match="key='foo', type='list'"): + ConsolidatedMetadata.from_dict(payload) From 22d501e8cf99f40bc77c0b17dd26d7af1807f27d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 11:09:00 -0500 Subject: [PATCH 21/62] Fixups --- src/zarr/core/group.py | 12 ++++++--- tests/v3/conftest.py | 9 ------- tests/v3/test_group.py | 56 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 086e97f9ac..376a0e96b1 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -575,6 +575,9 @@ def _getitem_consolidated(self, store_path: StorePath, key: str) -> AsyncArray | msg = f"'{key}' not found in consolidated metadata." raise KeyError(msg) from e + # update store_path to ensure that AsyncArray/Group.name is correct + store_path = StorePath(store=store_path.store, path=key) + if isinstance(metadata, GroupMetadata): return AsyncGroup(metadata=metadata, store_path=store_path) else: @@ -584,9 +587,6 @@ async def delitem(self, key: str) -> None: store_path = self.store_path / key if self.metadata.zarr_format == 3: await (store_path / ZARR_JSON).delete() - if self.metadata.consolidated_metadata: - self.metadata.consolidated_metadata.metadata.pop(key, None) - # This should probably synchronize? elif self.metadata.zarr_format == 2: await asyncio.gather( @@ -594,9 +594,15 @@ async def delitem(self, key: str) -> None: (store_path / ZARRAY_JSON).delete(), # TODO: missing_ok=False (store_path / ZATTRS_JSON).delete(), # TODO: missing_ok=True ) + else: raise ValueError(f"unexpected zarr_format: {self.metadata.zarr_format}") + if self.metadata.consolidated_metadata: + self.metadata.consolidated_metadata.metadata.pop(key, None) + # FIXME: This should probably rewrite all the consolidated metadata? + # Or do we expect users to reconsolidate? + async def _save_metadata(self) -> None: to_save = self.metadata.to_buffer_dict(default_buffer_prototype()) awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] diff --git a/tests/v3/conftest.py b/tests/v3/conftest.py index 0b9d26f92f..b1308f058f 100644 --- a/tests/v3/conftest.py +++ b/tests/v3/conftest.py @@ -24,9 +24,6 @@ from zarr.core.common import ChunkCoords, MemoryOrder, ZarrFormat -HERE = pathlib.Path(__file__).parent - - async def parse_store( store: Literal["local", "memory", "remote"], path: str ) -> LocalStore | MemoryStore | RemoteStore: @@ -94,12 +91,6 @@ async def async_group(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> As return agroup -@pytest.fixture(scope="function") -async def complex_hierarchy() -> LocalStore: - root = HERE / "examples/complex-hierarchy.zarr" - return LocalStore(root=root) - - @pytest.fixture(params=["numpy", "cupy"]) def xp(request: pytest.FixtureRequest) -> Iterator[ModuleType]: """Fixture to parametrize over numpy-like libraries""" diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index a9e43d7279..ce6bca49a9 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -6,6 +6,7 @@ import pytest import zarr.api.asynchronous +import zarr.api.synchronous from zarr import Array, AsyncArray, AsyncGroup, Group from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ZarrFormat @@ -244,7 +245,10 @@ def test_group_open( assert group_created_again.store_path == spath -def test_group_getitem(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) -> None: +@pytest.mark.parametrize("consolidated", [True, False]) +def test_group_getitem( + store: MemoryStore | LocalStore, zarr_format: ZarrFormat, consolidated: bool +) -> None: """ Test the `Group.__getitem__` method. """ @@ -253,13 +257,19 @@ def test_group_getitem(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) subgroup = group.create_group(name="subgroup") subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + if consolidated: + group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + assert group["subgroup"] == subgroup assert group["subarray"] == subarray with pytest.raises(KeyError): group["nope"] -def test_group_delitem(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) -> None: +@pytest.mark.parametrize("consolidated", [True, False]) +def test_group_delitem( + store: MemoryStore | LocalStore, zarr_format: ZarrFormat, consolidated: bool +) -> None: """ Test the `Group.__delitem__` method. """ @@ -268,6 +278,9 @@ def test_group_delitem(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) subgroup = group.create_group(name="subgroup") subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + if consolidated: + group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + assert group["subgroup"] == subgroup assert group["subarray"] == subarray @@ -319,22 +332,32 @@ def test_group_contains(store: MemoryStore | LocalStore, zarr_format: ZarrFormat assert "foo" in group -def test_group_subgroups(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) -> None: +@pytest.mark.parametrize("consolidate", [True, False]) +def test_group_subgroups( + store: MemoryStore | LocalStore, zarr_format: ZarrFormat, consolidate: bool +) -> None: """ Test the behavior of `Group` methods for accessing subgroups, namely `Group.group_keys` and `Group.groups` """ group = Group.create(store, zarr_format=zarr_format) keys = ("foo", "bar") - subgroups_expected = tuple(group.create_group(k) for k in keys) + subgroups_expected = tuple(group.create_group(k, attributes={"key": k}) for k in keys) # create a sub-array as well _ = group.create_array("array", shape=(10,)) + + if consolidate: + group = zarr.api.synchronous.consolidate_metadata(store, zarr_format=zarr_format) + subgroups_observed = group.groups() assert set(group.group_keys()) == set(keys) assert len(subgroups_observed) == len(subgroups_expected) assert all(a in subgroups_observed for a in subgroups_expected) -def test_group_subarrays(store: MemoryStore | LocalStore, zarr_format: ZarrFormat) -> None: +@pytest.mark.parametrize("consolidate", [True, False]) +def test_group_subarrays( + store: MemoryStore | LocalStore, zarr_format: ZarrFormat, consolidate: bool +) -> None: """ Test the behavior of `Group` methods for accessing subgroups, namely `Group.group_keys` and `Group.groups` """ @@ -343,6 +366,10 @@ def test_group_subarrays(store: MemoryStore | LocalStore, zarr_format: ZarrForma subarrays_expected = tuple(group.create_array(k, shape=(10,)) for k in keys) # create a sub-group as well _ = group.create_group("group") + + if consolidate: + group = zarr.api.synchronous.consolidate_metadata(store, zarr_format=zarr_format) + subarrays_observed = group.arrays() assert set(group.array_keys()) == set(keys) assert len(subarrays_observed) == len(subarrays_expected) @@ -928,3 +955,22 @@ async def test_group_delitem_consolidated(store: LocalStore | MemoryStore): await group.delitem("g0") assert len(group.metadata.consolidated_metadata.metadata) == 1 assert "g0" not in group.metadata.consolidated_metadata.metadata + + +@pytest.mark.parametrize("consolidate", [True, False]) +def test_members_name(store: LocalStore | MemoryStore, consolidate: bool): + group = Group.create(store=store) + a = group.create_group(name="a") + a.create_array("array", shape=(1,)) + b = a.create_group(name="b") + b.create_array("array", shape=(1,)) + + if consolidate: + group = zarr.api.synchronous.consolidate_metadata(store) + + result = group["a"]["b"] + assert result.name == "/a/b" + + paths = sorted(x.name for _, x in group.members(max_depth=None)) + expected = ["/a", "/a/array", "/a/b", "/a/b/array"] + assert paths == expected From d6c6cc7520e68d8cf1aac0046e54493a31ced8ed Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 11:40:19 -0500 Subject: [PATCH 22/62] fixup --- tests/v3/test_metadata/test_v3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 27a3a70f29..c6e2552859 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -19,9 +19,10 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.group import ConsolidatedMetadata, GroupMetadata -from zarr.core.metadata import ArrayV3Metadata, parse_dimension_names +from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata import parse_fill_value_v3 as parse_fill_value from zarr.core.metadata import parse_zarr_format_v3 as parse_zarr_format +from zarr.core.metadata.v3 import parse_dimension_names from zarr.store.common import StorePath if TYPE_CHECKING: From 6755fbc767e20514d989d98cd9841919d588706a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 11:41:20 -0500 Subject: [PATCH 23/62] fixup --- tests/v3/test_metadata/test_v3.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index c6e2552859..a3d56bdea6 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -20,9 +20,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata -from zarr.core.metadata import parse_fill_value_v3 as parse_fill_value -from zarr.core.metadata import parse_zarr_format_v3 as parse_zarr_format -from zarr.core.metadata.v3 import parse_dimension_names +from zarr.core.metadata.v3 import parse_dimension_names, parse_fill_value, parse_zarr_format from zarr.store.common import StorePath if TYPE_CHECKING: From e406f86262cc2115997eeb57b5fb18bd475943f5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 12:18:45 -0500 Subject: [PATCH 24/62] fixup --- src/zarr/core/group.py | 2 +- tests/v3/test_metadata/test_v2.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index c443aa3990..f174cde429 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -50,7 +50,7 @@ def parse_zarr_format(data: Any) -> ZarrFormat: if data in (2, 3): return cast(Literal[2, 3], data) - msg = msg = f"Invalid zarr_format. Expected one 2 or 3. Got {data}." + msg = msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." raise ValueError(msg) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 9cd0c7b149..a1fad26c06 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -10,8 +10,9 @@ import zarr.store from zarr.codecs import GzipCodec from zarr.core.buffer import cpu -from zarr.core.group import ConsolidatedMetadata, GroupMetadata, parse_zarr_format +from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata +from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: from typing import Any From bdf15ad100a214d61bffe262486ed789c5a143e0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Sep 2024 13:56:56 -0500 Subject: [PATCH 25/62] fixup --- tests/v3/test_group.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 9076739f93..4a4a424e1a 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -17,6 +17,7 @@ from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.store import LocalStore, MemoryStore, StorePath from zarr.store.common import make_store_path +from zarr.store.zip import ZipStore from .conftest import parse_store @@ -912,7 +913,10 @@ async def test_group_getitem_consolidated(store: LocalStore | MemoryStore): assert rg2.metadata.consolidated_metadata is None -async def test_group_delitem_consolidated(store: LocalStore | MemoryStore): +async def test_group_delitem_consolidated(store: Store): + if isinstance(store, ZipStore): + raise pytest.skip("Not implemented") + root = await AsyncGroup.create(store=store) # Set up the test structure with # / From 18eb172e94e73754398b58bcd8e615101db65c3f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2024 11:14:29 -0500 Subject: [PATCH 26/62] consistent open_consolidated handling --- src/zarr/api/asynchronous.py | 24 ++++- src/zarr/api/synchronous.py | 2 + src/zarr/core/group.py | 53 +++++---- tests/v3/test_group.py | 204 +++++++++++++++++++++-------------- 4 files changed, 180 insertions(+), 103 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c12efe22cc..424d4b5f42 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -286,7 +286,9 @@ async def open( async def open_consolidated(*args: Any, **kwargs: Any) -> AsyncGroup: - kwargs.setdefault("open_consolidated", True) + """ + Alias for :func:`open_group`. + """ return await open_group(*args, **kwargs) @@ -547,7 +549,7 @@ async def open_group( zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # not used attributes: dict[str, JSON] | None = None, - open_consolidated: bool = False, + use_consolidated: bool | str | None = None, ) -> AsyncGroup: """Open a group using file-mode-like semantics. @@ -586,6 +588,22 @@ async def open_group( meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. + use_consolidated: bool or str, default None + Whether to use consolidated metadata. + + By default, consolidated metadata is used if it's present in the + store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file + for Zarr v2). + + To explicitly require consolidated metadata, set ``use_consolidated=True``, + which will raise an exception if consolidated metadata is not found. + + To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, + which will fall back to using the regular, non consolidated metadata. + + Zarr v2 allowed configuring the key storing the consolidated metadata + (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` + to load consolidated metadata from a non-default key. Returns ------- @@ -618,7 +636,7 @@ async def open_group( try: return await AsyncGroup.open( - store_path, zarr_format=zarr_format, open_consolidated=open_consolidated + store_path, zarr_format=zarr_format, use_consolidated=use_consolidated ) except (KeyError, FileNotFoundError): return await AsyncGroup.create( diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 93a33b8d3f..aee4cb4e3d 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -200,6 +200,7 @@ def open_group( zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # not used in async api + use_consolidated: bool | str | None = None, ) -> Group: return Group( sync( @@ -214,6 +215,7 @@ def open_group( zarr_version=zarr_version, zarr_format=zarr_format, meta_array=meta_array, + use_consolidated=use_consolidated, ) ) ) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f174cde429..eb5225a1c3 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -355,7 +355,7 @@ async def open( cls, store: StoreLike, zarr_format: Literal[2, 3, None] = 3, - open_consolidated: bool | str = False, + use_consolidated: bool | str | None = None, ) -> AsyncGroup: """ Open a new AsyncGroup @@ -364,27 +364,32 @@ async def open( ---------- store: StoreLike zarr_format: {2, 3}, optional - open_consolidated: bool or str, default False - Whether to use consolidated metadata when opening a V2 Group. - This parameter has no effect when ``zarr_format=3`` (the default), - where consolidated metadata is always used when present. + use_consolidated: bool or str, default None + Whether to use consolidated metadata. - By default, consolidated metadata is not used for ``zarr_format=2``, - matching the behavior of zarr-python 2.x. Pass ``open_consolidated=True`` - to use consolidated metadata with the default key ``.zmetadata``. + By default, consolidated metadata is used if it's present in the + store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file + for Zarr v2). - Pass a string to use an alternative key in the store, which will be - be relative to the store or ``StorePath``. + To explicitly require consolidated metadata, set ``use_consolidated=True``, + which will raise an exception if consolidated metadata is not found. + + To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, + which will fall back to using the regular, non consolidated metadata. + + Zarr v2 allowed configuring the key storing the consolidated metadata + (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` + to load consolidated metadata from a non-default key. """ store_path = await make_store_path(store) if zarr_format == 2: paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] - if open_consolidated: - if open_consolidated is True: - open_consolidated = ".zmetadata" + if use_consolidated: + if use_consolidated is True: + use_consolidated = ".zmetadata" - paths.append(store_path / open_consolidated) + paths.append(store_path / use_consolidated) zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather( *[path.get() for path in paths] @@ -392,7 +397,7 @@ async def open( if zgroup_bytes is None: raise FileNotFoundError(store_path) - if open_consolidated: + if use_consolidated: consolidated_metadata_bytes = rest[0] if consolidated_metadata_bytes is None: raise FileNotFoundError(paths[-1]) @@ -413,7 +418,7 @@ async def open( (store_path / ZARR_JSON).get(), (store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get(), - (store_path / str(open_consolidated)).get(), + (store_path / str(use_consolidated)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: # TODO: revisit this exception type @@ -438,7 +443,9 @@ async def open( else: # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None - return cls._from_bytes_v3(store_path, zarr_json_bytes) + return cls._from_bytes_v3( + store_path, zarr_json_bytes, use_consolidated=bool(use_consolidated) + ) @classmethod def _from_bytes_v2( @@ -483,8 +490,18 @@ def _from_bytes_v2( return cls.from_dict(store_path, group_metadata) @classmethod - def _from_bytes_v3(cls, store_path: StorePath, zarr_json_bytes: Buffer) -> AsyncGroup: + def _from_bytes_v3( + cls, store_path: StorePath, zarr_json_bytes: Buffer, use_consolidated: bool + ) -> AsyncGroup: group_metadata = json.loads(zarr_json_bytes.to_bytes()) + if use_consolidated and group_metadata.get("consolidated_metadata") is None: + msg = f"Consolidated metadata requested with 'use_consolidated=True' but not found in '{store_path.path}'." + # Use `FileNotFoundError` here to match the error from v2? + raise ValueError(msg) + + elif use_consolidated is False: + # Drop consolidated metadata if it's there. + group_metadata.pop("consolidated_metadata", None) return cls.from_dict(store_path, group_metadata) @classmethod diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 4a4a424e1a..783472e24f 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -868,88 +868,6 @@ async def test_require_array(store: LocalStore | MemoryStore, zarr_format: ZarrF await root.require_array("bar", shape=(10,), dtype="int8") -async def test_group_getitem_consolidated(store: LocalStore | MemoryStore): - root = await AsyncGroup.create(store=store) - # Set up the test structure with - # / - # g0/ # group /g0 - # g1/ # group /g0/g1 - # g2/ # group /g0/g1/g2 - # x1/ # group /x0 - # x2/ # group /x0/x1 - # x3/ # group /x0/x1/x2 - - g0 = await root.create_group("g0") - g1 = await g0.create_group("g1") - await g1.create_group("g2") - - x0 = await root.create_group("x0") - x1 = await x0.create_group("x1") - await x1.create_group("x2") - - await zarr.api.asynchronous.consolidate_metadata(store) - - # On disk, we've consolidated all the metadata in the root zarr.json - group = await zarr.api.asynchronous.open(store=store) - rg0 = await group.getitem("g0") - - expected = ConsolidatedMetadata( - metadata={ - "g1": GroupMetadata( - attributes={}, - zarr_format=3, - consolidated_metadata=ConsolidatedMetadata( - metadata={"g2": GroupMetadata(attributes={}, zarr_format=3)} - ), - ), - } - ) - assert rg0.metadata.consolidated_metadata == expected - - rg1 = await rg0.getitem("g1") - assert rg1.metadata.consolidated_metadata == expected.metadata["g1"].consolidated_metadata - - rg2 = await rg1.getitem("g2") - assert rg2.metadata.consolidated_metadata is None - - -async def test_group_delitem_consolidated(store: Store): - if isinstance(store, ZipStore): - raise pytest.skip("Not implemented") - - root = await AsyncGroup.create(store=store) - # Set up the test structure with - # / - # g0/ # group /g0 - # g1/ # group /g0/g1 - # g2/ # group /g0/g1/g2 - # data # array - # x1/ # group /x0 - # x2/ # group /x0/x1 - # x3/ # group /x0/x1/x2 - # data # array - - g0 = await root.create_group("g0") - g1 = await g0.create_group("g1") - g2 = await g1.create_group("g2") - await g2.create_array("data", shape=(1,)) - - x0 = await root.create_group("x0") - x1 = await x0.create_group("x1") - x2 = await x1.create_group("x2") - await x2.create_array("data", shape=(1,)) - - await zarr.api.asynchronous.consolidate_metadata(store) - - group = await zarr.api.asynchronous.open_consolidated(store=store) - assert len(group.metadata.consolidated_metadata.metadata) == 2 - assert "g0" in group.metadata.consolidated_metadata.metadata - - await group.delitem("g0") - assert len(group.metadata.consolidated_metadata.metadata) == 1 - assert "g0" not in group.metadata.consolidated_metadata.metadata - - @pytest.mark.parametrize("consolidate", [True, False]) def test_members_name(store: LocalStore | MemoryStore, consolidate: bool): group = Group.create(store=store) @@ -977,3 +895,125 @@ async def test_open_mutable_mapping(): def test_open_mutable_mapping_sync(): group = open_group(store={}, mode="w") assert isinstance(group.store_path.store, MemoryStore) + + +class TestConsolidated: + async def test_group_getitem_consolidated(self, store: LocalStore | MemoryStore) -> None: + root = await AsyncGroup.create(store=store) + # Set up the test structure with + # / + # g0/ # group /g0 + # g1/ # group /g0/g1 + # g2/ # group /g0/g1/g2 + # x1/ # group /x0 + # x2/ # group /x0/x1 + # x3/ # group /x0/x1/x2 + + g0 = await root.create_group("g0") + g1 = await g0.create_group("g1") + await g1.create_group("g2") + + x0 = await root.create_group("x0") + x1 = await x0.create_group("x1") + await x1.create_group("x2") + + await zarr.api.asynchronous.consolidate_metadata(store) + + # On disk, we've consolidated all the metadata in the root zarr.json + group = await zarr.api.asynchronous.open(store=store) + rg0 = await group.getitem("g0") + + expected = ConsolidatedMetadata( + metadata={ + "g1": GroupMetadata( + attributes={}, + zarr_format=3, + consolidated_metadata=ConsolidatedMetadata( + metadata={"g2": GroupMetadata(attributes={}, zarr_format=3)} + ), + ), + } + ) + assert rg0.metadata.consolidated_metadata == expected + + rg1 = await rg0.getitem("g1") + assert rg1.metadata.consolidated_metadata == expected.metadata["g1"].consolidated_metadata + + rg2 = await rg1.getitem("g2") + assert rg2.metadata.consolidated_metadata is None + + async def test_group_delitem_consolidated(self, store: Store) -> None: + if isinstance(store, ZipStore): + raise pytest.skip("Not implemented") + + root = await AsyncGroup.create(store=store) + # Set up the test structure with + # / + # g0/ # group /g0 + # g1/ # group /g0/g1 + # g2/ # group /g0/g1/g2 + # data # array + # x1/ # group /x0 + # x2/ # group /x0/x1 + # x3/ # group /x0/x1/x2 + # data # array + + g0 = await root.create_group("g0") + g1 = await g0.create_group("g1") + g2 = await g1.create_group("g2") + await g2.create_array("data", shape=(1,)) + + x0 = await root.create_group("x0") + x1 = await x0.create_group("x1") + x2 = await x1.create_group("x2") + await x2.create_array("data", shape=(1,)) + + await zarr.api.asynchronous.consolidate_metadata(store) + + group = await zarr.api.asynchronous.open_consolidated(store=store) + assert len(group.metadata.consolidated_metadata.metadata) == 2 + assert "g0" in group.metadata.consolidated_metadata.metadata + + await group.delitem("g0") + assert len(group.metadata.consolidated_metadata.metadata) == 1 + assert "g0" not in group.metadata.consolidated_metadata.metadata + + def test_open_consolidated_raises(self, store: Store) -> None: + if isinstance(store, ZipStore): + raise pytest.skip("Not implemented") + + root = Group.create(store=store) + + # fine to be missing by default + zarr.open_group(store=store) + + with pytest.raises(ValueError, match="Consolidated metadata requested."): + zarr.open_group(store=store, use_consolidated=True) + + # Now create consolidated metadata... + root.create_group("g0") + zarr.consolidate_metadata(store) + + # and explicitly ignore it. + group = zarr.open_group(store=store, use_consolidated=False) + assert group.metadata.consolidated_metadata is None + + async def test_open_consolidated_raises_async(self, store: Store) -> None: + if isinstance(store, ZipStore): + raise pytest.skip("Not implemented") + + root = await AsyncGroup.create(store=store) + + # fine to be missing by default + await zarr.api.asynchronous.open_group(store=store) + + with pytest.raises(ValueError, match="Consolidated metadata requested."): + await zarr.api.asynchronous.open_group(store=store, use_consolidated=True) + + # Now create consolidated metadata... + await root.create_group("g0") + await zarr.api.asynchronous.consolidate_metadata(store) + + # and explicitly ignore it. + group = await zarr.api.asynchronous.open_group(store=store, use_consolidated=False) + assert group.metadata.consolidated_metadata is None From c11f1adc075272cf6ee2252a0f7be91d9e8be4dc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2024 11:27:04 -0500 Subject: [PATCH 27/62] fixup --- src/zarr/core/group.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index eb5225a1c3..24096a844f 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -167,9 +167,8 @@ def _flat_to_nested( # # We want a nested representation, so we need to group by Group, # i.e. find all the children with the same prefix. - # keys = sorted(metadata, key=lambda x: (x.count("/"), x)) + # - # ideally we build up a data structure that has {full_key: metadata} metadata = dict(metadata) keys = sorted(metadata, key=lambda k: k.count("/")) @@ -442,9 +441,11 @@ async def open( ) else: # V3 groups are comprised of a zarr.json object + if not isinstance(use_consolidated, bool | None): + raise TypeError("use_consolidated must be a bool for Zarr V3.") assert zarr_json_bytes is not None return cls._from_bytes_v3( - store_path, zarr_json_bytes, use_consolidated=bool(use_consolidated) + store_path, zarr_json_bytes, use_consolidated=use_consolidated ) @classmethod @@ -491,7 +492,7 @@ def _from_bytes_v2( @classmethod def _from_bytes_v3( - cls, store_path: StorePath, zarr_json_bytes: Buffer, use_consolidated: bool + cls, store_path: StorePath, zarr_json_bytes: Buffer, use_consolidated: bool | None ) -> AsyncGroup: group_metadata = json.loads(zarr_json_bytes.to_bytes()) if use_consolidated and group_metadata.get("consolidated_metadata") is None: @@ -502,6 +503,7 @@ def _from_bytes_v3( elif use_consolidated is False: # Drop consolidated metadata if it's there. group_metadata.pop("consolidated_metadata", None) + return cls.from_dict(store_path, group_metadata) @classmethod From f6397f4c383867dc369cffb094f1641f338fe199 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2024 11:37:04 -0500 Subject: [PATCH 28/62] make clear that flat_to_nested mutates --- src/zarr/core/group.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 24096a844f..29d1f3bef8 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -141,14 +141,15 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: else: # probably v2 Group metadata metadata[k] = GroupMetadata.from_dict(v) - metadata = cls._flat_to_nested(metadata) + + cls._flat_to_nested(metadata) return cls(metadata=metadata) @staticmethod def _flat_to_nested( metadata: dict[str, ArrayMetadata | GroupMetadata], - ) -> dict[str, ArrayMetadata | GroupMetadata]: + ) -> None: """ Convert a flat metadata representation to a nested one. @@ -168,15 +169,14 @@ def _flat_to_nested( # We want a nested representation, so we need to group by Group, # i.e. find all the children with the same prefix. # - - metadata = dict(metadata) + # metadata = dict(metadata) keys = sorted(metadata, key=lambda k: k.count("/")) grouped = { k: list(v) for k, v in itertools.groupby(keys, key=lambda k: k.rsplit("/", 1)[0]) } - # # we go top down and directly manipulate metadata. + # we go top down and directly manipulate metadata. for key, children_keys in grouped.items(): # key is a key like "a", "a/b", "a/b/c" # The basic idea is to find the immediate parent (so "", "a", or "a/b") @@ -191,7 +191,8 @@ def _flat_to_nested( part = prefixes.pop(0) # we can assume that parent[part] here is a group # otherwise we wouldn't have a node with this `part` prefix. - # We can also assume that the parent node will have consolidated metadata. + # We can also assume that the parent node will have consolidated metadata, + # because we're walking top to bottom. parent = parent[part].consolidated_metadata.metadata # type: ignore[union-attr] node = parent[name] @@ -212,7 +213,6 @@ def _flat_to_nested( parent[name] = replace( node, consolidated_metadata=ConsolidatedMetadata(metadata=children) ) - return metadata @property def flattened_metadata(self) -> dict[str, ArrayMetadata | GroupMetadata]: From 123dc6091a6d2b0a9526e60d2887e090ef4849a4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2024 18:22:07 -0500 Subject: [PATCH 29/62] fixup --- src/zarr/core/group.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 29d1f3bef8..f5e75d5261 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -382,13 +382,17 @@ async def open( """ store_path = await make_store_path(store) + if (zarr_format is None or zarr_format == 2) and use_consolidated in (None, True): + use_consolidated = ".zmetadata" + + elif zarr_format == 3 and not isinstance(use_consolidated, bool | None): + raise TypeError("use_consolidated must be a bool for Zarr V3.") + if zarr_format == 2: paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] if use_consolidated: - if use_consolidated is True: - use_consolidated = ".zmetadata" - - paths.append(store_path / use_consolidated) + # we've cast to a str here, but mypy doesn't see that + paths.append(store_path / use_consolidated) # type: ignore[operator] zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather( *[path.get() for path in paths] @@ -441,11 +445,12 @@ async def open( ) else: # V3 groups are comprised of a zarr.json object - if not isinstance(use_consolidated, bool | None): - raise TypeError("use_consolidated must be a bool for Zarr V3.") assert zarr_json_bytes is not None + # we've cast to a str here but mypy doesn't see that. return cls._from_bytes_v3( - store_path, zarr_json_bytes, use_consolidated=use_consolidated + store_path, + zarr_json_bytes, + use_consolidated=use_consolidated, # type: ignore[arg-type] ) @classmethod From 34c77209998ab30a771d0bbbb412c388937611b4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2024 20:41:27 -0500 Subject: [PATCH 30/62] fixup --- src/zarr/core/group.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f5e75d5261..71fa96bdb7 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -382,8 +382,9 @@ async def open( """ store_path = await make_store_path(store) - if (zarr_format is None or zarr_format == 2) and use_consolidated in (None, True): - use_consolidated = ".zmetadata" + consolidated_key = ".zmetadata" + if zarr_format == 2 or zarr_format is None and isinstance(use_consolidated, str): + consolidated_key = use_consolidated # type: ignore[assignment] elif zarr_format == 3 and not isinstance(use_consolidated, bool | None): raise TypeError("use_consolidated must be a bool for Zarr V3.") @@ -391,8 +392,7 @@ async def open( if zarr_format == 2: paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] if use_consolidated: - # we've cast to a str here, but mypy doesn't see that - paths.append(store_path / use_consolidated) # type: ignore[operator] + paths.append(store_path / consolidated_key) zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather( *[path.get() for path in paths] @@ -421,7 +421,7 @@ async def open( (store_path / ZARR_JSON).get(), (store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get(), - (store_path / str(use_consolidated)).get(), + (store_path / str(consolidated_key)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: # TODO: revisit this exception type @@ -446,11 +446,11 @@ async def open( else: # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None - # we've cast to a str here but mypy doesn't see that. + use_consolidated = use_consolidated in (None, True) return cls._from_bytes_v3( store_path, zarr_json_bytes, - use_consolidated=use_consolidated, # type: ignore[arg-type] + use_consolidated=use_consolidated, ) @classmethod From 4db042b3cab2d949980dd762fe02586f55e1a8fa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2024 20:42:35 -0500 Subject: [PATCH 31/62] Fixup --- src/zarr/core/group.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 71fa96bdb7..b7ebfe43d9 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -386,9 +386,6 @@ async def open( if zarr_format == 2 or zarr_format is None and isinstance(use_consolidated, str): consolidated_key = use_consolidated # type: ignore[assignment] - elif zarr_format == 3 and not isinstance(use_consolidated, bool | None): - raise TypeError("use_consolidated must be a bool for Zarr V3.") - if zarr_format == 2: paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] if use_consolidated: @@ -446,7 +443,9 @@ async def open( else: # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None - use_consolidated = use_consolidated in (None, True) + if not isinstance(use_consolidated, bool | None): + raise TypeError("use_consolidated must be a bool for Zarr V3.") + return cls._from_bytes_v3( store_path, zarr_json_bytes, From a1f1ebbcbb97879a24dd347baf5104c860306ff5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Sep 2024 10:46:25 -0500 Subject: [PATCH 32/62] fixup --- src/zarr/api/asynchronous.py | 1 + src/zarr/core/group.py | 27 +++++++++++++++++++++------ tests/v3/test_metadata/test_v3.py | 19 +++++++++++++++++++ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c11bf8729b..1a8fd31b29 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -177,6 +177,7 @@ async def consolidate_metadata( members_metadata = {k: v.metadata for k, v in members.items()} + ConsolidatedMetadata._flat_to_nested(members_metadata) consolidated_metadata = ConsolidatedMetadata(metadata=members_metadata) metadata = dataclasses.replace(group.metadata, consolidated_metadata=consolidated_metadata) group = dataclasses.replace( diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 744498b049..29ddb626f7 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -102,7 +102,7 @@ def to_dict(self) -> dict[str, JSON]: return { "kind": self.kind, "must_understand": self.must_understand, - "metadata": {k: v.to_dict() for k, v in self.metadata.items()}, + "metadata": {k: v.to_dict() for k, v in self.flattened_metadata.items()}, } @classmethod @@ -165,10 +165,17 @@ def _flat_to_nested( under the consolidated metadata of their immediate parent. """ # We have a flat mapping from {k: v} where the keys include the *full* - # path segment. + # path segment: + # { + # "/a/b": { group_metadata }, + # "/a/b/array-0": { array_metadata }, + # "/a/b/array-1": { array_metadata }, + # } # - # We want a nested representation, so we need to group by Group, - # i.e. find all the children with the same prefix. + # We want to reorganize the metadata such that each Group contains the + # array metadata of its immediate children. + # In the example, the group at `/a/b` will have consolidated metadata + # for its children `array-0` and `array-1`. # # metadata = dict(metadata) @@ -972,6 +979,12 @@ async def members( ``max_depth=None`` to include all nodes, and some positive integer to consider children within that many levels of the root Group. + Returns + ------- + path: + A string giving the path to the target, relative to the Group ``self``. + value: AsyncArray or AsyncGroup + The AsyncArray or AsyncGroup that is a child of ``self``. """ if max_depth is not None and max_depth < 0: raise ValueError(f"max_depth must be None or >= 0. Got '{max_depth}' instead") @@ -1046,7 +1059,7 @@ def _members_consolidated( obj = self._getitem_consolidated(self.store_path, key) # Metadata -> Group/Array # this is probably generally useful key = "/".join([prefix, key]).lstrip("/") - yield key, obj + # yield key, obj if ((max_depth is None) or (current_depth < max_depth)) and isinstance( obj, AsyncGroup @@ -1063,7 +1076,9 @@ async def contains(self, member: str) -> bool: return True async def groups(self) -> AsyncGenerator[tuple[str, AsyncGroup], None]: - async for name, value in self.members(): + members = [x async for x in self.members()] + # async for name, value in self.members(): + for name, value in members: if isinstance(value, AsyncGroup): yield name, value diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index a3d56bdea6..bd34265dc5 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -410,6 +410,25 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: group4 = await open_consolidated(store=memory_store_with_hierarchy) assert group4.metadata == expected + result_raw = json.loads( + ( + await memory_store_with_hierarchy.get( + "zarr.json", prototype=default_buffer_prototype() + ) + ).to_bytes() + )["consolidated_metadata"] + assert result_raw["kind"] == "inline" + assert sorted(result_raw["metadata"]) == [ + "air", + "child", + "child/array", + "child/grandchild", + "child/grandchild/array", + "lat", + "lon", + "time", + ] + def test_consolidated_sync(self, memory_store): g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) g.create_array(name="air", shape=(1, 2, 3)) From 35a383289a6d64cbdeebbd23c6bdf595170130f7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Sep 2024 12:08:25 -0500 Subject: [PATCH 33/62] fixup --- src/zarr/core/group.py | 2 +- tests/v3/test_metadata/test_consolidated.py | 37 +++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 29ddb626f7..0b399f5f33 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1059,7 +1059,7 @@ def _members_consolidated( obj = self._getitem_consolidated(self.store_path, key) # Metadata -> Group/Array # this is probably generally useful key = "/".join([prefix, key]).lstrip("/") - # yield key, obj + yield key, obj if ((max_depth is None) or (current_depth < max_depth)) and isinstance( obj, AsyncGroup diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index b5d12d65de..84667adfdb 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING import numpy as np @@ -13,6 +14,7 @@ open, open_consolidated, ) +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata from zarr.store.common import StorePath @@ -21,6 +23,22 @@ from zarr.abc.store import Store +@pytest.fixture +async def memory_store_with_hierarchy(memory_store: Store) -> None: + g = await group(store=memory_store, attributes={"foo": "bar"}) + await g.create_array(name="air", shape=(1, 2, 3)) + await g.create_array(name="lat", shape=(1,)) + await g.create_array(name="lon", shape=(2,)) + await g.create_array(name="time", shape=(3,)) + + child = await g.create_group("child", attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + + grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) + await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + return memory_store + + class TestConsolidated: async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: # TODO: Figure out desired keys in @@ -151,6 +169,25 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: group4 = await open_consolidated(store=memory_store_with_hierarchy) assert group4.metadata == expected + result_raw = json.loads( + ( + await memory_store_with_hierarchy.get( + "zarr.json", prototype=default_buffer_prototype() + ) + ).to_bytes() + )["consolidated_metadata"] + assert result_raw["kind"] == "inline" + assert sorted(result_raw["metadata"]) == [ + "air", + "child", + "child/array", + "child/grandchild", + "child/grandchild/array", + "lat", + "lon", + "time", + ] + def test_consolidated_sync(self, memory_store): g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) g.create_array(name="air", shape=(1, 2, 3)) From d03f4bda6b9fcaf820d5bb60260c1c0f96e2060c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Sep 2024 12:12:33 -0500 Subject: [PATCH 34/62] fixup --- tests/v3/test_group.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 5ef0888c47..299a8b721b 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -884,20 +884,6 @@ async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrF await foo_group.require_group("bar") -async def test_create_dataset(store: Store, zarr_format: ZarrFormat) -> None: - root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) - with pytest.warns(DeprecationWarning): - foo = await root.create_dataset("foo", shape=(10,), dtype="uint8") - assert foo.shape == (10,) - - with pytest.raises(ContainsArrayError), pytest.warns(DeprecationWarning): - await root.create_dataset("foo", shape=(100,), dtype="int8") - - _ = await root.create_group("bar") - with pytest.raises(ContainsGroupError), pytest.warns(DeprecationWarning): - await root.create_dataset("bar", shape=(100,), dtype="int8") - - async def test_require_groups(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) # create foo group @@ -919,6 +905,20 @@ async def test_require_groups(store: LocalStore | MemoryStore, zarr_format: Zarr assert no_group == () +async def test_create_dataset(store: Store, zarr_format: ZarrFormat) -> None: + root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) + with pytest.warns(DeprecationWarning): + foo = await root.create_dataset("foo", shape=(10,), dtype="uint8") + assert foo.shape == (10,) + + with pytest.raises(ContainsArrayError), pytest.warns(DeprecationWarning): + await root.create_dataset("foo", shape=(100,), dtype="int8") + + _ = await root.create_group("bar") + with pytest.raises(ContainsGroupError), pytest.warns(DeprecationWarning): + await root.create_dataset("bar", shape=(100,), dtype="int8") + + async def test_require_array(store: Store, zarr_format: ZarrFormat) -> None: root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) foo1 = await root.require_array("foo", shape=(10,), dtype="i8", attributes={"foo": 101}) From cddd01fc775af99abb9f37080dd5a2709ef956e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Sep 2024 14:13:24 -0500 Subject: [PATCH 35/62] fixup --- src/zarr/api/asynchronous.py | 4 ++-- src/zarr/api/synchronous.py | 6 ++++-- src/zarr/core/group.py | 25 +++++++++++++------------ tests/v3/test_metadata/test_v2.py | 12 ++++++------ 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 156abe2b43..03f4b1f1ec 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -169,7 +169,7 @@ async def consolidate_metadata( if path is not None: store_path = store_path / path - group = await AsyncGroup.open(store_path, zarr_format=zarr_format) + group = await AsyncGroup.open(store_path, zarr_format=zarr_format, use_consolidated=False) group.store_path.store._check_writable() members = dict([x async for x in group.members(max_depth=None)]) @@ -290,7 +290,7 @@ async def open( return await open_group(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs) -async def open_consolidated(*args: Any, **kwargs: Any) -> AsyncGroup: +async def open_consolidated(*args: Any, use_consolidated: bool = True, **kwargs: Any) -> AsyncGroup: """ Alias for :func:`open_group`. """ diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 4dcd3036ee..5aa314886f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -90,8 +90,10 @@ def open( return Group(obj) -def open_consolidated(*args: Any, **kwargs: Any) -> Group: - return Group(sync(async_api.open_consolidated(*args, **kwargs))) +def open_consolidated(*args: Any, use_consolidated: bool = True, **kwargs: Any) -> Group: + return Group( + sync(async_api.open_consolidated(*args, use_consolidated=use_consolidated, **kwargs)) + ) def save( diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 2e9288d721..00efe2c721 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -391,12 +391,12 @@ async def open( store_path = await make_store_path(store) consolidated_key = ".zmetadata" - if zarr_format == 2 or zarr_format is None and isinstance(use_consolidated, str): - consolidated_key = use_consolidated # type: ignore[assignment] + if (zarr_format == 2 or zarr_format is None) and isinstance(use_consolidated, str): + consolidated_key = use_consolidated if zarr_format == 2: paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON] - if use_consolidated: + if use_consolidated or use_consolidated is None: paths.append(store_path / consolidated_key) zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather( @@ -405,12 +405,14 @@ async def open( if zgroup_bytes is None: raise FileNotFoundError(store_path) - if use_consolidated: - consolidated_metadata_bytes = rest[0] - if consolidated_metadata_bytes is None: + if use_consolidated or use_consolidated is None: + maybe_consolidated_metadata_bytes = rest[0] + + if use_consolidated and maybe_consolidated_metadata_bytes is None: + # the user requested consolidated metadata, but it was missing raise FileNotFoundError(paths[-1]) else: - consolidated_metadata_bytes = None + maybe_consolidated_metadata_bytes = None elif zarr_format == 3: zarr_json_bytes = await (store_path / ZARR_JSON).get() @@ -421,7 +423,7 @@ async def open( zarr_json_bytes, zgroup_bytes, zattrs_bytes, - consolidated_metadata_bytes, + maybe_consolidated_metadata_bytes, ) = await asyncio.gather( (store_path / ZARR_JSON).get(), (store_path / ZGROUP_JSON).get(), @@ -446,7 +448,7 @@ async def open( # this is checked above, asserting here for mypy assert zgroup_bytes is not None return cls._from_bytes_v2( - store_path, zgroup_bytes, zattrs_bytes, consolidated_metadata_bytes + store_path, zgroup_bytes, zattrs_bytes, maybe_consolidated_metadata_bytes ) else: # V3 groups are comprised of a zarr.json object @@ -632,10 +634,9 @@ async def delitem(self, key: str) -> None: if self.metadata.consolidated_metadata: self.metadata.consolidated_metadata.metadata.pop(key, None) - # FIXME: This should probably rewrite all the consolidated metadata? - # Or do we expect users to reconsolidate? + await self._save_metadata(drop_consolidated_metadata=True) - async def _save_metadata(self) -> None: + async def _save_metadata(self, drop_consolidated_metadata: bool = False) -> None: to_save = self.metadata.to_buffer_dict(default_buffer_prototype()) awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] await asyncio.gather(*awaitables) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 75ebce9f32..29d4f7e867 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -93,7 +93,7 @@ async def v2_consolidated_metadata( ".zgroup": {"zarr_format": 2}, "air/.zarray": { "chunks": [730], - "compressor": {}, + "compressor": None, "dtype": " Date: Fri, 20 Sep 2024 14:13:45 -0500 Subject: [PATCH 36/62] added docs --- docs/consolidated-metadata.rst | 74 ++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docs/consolidated-metadata.rst diff --git a/docs/consolidated-metadata.rst b/docs/consolidated-metadata.rst new file mode 100644 index 0000000000..1a8ec2b946 --- /dev/null +++ b/docs/consolidated-metadata.rst @@ -0,0 +1,74 @@ +Consolidated Metadata +===================== + +zarr-python implements the `Consolidated Metadata_` extension to the Zarr Spec. +Consolidated metadata can reduce the time needed to load the metadata for an +entire hierarchy, especially when the metadata is being served over a network. +Consolidated metadata essentially stores all the metadata for a hierarchy in the +metadata of the root Group. + +Usage +----- + +If consolidated metadata is present in a Zarr Group's metadata then it is used +by default. The initial read to open the group will need to communicate with +the store (reading from a file for a :class:`zarr.store.LocalStore`, making a +network request for a :class:`zarr.store.RemoteStore`). After that, any subsequent +metadata reads get child Group or Array nodes will *not* require reads from the store. + +In Python, the consolidated metadata is available on the ``.consolidated_metadata`` +attribute of the Group. + +.. code-block:: python + + >>> import zarr + >>> store = zarr.store.MemoryStore({}, mode="w") + >>> group = zarr.open_group(store=store) + >>> group.create_array(shape=(1,), name="a") + >>> group.create_array(shape=(2, 2), name="b") + >>> group.create_array(shape=(3, 3, 3), name="c") + >>> zarr.consolidate_metadata(store) + +If we open that group, the Group's metadata has a :class:`zarr.ConsolidatedMetadata` +that can be used. + +.. code-block:: python + + >>> consolidated = zarr.open_group(store=store) + >>> consolidated.metadata.consolidated_metadata.metadata + {'b': ArrayV3Metadata(shape=(2, 2), fill_value=np.float64(0.0), ...), + 'a': ArrayV3Metadata(shape=(1,), fill_value=np.float64(0.0), ...), + 'c': ArrayV3Metadata(shape=(3, 3, 3), fill_value=np.float64(0.0), ...)} + +Operations on the group to get children automatically use the consolidated metadata. + +.. code-block:: python + + >>> consolidated["a"] # no read / HTTP request to the Store is required + + +With nested groups, the consolidated metadata is available on the children, recursively. + +... code-block:: python + + >>> child = group.create_group("child", attributes={"kind": "child"}) + >>> grandchild = child.create_group("child", attributes={"kind": "grandchild"}) + >>> consolidated = zarr.consolidate_metadata(store) + + >>> consolidated["child"].metadata.consolidated_metadata + ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, )}, ...) + +Synchronization and Concurrency +------------------------------- + +Consolidated metadata is intended for read-heavy use cases on slowly changing +hierarchies. For hierarchies where new nodes are constantly being added, +removed, or modified, consolidated metadata may not be desirable. + +1. It will add some overhead to each update operation, since the metadata + would need to be re-consolidated to keep it in sync with the store. +2. Readers using consolidated metadata will regularly see a "past" version + of the metadata, at the time they read the root node with its consolidated + metadata. + +.. _Consolidated Metadata: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#consolidated-metadata \ No newline at end of file From 87b65f16371bc6dfdd710c87c11a7ec06660cb66 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Sep 2024 14:34:18 -0500 Subject: [PATCH 37/62] fixup --- tests/v3/test_group.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 299a8b721b..5d24bfec15 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -92,7 +92,7 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad # subarray path = "group" - group = Group.create( + group = Group.from_store( store=store, zarr_format=zarr_format, ) @@ -801,7 +801,7 @@ def test_serializable_sync_group(store: LocalStore, zarr_format: ZarrFormat) -> @pytest.mark.parametrize("consolidated_metadata", [True, False]) async def test_group_members_async(store: Store, consolidated_metadata: bool) -> None: - group = await AsyncGroup.create( + group = await AsyncGroup.from_store( store=store, ) a0 = await group.create_array("a0", shape=(1,)) From ee5d130d49032f38c36a62c073bef78ef950fcfd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 10:25:19 -0500 Subject: [PATCH 38/62] Ensure empty dict --- src/zarr/api/asynchronous.py | 13 +++-- src/zarr/core/group.py | 9 ++-- tests/v3/test_group.py | 39 ++++++++++++++ tests/v3/test_metadata/test_consolidated.py | 56 ++++++++++++++++++--- 4 files changed, 104 insertions(+), 13 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 03f4b1f1ec..95b59985c5 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -19,7 +19,7 @@ ZarrFormat, ) from zarr.core.config import config -from zarr.core.group import AsyncGroup, ConsolidatedMetadata +from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.store import ( StoreLike, @@ -172,12 +172,17 @@ async def consolidate_metadata( group = await AsyncGroup.open(store_path, zarr_format=zarr_format, use_consolidated=False) group.store_path.store._check_writable() - members = dict([x async for x in group.members(max_depth=None)]) - members_metadata = {} + members_metadata = {k: v.metadata async for k, v in group.members(max_depth=None)} - members_metadata = {k: v.metadata for k, v in members.items()} + # While consolidating, we want to be explicit about when child groups + # are empty by inserting an empty dict for consolidated_metadata.metadata + for k, v in members_metadata.items(): + if isinstance(v, GroupMetadata) and v.consolidated_metadata is None: + v = dataclasses.replace(v, consolidated_metadata=ConsolidatedMetadata(metadata={})) + members_metadata[k] = v ConsolidatedMetadata._flat_to_nested(members_metadata) + consolidated_metadata = ConsolidatedMetadata(metadata=members_metadata) metadata = dataclasses.replace(group.metadata, consolidated_metadata=consolidated_metadata) group = dataclasses.replace( diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 00efe2c721..e9deeb6e43 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -261,15 +261,18 @@ def flatten( if isinstance(group, ArrayMetadata): children[key] = group else: - children[key] = replace(group, consolidated_metadata=None) - if group.consolidated_metadata and group.consolidated_metadata.metadata: + if group.consolidated_metadata and group.consolidated_metadata.metadata is not None: + children[key] = replace( + group, consolidated_metadata=ConsolidatedMetadata(metadata={}) + ) for name, val in group.consolidated_metadata.metadata.items(): full_key = "/".join([key, name]) if isinstance(val, GroupMetadata): children.update(flatten(full_key, val)) else: children[full_key] = val - + else: + children[key] = replace(group, consolidated_metadata=None) return children for k, v in self.metadata.items(): diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 5d24bfec15..88a4851538 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -345,6 +345,45 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat if consolidate: group = zarr.consolidate_metadata(store) + object.__setattr__( + expected_group_values[0].metadata, + "consolidated_metadata", + ConsolidatedMetadata.from_dict( + { + "kind": "inline", + "metadata": { + "subarray": { + "attributes": {}, + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": "float64", + "fill_value": np.float64(0.0), + "node_type": "array", + "shape": (1,), + "zarr_format": 3, + }, + "subgroup": { + "attributes": {}, + "consolidated_metadata": { + "metadata": {}, + "kind": "inline", + "must_understand": False, + }, + "node_type": "group", + "zarr_format": 3, + }, + }, + "must_understand": False, + } + ), + ) result = sorted(group.groups(), key=lambda x: x[0]) assert result == expected_groups diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index 84667adfdb..56dab60a7b 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -36,6 +36,7 @@ async def memory_store_with_hierarchy(memory_store: Store) -> None: grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + await grandchild.create_group("empty_group", attributes={"key": "empty"}) return memory_store @@ -137,6 +138,13 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: attributes={"key": "grandchild"}, consolidated_metadata=ConsolidatedMetadata( metadata={ + # known to be empty child group + "empty_group": GroupMetadata( + consolidated_metadata=ConsolidatedMetadata( + metadata={} + ), + attributes={"key": "empty"}, + ), "array": ArrayV3Metadata.from_dict( { **array_metadata, @@ -151,7 +159,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: }, }, } - ) + ), } ), ), @@ -183,6 +191,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "child/array", "child/grandchild", "child/grandchild/array", + "child/grandchild/empty_group", "lat", "lon", "time", @@ -408,12 +417,19 @@ def test_flatten(self): expected = { "air": metadata.metadata["air"], "lat": metadata.metadata["lat"], - "child": GroupMetadata(attributes={"key": "child"}), + "child": GroupMetadata( + attributes={"key": "child"}, consolidated_metadata=ConsolidatedMetadata(metadata={}) + ), "child/array": metadata.metadata["child"].consolidated_metadata.metadata["array"], - "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), - "child/grandchild/array": metadata.metadata["child"] - .consolidated_metadata.metadata["grandchild"] - .consolidated_metadata.metadata["array"], + "child/grandchild": GroupMetadata( + attributes={"key": "grandchild"}, + consolidated_metadata=ConsolidatedMetadata(metadata={}), + ), + "child/grandchild/array": ( + metadata.metadata["child"] + .consolidated_metadata.metadata["grandchild"] + .consolidated_metadata.metadata["array"] + ), } assert result == expected @@ -428,3 +444,31 @@ def test_invalid_metadata_raises(self): with pytest.raises(TypeError, match="key='foo', type='list'"): ConsolidatedMetadata.from_dict(payload) + + def test_to_dict_empty(self): + meta = ConsolidatedMetadata( + metadata={ + "empty": GroupMetadata( + attributes={"key": "empty"}, + consolidated_metadata=ConsolidatedMetadata(metadata={}), + ) + } + ) + result = meta.to_dict() + expected = { + "kind": "inline", + "must_understand": False, + "metadata": { + "empty": { + "attributes": {"key": "empty"}, + "consolidated_metadata": { + "kind": "inline", + "must_understand": False, + "metadata": {}, + }, + "node_type": "group", + "zarr_format": 3, + } + }, + } + assert result == expected From af9788faf4df59dd359a25bf6a0bb96adb695140 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 10:38:27 -0500 Subject: [PATCH 39/62] fixed name --- src/zarr/core/group.py | 12 ++++++++---- tests/v3/test_group.py | 16 +++++++++++----- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index e9deeb6e43..bfbbad1acb 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -544,7 +544,7 @@ async def getitem( # Consolidated metadata lets us avoid some I/O operations so try that first. if self.metadata.consolidated_metadata is not None: - return self._getitem_consolidated(store_path, key) + return self._getitem_consolidated(store_path, key, prefix=self.name) # Note: # in zarr-python v2, we first check if `key` references an Array, else if `key` references @@ -595,7 +595,9 @@ async def getitem( else: raise ValueError(f"unexpected zarr_format: {self.metadata.zarr_format}") - def _getitem_consolidated(self, store_path: StorePath, key: str) -> AsyncArray | AsyncGroup: + def _getitem_consolidated( + self, store_path: StorePath, key: str, prefix: str + ) -> AsyncArray | AsyncGroup: # getitem, in the special case where we have consolidated metadata. # Note that this is a regular def (non async) function. # This shouldn't do any additional I/O. @@ -613,6 +615,7 @@ def _getitem_consolidated(self, store_path: StorePath, key: str) -> AsyncArray | raise KeyError(msg) from e # update store_path to ensure that AsyncArray/Group.name is correct + key = "/".join([prefix.lstrip("/"), key]) store_path = StorePath(store=store_path.store, path=key) if isinstance(metadata, GroupMetadata): @@ -1075,8 +1078,9 @@ def _members_consolidated( # we kind of just want the top-level keys. if consolidated_metadata is not None: for key in consolidated_metadata.metadata.keys(): - obj = self._getitem_consolidated(self.store_path, key) # Metadata -> Group/Array - # this is probably generally useful + obj = self._getitem_consolidated( + self.store_path, key, prefix=self.name + ) # Metadata -> Group/Array key = "/".join([prefix, key]).lstrip("/") yield key, obj diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 88a4851538..eb62d9695e 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -254,6 +254,9 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + object.__setattr__( + subgroup.metadata, "consolidated_metadata", ConsolidatedMetadata(metadata={}) + ) assert group["subgroup"] == subgroup assert group["subarray"] == subarray @@ -275,6 +278,9 @@ def test_group_delitem(store: Store, zarr_format: ZarrFormat, consolidated: bool if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + object.__setattr__( + subgroup.metadata, "consolidated_metadata", ConsolidatedMetadata(metadata={}) + ) assert group["subgroup"] == subgroup assert group["subarray"] == subarray @@ -985,7 +991,7 @@ async def test_require_array(store: Store, zarr_format: ZarrFormat) -> None: @pytest.mark.parametrize("consolidate", [True, False]) def test_members_name(store: Store, consolidate: bool): - group = Group.create(store=store) + group = Group.from_store(store=store) a = group.create_group(name="a") a.create_array("array", shape=(1,)) b = a.create_group(name="b") @@ -1014,7 +1020,7 @@ def test_open_mutable_mapping_sync(): class TestConsolidated: async def test_group_getitem_consolidated(self, store: Store) -> None: - root = await AsyncGroup.create(store=store) + root = await AsyncGroup.from_store(store=store) # Set up the test structure with # / # g0/ # group /g0 @@ -1061,7 +1067,7 @@ async def test_group_delitem_consolidated(self, store: Store) -> None: if isinstance(store, ZipStore): raise pytest.skip("Not implemented") - root = await AsyncGroup.create(store=store) + root = await AsyncGroup.from_store(store=store) # Set up the test structure with # / # g0/ # group /g0 @@ -1097,7 +1103,7 @@ def test_open_consolidated_raises(self, store: Store) -> None: if isinstance(store, ZipStore): raise pytest.skip("Not implemented") - root = Group.create(store=store) + root = Group.from_store(store=store) # fine to be missing by default zarr.open_group(store=store) @@ -1117,7 +1123,7 @@ async def test_open_consolidated_raises_async(self, store: Store) -> None: if isinstance(store, ZipStore): raise pytest.skip("Not implemented") - root = await AsyncGroup.create(store=store) + root = await AsyncGroup.from_store(store=store) # fine to be missing by default await zarr.api.asynchronous.open_group(store=store) From 5a08466944b6e7859619b7dfcf9048faac3eb73e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 10:46:45 -0500 Subject: [PATCH 40/62] fixup nested --- src/zarr/core/group.py | 4 +++- tests/v3/test_group.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index bfbbad1acb..ef53f3fbd7 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -615,7 +615,8 @@ def _getitem_consolidated( raise KeyError(msg) from e # update store_path to ensure that AsyncArray/Group.name is correct - key = "/".join([prefix.lstrip("/"), key]) + if prefix != "/": + key = "/".join([prefix.lstrip("/"), key]) store_path = StorePath(store=store_path.store, path=key) if isinstance(metadata, GroupMetadata): @@ -1082,6 +1083,7 @@ def _members_consolidated( self.store_path, key, prefix=self.name ) # Metadata -> Group/Array key = "/".join([prefix, key]).lstrip("/") + # breakpoint() yield key, obj if ((max_depth is None) or (current_depth < max_depth)) and isinstance( diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index eb62d9695e..dc8ac3141a 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -390,6 +390,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat } ), ) + object.__setattr__( + expected_group_values[1].metadata, + "consolidated_metadata", + ConsolidatedMetadata(metadata={}), + ) result = sorted(group.groups(), key=lambda x: x[0]) assert result == expected_groups From d236e5305c514907e9ea5aa9f1fbb886b835da1c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 11:19:51 -0500 Subject: [PATCH 41/62] removed dupe tests --- src/zarr/api/asynchronous.py | 4 +- src/zarr/core/group.py | 4 +- tests/v3/test_group.py | 93 +++++++++++++------- tests/v3/test_metadata/test_v3.py | 139 ------------------------------ 4 files changed, 68 insertions(+), 172 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 95b59985c5..fe87b4cdf1 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -139,7 +139,9 @@ def _default_zarr_version() -> ZarrFormat: async def consolidate_metadata( - store: StoreLike, path: str | None = None, zarr_format: ZarrFormat = 3 + store: StoreLike, + path: str | None = None, + zarr_format: ZarrFormat | None = None, ) -> AsyncGroup: """ Consolidate the metadata of all nodes in a hierarchy. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index ef53f3fbd7..e9eecee483 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -132,8 +132,10 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: node_type = v.get("node_type", None) if node_type == "group": metadata[k] = GroupMetadata.from_dict(v) - elif node_type == "array": + elif node_type == "array" and v.get("zarr_format") == 3: metadata[k] = ArrayV3Metadata.from_dict(v) + elif node_type == "array": + metadata[k] = ArrayV2Metadata.from_dict(v) else: # We either have V2 metadata, or invalid metadata if "shape" in v: diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index dc8ac3141a..b61b35befd 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -351,41 +351,66 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat if consolidate: group = zarr.consolidate_metadata(store) + if zarr_format == 2: + metadata = { + "subarray": { + "attributes": {}, + "dtype": "float64", + "fill_value": np.float64(0.0), + "shape": (1,), + "chunks": (1,), + "order": "C", + "zarr_format": zarr_format, + }, + "subgroup": { + "attributes": {}, + "consolidated_metadata": { + "metadata": {}, + "kind": "inline", + "must_understand": False, + }, + "node_type": "group", + "zarr_format": zarr_format, + }, + } + else: + metadata = { + "subarray": { + "attributes": {}, + "chunk_grid": { + "configuration": {"chunk_shape": (1,)}, + "name": "regular", + }, + "chunk_key_encoding": { + "configuration": {"separator": "/"}, + "name": "default", + }, + "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "data_type": "float64", + "fill_value": np.float64(0.0), + "node_type": "array", + "shape": (1,), + "zarr_format": zarr_format, + }, + "subgroup": { + "attributes": {}, + "consolidated_metadata": { + "metadata": {}, + "kind": "inline", + "must_understand": False, + }, + "node_type": "group", + "zarr_format": zarr_format, + }, + } + object.__setattr__( expected_group_values[0].metadata, "consolidated_metadata", ConsolidatedMetadata.from_dict( { "kind": "inline", - "metadata": { - "subarray": { - "attributes": {}, - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": "float64", - "fill_value": np.float64(0.0), - "node_type": "array", - "shape": (1,), - "zarr_format": 3, - }, - "subgroup": { - "attributes": {}, - "consolidated_metadata": { - "metadata": {}, - "kind": "inline", - "must_understand": False, - }, - "node_type": "group", - "zarr_format": 3, - }, - }, + "metadata": metadata, "must_understand": False, } ), @@ -1055,7 +1080,13 @@ async def test_group_getitem_consolidated(self, store: Store) -> None: attributes={}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata( - metadata={"g2": GroupMetadata(attributes={}, zarr_format=3)} + metadata={ + "g2": GroupMetadata( + attributes={}, + zarr_format=3, + consolidated_metadata=ConsolidatedMetadata(metadata={}), + ) + } ), ), } @@ -1066,7 +1097,7 @@ async def test_group_getitem_consolidated(self, store: Store) -> None: assert rg1.metadata.consolidated_metadata == expected.metadata["g1"].consolidated_metadata rg2 = await rg1.getitem("g2") - assert rg2.metadata.consolidated_metadata is None + assert rg2.metadata.consolidated_metadata == ConsolidatedMetadata(metadata={}) async def test_group_delitem_consolidated(self, store: Store) -> None: if isinstance(store, ZipStore): diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index bd34265dc5..16ddb27594 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -530,142 +530,3 @@ async def test_non_root_node(self, memory_store_with_hierarchy: Store) -> None: assert child.metadata.consolidated_metadata is not None assert "air" not in child.metadata.consolidated_metadata.metadata assert "grandchild" in child.metadata.consolidated_metadata.metadata - - def test_consolidated_metadata_from_dict(self): - data = {"must_understand": False} - - # missing kind - with pytest.raises(ValueError, match="kind='None'"): - ConsolidatedMetadata.from_dict(data) - - # invalid kind - data["kind"] = "invalid" - with pytest.raises(ValueError, match="kind='invalid'"): - ConsolidatedMetadata.from_dict(data) - - # missing metadata - data["kind"] = "inline" - - with pytest.raises(TypeError, match="Unexpected type for 'metadata'"): - ConsolidatedMetadata.from_dict(data) - - data["kind"] = "inline" - # empty is fine - data["metadata"] = {} - ConsolidatedMetadata.from_dict(data) - - # invalid metadata - data["metadata"]["a"] = {"node_type": "array", "zarr_format": 3} - - with pytest.raises(TypeError): - ConsolidatedMetadata.from_dict(data) - - def test_flatten(self): - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - metadata = ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "child": GroupMetadata( - attributes={"key": "child"}, - consolidated_metadata=ConsolidatedMetadata( - metadata={ - "array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "child"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ), - "grandchild": GroupMetadata( - attributes={"key": "grandchild"}, - consolidated_metadata=ConsolidatedMetadata( - metadata={ - "array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "grandchild"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ) - } - ), - ), - }, - ), - ), - }, - ) - result = metadata.flattened_metadata - expected = { - "air": metadata.metadata["air"], - "lat": metadata.metadata["lat"], - "child": GroupMetadata(attributes={"key": "child"}), - "child/array": metadata.metadata["child"].consolidated_metadata.metadata["array"], - "child/grandchild": GroupMetadata(attributes={"key": "grandchild"}), - "child/grandchild/array": metadata.metadata["child"] - .consolidated_metadata.metadata["grandchild"] - .consolidated_metadata.metadata["array"], - } - assert result == expected - - def test_invalid_metadata_raises(self): - payload = { - "kind": "inline", - "must_understand": False, - "metadata": { - "foo": [1, 2, 3] # invalid - }, - } - - with pytest.raises(TypeError, match="key='foo', type='list'"): - ConsolidatedMetadata.from_dict(payload) From 2824de6011e20a51106c2176a7d6e6960232aec6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 12:03:10 -0500 Subject: [PATCH 42/62] fixup --- tests/v3/test_metadata/test_v3.py | 279 ------------------------------ 1 file changed, 279 deletions(-) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 16ddb27594..affa2318e7 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -7,28 +7,17 @@ import numpy as np import pytest -import zarr.api.synchronous -from zarr.api.asynchronous import ( - AsyncGroup, - consolidate_metadata, - group, - open, - open_consolidated, -) from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding -from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v3 import parse_dimension_names, parse_fill_value, parse_zarr_format -from zarr.store.common import StorePath if TYPE_CHECKING: from collections.abc import Sequence from typing import Any from zarr.abc.codec import Codec - from zarr.abc.store import Store bool_dtypes = ("bool",) @@ -262,271 +251,3 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: result = json.loads(d["zarr.json"].to_bytes()) assert result["fill_value"] == fill_value - - -@pytest.fixture -async def memory_store_with_hierarchy(memory_store: Store) -> None: - g = await group(store=memory_store, attributes={"foo": "bar"}) - await g.create_array(name="air", shape=(1, 2, 3)) - await g.create_array(name="lat", shape=(1,)) - await g.create_array(name="lon", shape=(2,)) - await g.create_array(name="time", shape=(3,)) - - child = await g.create_group("child", attributes={"key": "child"}) - await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) - - grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) - await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) - return memory_store - - -class TestConsolidated: - async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: - # TODO: Figure out desired keys in - # TODO: variety in the hierarchies - # More nesting - # arrays under arrays - # single array - # etc. - await consolidate_metadata(memory_store_with_hierarchy) - group2 = await AsyncGroup.open(memory_store_with_hierarchy) - - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - expected = GroupMetadata( - attributes={"foo": "bar"}, - consolidated_metadata=ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lon": ArrayV3Metadata.from_dict( - { - **{"shape": (2,)}, - "chunk_grid": { - "configuration": {"chunk_shape": (2,)}, - "name": "regular", - }, - **array_metadata, - } - ), - "time": ArrayV3Metadata.from_dict( - { - **{ - "shape": (3,), - "chunk_grid": { - "configuration": {"chunk_shape": (3,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "child": GroupMetadata( - attributes={"key": "child"}, - consolidated_metadata=ConsolidatedMetadata( - metadata={ - "array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "child"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": {"chunk_shape": (4, 4)}, - "name": "regular", - }, - }, - } - ), - "grandchild": GroupMetadata( - attributes={"key": "grandchild"}, - consolidated_metadata=ConsolidatedMetadata( - metadata={ - "array": ArrayV3Metadata.from_dict( - { - **array_metadata, - **{ - "attributes": {"key": "grandchild"}, - "shape": (4, 4), - "chunk_grid": { - "configuration": { - "chunk_shape": (4, 4) - }, - "name": "regular", - }, - }, - } - ) - } - ), - ), - }, - ), - ), - }, - ), - ) - - assert group2.metadata == expected - group3 = await open(store=memory_store_with_hierarchy) - assert group3.metadata == expected - - group4 = await open_consolidated(store=memory_store_with_hierarchy) - assert group4.metadata == expected - - result_raw = json.loads( - ( - await memory_store_with_hierarchy.get( - "zarr.json", prototype=default_buffer_prototype() - ) - ).to_bytes() - )["consolidated_metadata"] - assert result_raw["kind"] == "inline" - assert sorted(result_raw["metadata"]) == [ - "air", - "child", - "child/array", - "child/grandchild", - "child/grandchild/array", - "lat", - "lon", - "time", - ] - - def test_consolidated_sync(self, memory_store): - g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) - - zarr.api.synchronous.consolidate_metadata(memory_store) - group2 = zarr.api.synchronous.Group.open(memory_store) - - array_metadata = { - "attributes": {}, - "chunk_key_encoding": { - "configuration": {"separator": "/"}, - "name": "default", - }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), - "fill_value": np.float64(0.0), - "node_type": "array", - # "shape": (1, 2, 3), - "zarr_format": 3, - } - - expected = GroupMetadata( - attributes={"foo": "bar"}, - consolidated_metadata=ConsolidatedMetadata( - kind="inline", - must_understand=False, - metadata={ - "air": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1, 2, 3), - "chunk_grid": { - "configuration": {"chunk_shape": (1, 2, 3)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lat": ArrayV3Metadata.from_dict( - { - **{ - "shape": (1,), - "chunk_grid": { - "configuration": {"chunk_shape": (1,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - "lon": ArrayV3Metadata.from_dict( - { - **{"shape": (2,)}, - "chunk_grid": { - "configuration": {"chunk_shape": (2,)}, - "name": "regular", - }, - **array_metadata, - } - ), - "time": ArrayV3Metadata.from_dict( - { - **{ - "shape": (3,), - "chunk_grid": { - "configuration": {"chunk_shape": (3,)}, - "name": "regular", - }, - }, - **array_metadata, - } - ), - }, - ), - ) - assert group2.metadata == expected - group3 = zarr.api.synchronous.open(store=memory_store) - assert group3.metadata == expected - - group4 = zarr.api.synchronous.open_consolidated(store=memory_store) - assert group4.metadata == expected - - async def test_not_writable_raises(self, memory_store: zarr.store.MemoryStore) -> None: - await group(store=memory_store, attributes={"foo": "bar"}) - read_store = zarr.store.MemoryStore(store_dict=memory_store._store_dict) - with pytest.raises(ValueError, match="does not support writing"): - await consolidate_metadata(read_store) - - async def test_non_root_node(self, memory_store_with_hierarchy: Store) -> None: - await consolidate_metadata(memory_store_with_hierarchy, path="child") - root = await AsyncGroup.open(memory_store_with_hierarchy) - child = await AsyncGroup.open(StorePath(memory_store_with_hierarchy) / "child") - - assert root.metadata.consolidated_metadata is None - assert child.metadata.consolidated_metadata is not None - assert "air" not in child.metadata.consolidated_metadata.metadata - assert "grandchild" in child.metadata.consolidated_metadata.metadata From 08a7682c2004458d005426db059dc7e818811f64 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 12:03:16 -0500 Subject: [PATCH 43/62] doc fix --- docs/{consolidated-metadata.rst => consolidated_metadata.rst} | 0 docs/index.rst | 1 + 2 files changed, 1 insertion(+) rename docs/{consolidated-metadata.rst => consolidated_metadata.rst} (100%) diff --git a/docs/consolidated-metadata.rst b/docs/consolidated_metadata.rst similarity index 100% rename from docs/consolidated-metadata.rst rename to docs/consolidated_metadata.rst diff --git a/docs/index.rst b/docs/index.rst index 92f9a3df18..b3e2855e98 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,6 +10,7 @@ Zarr-Python getting_started tutorial + consolidated_metadata api/index spec release From b8b5f518f796fabafa4bfa72f52c4ef42c4bfb46 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2024 09:57:55 -0500 Subject: [PATCH 44/62] fixups --- src/zarr/core/common.py | 2 +- src/zarr/core/group.py | 3 ++- src/zarr/core/metadata/v3.py | 9 ++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 11d1bc32d4..6092fbdb9c 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -28,7 +28,7 @@ ZARRAY_JSON = ".zarray" ZGROUP_JSON = ".zgroup" ZATTRS_JSON = ".zattrs" -ZMETADATA_v2_JSON = ".zmetadata" +ZMETADATA_V2_JSON = ".zmetadata" ByteRangeRequest = tuple[int | None, int | None] BytesLike = bytes | bytearray | memoryview diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index e9eecee483..e7d0223766 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -24,6 +24,7 @@ ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, + ZMETADATA_V2_JSON, ChunkCoords, ShapeLike, ZarrFormat, @@ -395,7 +396,7 @@ async def open( """ store_path = await make_store_path(store) - consolidated_key = ".zmetadata" + consolidated_key = ZMETADATA_V2_JSON if (zarr_format == 2 or zarr_format is None) and isinstance(use_consolidated, str): consolidated_key = use_consolidated diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 392a622bac..7f8c57c14c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -37,11 +37,6 @@ from zarr.core.metadata.common import ArrayMetadata, parse_attributes from zarr.registry import get_codec_class -# For type checking -_bool = bool - -__all__ = ["ArrayMetadata"] - def parse_zarr_format(data: object) -> Literal[3]: if data == 3: @@ -315,6 +310,10 @@ def parse_fill_value( return dtype.type(fill_value) # type: ignore[arg-type] +# For type checking +_bool = bool + + class DataType(Enum): bool = "bool" int8 = "int8" From ba4fb47a1a8b8893084b7b6ca221469e08a8cde7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2024 10:14:48 -0500 Subject: [PATCH 45/62] fixup --- src/zarr/api/asynchronous.py | 4 ++-- src/zarr/core/group.py | 11 ++++++----- tests/v3/test_metadata/test_consolidated.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index fe87b4cdf1..ff760f2b13 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -299,9 +299,9 @@ async def open( async def open_consolidated(*args: Any, use_consolidated: bool = True, **kwargs: Any) -> AsyncGroup: """ - Alias for :func:`open_group`. + Alias for :func:`open_group` with ``use_consolidated=True``. """ - return await open_group(*args, **kwargs) + return await open_group(*args, use_consolidated=use_consolidated, **kwargs) async def save( diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index e7d0223766..fbdfd6ba47 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -397,6 +397,7 @@ async def open( store_path = await make_store_path(store) consolidated_key = ZMETADATA_V2_JSON + if (zarr_format == 2 or zarr_format is None) and isinstance(use_consolidated, str): consolidated_key = use_consolidated @@ -414,9 +415,6 @@ async def open( if use_consolidated or use_consolidated is None: maybe_consolidated_metadata_bytes = rest[0] - if use_consolidated and maybe_consolidated_metadata_bytes is None: - # the user requested consolidated metadata, but it was missing - raise FileNotFoundError(paths[-1]) else: maybe_consolidated_metadata_bytes = None @@ -453,6 +451,11 @@ async def open( if zarr_format == 2: # this is checked above, asserting here for mypy assert zgroup_bytes is not None + + if use_consolidated and maybe_consolidated_metadata_bytes is None: + # the user requested consolidated metadata, but it was missing + raise ValueError(consolidated_key) + return cls._from_bytes_v2( store_path, zgroup_bytes, zattrs_bytes, maybe_consolidated_metadata_bytes ) @@ -517,7 +520,6 @@ def _from_bytes_v3( group_metadata = json.loads(zarr_json_bytes.to_bytes()) if use_consolidated and group_metadata.get("consolidated_metadata") is None: msg = f"Consolidated metadata requested with 'use_consolidated=True' but not found in '{store_path.path}'." - # Use `FileNotFoundError` here to match the error from v2? raise ValueError(msg) elif use_consolidated is False: @@ -1086,7 +1088,6 @@ def _members_consolidated( self.store_path, key, prefix=self.name ) # Metadata -> Group/Array key = "/".join([prefix, key]).lstrip("/") - # breakpoint() yield key, obj if ((max_depth is None) or (current_depth < max_depth)) and isinstance( diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index 56dab60a7b..3fe36e7946 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -6,7 +6,9 @@ import numpy as np import pytest +import zarr.api.asynchronous import zarr.api.synchronous +import zarr.store from zarr.api.asynchronous import ( AsyncGroup, consolidate_metadata, @@ -21,6 +23,7 @@ if TYPE_CHECKING: from zarr.abc.store import Store + from zarr.core.common import ZarrFormat @pytest.fixture @@ -472,3 +475,13 @@ def test_to_dict_empty(self): }, } assert result == expected + + @pytest.mark.parametrize("zarr_format", [2, 3]) + async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): + store = zarr.store.MemoryStore(mode="w") + await AsyncGroup.from_store(store, zarr_format=zarr_format) + with pytest.raises(ValueError): + await zarr.api.asynchronous.open_consolidated(store, zarr_format=zarr_format) + + with pytest.raises(ValueError): + await zarr.api.asynchronous.open_consolidated(store, zarr_format=None) From e6142d815497a0fe8b74bad108d67b9d0fa7ee5f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2024 10:26:47 -0500 Subject: [PATCH 46/62] fixup --- tests/v3/test_metadata/test_consolidated.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index 3fe36e7946..a6f3f21fc4 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -61,7 +61,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "name": "default", }, "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), + "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", # "shape": (1, 2, 3), @@ -217,7 +217,7 @@ def test_consolidated_sync(self, memory_store): "name": "default", }, "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), + "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", # "shape": (1, 2, 3), @@ -325,12 +325,6 @@ def test_consolidated_metadata_from_dict(self): data["metadata"] = {} ConsolidatedMetadata.from_dict(data) - # invalid metadata - data["metadata"]["a"] = {"node_type": "array", "zarr_format": 3} - - with pytest.raises(TypeError): - ConsolidatedMetadata.from_dict(data) - def test_flatten(self): array_metadata = { "attributes": {}, @@ -339,7 +333,7 @@ def test_flatten(self): "name": "default", }, "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), - "data_type": np.dtype("float64"), + "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", # "shape": (1, 2, 3), From 8ad3738a9def78131bbc992b7070a4033513c84a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2024 11:27:33 -0500 Subject: [PATCH 47/62] v2 writer --- src/zarr/core/group.py | 32 +++++++++++++++- tests/v3/test_metadata/test_consolidated.py | 42 +++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 9c1f410b7c..abda1cce37 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -331,7 +331,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: ) } else: - return { + items = { ZGROUP_JSON: prototype.buffer.from_bytes( json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode() ), @@ -339,6 +339,36 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json.dumps(self.attributes, indent=json_indent).encode() ), } + if self.consolidated_metadata: + d = { + ZGROUP_JSON: {"zarr_format": self.zarr_format}, + ZATTRS_JSON: self.attributes, + } + consolidated_metadata = self.consolidated_metadata.to_dict()["metadata"] + assert isinstance(consolidated_metadata, dict) + for k, v in consolidated_metadata.items(): + attrs = v.pop("attributes", None) + d[f"{k}/{ZATTRS_JSON}"] = attrs + if "shape" in v: + # it's an array + d[f"{k}/{ZARRAY_JSON}"] = v + else: + d[f"{k}/{ZGROUP_JSON}"] = { + "zarr_format": self.zarr_format, + "consolidated_metadata": { + "metadata": {}, + "must_understand": False, + "kind": "inline", + }, + } + + items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( + json.dumps( + {"metadata": d, "zarr_consolidated_format": 1}, indent=json_indent + ).encode() + ) + + return items def __init__( self, diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index a6f3f21fc4..8828937a40 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -19,6 +19,7 @@ from zarr.core.buffer.core import default_buffer_prototype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata +from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.store.common import StorePath if TYPE_CHECKING: @@ -479,3 +480,44 @@ async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): with pytest.raises(ValueError): await zarr.api.asynchronous.open_consolidated(store, zarr_format=None) + + async def test_consolidated_metadata_v2(self): + store = zarr.store.MemoryStore(mode="w") + g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) + await g.create_array(name="a", shape=(1,), attributes={"key": "a"}) + g1 = await g.create_group(name="g1", attributes={"key": "g1"}) + await g1.create_group(name="g2", attributes={"key": "g2"}) + + await zarr.api.asynchronous.consolidate_metadata(store) + result = await zarr.api.asynchronous.open_consolidated(store, zarr_format=2) + + expected = GroupMetadata( + attributes={"key": "root"}, + zarr_format=2, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "a": ArrayV2Metadata( + shape=(1,), + dtype="float64", + attributes={"key": "a"}, + chunks=(1,), + fill_value=0.0, + order="C", + ), + "g1": GroupMetadata( + attributes={"key": "g1"}, + zarr_format=2, + consolidated_metadata=ConsolidatedMetadata( + metadata={ + "g2": GroupMetadata( + attributes={"key": "g2"}, + zarr_format=2, + consolidated_metadata=ConsolidatedMetadata(metadata={}), + ) + } + ), + ), + } + ), + ) + assert result.metadata == expected From fc9493355b893d8fff09a901144bbbfce1433497 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2024 11:30:25 -0500 Subject: [PATCH 48/62] fixup --- src/zarr/core/group.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index abda1cce37..874ca095f3 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -364,7 +364,9 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( json.dumps( - {"metadata": d, "zarr_consolidated_format": 1}, indent=json_indent + {"metadata": d, "zarr_consolidated_format": 1}, + indent=json_indent, + default=_json_convert, ).encode() ) From 4bfad1ba4d37955ef09c11c07f4cba8b59b5508c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Sep 2024 13:23:41 -0500 Subject: [PATCH 49/62] fixup --- src/zarr/core/array.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e1de15c747..33da11cfaf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -156,6 +156,12 @@ async def get_array_metadata( # V3 arrays are comprised of a zarr.json object assert zarr_json_bytes is not None metadata_dict = json.loads(zarr_json_bytes.to_bytes()) + + if metadata_dict.get("node_type") != "array": + # This KeyError is load bearing for `open`. That currently tries + # to open the node as an `array` and then falls back to opening + # as a group. + raise KeyError return metadata_dict From 8728440dfb9ece9b543fd2be2b1e63fe199ed5f0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2024 11:34:11 -0500 Subject: [PATCH 50/62] path fix --- tests/v3/test_metadata/test_consolidated.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index 8828937a40..7d90e7ba11 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -8,7 +8,7 @@ import zarr.api.asynchronous import zarr.api.synchronous -import zarr.store +import zarr.storage from zarr.api.asynchronous import ( AsyncGroup, consolidate_metadata, @@ -20,7 +20,7 @@ from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata -from zarr.store.common import StorePath +from zarr.storage.common import StorePath if TYPE_CHECKING: from zarr.abc.store import Store @@ -287,9 +287,9 @@ def test_consolidated_sync(self, memory_store): group4 = zarr.api.synchronous.open_consolidated(store=memory_store) assert group4.metadata == expected - async def test_not_writable_raises(self, memory_store: zarr.store.MemoryStore) -> None: + async def test_not_writable_raises(self, memory_store: zarr.storage.MemoryStore) -> None: await group(store=memory_store, attributes={"foo": "bar"}) - read_store = zarr.store.MemoryStore(store_dict=memory_store._store_dict) + read_store = zarr.storage.MemoryStore(store_dict=memory_store._store_dict) with pytest.raises(ValueError, match="does not support writing"): await consolidate_metadata(read_store) @@ -473,7 +473,7 @@ def test_to_dict_empty(self): @pytest.mark.parametrize("zarr_format", [2, 3]) async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): - store = zarr.store.MemoryStore(mode="w") + store = zarr.storage.MemoryStore(mode="w") await AsyncGroup.from_store(store, zarr_format=zarr_format) with pytest.raises(ValueError): await zarr.api.asynchronous.open_consolidated(store, zarr_format=zarr_format) @@ -482,7 +482,7 @@ async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): await zarr.api.asynchronous.open_consolidated(store, zarr_format=None) async def test_consolidated_metadata_v2(self): - store = zarr.store.MemoryStore(mode="w") + store = zarr.storage.MemoryStore(mode="w") g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) await g.create_array(name="a", shape=(1,), attributes={"key": "a"}) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) From 20c97a4048908d6d31be24d4de8aa1c9dccabeaf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2024 12:15:38 -0500 Subject: [PATCH 51/62] Fixed v2 use_consolidated=False --- src/zarr/core/group.py | 5 ++++ tests/v3/test_metadata/test_consolidated.py | 31 +++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 54b1bacacb..f7971ac786 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -520,6 +520,11 @@ async def open( # the user requested consolidated metadata, but it was missing raise ValueError(consolidated_key) + elif use_consolidated is False: + # the user explicitly opted out of consolidated_metadata. + # Discard anything we might have read. + maybe_consolidated_metadata_bytes = None + return cls._from_bytes_v2( store_path, zgroup_bytes, zattrs_bytes, maybe_consolidated_metadata_bytes ) diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index 7d90e7ba11..eb21eef3ac 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -521,3 +521,34 @@ async def test_consolidated_metadata_v2(self): ), ) assert result.metadata == expected + + @pytest.mark.parametrize("zarr_format", [2, 3]) + async def test_use_consolidated_false( + self, memory_store: zarr.storage.MemoryStore, zarr_format: ZarrFormat + ) -> None: + with zarr.config.set(default_zarr_version=zarr_format): + g = await group(store=memory_store, attributes={"foo": "bar"}) + await g.create_group(name="a") + + # test a stale read + await zarr.api.asynchronous.consolidate_metadata(memory_store) + await g.create_group(name="b") + + stale = await zarr.api.asynchronous.open_group(store=memory_store) + assert len([x async for x in stale.members()]) == 1 + assert stale.metadata.consolidated_metadata + assert list(stale.metadata.consolidated_metadata.metadata) == ["a"] + + # bypass stale data + good = await zarr.api.asynchronous.open_group( + store=memory_store, use_consolidated=False + ) + assert len([x async for x in good.members()]) == 2 + + # reconsolidate + await zarr.api.asynchronous.consolidate_metadata(memory_store) + + good = await zarr.api.asynchronous.open_group(store=memory_store) + assert len([x async for x in good.members()]) == 2 + assert good.metadata.consolidated_metadata + assert sorted(good.metadata.consolidated_metadata.metadata) == ["a", "b"] From f7e5b3fb5ef32b514c87de0a7d4766604498c72e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2024 13:15:26 -0500 Subject: [PATCH 52/62] fixupg --- tests/v3/test_metadata/test_v2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 29d4f7e867..79bb93d714 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -7,7 +7,7 @@ import pytest import zarr.api.asynchronous -import zarr.store +import zarr.storage from zarr.core.buffer import cpu from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata @@ -83,8 +83,8 @@ def test_metadata_to_dict( class TestConsolidated: @pytest.fixture async def v2_consolidated_metadata( - self, memory_store: zarr.store.MemoryStore - ) -> zarr.store.MemoryStore: + self, memory_store: zarr.storage.MemoryStore + ) -> zarr.storage.MemoryStore: zmetadata = { "metadata": { ".zattrs": { @@ -141,7 +141,7 @@ async def v2_consolidated_metadata( "zarr_consolidated_format": 1, } store_dict = {} - store = zarr.store.MemoryStore(store_dict=store_dict, mode="a") + store = zarr.storage.MemoryStore(store_dict=store_dict, mode="a") await store.set( ".zattrs", cpu.Buffer.from_bytes(json.dumps({"Conventions": "COARDS"}).encode()) ) @@ -187,7 +187,7 @@ async def v2_consolidated_metadata( return store async def test_read_consolidated_metadata( - self, v2_consolidated_metadata: zarr.store.MemoryStore + self, v2_consolidated_metadata: zarr.storage.MemoryStore ): # .zgroup, .zattrs, .metadata store = v2_consolidated_metadata From 483681b76a14a7d1927815ae1def7c4efa9f3282 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 9 Oct 2024 09:30:38 -0500 Subject: [PATCH 53/62] Special case object dtype Closes https://github.com/zarr-developers/zarr-python/issues/2315 --- src/zarr/core/array.py | 4 +++- src/zarr/core/common.py | 9 +++++++++ src/zarr/core/metadata/v2.py | 7 +------ src/zarr/core/metadata/v3.py | 4 ++++ tests/v3/test_array.py | 9 ++++++++- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9f5591ce1e..12a94c64ae 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -35,6 +35,7 @@ ShapeLike, ZarrFormat, concurrent_map, + parse_dtype, parse_shapelike, product, ) @@ -226,7 +227,8 @@ async def create( if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - dtype = np.dtype(dtype) + dtype = parse_dtype(dtype) + # dtype = np.dtype(dtype) if chunks: _chunks = normalize_chunks(chunks, shape, dtype.itemsize) else: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 80c743cc90..93485d1b73 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -16,6 +16,8 @@ overload, ) +import numpy as np + if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -162,3 +164,10 @@ def parse_order(data: Any) -> Literal["C", "F"]: if data in ("C", "F"): return cast(Literal["C", "F"], data) raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") + + +def parse_dtype(dtype: Any) -> np.dtype[Any]: + if dtype is str or dtype == "str": + # special case as object + return np.dtype("object") + return np.dtype(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ec44673f9d..5ac4f9b361 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -22,7 +22,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_shapelike +from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import ArrayMetadata, parse_attributes @@ -201,11 +201,6 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) -def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: - # todo: real validation - return np.dtype(data) - - def parse_zarr_format(data: object) -> Literal[2]: if data == 2: return 2 diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 47c6106bfe..d77646249d 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -504,6 +504,7 @@ class DataType(Enum): complex128 = "complex128" string = "string" bytes = "bytes" + object = "object" @property def byte_count(self) -> None | int: @@ -549,6 +550,7 @@ def to_numpy_shortname(self) -> str: DataType.float64: "f8", DataType.complex64: "c8", DataType.complex128: "c16", + DataType.object: "object", } return data_type_to_numpy[self] @@ -572,6 +574,8 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType: return DataType.string elif dtype.kind == "S": return DataType.bytes + elif dtype.kind == "O": + return DataType.object dtype_to_data_type = { "|b1": "bool", "bool": "bool", diff --git a/tests/v3/test_array.py b/tests/v3/test_array.py index 04adb2a224..fa15cd08d7 100644 --- a/tests/v3/test_array.py +++ b/tests/v3/test_array.py @@ -1,6 +1,6 @@ import pickle from itertools import accumulate -from typing import Literal +from typing import Any, Literal import numpy as np import pytest @@ -406,3 +406,10 @@ def test_vlen_errors() -> None: dtype=" None: + arr = zarr.create(shape=10, dtype=dtype, zarr_format=zarr_format) + assert arr.dtype.kind == "O" From 7e76e9e392c07fbf854a2a1892211bba9e86566b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 9 Oct 2024 10:08:13 -0500 Subject: [PATCH 54/62] fixup --- src/zarr/core/array.py | 2 +- src/zarr/core/common.py | 11 ++++++++--- src/zarr/core/metadata/v2.py | 4 ++-- src/zarr/core/metadata/v3.py | 4 ---- tests/v3/test_array.py | 9 +-------- tests/v3/test_codecs/test_vlen.py | 2 +- tests/v3/test_v2.py | 8 ++++++++ 7 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 12a94c64ae..a80e15e896 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -227,7 +227,7 @@ async def create( if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - dtype = parse_dtype(dtype) + dtype = parse_dtype(dtype, zarr_format) # dtype = np.dtype(dtype) if chunks: _chunks = normalize_chunks(chunks, shape, dtype.itemsize) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 93485d1b73..d74b07ebce 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -18,6 +18,8 @@ import numpy as np +from zarr.core.strings import _STRING_DTYPE + if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -166,8 +168,11 @@ def parse_order(data: Any) -> Literal["C", "F"]: raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") -def parse_dtype(dtype: Any) -> np.dtype[Any]: +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: if dtype is str or dtype == "str": - # special case as object - return np.dtype("object") + if zarr_format == 2: + # special case as object + return np.dtype("object") + else: + return _STRING_DTYPE return np.dtype(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 5ac4f9b361..ddbf0aa94d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -57,7 +57,7 @@ def __init__( Metadata for a Zarr version 2 array. """ shape_parsed = parse_shapelike(shape) - data_type_parsed = parse_dtype(dtype) + data_type_parsed = parse_dtype(dtype, zarr_format=2) chunks_parsed = parse_shapelike(chunks) compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) @@ -141,7 +141,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # check that the zarr_format attribute is correct _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = parse_dtype(_data["dtype"]) + dtype = parse_dtype(_data["dtype"], zarr_format=2) if dtype.kind in "SV": fill_value_encoded = _data.get("fill_value") diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index d77646249d..47c6106bfe 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -504,7 +504,6 @@ class DataType(Enum): complex128 = "complex128" string = "string" bytes = "bytes" - object = "object" @property def byte_count(self) -> None | int: @@ -550,7 +549,6 @@ def to_numpy_shortname(self) -> str: DataType.float64: "f8", DataType.complex64: "c8", DataType.complex128: "c16", - DataType.object: "object", } return data_type_to_numpy[self] @@ -574,8 +572,6 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType: return DataType.string elif dtype.kind == "S": return DataType.bytes - elif dtype.kind == "O": - return DataType.object dtype_to_data_type = { "|b1": "bool", "bool": "bool", diff --git a/tests/v3/test_array.py b/tests/v3/test_array.py index fa15cd08d7..04adb2a224 100644 --- a/tests/v3/test_array.py +++ b/tests/v3/test_array.py @@ -1,6 +1,6 @@ import pickle from itertools import accumulate -from typing import Any, Literal +from typing import Literal import numpy as np import pytest @@ -406,10 +406,3 @@ def test_vlen_errors() -> None: dtype=" None: - arr = zarr.create(shape=10, dtype=dtype, zarr_format=zarr_format) - assert arr.dtype.kind == "O" diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py index c6f587931a..aaea5dab83 100644 --- a/tests/v3/test_codecs/test_vlen.py +++ b/tests/v3/test_codecs/test_vlen.py @@ -11,7 +11,7 @@ from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath -numpy_str_dtypes: list[type | None] = [None, str, np.dtypes.StrDType] +numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] expected_zarr_string_dtype: np.dtype[Any] if _NUMPY_SUPPORTS_VLEN_STRING: numpy_str_dtypes.append(np.dtypes.StringDType) diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index f488782d78..60e59e49bf 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -1,5 +1,6 @@ import json from collections.abc import Iterator +from typing import Any import numpy as np import pytest @@ -84,3 +85,10 @@ async def test_v2_encode_decode(dtype): data = zarr.open_array(store=store, path="foo")[:] expected = np.full((3,), b"X", dtype=dtype) np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype", [str, "str"]) +async def test_create_dtype_str(dtype: Any) -> None: + arr = zarr.create(shape=10, dtype=dtype, zarr_format=2) + assert arr.dtype.kind == "O" + assert arr.metadata.to_dict()["dtype"] == "|O" From cbffcbb80e1b2da87f7899ebdb3e88018d6b6bb6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 06:50:24 -0500 Subject: [PATCH 55/62] docs --- docs/consolidated_metadata.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/consolidated_metadata.rst b/docs/consolidated_metadata.rst index 1a8ec2b946..480a855831 100644 --- a/docs/consolidated_metadata.rst +++ b/docs/consolidated_metadata.rst @@ -1,7 +1,7 @@ Consolidated Metadata ===================== -zarr-python implements the `Consolidated Metadata_` extension to the Zarr Spec. +Zarr-Python implements the `Consolidated Metadata_` extension to the Zarr Spec. Consolidated metadata can reduce the time needed to load the metadata for an entire hierarchy, especially when the metadata is being served over a network. Consolidated metadata essentially stores all the metadata for a hierarchy in the @@ -17,7 +17,7 @@ network request for a :class:`zarr.store.RemoteStore`). After that, any subseque metadata reads get child Group or Array nodes will *not* require reads from the store. In Python, the consolidated metadata is available on the ``.consolidated_metadata`` -attribute of the Group. +attribute of the ``GroupMetadata`` object. .. code-block:: python From 56d2704455608bb557dc06cf289d4d944a23e803 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 07:22:52 -0500 Subject: [PATCH 56/62] pr review --- src/zarr/api/asynchronous.py | 7 +++++-- src/zarr/api/synchronous.py | 4 ++-- src/zarr/core/array.py | 11 +++++------ src/zarr/core/common.py | 14 -------------- src/zarr/core/metadata/v2.py | 10 +++++++--- tests/v3/test_codecs/test_vlen.py | 2 +- tests/v3/test_metadata/test_consolidated.py | 2 +- tests/v3/test_v2.py | 7 ------- 8 files changed, 21 insertions(+), 36 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 46a1068c13..a488c2a674 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -316,13 +316,16 @@ async def open( return await open_group(store=store_path, zarr_format=zarr_format, **kwargs) -async def open_consolidated(*args: Any, use_consolidated: bool = True, **kwargs: Any) -> AsyncGroup: +async def open_consolidated( + *args: Any, use_consolidated: Literal[True] = True, **kwargs: Any +) -> AsyncGroup: """ Alias for :func:`open_group` with ``use_consolidated=True``. """ if use_consolidated is not True: raise TypeError( - "'use_consolidated' must be 'True' in 'open_consolidated'. Use 'open' with 'use_consolidated=False' to bypass consolidated metadata." + "'use_consolidated' must be 'True' in 'open_consolidated'. Use 'open' with " + "'use_consolidated=False' to bypass consolidated metadata." ) return await open_group(*args, use_consolidated=use_consolidated, **kwargs) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 2b04493ae6..9dcd6fe2d5 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import zarr.api.asynchronous as async_api from zarr._compat import _deprecate_positional_args @@ -90,7 +90,7 @@ def open( return Group(obj) -def open_consolidated(*args: Any, use_consolidated: bool = True, **kwargs: Any) -> Group: +def open_consolidated(*args: Any, use_consolidated: Literal[True] = True, **kwargs: Any) -> Group: return Group( sync(async_api.open_consolidated(*args, use_consolidated=use_consolidated, **kwargs)) ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f3a20678cc..6fd54253b2 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -35,7 +35,6 @@ ShapeLike, ZarrFormat, concurrent_map, - parse_dtype, parse_shapelike, product, ) @@ -68,7 +67,7 @@ from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import collect_aiterator, sync -from zarr.errors import MetadataValidationError +from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_pipeline_class from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node @@ -160,11 +159,12 @@ async def get_array_metadata( assert zarr_json_bytes is not None metadata_dict = json.loads(zarr_json_bytes.to_bytes()) - if metadata_dict.get("node_type") != "array": + node_type = metadata_dict.get("node_type") + if node_type != "array": # This KeyError is load bearing for `open`. That currently tries # to open the node as an `array` and then falls back to opening # as a group. - raise KeyError + raise NodeTypeValidationError("node_type", "array", node_type) return metadata_dict @@ -234,8 +234,7 @@ async def create( if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - dtype = parse_dtype(dtype, zarr_format) - # dtype = np.dtype(dtype) + dtype = np.dtype(dtype) if chunks: _chunks = normalize_chunks(chunks, shape, dtype.itemsize) else: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 85d8bebbb3..110dea1c26 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -16,10 +16,6 @@ overload, ) -import numpy as np - -from zarr.core.strings import _STRING_DTYPE - if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -167,13 +163,3 @@ def parse_order(data: Any) -> Literal["C", "F"]: if data in ("C", "F"): return cast(Literal["C", "F"], data) raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") - - -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: - if dtype is str or dtype == "str": - if zarr_format == 2: - # special case as object - return np.dtype("object") - else: - return _STRING_DTYPE - return np.dtype(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 2bae6f2db1..5fada91d63 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -22,7 +22,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike +from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_shapelike from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import ArrayMetadata, parse_attributes @@ -57,7 +57,7 @@ def __init__( Metadata for a Zarr version 2 array. """ shape_parsed = parse_shapelike(shape) - data_type_parsed = parse_dtype(dtype, zarr_format=2) + data_type_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) @@ -141,7 +141,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # check that the zarr_format attribute is correct _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = parse_dtype(_data["dtype"], zarr_format=2) + dtype = parse_dtype(_data["dtype"]) if dtype.kind in "SV": fill_value_encoded = _data.get("fill_value") @@ -201,6 +201,10 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) +def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: + return np.dtype(data) + + def parse_zarr_format(data: object) -> Literal[2]: if data == 2: return 2 diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py index aaea5dab83..ca5ccb92fa 100644 --- a/tests/v3/test_codecs/test_vlen.py +++ b/tests/v3/test_codecs/test_vlen.py @@ -11,7 +11,7 @@ from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath -numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] +numpy_str_dtypes: list[type | str | None] = [None, str, np.dtypes.StrDType] expected_zarr_string_dtype: np.dtype[Any] if _NUMPY_SUPPORTS_VLEN_STRING: numpy_str_dtypes.append(np.dtypes.StringDType) diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/v3/test_metadata/test_consolidated.py index 12fcf56ccf..c0218602f6 100644 --- a/tests/v3/test_metadata/test_consolidated.py +++ b/tests/v3/test_metadata/test_consolidated.py @@ -511,7 +511,7 @@ async def test_consolidated_metadata_v2(self): dtype="float64", attributes={"key": "a"}, chunks=(1,), - fill_value=0.0, + fill_value=None, order="C", ), "g1": GroupMetadata( diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 55142c6894..d981fbc893 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -106,10 +106,3 @@ async def test_v2_encode_decode(dtype): data = zarr.open_array(store=store, path="foo")[:] expected = np.full((3,), b"X", dtype=dtype) np.testing.assert_equal(data, expected) - - -@pytest.mark.parametrize("dtype", [str, "str"]) -async def test_create_dtype_str(dtype: Any) -> None: - arr = zarr.create(shape=10, dtype=dtype, zarr_format=2) - assert arr.dtype.kind == "O" - assert arr.metadata.to_dict()["dtype"] == "|O" From 8ade87d9480d8a554eefad82953d94b290b84a46 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 07:27:27 -0500 Subject: [PATCH 57/62] must_understand --- src/zarr/core/group.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1e1e10574b..255843d9f1 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -35,6 +35,7 @@ from zarr.core.config import config from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.common import ArrayMetadata +from zarr.core.metadata.v3 import V3JsonEncoder from zarr.core.sync import SyncMixin, sync from zarr.errors import MetadataValidationError from zarr.storage import StoreLike, make_store_path @@ -148,10 +149,6 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: if kind != "inline": raise ValueError(f"Consolidated metadata kind='{kind}' is not supported.") - # Do we care about the value of 'must_understand'? - # if data["must_understand"] is not False: - # raise ValueError - raw_metadata = data.get("metadata") if not isinstance(raw_metadata, dict): raise TypeError(f"Unexpected type for 'metadata': {type(raw_metadata)}") @@ -329,7 +326,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: if self.zarr_format == 3: return { ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(self.to_dict(), default=_json_convert, indent=json_indent).encode() + json.dumps(self.to_dict(), cls=V3JsonEncoder).encode() ) } else: @@ -367,8 +364,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( json.dumps( {"metadata": d, "zarr_consolidated_format": 1}, - indent=json_indent, - default=_json_convert, + cls=V3JsonEncoder, ).encode() ) From b5fb72153dfde336e269362fc1dc0a70d573768e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 07:40:28 -0500 Subject: [PATCH 58/62] Updated from_dict checking --- src/zarr/core/common.py | 1 + src/zarr/core/group.py | 36 ++++++++++++++++++++----------- tests/v3/test_metadata/test_v3.py | 20 +++++++++++++++-- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 110dea1c26..2762b4d2e6 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -32,6 +32,7 @@ ChunkCoords = tuple[int, ...] ChunkCoordsLike = Iterable[int] ZarrFormat = Literal[2, 3] +NodeType = Literal["array", "group"] JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...] MemoryOrder = Literal["C", "F"] AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"] diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 255843d9f1..1df02d01ed 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -7,7 +7,7 @@ from collections import defaultdict from dataclasses import asdict, dataclass, field, fields, replace from enum import Enum -from typing import TYPE_CHECKING, Literal, TypeVar, cast, overload +from typing import TYPE_CHECKING, Literal, TypeVar, assert_never, cast, overload import numcodecs.abc import numpy as np @@ -28,6 +28,7 @@ ZGROUP_JSON, ZMETADATA_V2_JSON, ChunkCoords, + NodeType, ShapeLike, ZarrFormat, parse_shapelike, @@ -57,10 +58,16 @@ def parse_zarr_format(data: Any) -> ZarrFormat: if data in (2, 3): return cast(Literal[2, 3], data) - msg = msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." + msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." raise ValueError(msg) +def parse_node_type(data: Any) -> NodeType: + if data in ("array", "group"): + return cast(Literal["array", "group"], data) + raise MetadataValidationError("node_type", "array or group", data) + + # todo: convert None to empty dict def parse_attributes(data: Any) -> dict[str, Any]: if data is None: @@ -161,21 +168,24 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: f"Invalid value for metadata items. key='{k}', type='{type(v).__name__}'" ) - node_type = v.get("node_type", None) - if node_type == "group": - metadata[k] = GroupMetadata.from_dict(v) - elif node_type == "array" and v.get("zarr_format") == 3: - metadata[k] = ArrayV3Metadata.from_dict(v) - elif node_type == "array": - metadata[k] = ArrayV2Metadata.from_dict(v) - else: - # We either have V2 metadata, or invalid metadata + # zarr_format is present in v2 and v3. + zarr_format = parse_zarr_format(v["zarr_format"]) + + if zarr_format == 3: + node_type = parse_node_type(v.get("node_type", None)) + if node_type == "group": + metadata[k] = GroupMetadata.from_dict(v) + elif node_type == "array": + metadata[k] = ArrayV3Metadata.from_dict(v) + else: + assert_never(node_type) + elif zarr_format == 2: if "shape" in v: - # probably ArrayV2Metadata metadata[k] = ArrayV2Metadata.from_dict(v) else: - # probably v2 Group metadata metadata[k] = GroupMetadata.from_dict(v) + else: + assert_never(zarr_format) cls._flat_to_nested(metadata) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 7d0e4cb81a..4e4ba23313 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -10,6 +10,7 @@ from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding +from zarr.core.group import parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, DataType, @@ -18,6 +19,7 @@ parse_fill_value, parse_zarr_format, ) +from zarr.errors import MetadataValidationError if TYPE_CHECKING: from collections.abc import Sequence @@ -68,15 +70,29 @@ def test_parse_zarr_format_valid() -> None: assert parse_zarr_format(3) == 3 +def test_parse_node_type_valid() -> None: + assert parse_node_type("array") == "array" + assert parse_node_type("group") == "group" + + +@pytest.mark.parametrize("node_type", [None, 2, "other"]) +def test_parse_node_type_invalid(node_type: Any) -> None: + with pytest.raises( + MetadataValidationError, + match=f"Invalid value for 'node_type'. Expected 'array or group'. Got '{node_type}'.", + ): + parse_node_type(node_type) + + @pytest.mark.parametrize("data", [None, "group"]) -def test_parse_node_type_arrayinvalid(data: Any) -> None: +def test_parse_node_type_array_invalid(data: Any) -> None: with pytest.raises( ValueError, match=f"Invalid value for 'node_type'. Expected 'array'. Got '{data}'." ): parse_node_type_array(data) -def test_parse_node_typevalid() -> None: +def test_parse_node_typev_array_alid() -> None: assert parse_node_type_array("array") == "array" From d17f955b8b6d1f39f142895f817ff1faa19aea8b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 07:42:46 -0500 Subject: [PATCH 59/62] cleanup --- src/zarr/core/group.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1df02d01ed..7cf6b7bd64 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -550,7 +550,7 @@ async def open( # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None if not isinstance(use_consolidated, bool | None): - raise TypeError("use_consolidated must be a bool for Zarr V3.") + raise TypeError("use_consolidated must be a bool or None for Zarr V3.") return cls._from_bytes_v3( store_path, @@ -1166,8 +1166,8 @@ async def _members( # hmm lots of I/O and logic interleaved here. # We *could* have an async gen over self.metadata.consolidated_metadata.metadata.keys() - # and plug in here.l `getitem` will skip I/O. - # Kinda a shape to have all the asyncio task overhead though, when it isn't needed. + # and plug in here. `getitem` will skip I/O. + # Kinda a shame to have all the asyncio task overhead though, when it isn't needed. async for key in self.store_path.store.list_dir(self.store_path.path): if key in _skip_keys: From 1d17140b40212131a342a7e7d565cc32d9a9068a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 09:44:25 -0500 Subject: [PATCH 60/62] cleanup --- src/zarr/core/group.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 7cf6b7bd64..7f36c0d6d4 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1230,9 +1230,7 @@ async def contains(self, member: str) -> bool: return True async def groups(self) -> AsyncGenerator[tuple[str, AsyncGroup], None]: - members = [x async for x in self.members()] - # async for name, value in self.members(): - for name, value in members: + async for name, value in self.members(): if isinstance(value, AsyncGroup): yield name, value From 2b2e3da29a5b1bd304d0fd161804fc99466d4bd7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 09:46:47 -0500 Subject: [PATCH 61/62] Fixed fill_value --- tests/v3/test_group.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 9fd574143c..9b686a6618 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -414,6 +414,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat group.create_array(name=name, shape=(1,)) for name in expected_array_keys ] expected_arrays = list(zip(expected_array_keys, expected_array_values, strict=False)) + fill_value: float | None + if zarr_format == 2: + fill_value = None + else: + fill_value = np.float64(0.0) if consolidate: group = zarr.consolidate_metadata(store) @@ -422,7 +427,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "subarray": { "attributes": {}, "dtype": "float64", - "fill_value": np.float64(0.0), + "fill_value": fill_value, "shape": (1,), "chunks": (1,), "order": "C", @@ -453,7 +458,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat }, "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), "data_type": "float64", - "fill_value": np.float64(0.0), + "fill_value": fill_value, "node_type": "array", "shape": (1,), "zarr_format": zarr_format, From c9229d19e4833b10c08fa32684ccfbb4c77e12f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 13:57:35 -0500 Subject: [PATCH 62/62] fixup --- src/zarr/api/asynchronous.py | 5 ++++- src/zarr/core/array.py | 11 ++++------- src/zarr/core/group.py | 32 -------------------------------- 3 files changed, 8 insertions(+), 40 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 0defac79a9..a6363bc185 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -162,6 +162,9 @@ async def consolidate_metadata( By default, the root node is used so all the metadata in the store is consolidated. + zarr_format : {2, 3, None}, optional + The zarr format of the hierarchy. By default the zarr format + is inferred. Returns ------- @@ -662,7 +665,7 @@ async def open_group( to users. Use `numpy.empty(())` by default. attributes : dict A dictionary of JSON-serializable values with user-defined attributes. - use_consolidated : bool or str, default None + use_consolidated : bool or str, default None Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f3b99a0b8f..ef4921d46f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -73,8 +73,9 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) +from zarr.core.metadata.v3 import parse_node_type_array from zarr.core.sync import collect_aiterator, sync -from zarr.errors import MetadataValidationError, NodeTypeValidationError +from zarr.errors import MetadataValidationError from zarr.registry import get_pipeline_class from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node @@ -166,12 +167,8 @@ async def get_array_metadata( assert zarr_json_bytes is not None metadata_dict = json.loads(zarr_json_bytes.to_bytes()) - node_type = metadata_dict.get("node_type") - if node_type != "array": - # This KeyError is load bearing for `open`. That currently tries - # to open the node as an `array` and then falls back to opening - # as a group. - raise NodeTypeValidationError("node_type", "array", node_type) + parse_node_type_array(metadata_dict.get("node_type")) + return metadata_dict diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1d19c3df2c..0b15e2f085 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -6,10 +6,8 @@ import logging from collections import defaultdict from dataclasses import asdict, dataclass, field, fields, replace -from enum import Enum from typing import TYPE_CHECKING, Literal, TypeVar, assert_never, cast, overload -import numcodecs.abc import numpy as np import numpy.typing as npt from typing_extensions import deprecated @@ -99,36 +97,6 @@ def _parse_async_node( raise TypeError(f"Unknown node type, got {type(node)}") -def _json_convert(o: object) -> Any: - if isinstance(o, np.dtype): - return str(o) - if np.isscalar(o): - out: Any - if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"): - # https://github.com/zarr-developers/zarr-python/issues/2119 - # `.item()` on a datetime type might or might not return an - # integer, depending on the value. - # Explicitly cast to an int first, and then grab .item() - out = o.view("i8").item() - else: - # convert numpy scalar to python type, and pass - # python types through - out = getattr(o, "item", lambda: o)() - if isinstance(out, complex): - # python complex types are not JSON serializable, so we use the - # serialization defined in the zarr v3 spec - return [out.real, out.imag] - return out - if isinstance(o, Enum): - return o.name - # this serializes numcodecs compressors - # todo: implement to_dict for codecs - elif isinstance(o, numcodecs.abc.Codec): - config: dict[str, Any] = o.get_config() - return config - raise TypeError - - @dataclass(frozen=True) class ConsolidatedMetadata: """