Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(fix): structured arrays for v2 #2681

Merged
merged 18 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ New features
Bug fixes
~~~~~~~~~
* Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`).
* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`)

* Fixes a bug that prevented reading Zarr format 2 data with consolidated metadata written using ``zarr-python`` version 2 (:issue:`2694`).

Expand Down
1 change: 1 addition & 0 deletions docs/user-guide/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ This is the current default configuration::
'level': 0}},
'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
'numeric': None,
'raw': None,
'string': [{'id': 'vlen-utf8'}]},
'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
'level': 0},
Expand Down
4 changes: 3 additions & 1 deletion src/zarr/core/buffer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
# every single time we have to write data?
_data, other = np.broadcast_arrays(self._data, other)
return np.array_equal(
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
self._data,
other,
equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False,
)

def fill(self, value: Any) -> None:
Expand Down
1 change: 1 addition & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def reset(self) -> None:
"numeric": None,
"string": [{"id": "vlen-utf8"}],
"bytes": [{"id": "vlen-bytes"}],
"raw": None,
},
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
"v3_default_serializer": {
Expand Down
15 changes: 13 additions & 2 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,14 @@ def to_dict(self) -> dict[str, JSON]:
zarray_dict["fill_value"] = fill_value

_ = zarray_dict.pop("dtype")
zarray_dict["dtype"] = self.dtype.str
dtype_json: JSON
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
dtype_descr = self.dtype.descr
if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0:
dtype_json = tuple(self.dtype.descr)
else:
dtype_json = self.dtype.str
Comment on lines +197 to +202
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is my attempt to match the old behavior. I didn't look back at the old code yet, but if someone knows this to be wrong, would be great to know.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks right to me

zarray_dict["dtype"] = dtype_json

return zarray_dict

Expand All @@ -220,6 +227,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:


def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]:
if isinstance(data, list): # this is a valid _VoidDTypeLike check
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any iterable?

Copy link
Contributor Author

@ilan-gold ilan-gold Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to handle the [(field_name, field_dtype, field_shape), ...] case on https://numpy.org/doc/2.1/reference/arrays.dtypes.html#specifying-and-constructing-data-types but at the same time to obey
Screenshot 2025-01-14 at 19 22 14

This might require more stringent checking or tests...Not sure. The reason this tuple conversion happens is that lists (as data types) incoming from on-disk reads contain lists, not tuples. So maybe we should check list and data[0] is also list? And throw an error if it isn't? I'm not sure what else could be in the lists though

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the dtype constructor would make an exception (or our own comprehension fails) in the case the JSON on disk was edited - so I'm not too worried.

data = [tuple(d) for d in data]
return np.dtype(data)


Expand Down Expand Up @@ -376,8 +385,10 @@ def _default_filters(
dtype_key = "numeric"
elif dtype.kind in "U":
dtype_key = "string"
elif dtype.kind in "OSV":
elif dtype.kind in "OS":
dtype_key = "bytes"
elif dtype.kind == "V":
dtype_key = "raw"
else:
raise ValueError(f"Unsupported dtype kind {dtype.kind}")

Expand Down
1 change: 1 addition & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def test_config_defaults_set() -> None:
"numeric": None,
"string": [{"id": "vlen-utf8"}],
"bytes": [{"id": "vlen-bytes"}],
"raw": None,
},
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
"v3_default_serializer": {
Expand Down
41 changes: 35 additions & 6 deletions tests/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,15 @@ def test_codec_pipeline() -> None:
np.testing.assert_array_equal(result, expected)


@pytest.mark.parametrize("dtype", ["|S", "|V"])
async def test_v2_encode_decode(dtype):
@pytest.mark.parametrize(
("dtype", "expected_dtype", "fill_value", "fill_value_encoding"),
[
("|S", "|S0", b"X", "WA=="),
("|V", "|V0", b"X", "WA=="),
("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="),
],
)
async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None:
with config.set(
{
"array.v2_default_filters.bytes": [{"id": "vlen-bytes"}],
Expand All @@ -97,7 +104,7 @@ async def test_v2_encode_decode(dtype):
shape=(3,),
chunks=(3,),
dtype=dtype,
fill_value=b"X",
fill_value=fill_value,
)

result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
Expand All @@ -107,9 +114,9 @@ async def test_v2_encode_decode(dtype):
expected = {
"chunks": [3],
"compressor": None,
"dtype": f"{dtype}0",
"fill_value": "WA==",
"filters": [{"id": "vlen-bytes"}],
"dtype": expected_dtype,
"fill_value": fill_value_encoding,
"filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None,
"order": "C",
"shape": [3],
"zarr_format": 2,
Expand Down Expand Up @@ -263,3 +270,25 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None:
assert arr.metadata.compressor.codec_id == expected_compressor
if expected_filter is not None:
assert arr.metadata.filters[0].codec_id == expected_filter


@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"])
def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
a = np.array(
[(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)],
dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")],
)
array_path = tmp_path / "data.zarr"
za = zarr.create(
shape=(3,),
store=array_path,
chunks=(2,),
fill_value=fill_value,
zarr_format=2,
dtype=a.dtype,
)
if fill_value is not None:
assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
za[...] = a
za = zarr.open_array(store=array_path)
assert (a == za[:]).all()
Loading