zarr-developers · martindurant · Jan 21, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/docs/release-notes.rst b/docs/release-notes.rst
@@ -10,6 +10,7 @@ New features
 Bug fixes
 ~~~~~~~~~
 * Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`).
+* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`)
 
 * Fixes a bug that prevented reading Zarr format 2 data with consolidated metadata written using ``zarr-python`` version 2 (:issue:`2694`).
 

diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
@@ -53,6 +53,7 @@ This is the current default configuration::
                                                    'level': 0}},
               'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
                                      'numeric': None,
+                                     'raw': None,
                                      'string': [{'id': 'vlen-utf8'}]},
               'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
                                                                       'level': 0},

diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py
@@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
         # every single time we have to write data?
         _data, other = np.broadcast_arrays(self._data, other)
         return np.array_equal(
-            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
+            self._data,
+            other,
+            equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False,
         )
 
     def fill(self, value: Any) -> None:

diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -75,6 +75,7 @@ def reset(self) -> None:
                     "numeric": None,
                     "string": [{"id": "vlen-utf8"}],
                     "bytes": [{"id": "vlen-bytes"}],
+                    "raw": None,
                 },
                 "v3_default_filters": {"numeric": [], "string": [], "bytes": []},
                 "v3_default_serializer": {

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -193,7 +193,14 @@ def to_dict(self) -> dict[str, JSON]:
             zarray_dict["fill_value"] = fill_value
 
         _ = zarray_dict.pop("dtype")
-        zarray_dict["dtype"] = self.dtype.str
+        dtype_json: JSON
+        # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
+        dtype_descr = self.dtype.descr
+        if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0:
+            dtype_json = tuple(self.dtype.descr)
+        else:
+            dtype_json = self.dtype.str
+        zarray_dict["dtype"] = dtype_json
 
         return zarray_dict
 
@@ -220,6 +227,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
 
 
 def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]:
+    if isinstance(data, list):  # this is a valid _VoidDTypeLike check
+        data = [tuple(d) for d in data]
     return np.dtype(data)
 
 
@@ -376,8 +385,10 @@ def _default_filters(
         dtype_key = "numeric"
     elif dtype.kind in "U":
         dtype_key = "string"
-    elif dtype.kind in "OSV":
+    elif dtype.kind in "OS":
         dtype_key = "bytes"
+    elif dtype.kind == "V":
+        dtype_key = "raw"
     else:
         raise ValueError(f"Unsupported dtype kind {dtype.kind}")
 

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -61,6 +61,7 @@ def test_config_defaults_set() -> None:
                     "numeric": None,
                     "string": [{"id": "vlen-utf8"}],
                     "bytes": [{"id": "vlen-bytes"}],
+                    "raw": None,
                 },
                 "v3_default_filters": {"numeric": [], "string": [], "bytes": []},
                 "v3_default_serializer": {

diff --git a/tests/test_v2.py b/tests/test_v2.py
@@ -82,8 +82,15 @@ def test_codec_pipeline() -> None:
     np.testing.assert_array_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype", ["|S", "|V"])
-async def test_v2_encode_decode(dtype):
+@pytest.mark.parametrize(
+    ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"),
+    [
+        ("|S", "|S0", b"X", "WA=="),
+        ("|V", "|V0", b"X", "WA=="),
+        ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="),
+    ],
+)
+async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None:
     with config.set(
         {
             "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}],
@@ -97,7 +104,7 @@ async def test_v2_encode_decode(dtype):
             shape=(3,),
             chunks=(3,),
             dtype=dtype,
-            fill_value=b"X",
+            fill_value=fill_value,
         )
 
         result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
@@ -107,9 +114,9 @@ async def test_v2_encode_decode(dtype):
         expected = {
             "chunks": [3],
             "compressor": None,
-            "dtype": f"{dtype}0",
-            "fill_value": "WA==",
-            "filters": [{"id": "vlen-bytes"}],
+            "dtype": expected_dtype,
+            "fill_value": fill_value_encoding,
+            "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None,
             "order": "C",
             "shape": [3],
             "zarr_format": 2,
@@ -263,3 +270,25 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None:
         assert arr.metadata.compressor.codec_id == expected_compressor
         if expected_filter is not None:
             assert arr.metadata.filters[0].codec_id == expected_filter
+
+
+@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"])
+def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
+    a = np.array(
+        [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)],
+        dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")],
+    )
+    array_path = tmp_path / "data.zarr"
+    za = zarr.create(
+        shape=(3,),
+        store=array_path,
+        chunks=(2,),
+        fill_value=fill_value,
+        zarr_format=2,
+        dtype=a.dtype,
+    )
+    if fill_value is not None:
+        assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
+    za[...] = a
+    za = zarr.open_array(store=array_path)
+    assert (a == za[:]).all()