zarr-developers · rabernat · Oct 8, 2024 · Jul 14, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -7,6 +7,7 @@
 from zarr.codecs.pipeline import BatchedCodecPipeline
 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
 from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.vlen_utf8 import VLenUTF8Codec
 from zarr.codecs.zstd import ZstdCodec
 
 __all__ = [
@@ -21,5 +22,6 @@
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
+    "VLenUTF8Codec",
     "ZstdCodec",
 ]
diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numcodecs.vlen import VLenUTF8
+
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.core.buffer import Buffer, NDBuffer
+from zarr.core.common import JSON, parse_named_configuration
+from zarr.registry import register_codec
+from zarr.strings import cast_to_string_dtype
+
+if TYPE_CHECKING:
+    from typing import Self
+
+    from zarr.core.array_spec import ArraySpec
+
+
+# can use a global because there are no parameters
+vlen_utf8_codec = VLenUTF8()
+
+
+@dataclass(frozen=True)
+class VLenUTF8Codec(ArrayBytesCodec):
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-utf8", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-utf8", "configuration": {}}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = vlen_utf8_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
+        decoded.shape = chunk_spec.shape
+        # coming out of the code, we know this is safe, so don't issue a warning
+        as_string_dtype = cast_to_string_dtype(decoded, safe=True)
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            vlen_utf8_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+register_codec("vlen-utf8", VLenUTF8Codec)
diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py
@@ -313,8 +313,7 @@ class NDBuffer:
     """
 
     def __init__(self, array: NDArrayLike) -> None:
-        # assert array.ndim > 0
-        assert array.dtype != object
+        # assert array.dtype != object
         self._data = array
 
     @classmethod
@@ -467,9 +466,12 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
             # Handle None fill_value for Zarr V2
             return False
         # use array_equal to obtain equal_nan=True functionality
+        # Note from Ryan: doesn't this lead to a huge amount of unnecessary memory allocation on every single chunk?
+        # Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
+        # every single time we have to write data?
         _data, other = np.broadcast_arrays(self._data, other)
         return np.array_equal(
-            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False
+            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
         )
 
     def fill(self, value: Any) -> None:

diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -58,6 +58,7 @@ def reset(self) -> None:
                 "crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
                 "sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
                 "transpose": "zarr.codecs.transpose.TransposeCodec",
+                "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
             },
             "buffer": "zarr.core.buffer.cpu.Buffer",
             "ndbuffer": "zarr.core.buffer.cpu.NDBuffer",

diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -6,8 +6,6 @@
 if TYPE_CHECKING:
     from typing import Self
 
-    import numpy.typing as npt
-
     from zarr.core.buffer import Buffer, BufferPrototype
     from zarr.core.chunk_grids import ChunkGrid
     from zarr.core.common import JSON, ChunkCoords
@@ -20,6 +18,7 @@
 
 import numcodecs.abc
 import numpy as np
+import numpy.typing as npt
 
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec
 from zarr.core.array_spec import ArraySpec
@@ -30,6 +29,7 @@
 from zarr.core.config import config
 from zarr.core.metadata.common import ArrayMetadata, parse_attributes
 from zarr.registry import get_codec_class
+from zarr.strings import STRING_DTYPE
 
 
 def parse_zarr_format(data: object) -> Literal[3]:
@@ -152,7 +152,7 @@ def _replace_special_floats(obj: object) -> Any:
 @dataclass(frozen=True, kw_only=True)
 class ArrayV3Metadata(ArrayMetadata):
     shape: ChunkCoords
-    data_type: np.dtype[Any]
+    data_type: DataType
     chunk_grid: ChunkGrid
     chunk_key_encoding: ChunkKeyEncoding
     fill_value: Any
@@ -167,7 +167,7 @@ def __init__(
         self,
         *,
         shape: Iterable[int],
-        data_type: npt.DTypeLike,
+        data_type: npt.DTypeLike | DataType,
         chunk_grid: dict[str, JSON] | ChunkGrid,
         chunk_key_encoding: dict[str, JSON] | ChunkKeyEncoding,
         fill_value: Any,
@@ -180,18 +180,18 @@ def __init__(
         Because the class is a frozen dataclass, we set attributes using object.__setattr__
         """
         shape_parsed = parse_shapelike(shape)
-        data_type_parsed = parse_dtype(data_type)
+        data_type_parsed = DataType.parse(data_type)
         chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid)
         chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding)
         dimension_names_parsed = parse_dimension_names(dimension_names)
-        fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
+        fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed.to_numpy_dtype())
         attributes_parsed = parse_attributes(attributes)
         codecs_parsed_partial = parse_codecs(codecs)
         storage_transformers_parsed = parse_storage_transformers(storage_transformers)
 
         array_spec = ArraySpec(
             shape=shape_parsed,
-            dtype=data_type_parsed,
+            dtype=data_type_parsed.to_numpy_dtype(),
             fill_value=fill_value_parsed,
             order="C",  # TODO: order is not needed here.
             prototype=default_buffer_prototype(),  # TODO: prototype is not needed here.
@@ -224,11 +224,14 @@ def _validate_metadata(self) -> None:
         if self.fill_value is None:
             raise ValueError("`fill_value` is required.")
         for codec in self.codecs:
-            codec.validate(shape=self.shape, dtype=self.data_type, chunk_grid=self.chunk_grid)
+            codec.validate(
+                shape=self.shape, dtype=self.data_type.to_numpy_dtype(), chunk_grid=self.chunk_grid
+            )
 
     @property
     def dtype(self) -> np.dtype[Any]:
-        return self.data_type
+        """Interpret Zarr dtype as NumPy dtype"""
+        return self.data_type.to_numpy_dtype()
 
     @property
     def ndim(self) -> int:
@@ -266,13 +269,13 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
         _ = parse_node_type_array(_data.pop("node_type"))
 
         # check that the data_type attribute is valid
-        _ = DataType(_data["data_type"])
+        data_type = DataType.parse(_data.pop("data_type"))
 
         # dimension_names key is optional, normalize missing to `None`
         _data["dimension_names"] = _data.pop("dimension_names", None)
         # attributes key is optional, normalize missing to `None`
         _data["attributes"] = _data.pop("attributes", None)
-        return cls(**_data)  # type: ignore[arg-type]
+        return cls(**_data, data_type=data_type)  # type: ignore[arg-type]
 
     def to_dict(self) -> dict[str, JSON]:
         out_dict = super().to_dict()
@@ -445,6 +448,7 @@ class DataType(Enum):
     float64 = "float64"
     complex64 = "complex64"
     complex128 = "complex128"
+    string = "string"
 
     @property
     def byte_count(self) -> int:
@@ -490,8 +494,16 @@ def to_numpy_shortname(self) -> str:
         }
         return data_type_to_numpy[self]
 
+    def to_numpy_dtype(self) -> np.dtype[Any]:
+        if self == DataType.string:
+            return STRING_DTYPE
+        else:
+            return np.dtype(self.to_numpy_shortname())
+
     @classmethod
-    def from_dtype(cls, dtype: np.dtype[Any]) -> DataType:
+    def from_numpy_dtype(cls, dtype: np.dtype[Any]) -> DataType:
+        if np.issubdtype(np.str_, dtype):
+            return DataType.string
         dtype_to_data_type = {
             "|b1": "bool",
             "bool": "bool",
@@ -511,16 +523,24 @@ def from_dtype(cls, dtype: np.dtype[Any]) -> DataType:
         }
         return DataType[dtype_to_data_type[dtype.str]]
 
-
-def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]:
-    try:
-        dtype = np.dtype(data)
-    except (ValueError, TypeError) as e:
-        raise ValueError(f"Invalid V3 data_type: {data}") from e
-    # check that this is a valid v3 data_type
-    try:
-        _ = DataType.from_dtype(dtype)
-    except KeyError as e:
-        raise ValueError(f"Invalid V3 data_type: {dtype}") from e
-
-    return dtype
+    @classmethod
+    def parse(cls, dtype: None | DataType | Any) -> DataType:
+        if dtype is None:
+            # the default dtype
+            return DataType.float64
+        if isinstance(dtype, DataType):
+            return dtype
+        try:
+            return DataType(dtype)
+        except ValueError:
+            pass
+        try:
+            dtype = np.dtype(dtype)
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"Invalid V3 data_type: {dtype}") from e
+        # check that this is a valid v3 data_type
+        try:
+            data_type = DataType.from_numpy_dtype(dtype)
+        except KeyError as e:
+            raise ValueError(f"Invalid V3 data_type: {dtype}") from e
+        return data_type
diff --git a/src/zarr/strings.py b/src/zarr/strings.py
@@ -0,0 +1,36 @@
+from typing import Any
+from warnings import warn
+
+import numpy as np
+
+try:
+    STRING_DTYPE = np.dtype("T")
+    NUMPY_SUPPORTS_VLEN_STRING = True
+except TypeError:
+    STRING_DTYPE = np.dtype("object")
+    NUMPY_SUPPORTS_VLEN_STRING = False
+
+
+def cast_to_string_dtype(
+    data: np.ndarray[Any, np.dtype[Any]], safe: bool = False
+) -> np.ndarray[Any, np.dtype[Any]]:
+    if np.issubdtype(data.dtype, np.str_):
+        return data
+    if np.issubdtype(data.dtype, np.object_):
+        if NUMPY_SUPPORTS_VLEN_STRING:
+            try:
+                # cast to variable-length string dtype, fail if object contains non-string data
+                # mypy says "error: Unexpected keyword argument "coerce" for "StringDType"  [call-arg]"
+                return data.astype(np.dtypes.StringDType(coerce=False), copy=False)  # type: ignore[call-arg]
+            except ValueError as e:
+                raise ValueError("Cannot cast object dtype to string dtype") from e
+        else:
+            out = data.astype(np.str_)
+            if not safe:
+                warn(
+                    f"Casted object dtype to string dtype {out.dtype}. To avoid this warning, "
+                    "cast the data to a string dtype before passing to Zarr or upgrade to NumPy >= 2.",
+                    stacklevel=2,
+                )
+            return out
+    raise ValueError(f"Cannot cast dtype {data.dtype} to string dtype")
diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py
@@ -0,0 +1,51 @@
+from typing import Any
+
+import numpy as np
+import pytest
+
+from zarr import Array
+from zarr.abc.store import Store
+from zarr.codecs import VLenUTF8Codec
+from zarr.core.metadata.v3 import ArrayV3Metadata, DataType
+from zarr.storage.common import StorePath
+from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING
+
+numpy_str_dtypes: list[type | None] = [None, str, np.dtypes.StrDType]
+expected_zarr_string_dtype: np.dtype[Any]
+if NUMPY_SUPPORTS_VLEN_STRING:
+    numpy_str_dtypes.append(np.dtypes.StringDType)
+    expected_zarr_string_dtype = np.dtypes.StringDType()
+else:
+    expected_zarr_string_dtype = np.dtype("O")
+
+
+@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
+@pytest.mark.parametrize("dtype", numpy_str_dtypes)
+async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
+    strings = ["hello", "world", "this", "is", "a", "test"]
+    data = np.array(strings).reshape((2, 3))
+    if dtype is not None:
+        data = data.astype(dtype)
+
+    sp = StorePath(store, path="string")
+    a = Array.create(
+        sp,
+        shape=data.shape,
+        chunk_shape=data.shape,
+        dtype=data.dtype,
+        fill_value="",
+        codecs=[VLenUTF8Codec()],
+    )
+    assert isinstance(a.metadata, ArrayV3Metadata)  # needed for mypy
+
+    a[:, :] = data
+    assert np.array_equal(data, a[:, :])
+    assert a.metadata.data_type == DataType.string
+    assert a.dtype == expected_zarr_string_dtype
+
+    # test round trip
+    b = Array.open(sp)
+    assert isinstance(b.metadata, ArrayV3Metadata)  # needed for mypy
+    assert np.array_equal(data, b[:, :])
+    assert b.metadata.data_type == DataType.string
+    assert a.dtype == expected_zarr_string_dtype
diff --git a/tests/v3/test_config.py b/tests/v3/test_config.py
@@ -58,6 +58,7 @@ def test_config_defaults_set() -> None:
                 "crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
                 "sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
                 "transpose": "zarr.codecs.transpose.TransposeCodec",
+                "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
             },
         }
     ]