From 3717391088ce26c1277a95216cf6364cc1d65908 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 8 Oct 2024 20:17:11 -0400 Subject: [PATCH 1/4] wip --- xarray/backends/zarr.py | 53 +++++++++++++++++++++++++++-------- xarray/coding/variables.py | 1 + xarray/core/dtypes.py | 9 +++++- xarray/tests/test_backends.py | 3 +- 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2f7fe8702b1..5aebb754409 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -495,6 +495,7 @@ class ZarrStore(AbstractWritableDataStore): "_safe_chunks", "_write_empty", "_close_store_on_close", + "_use_zarr_fill_value_as_mask" ) @classmethod @@ -513,10 +514,11 @@ def open_store( safe_chunks=True, stacklevel=2, zarr_version=None, + use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close = _get_open_params( + zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -527,6 +529,7 @@ def open_store( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask ) group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)] return { @@ -539,6 +542,7 @@ def open_store( safe_chunks, write_empty, close_store_on_close, + use_zarr_fill_value_as_mask ) for group in group_paths } @@ -559,10 +563,11 @@ def open_group( safe_chunks=True, stacklevel=2, zarr_version=None, + use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close = _get_open_params( + zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -573,6 +578,7 @@ def open_group( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask ) return cls( @@ -584,6 +590,7 @@ def open_group( safe_chunks, write_empty, close_store_on_close, + use_zarr_fill_value_as_mask ) def __init__( @@ -596,6 +603,7 @@ def __init__( safe_chunks=True, write_empty: bool | None = None, close_store_on_close: bool = False, + use_zarr_fill_value_as_mask=None ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -607,7 +615,8 @@ def __init__( self._write_region = write_region self._safe_chunks = safe_chunks self._write_empty = write_empty - self._close_store_on_close = close_store_on_close + self._close_store_on_close = close_store_on_close, + self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask @property def ds(self): @@ -650,10 +659,11 @@ def open_store_variable(self, name, zarr_array=None): } ) - # _FillValue needs to be in attributes, not encoding, so it will get - # picked up by decode_cf - if zarr_array.fill_value is not None: - attributes["_FillValue"] = zarr_array.fill_value + if self._use_zarr_fill_value_as_mask: + # Setting this attribute triggers CF decoding for missing values + # TODO: it feels a bit hacky to hijack CF decoding for this purpose + if zarr_array.fill_value is not None: + attributes["_FillValue"] = zarr_array.fill_value return Variable(dimensions, data, attributes, encoding) @@ -857,9 +867,12 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No dtype = v.dtype shape = v.shape - fill_value = attrs.pop("_FillValue", None) - if v.encoding == {"_FillValue": None} and fill_value is None: - v.encoding = {} + if self._use_zarr_fill_value_as_mask: + fill_value = attrs.pop("_FillValue", None) + if v.encoding == {"_FillValue": None} and fill_value is None: + v.encoding = {} + else: + fill_value=None zarr_array = None zarr_shape = None @@ -967,6 +980,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No pipeline = encoding.pop("codecs") encoding["codecs"] = pipeline + print(dtype, encoding) zarr_array = self.zarr_group.create( name, shape=shape, @@ -1084,6 +1098,7 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, + use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, **kwargs, @@ -1176,6 +1191,11 @@ def open_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + use_zarr_fill_value_as_mask : bool, optional + If True, use the zarr Array `fill_value` to mask the data, the same as done + for NetCDF data with `_FillValue` or `missing_value` attributes. If False, + the `fill_value` is ignored and the data are not masked. If None, this defaults + to True for `zarr_version=2` and False for `zarr_version=3`. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. @@ -1247,6 +1267,7 @@ def open_zarr( decode_timedelta=decode_timedelta, use_cftime=use_cftime, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, ) return ds @@ -1297,6 +1318,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti zarr_version=None, store=None, engine=None, + use_zarr_fill_value_as_mask=None ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: @@ -1311,6 +1333,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti storage_options=storage_options, stacklevel=stacklevel + 1, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=None ) store_entrypoint = StoreBackendEntrypoint() @@ -1443,6 +1466,7 @@ def _get_open_params( storage_options, stacklevel, zarr_version, + use_zarr_fill_value_as_mask ): import zarr @@ -1505,7 +1529,14 @@ def _get_open_params( else: zarr_group = zarr.open_group(store, **open_kwargs) close_store_on_close = zarr_group.store is not store - return zarr_group, consolidate_on_close, close_store_on_close + if use_zarr_fill_value_as_mask is None: + if zarr_version == 3: + # for new data, we use a better default + use_zarr_fill_value_as_mask = False + else: + # this was the default for v2 and shold apply to most existing Zarr data + use_zarr_fill_value_as_mask = True + return zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask BACKEND_ENTRYPOINTS["zarr"] = ("zarr", ZarrBackendEntrypoint) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 74916886026..4c235b3e42b 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -464,6 +464,7 @@ def decode(self, variable: Variable, name: T_Name = None): dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min else: if "scale_factor" not in attrs and "add_offset" not in attrs: + print(data.dtype) dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) else: dtype, decoded_fill_value = ( diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 7464c1e8a89..0d228e17a19 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -61,7 +61,13 @@ def maybe_promote(dtype: np.dtype) -> tuple[np.dtype, Any]: # N.B. these casting rules should match pandas dtype_: np.typing.DTypeLike fill_value: Any - if isdtype(dtype, "real floating"): + print("maybe_promote", dtype) + if np.issubdtype(dtype, np.dtypes.StringDType()): + # for now, we always promote string dtypes to object for consistency with existing behavior + # TODO: refactor this once we have a better way to handle numpy vlen-string dtypes + dtype_ = object + fill_value = np.nan + elif isdtype(dtype, "real floating"): dtype_ = dtype fill_value = np.nan elif np.issubdtype(dtype, np.timedelta64): @@ -196,6 +202,7 @@ def isdtype(dtype, kind: str | tuple[str, ...], xp=None) -> bool: Unlike xp.isdtype(), kind must be a string. """ + print(dtype, kind, xp) # TODO(shoyer): remove this wrapper when Xarray requires # numpy>=2 and pandas extensions arrays are implemented in # Xarray via the array API diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 184e1b1c63a..3cdd5baffd2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2242,7 +2242,7 @@ def create_store(self): def save(self, dataset, store_target, **kwargs): # type: ignore[override] if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: for k, v in dataset.variables.items(): - if v.dtype.kind in ("U", "O", "M", "b"): + if v.dtype.kind in ("M",): pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @@ -2265,6 +2265,7 @@ def roundtrip( with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: + assert False yield ds @pytest.mark.parametrize("consolidated", [False, True, None]) From 8d16bb291da4a10912030cf556b6255de0254e1e Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 8 Oct 2024 23:16:59 -0400 Subject: [PATCH 2/4] most Zarr tests passing --- xarray/backends/zarr.py | 54 ++++++++++++++++++++++++----------- xarray/coding/variables.py | 14 ++++++++- xarray/conventions.py | 3 ++ xarray/core/dtypes.py | 2 -- xarray/tests/test_backends.py | 8 +++--- 5 files changed, 58 insertions(+), 23 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5aebb754409..58572f4998c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -414,6 +414,7 @@ def _validate_datatypes_for_zarr_append(vname, existing_var, new_var): or np.issubdtype(new_var.dtype, np.datetime64) or np.issubdtype(new_var.dtype, np.bool_) or new_var.dtype == object + or (new_var.dtype.kind in ("S", "U") and existing_var.dtype == object) ): # We can skip dtype equality checks under two conditions: (1) if the var to append is # new to the dataset, because in this case there is no existing var to compare it to; @@ -495,7 +496,7 @@ class ZarrStore(AbstractWritableDataStore): "_safe_chunks", "_write_empty", "_close_store_on_close", - "_use_zarr_fill_value_as_mask" + "_use_zarr_fill_value_as_mask", ) @classmethod @@ -518,7 +519,12 @@ def open_store( write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( + ( + zarr_group, + consolidate_on_close, + close_store_on_close, + use_zarr_fill_value_as_mask, + ) = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -529,7 +535,7 @@ def open_store( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, - use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, ) group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)] return { @@ -542,7 +548,7 @@ def open_store( safe_chunks, write_empty, close_store_on_close, - use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask, ) for group in group_paths } @@ -567,7 +573,12 @@ def open_group( write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( + ( + zarr_group, + consolidate_on_close, + close_store_on_close, + use_zarr_fill_value_as_mask, + ) = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -578,7 +589,7 @@ def open_group( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, - use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, ) return cls( @@ -590,7 +601,7 @@ def open_group( safe_chunks, write_empty, close_store_on_close, - use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask, ) def __init__( @@ -603,7 +614,7 @@ def __init__( safe_chunks=True, write_empty: bool | None = None, close_store_on_close: bool = False, - use_zarr_fill_value_as_mask=None + use_zarr_fill_value_as_mask=None, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -615,7 +626,7 @@ def __init__( self._write_region = write_region self._safe_chunks = safe_chunks self._write_empty = write_empty - self._close_store_on_close = close_store_on_close, + self._close_store_on_close = (close_store_on_close,) self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask @property @@ -872,7 +883,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} else: - fill_value=None + fill_value = None zarr_array = None zarr_shape = None @@ -980,7 +991,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No pipeline = encoding.pop("codecs") encoding["codecs"] = pipeline - print(dtype, encoding) zarr_array = self.zarr_group.create( name, shape=shape, @@ -1318,7 +1328,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti zarr_version=None, store=None, engine=None, - use_zarr_fill_value_as_mask=None + use_zarr_fill_value_as_mask=None, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: @@ -1333,7 +1343,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti storage_options=storage_options, stacklevel=stacklevel + 1, zarr_version=zarr_version, - use_zarr_fill_value_as_mask=None + use_zarr_fill_value_as_mask=None, ) store_entrypoint = StoreBackendEntrypoint() @@ -1466,7 +1476,7 @@ def _get_open_params( storage_options, stacklevel, zarr_version, - use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask, ): import zarr @@ -1529,14 +1539,26 @@ def _get_open_params( else: zarr_group = zarr.open_group(store, **open_kwargs) close_store_on_close = zarr_group.store is not store + + # we use this to determine how to handle fill_value + is_zarr_v3_format: bool + if _zarr_v3(): + is_zarr_v3_format = zarr_group.metadata.zarr_format == 3 + else: + is_zarr_v3_format = False if use_zarr_fill_value_as_mask is None: - if zarr_version == 3: + if is_zarr_v3_format: # for new data, we use a better default use_zarr_fill_value_as_mask = False else: # this was the default for v2 and shold apply to most existing Zarr data use_zarr_fill_value_as_mask = True - return zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask + return ( + zarr_group, + consolidate_on_close, + close_store_on_close, + use_zarr_fill_value_as_mask, + ) BACKEND_ENTRYPOINTS["zarr"] = ("zarr", ZarrBackendEntrypoint) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4c235b3e42b..3fbc175d3e3 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -464,7 +464,6 @@ def decode(self, variable: Variable, name: T_Name = None): dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min else: if "scale_factor" not in attrs and "add_offset" not in attrs: - print(data.dtype) dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) else: dtype, decoded_fill_value = ( @@ -708,6 +707,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return variable +class Numpy2StringDTypeCoder(VariableCoder): + # Convert Numpy 2 StringDType arrays to object arrays for backwards compatibility + # TODO: remove this if / when we decide to allow StringDType arrays in Xarray + def encode(self): + raise NotImplementedError + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.dtype.kind == "T": + return variable.astype(object) + else: + return variable + + class NativeEnumCoder(VariableCoder): """Encode Enum into variable dtype metadata.""" diff --git a/xarray/conventions.py b/xarray/conventions.py index 18a81938225..9f719b8497c 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -276,6 +276,9 @@ def decode_cf_variable( var = variables.ObjectVLenStringCoder().decode(var) original_dtype = var.dtype + if original_dtype.kind == "T": + var = variables.Numpy2StringDTypeCoder().decode(var) + if mask_and_scale: for coder in [ variables.CFMaskCoder(), diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 0d228e17a19..985eb3fa83f 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -61,7 +61,6 @@ def maybe_promote(dtype: np.dtype) -> tuple[np.dtype, Any]: # N.B. these casting rules should match pandas dtype_: np.typing.DTypeLike fill_value: Any - print("maybe_promote", dtype) if np.issubdtype(dtype, np.dtypes.StringDType()): # for now, we always promote string dtypes to object for consistency with existing behavior # TODO: refactor this once we have a better way to handle numpy vlen-string dtypes @@ -202,7 +201,6 @@ def isdtype(dtype, kind: str | tuple[str, ...], xp=None) -> bool: Unlike xp.isdtype(), kind must be a string. """ - print(dtype, kind, xp) # TODO(shoyer): remove this wrapper when Xarray requires # numpy>=2 and pandas extensions arrays are implemented in # Xarray via the array API diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3cdd5baffd2..7f3e00d872f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -558,6 +558,7 @@ def test_roundtrip_object_dtype(self) -> None: # This currently includes all netCDF files when encoding is not # explicitly set. # https://github.com/pydata/xarray/issues/1647 + # Also Zarr expected["bytes_nans"][-1] = b"" expected["strings_nans"][-1] = "" assert_identical(expected, actual) @@ -2265,7 +2266,6 @@ def roundtrip( with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: - assert False yield ds @pytest.mark.parametrize("consolidated", [False, True, None]) @@ -2745,12 +2745,12 @@ def test_check_encoding_is_consistent_after_append(self) -> None: with self.create_zarr_target() as store_target: import numcodecs - compressor = numcodecs.Blosc() - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + compressor = zarr.codecs.BloscCodec() encoding_key = "codecs" - encoding_value = [compressor] + encoding_value = [zarr.codecs.BytesCodec(), compressor] else: + compressor = numcodecs.Blosc() encoding_key = "compressor" encoding_value = compressor From bd978b04827b858a106a18b19e311caf1cf71abb Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 8 Oct 2024 23:29:18 -0400 Subject: [PATCH 3/4] unskip tests --- xarray/tests/test_backends.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7f3e00d872f..990fc840b23 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3164,38 +3164,6 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks - def test_write_store(self) -> None: - skip_if_zarr_format_3(reason="unsupported dtypes") - return super().test_write_store() - - def test_roundtrip_endian(self) -> None: - skip_if_zarr_format_3(reason="unsupported dtypes") - return super().test_roundtrip_endian() - - def test_roundtrip_bytes_with_fill_value(self) -> None: - skip_if_zarr_format_3(reason="unsupported dtypes") - return super().test_roundtrip_bytes_with_fill_value() - - def test_default_fill_value(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_default_fill_value() - - def test_explicitly_omit_fill_value(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value() - - def test_explicitly_omit_fill_value_via_encoding_kwarg(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value_via_encoding_kwarg() - - def test_explicitly_omit_fill_value_in_coord(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value_in_coord() - - def test_explicitly_omit_fill_value_in_coord_via_encoding_kwarg(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value_in_coord_via_encoding_kwarg() - @requires_zarr @pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") From 34c4c2430f9f0e3559b47f52d6077c536cb2386a Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 10:54:46 -0400 Subject: [PATCH 4/4] add custom Zarr _FillValue encoding / decoding --- xarray/backends/zarr.py | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 58572f4998c..f3d122ad394 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,8 +1,10 @@ from __future__ import annotations +import base64 import functools import json import os +import struct import warnings from collections.abc import Callable, Iterable from typing import TYPE_CHECKING, Any, Literal @@ -58,6 +60,49 @@ def _zarr_v3() -> bool: ZarrFormat = Literal[2, 3] +class FillValueCoder: + """Handle custom logic to safely encode and decode fill values in Zarr. + Possibly redundant with logic in xarray/coding/variables.py but needs to be + isolated from NetCDF-specific logic. + """ + + @classmethod + def encode(cls, value: int | float | str | bytes, dtype: np.dtype[Any]) -> Any: + if dtype.kind in "S": + # byte string + return base64.standard_b64encode(value).decode() + elif dtype.kind in "b": + # boolean + return bool(value) + elif dtype.kind in "iu": + # todo: do we want to check for decimals? + return int(value) + elif dtype.kind in "f": + return base64.standard_b64encode(struct.pack("