diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ca0e913f773..5bc4f256fdb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,6 +42,11 @@ New Features with a dimension. Examples include output from finite volume models like FVCOM. (:issue:`2233`, :pull:`7989`) By `Deepak Cherian `_ and `Benoit Bovy `_. +- When outputting :py:class:`Dataset` objects as Zarr via :py:meth:`Dataset.to_zarr`, + user can now specify that chunks that will contain no valid data will not be written. + Originally, this could be done by specifying ``"write_empty_chunks": True`` in the + ``encoding`` parameter; however, this setting would not carry over when appending new + data to an existing dataset. (:issue:`8009`) Requires ``zarr>=2.11``. Breaking changes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d992d3999a3..e35d85a1e2f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1520,6 +1520,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore: ... @@ -1543,6 +1544,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -1563,6 +1565,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore | Delayed: """This function creates an appropriate datastore for writing a dataset to @@ -1656,6 +1659,7 @@ def to_zarr( safe_chunks=safe_chunks, stacklevel=4, # for Dataset.to_zarr() zarr_version=zarr_version, + write_empty=write_empty_chunks, ) if mode in ["a", "r+"]: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 214581bba27..f88523422bb 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -368,6 +368,7 @@ class ZarrStore(AbstractWritableDataStore): "_synchronizer", "_write_region", "_safe_chunks", + "_write_empty", ) @classmethod @@ -386,6 +387,7 @@ def open_group( safe_chunks=True, stacklevel=2, zarr_version=None, + write_empty: bool | None = None, ): import zarr @@ -457,6 +459,7 @@ def open_group( append_dim, write_region, safe_chunks, + write_empty, ) def __init__( @@ -467,6 +470,7 @@ def __init__( append_dim=None, write_region=None, safe_chunks=True, + write_empty: bool | None = None, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -477,6 +481,7 @@ def __init__( self._append_dim = append_dim self._write_region = write_region self._safe_chunks = safe_chunks + self._write_empty = write_empty @property def ds(self): @@ -648,6 +653,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No dimensions. """ + import zarr + for vn, v in variables.items(): name = _encode_variable_name(vn) check = vn in check_encoding_set @@ -665,7 +672,14 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # TODO: if mode="a", consider overriding the existing variable # metadata. This would need some case work properly with region # and append_dim. - zarr_array = self.zarr_group[name] + if self._write_empty is not None: + zarr_array = zarr.open( + store=self.zarr_group.store, + path=f"{self.zarr_group.name}/{name}", + write_empty_chunks=self._write_empty, + ) + else: + zarr_array = self.zarr_group[name] else: # new variable encoding = extract_zarr_variable_encoding( @@ -679,8 +693,25 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if coding.strings.check_vlen_dtype(dtype) == str: dtype = str + + if self._write_empty is not None: + if ( + "write_empty_chunks" in encoding + and encoding["write_empty_chunks"] != self._write_empty + ): + raise ValueError( + 'Differing "write_empty_chunks" values in encoding and parameters' + f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }' + ) + else: + encoding["write_empty_chunks"] = self._write_empty + zarr_array = self.zarr_group.create( - name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding + name, + shape=shape, + dtype=dtype, + fill_value=fill_value, + **encoding, ) zarr_array = _put_attrs(zarr_array, encoded_attrs) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2a0265f1420..bdf2d8babe1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2280,6 +2280,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore: ... @@ -2302,6 +2303,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -2321,6 +2323,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore | Delayed: """Write dataset contents to a zarr group. @@ -2410,6 +2413,17 @@ def to_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + write_empty_chunks : bool or None, optional + If True, all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill value + prior to storing. If a chunk is uniformly equal to the fill value, then + that chunk is not be stored, and the store entry for that chunk's key + is deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. If None (default) fall back to + specification(s) in ``encoding`` or Zarr defaults. A ``ValueError`` + will be raised if the value of this (if not None) differs with + ``encoding``. chunkmanager_store_kwargs : dict, optional Additional keyword arguments passed on to the `ChunkManager.store` method used to store chunked arrays. For example for a dask array additional kwargs will be passed eventually to @@ -2459,6 +2473,7 @@ def to_zarr( region=region, safe_chunks=safe_chunks, zarr_version=zarr_version, + write_empty_chunks=write_empty_chunks, chunkmanager_store_kwargs=chunkmanager_store_kwargs, ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index dad2d668ff8..d54e1004f08 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -16,6 +16,7 @@ from collections.abc import Iterator from contextlib import ExitStack from io import BytesIO +from os import listdir from pathlib import Path from typing import TYPE_CHECKING, Any, Final, cast @@ -2665,6 +2666,86 @@ def create_store(self): yield group +@requires_zarr +class TestZarrWriteEmpty(TestZarrDirectoryStore): + @contextlib.contextmanager + def temp_dir(self) -> Iterator[tuple[str, str]]: + with tempfile.TemporaryDirectory() as d: + store = os.path.join(d, "test.zarr") + yield d, store + + @contextlib.contextmanager + def roundtrip_dir( + self, + data, + store, + save_kwargs=None, + open_kwargs=None, + allow_cleanup_failure=False, + ) -> Iterator[Dataset]: + if save_kwargs is None: + save_kwargs = {} + if open_kwargs is None: + open_kwargs = {} + + data.to_zarr(store, **save_kwargs, **self.version_kwargs) + with xr.open_dataset( + store, engine="zarr", **open_kwargs, **self.version_kwargs + ) as ds: + yield ds + + @pytest.mark.parametrize("write_empty", [True, False]) + def test_write_empty(self, write_empty: bool) -> None: + if not write_empty: + expected = ["0.1.0", "1.1.0"] + else: + expected = [ + "0.0.0", + "0.0.1", + "0.1.0", + "0.1.1", + "1.0.0", + "1.0.1", + "1.1.0", + "1.1.1", + ] + + ds = xr.Dataset( + data_vars={ + "test": ( + ("Z", "Y", "X"), + np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)), + ) + } + ) + + if has_dask: + ds["test"] = ds["test"].chunk((1, 1, 1)) + encoding = None + else: + encoding = {"test": {"chunks": (1, 1, 1)}} + + with self.temp_dir() as (d, store): + ds.to_zarr( + store, + mode="w", + encoding=encoding, + write_empty_chunks=write_empty, + ) + + with self.roundtrip_dir( + ds, + store, + {"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty}, + ) as a_ds: + expected_ds = xr.concat([ds, ds], dim="Z") + + assert_identical(a_ds, expected_ds) + + ls = listdir(os.path.join(store, "test")) + assert set(expected) == set([file for file in ls if file[0] != "."]) + + class ZarrBaseV3(ZarrBase): zarr_version = 3