Skip to content

Commit

Permalink
Zarr : Allow setting write_empty_chunks (#8016)
Browse files Browse the repository at this point in the history
* Added parameter write_empty_chunks to Dataset.to_zarr

* Missed a couple things...

- Incorrectly set default values
- Need to set write_empty_chunks for new variables in group

* Update xarray/tests/test_backends.py

Co-authored-by: Deepak Cherian <[email protected]>

* Alternate chunking method if dask not present in test run

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Improvements

- Default value of None which will fall back to encoding specified behavior or Zarr defaults
- If param (!= None) and encoding setting disagree, a ValueError is raised
- Test case checks for compatible zarr version
- Documented minimum required Zarr version

* Bumped minimum zarr dependency version 2.10 -> 2.12

* Simplified test case

Zarr minimum dependency bump should make the version check no longer necessary

* Fixed table in docs

* Update xarray/backends/zarr.py

Co-authored-by: Illviljan <[email protected]>

* Update xarray/backends/zarr.py

Co-authored-by: Illviljan <[email protected]>

* Update xarray/tests/test_backends.py

Co-authored-by: Illviljan <[email protected]>

* Update xarray/tests/test_backends.py

Co-authored-by: Illviljan <[email protected]>

* Update xarray/backends/zarr.py

Co-authored-by: Illviljan <[email protected]>

* Fix typing

* Update xarray/core/dataset.py

Co-authored-by: Joe Hamman <[email protected]>

* Avoid setting of protected attribute in zarr Array object

---------

Co-authored-by: rileykk <[email protected]>
Co-authored-by: Deepak Cherian <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Illviljan <[email protected]>
Co-authored-by: Joe Hamman <[email protected]>
Co-authored-by: Joe Hamman <[email protected]>
  • Loading branch information
7 people authored Aug 4, 2023
1 parent fb6bbbe commit 0343761
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 2 deletions.
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ New Features
with a dimension. Examples include output from finite volume models like FVCOM.
(:issue:`2233`, :pull:`7989`)
By `Deepak Cherian <https://github.com/dcherian>`_ and `Benoit Bovy <https://github.com/benbovy>`_.
- When outputting :py:class:`Dataset` objects as Zarr via :py:meth:`Dataset.to_zarr`,
user can now specify that chunks that will contain no valid data will not be written.
Originally, this could be done by specifying ``"write_empty_chunks": True`` in the
``encoding`` parameter; however, this setting would not carry over when appending new
data to an existing dataset. (:issue:`8009`) Requires ``zarr>=2.11``.


Breaking changes
Expand Down
4 changes: 4 additions & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1520,6 +1520,7 @@ def to_zarr(
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
zarr_version: int | None = None,
write_empty_chunks: bool | None = None,
chunkmanager_store_kwargs: dict[str, Any] | None = None,
) -> backends.ZarrStore:
...
Expand All @@ -1543,6 +1544,7 @@ def to_zarr(
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
zarr_version: int | None = None,
write_empty_chunks: bool | None = None,
chunkmanager_store_kwargs: dict[str, Any] | None = None,
) -> Delayed:
...
Expand All @@ -1563,6 +1565,7 @@ def to_zarr(
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
zarr_version: int | None = None,
write_empty_chunks: bool | None = None,
chunkmanager_store_kwargs: dict[str, Any] | None = None,
) -> backends.ZarrStore | Delayed:
"""This function creates an appropriate datastore for writing a dataset to
Expand Down Expand Up @@ -1656,6 +1659,7 @@ def to_zarr(
safe_chunks=safe_chunks,
stacklevel=4, # for Dataset.to_zarr()
zarr_version=zarr_version,
write_empty=write_empty_chunks,
)

if mode in ["a", "r+"]:
Expand Down
35 changes: 33 additions & 2 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ class ZarrStore(AbstractWritableDataStore):
"_synchronizer",
"_write_region",
"_safe_chunks",
"_write_empty",
)

@classmethod
Expand All @@ -386,6 +387,7 @@ def open_group(
safe_chunks=True,
stacklevel=2,
zarr_version=None,
write_empty: bool | None = None,
):
import zarr

Expand Down Expand Up @@ -457,6 +459,7 @@ def open_group(
append_dim,
write_region,
safe_chunks,
write_empty,
)

def __init__(
Expand All @@ -467,6 +470,7 @@ def __init__(
append_dim=None,
write_region=None,
safe_chunks=True,
write_empty: bool | None = None,
):
self.zarr_group = zarr_group
self._read_only = self.zarr_group.read_only
Expand All @@ -477,6 +481,7 @@ def __init__(
self._append_dim = append_dim
self._write_region = write_region
self._safe_chunks = safe_chunks
self._write_empty = write_empty

@property
def ds(self):
Expand Down Expand Up @@ -648,6 +653,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
dimensions.
"""

import zarr

for vn, v in variables.items():
name = _encode_variable_name(vn)
check = vn in check_encoding_set
Expand All @@ -665,7 +672,14 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
# TODO: if mode="a", consider overriding the existing variable
# metadata. This would need some case work properly with region
# and append_dim.
zarr_array = self.zarr_group[name]
if self._write_empty is not None:
zarr_array = zarr.open(
store=self.zarr_group.store,
path=f"{self.zarr_group.name}/{name}",
write_empty_chunks=self._write_empty,
)
else:
zarr_array = self.zarr_group[name]
else:
# new variable
encoding = extract_zarr_variable_encoding(
Expand All @@ -679,8 +693,25 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No

if coding.strings.check_vlen_dtype(dtype) == str:
dtype = str

if self._write_empty is not None:
if (
"write_empty_chunks" in encoding
and encoding["write_empty_chunks"] != self._write_empty
):
raise ValueError(
'Differing "write_empty_chunks" values in encoding and parameters'
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
)
else:
encoding["write_empty_chunks"] = self._write_empty

zarr_array = self.zarr_group.create(
name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding
name,
shape=shape,
dtype=dtype,
fill_value=fill_value,
**encoding,
)
zarr_array = _put_attrs(zarr_array, encoded_attrs)

Expand Down
15 changes: 15 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2280,6 +2280,7 @@ def to_zarr(
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
zarr_version: int | None = None,
write_empty_chunks: bool | None = None,
chunkmanager_store_kwargs: dict[str, Any] | None = None,
) -> ZarrStore:
...
Expand All @@ -2302,6 +2303,7 @@ def to_zarr(
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
zarr_version: int | None = None,
write_empty_chunks: bool | None = None,
chunkmanager_store_kwargs: dict[str, Any] | None = None,
) -> Delayed:
...
Expand All @@ -2321,6 +2323,7 @@ def to_zarr(
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
zarr_version: int | None = None,
write_empty_chunks: bool | None = None,
chunkmanager_store_kwargs: dict[str, Any] | None = None,
) -> ZarrStore | Delayed:
"""Write dataset contents to a zarr group.
Expand Down Expand Up @@ -2410,6 +2413,17 @@ def to_zarr(
The desired zarr spec version to target (currently 2 or 3). The
default of None will attempt to determine the zarr version from
``store`` when possible, otherwise defaulting to 2.
write_empty_chunks : bool or None, optional
If True, all chunks will be stored regardless of their
contents. If False, each chunk is compared to the array's fill value
prior to storing. If a chunk is uniformly equal to the fill value, then
that chunk is not be stored, and the store entry for that chunk's key
is deleted. This setting enables sparser storage, as only chunks with
non-fill-value data are stored, at the expense of overhead associated
with checking the data of each chunk. If None (default) fall back to
specification(s) in ``encoding`` or Zarr defaults. A ``ValueError``
will be raised if the value of this (if not None) differs with
``encoding``.
chunkmanager_store_kwargs : dict, optional
Additional keyword arguments passed on to the `ChunkManager.store` method used to store
chunked arrays. For example for a dask array additional kwargs will be passed eventually to
Expand Down Expand Up @@ -2459,6 +2473,7 @@ def to_zarr(
region=region,
safe_chunks=safe_chunks,
zarr_version=zarr_version,
write_empty_chunks=write_empty_chunks,
chunkmanager_store_kwargs=chunkmanager_store_kwargs,
)

Expand Down
81 changes: 81 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from collections.abc import Iterator
from contextlib import ExitStack
from io import BytesIO
from os import listdir
from pathlib import Path
from typing import TYPE_CHECKING, Any, Final, cast

Expand Down Expand Up @@ -2665,6 +2666,86 @@ def create_store(self):
yield group


@requires_zarr
class TestZarrWriteEmpty(TestZarrDirectoryStore):
@contextlib.contextmanager
def temp_dir(self) -> Iterator[tuple[str, str]]:
with tempfile.TemporaryDirectory() as d:
store = os.path.join(d, "test.zarr")
yield d, store

@contextlib.contextmanager
def roundtrip_dir(
self,
data,
store,
save_kwargs=None,
open_kwargs=None,
allow_cleanup_failure=False,
) -> Iterator[Dataset]:
if save_kwargs is None:
save_kwargs = {}
if open_kwargs is None:
open_kwargs = {}

data.to_zarr(store, **save_kwargs, **self.version_kwargs)
with xr.open_dataset(
store, engine="zarr", **open_kwargs, **self.version_kwargs
) as ds:
yield ds

@pytest.mark.parametrize("write_empty", [True, False])
def test_write_empty(self, write_empty: bool) -> None:
if not write_empty:
expected = ["0.1.0", "1.1.0"]
else:
expected = [
"0.0.0",
"0.0.1",
"0.1.0",
"0.1.1",
"1.0.0",
"1.0.1",
"1.1.0",
"1.1.1",
]

ds = xr.Dataset(
data_vars={
"test": (
("Z", "Y", "X"),
np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)),
)
}
)

if has_dask:
ds["test"] = ds["test"].chunk((1, 1, 1))
encoding = None
else:
encoding = {"test": {"chunks": (1, 1, 1)}}

with self.temp_dir() as (d, store):
ds.to_zarr(
store,
mode="w",
encoding=encoding,
write_empty_chunks=write_empty,
)

with self.roundtrip_dir(
ds,
store,
{"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty},
) as a_ds:
expected_ds = xr.concat([ds, ds], dim="Z")

assert_identical(a_ds, expected_ds)

ls = listdir(os.path.join(store, "test"))
assert set(expected) == set([file for file in ls if file[0] != "."])


class ZarrBaseV3(ZarrBase):
zarr_version = 3

Expand Down

0 comments on commit 0343761

Please sign in to comment.