diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 4edf7b3c570..1aeb393f3af 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -876,17 +876,20 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata ds.to_zarr(path, compute=False) Now, a Zarr store with the correct variable shapes and attributes exists that -can be filled out by subsequent calls to ``to_zarr``. The ``region`` provides a -mapping from dimension names to Python ``slice`` objects indicating where the -data should be written (in index space, not coordinate space), e.g., +can be filled out by subsequent calls to ``to_zarr``. ``region`` can be +specified as ``"auto"``, which opens the existing store and determines the +correct alignment of the new data with the existing coordinates, or as an +explicit mapping from dimension names to Python ``slice`` objects indicating +where the data should be written (in index space, not label space), e.g., .. ipython:: python # For convenience, we'll slice a single dataset, but in the real use-case # we would create them separately possibly even from separate processes. ds = xr.Dataset({"foo": ("x", np.arange(30))}) - ds.isel(x=slice(0, 10)).to_zarr(path, region={"x": slice(0, 10)}) - ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": slice(10, 20)}) + # Any of the following region specifications are valid + ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") + ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)}) Concurrent writes with ``region`` are safe as long as they modify distinct diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c173504ebfd..5d0c30a1c2f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,6 +26,10 @@ New Features By `Deepak Cherian `_. (:issue:`7764`, :pull:`8373`). - Add ``DataArray.dt.total_seconds()`` method to match the Pandas API. (:pull:`8435`). By `Ben Mares `_. +- Allow passing ``region="auto"`` in :py:meth:`Dataset.to_zarr` to automatically infer the + region to write in the original store. Also implement automatic transpose when dimension + order does not match the original store. (:issue:`7702`, :issue:`8421`, :pull:`8434`). + By `Sam Levang `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 84817745b0a..3e6d00a8059 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -27,6 +27,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.backends.zarr import open_zarr from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -1443,10 +1444,63 @@ def save_mfdataset( ) -def _validate_region(ds, region): +def _auto_detect_region(ds_new, ds_orig, dim): + # Create a mapping array of coordinates to indices on the original array + coord = ds_orig[dim] + da_map = DataArray(np.arange(coord.size), coords={dim: coord}) + + try: + da_idxs = da_map.sel({dim: ds_new[dim]}) + except KeyError as e: + if "not all values found" in str(e): + raise KeyError( + f"Not all values of coordinate '{dim}' in the new array were" + " found in the original store. Writing to a zarr region slice" + " requires that no dimensions or metadata are changed by the write." + ) + else: + raise e + + if (da_idxs.diff(dim) != 1).any(): + raise ValueError( + f"The auto-detected region of coordinate '{dim}' for writing new data" + " to the original store had non-contiguous indices. Writing to a zarr" + " region slice requires that the new data constitute a contiguous subset" + " of the original store." + ) + + dim_slice = slice(da_idxs.values[0], da_idxs.values[-1] + 1) + + return dim_slice + + +def _auto_detect_regions(ds, region, open_kwargs): + ds_original = open_zarr(**open_kwargs) + for key, val in region.items(): + if val == "auto": + region[key] = _auto_detect_region(ds, ds_original, key) + return region + + +def _validate_and_autodetect_region( + ds, region, mode, open_kwargs +) -> tuple[dict[str, slice], bool]: + if region == "auto": + region = {dim: "auto" for dim in ds.dims} + if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") + if any(v == "auto" for v in region.values()): + region_was_autodetected = True + if mode != "r+": + raise ValueError( + f"``mode`` must be 'r+' when using ``region='auto'``, got {mode}" + ) + region = _auto_detect_regions(ds, region, open_kwargs) + else: + region_was_autodetected = False + for k, v in region.items(): if k not in ds.dims: raise ValueError( @@ -1478,6 +1532,8 @@ def _validate_region(ds, region): f".drop_vars({non_matching_vars!r})" ) + return region, region_was_autodetected + def _validate_datatypes_for_zarr_append(zstore, dataset): """If variable exists in the store, confirm dtype of the data to append is compatible with @@ -1529,7 +1585,7 @@ def to_zarr( compute: Literal[True] = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1553,7 +1609,7 @@ def to_zarr( compute: Literal[False], consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1575,7 +1631,7 @@ def to_zarr( compute: bool = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1640,7 +1696,20 @@ def to_zarr( _validate_dataset_names(dataset) if region is not None: - _validate_region(dataset, region) + open_kwargs = dict( + store=store, + synchronizer=synchronizer, + group=group, + consolidated=consolidated, + storage_options=storage_options, + zarr_version=zarr_version, + ) + region, region_was_autodetected = _validate_and_autodetect_region( + dataset, region, mode, open_kwargs + ) + # drop indices to avoid potential race condition with auto region + if region_was_autodetected: + dataset = dataset.drop_vars(dataset.indexes) if append_dim is not None and append_dim in region: raise ValueError( f"cannot list the same dimension in both ``append_dim`` and " diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2b41fa5224e..6632e40cf6f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -320,14 +320,19 @@ def encode_zarr_variable(var, needs_copy=True, name=None): return var -def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim): +def _validate_and_transpose_existing_dims( + var_name, new_var, existing_var, region, append_dim +): if new_var.dims != existing_var.dims: - raise ValueError( - f"variable {var_name!r} already exists with different " - f"dimension names {existing_var.dims} != " - f"{new_var.dims}, but changing variable " - f"dimensions is not supported by to_zarr()." - ) + if set(existing_var.dims) == set(new_var.dims): + new_var = new_var.transpose(*existing_var.dims) + else: + raise ValueError( + f"variable {var_name!r} already exists with different " + f"dimension names {existing_var.dims} != " + f"{new_var.dims}, but changing variable " + f"dimensions is not supported by to_zarr()." + ) existing_sizes = {} for dim, size in existing_var.sizes.items(): @@ -344,9 +349,14 @@ def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim) f"variable {var_name!r} already exists with different " f"dimension sizes: {existing_sizes} != {new_sizes}. " f"to_zarr() only supports changing dimension sizes when " - f"explicitly appending, but append_dim={append_dim!r}." + f"explicitly appending, but append_dim={append_dim!r}. " + f"If you are attempting to write to a subset of the " + f"existing store without changing dimension sizes, " + f"consider using the region argument in to_zarr()." ) + return new_var + def _put_attrs(zarr_obj, attrs): """Raise a more informative error message for invalid attrs.""" @@ -616,7 +626,7 @@ def store( for var_name in existing_variable_names: new_var = variables_encoded[var_name] existing_var = existing_vars[var_name] - _validate_existing_dims( + new_var = _validate_and_transpose_existing_dims( var_name, new_var, existing_var, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c7f92b87d63..2e0bb7d1354 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2305,7 +2305,7 @@ def to_zarr( compute: Literal[True] = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -2328,7 +2328,7 @@ def to_zarr( compute: Literal[False], consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -2349,7 +2349,7 @@ def to_zarr( compute: bool = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -2411,7 +2411,7 @@ def to_zarr( append_dim : hashable, optional If set, the dimension along which the data will be appended. All other dimensions on overridden variables must remain the same size. - region : dict, optional + region : dict or "auto", optional Optional mapping from dimension names to integer slices along dataset dimensions to indicate the region of existing zarr array(s) in which to write this dataset's data. For example, @@ -2419,6 +2419,12 @@ def to_zarr( that values should be written to the region ``0:1000`` along ``x`` and ``10000:11000`` along ``y``. + Can also specify ``"auto"``, in which case the existing store will be + opened and the region inferred by matching the new data's coordinates. + ``"auto"`` can be used as a single string, which will automatically infer + the region for all dimensions, or as dictionary values for specific + dimensions mixed together with explicit slices for other dimensions. + Two restrictions apply to the use of ``region``: - If ``region`` is set, _all_ variables in a dataset must have at diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a0823e9ec96..1c8a24770d7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5210,3 +5210,198 @@ def test_raise_writing_to_nczarr(self, mode) -> None: def test_pickle_open_mfdataset_dataset(): ds = open_example_mfdataset(["bears.nc"]) assert_identical(ds, pickle.loads(pickle.dumps(ds))) + + +@requires_zarr +class TestZarrRegionAuto: + def test_zarr_region_auto_all(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + ds_region.to_zarr(tmp_path / "test.zarr", region="auto") + + ds_updated = xr.open_zarr(tmp_path / "test.zarr") + + expected = ds.copy() + expected["test"][2:4, 6:8] += 1 + assert_identical(ds_updated, expected) + + def test_zarr_region_auto_mixed(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + ds_region.to_zarr( + tmp_path / "test.zarr", region={"x": "auto", "y": slice(6, 8)} + ) + + ds_updated = xr.open_zarr(tmp_path / "test.zarr") + + expected = ds.copy() + expected["test"][2:4, 6:8] += 1 + assert_identical(ds_updated, expected) + + def test_zarr_region_auto_noncontiguous(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=[0, 2, 3], y=[5, 6]) + with pytest.raises(ValueError): + ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + + def test_zarr_region_auto_new_coord_vals(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + x = np.arange(5, 55, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + with pytest.raises(KeyError): + ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + + def test_zarr_region_index_write(self, tmp_path): + from xarray.backends.zarr import ZarrStore + + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + + ds.to_zarr(tmp_path / "test.zarr") + + with patch.object( + ZarrStore, + "set_variables", + side_effect=ZarrStore.set_variables, + autospec=True, + ) as mock: + ds_region.to_zarr(tmp_path / "test.zarr", region="auto", mode="r+") + + # should write the data vars but never the index vars with auto mode + for call in mock.call_args_list: + written_variables = call.args[1].keys() + assert "test" in written_variables + assert "x" not in written_variables + assert "y" not in written_variables + + def test_zarr_region_append(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + x_new = np.arange(40, 70, 10) + data_new = np.ones((3, 10)) + ds_new = xr.Dataset( + { + "test": xr.DataArray( + data_new, + dims=("x", "y"), + coords={"x": x_new, "y": y}, + ) + } + ) + + # Don't allow auto region detection in append mode due to complexities in + # implementing the overlap logic and lack of safety with parallel writes + with pytest.raises(ValueError): + ds_new.to_zarr( + tmp_path / "test.zarr", mode="a", append_dim="x", region="auto" + ) + + +@requires_zarr +def test_zarr_region_transpose(tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=[0], y=[0]).transpose() + ds_region.to_zarr( + tmp_path / "test.zarr", region={"x": slice(0, 1), "y": slice(0, 1)} + )