diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 739062231f6..e9088960616 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,6 +26,7 @@ New Features different collections of coordinates prior to assign them to a Dataset or DataArray (:pull:`8102`) at once. By `Benoît Bovy `_. +- Provide `preferred_chunks` for data read from netcdf files (:issue:`1440`, :pull:`7948`) Breaking changes ~~~~~~~~~~~~~~~~ @@ -63,6 +64,9 @@ Bug fixes special case ``NaT`` handling in :py:meth:`~core.accessor_dt.DatetimeAccessor.isocalendar()` (:issue:`7928`, :pull:`8084`). By `Kai Mühlbauer `_. +- Fix bug where :py:class:`DataArray` instances on the right-hand side + of :py:meth:`DataArray.__setitem__` lose dimension names. + (:issue:`7030`, :pull:`8067`) By `Darsh Ranjan `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 59f6c362491..19748084625 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -198,6 +198,8 @@ def open_store_variable(self, name, var): "fletcher32": var.fletcher32, "shuffle": var.shuffle, } + if var.chunks: + encoding["preferred_chunks"] = dict(zip(var.dimensions, var.chunks)) # Convert h5py-style compression options to NetCDF4-Python # style, if possible if var.compression == "gzip": diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 7cbfa5b5e4e..f21f15bf795 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -426,6 +426,7 @@ def open_store_variable(self, name, var): else: encoding["contiguous"] = False encoding["chunksizes"] = tuple(chunking) + encoding["preferred_chunks"] = dict(zip(var.dimensions, chunking)) # TODO: figure out how to round-trip "endian-ness" without raising # warnings from netCDF4 # encoding['endian'] = var.endian() diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index fd3ff60cb6c..2e0a3a7089d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -856,6 +856,7 @@ def __setitem__(self, key: Any, value: Any) -> None: obj = self[key] if isinstance(value, DataArray): assert_coordinate_consistent(value, obj.coords.variables) + value = value.variable # DataArray key -> Variable key key = { k: v.variable if isinstance(v, DataArray) else v diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e2ae34f94f2..4799b619efd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -13,7 +13,7 @@ import tempfile import uuid import warnings -from collections.abc import Iterator +from collections.abc import Generator, Iterator from contextlib import ExitStack from io import BytesIO from os import listdir @@ -1536,6 +1536,83 @@ def test_keep_chunksizes_if_no_original_shape(self) -> None: ds["x"].encoding["chunksizes"], actual["x"].encoding["chunksizes"] ) + def test_preferred_chunks_is_present(self) -> None: + ds = Dataset({"x": [1, 2, 3]}) + chunksizes = (2,) + ds.variables["x"].encoding = {"chunksizes": chunksizes} + + with self.roundtrip(ds) as actual: + assert actual["x"].encoding["preferred_chunks"] == {"x": 2} + + @requires_dask + def test_auto_chunking_is_based_on_disk_chunk_sizes(self) -> None: + x_size = y_size = 1000 + y_chunksize = y_size + x_chunksize = 10 + + with dask.config.set({"array.chunk-size": "100KiB"}): + with self.chunked_roundtrip( + (1, y_size, x_size), + (1, y_chunksize, x_chunksize), + open_kwargs={"chunks": "auto"}, + ) as ds: + t_chunks, y_chunks, x_chunks = ds["image"].data.chunks + assert all(np.asanyarray(y_chunks) == y_chunksize) + # Check that the chunk size is a multiple of the file chunk size + assert all(np.asanyarray(x_chunks) % x_chunksize == 0) + + @requires_dask + def test_base_chunking_uses_disk_chunk_sizes(self) -> None: + x_size = y_size = 1000 + y_chunksize = y_size + x_chunksize = 10 + + with self.chunked_roundtrip( + (1, y_size, x_size), + (1, y_chunksize, x_chunksize), + open_kwargs={"chunks": {}}, + ) as ds: + for chunksizes, expected in zip( + ds["image"].data.chunks, (1, y_chunksize, x_chunksize) + ): + assert all(np.asanyarray(chunksizes) == expected) + + @contextlib.contextmanager + def chunked_roundtrip( + self, + array_shape: tuple[int, int, int], + chunk_sizes: tuple[int, int, int], + open_kwargs: dict[str, Any] | None = None, + ) -> Generator[Dataset, None, None]: + t_size, y_size, x_size = array_shape + t_chunksize, y_chunksize, x_chunksize = chunk_sizes + + image = xr.DataArray( + np.arange(t_size * x_size * y_size, dtype=np.int16).reshape( + (t_size, y_size, x_size) + ), + dims=["t", "y", "x"], + ) + image.encoding = {"chunksizes": (t_chunksize, y_chunksize, x_chunksize)} + dataset = xr.Dataset(dict(image=image)) + + with self.roundtrip(dataset, open_kwargs=open_kwargs) as ds: + yield ds + + def test_preferred_chunks_are_disk_chunk_sizes(self) -> None: + x_size = y_size = 1000 + y_chunksize = y_size + x_chunksize = 10 + + with self.chunked_roundtrip( + (1, y_size, x_size), (1, y_chunksize, x_chunksize) + ) as ds: + assert ds["image"].encoding["preferred_chunks"] == { + "t": 1, + "y": y_chunksize, + "x": x_chunksize, + } + def test_encoding_chunksizes_unlimited(self) -> None: # regression test for GH1225 ds = Dataset({"x": [1, 2, 3], "y": ("x", [2, 3, 4])}) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 1f4d259d320..e8a4259e500 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -841,6 +841,27 @@ def get_data(): ) da[dict(x=ind)] = value # should not raise + def test_setitem_vectorized(self) -> None: + # Regression test for GH:7030 + # Positional indexing + v = xr.DataArray(np.r_[:120].reshape(2, 3, 4, 5), dims=["a", "b", "c", "d"]) + b = xr.DataArray([[0, 0], [1, 0]], dims=["u", "v"]) + c = xr.DataArray([[0, 1], [2, 3]], dims=["u", "v"]) + w = xr.DataArray([-1, -2], dims=["u"]) + index = dict(b=b, c=c) + v[index] = w + assert (v[index] == w).all() + + # Indexing with coordinates + v = xr.DataArray(np.r_[:120].reshape(2, 3, 4, 5), dims=["a", "b", "c", "d"]) + v.coords["b"] = [2, 4, 6] + b = xr.DataArray([[2, 2], [4, 2]], dims=["u", "v"]) + c = xr.DataArray([[0, 1], [2, 3]], dims=["u", "v"]) + w = xr.DataArray([-1, -2], dims=["u"]) + index = dict(b=b, c=c) + v.loc[index] = w + assert (v.loc[index] == w).all() + def test_contains(self) -> None: data_array = DataArray([1, 2]) assert 1 in data_array diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 882285ac8ec..89825ac0996 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4202,6 +4202,29 @@ def test_setitem_align_new_indexes(self) -> None: ) assert_identical(ds, expected) + def test_setitem_vectorized(self) -> None: + # Regression test for GH:7030 + # Positional indexing + da = xr.DataArray(np.r_[:120].reshape(2, 3, 4, 5), dims=["a", "b", "c", "d"]) + ds = xr.Dataset({"da": da}) + b = xr.DataArray([[0, 0], [1, 0]], dims=["u", "v"]) + c = xr.DataArray([[0, 1], [2, 3]], dims=["u", "v"]) + w = xr.DataArray([-1, -2], dims=["u"]) + index = dict(b=b, c=c) + ds[index] = xr.Dataset({"da": w}) + assert (ds[index]["da"] == w).all() + + # Indexing with coordinates + da = xr.DataArray(np.r_[:120].reshape(2, 3, 4, 5), dims=["a", "b", "c", "d"]) + ds = xr.Dataset({"da": da}) + ds.coords["b"] = [2, 4, 6] + b = xr.DataArray([[2, 2], [4, 2]], dims=["u", "v"]) + c = xr.DataArray([[0, 1], [2, 3]], dims=["u", "v"]) + w = xr.DataArray([-1, -2], dims=["u"]) + index = dict(b=b, c=c) + ds.loc[index] = xr.Dataset({"da": w}, coords={"b": ds.coords["b"]}) + assert (ds.loc[index]["da"] == w).all() + @pytest.mark.parametrize("dtype", [str, bytes]) def test_setitem_str_dtype(self, dtype) -> None: ds = xr.Dataset(coords={"x": np.array(["x", "y"], dtype=dtype)})