From 2fe78758dc47b2de329df6807a6c7dd2c293946c Mon Sep 17 00:00:00 2001 From: Mick Date: Thu, 29 Sep 2022 18:36:51 +0200 Subject: [PATCH] Fix deepcopy of Variables and DataArrays (#7089) * fix deepcopy IndexVariable and dataset encodings * add fix to whats-new * fix also attrs silently changing * fix typpo * fix breaking attrs change * remove xfail from copyattrs test * [skip-ci] fix merge conflict in whats-new --- doc/whats-new.rst | 48 ++++++++++++++++----------------- xarray/core/dataarray.py | 8 +++--- xarray/core/dataset.py | 9 ++++--- xarray/core/indexes.py | 2 +- xarray/core/variable.py | 49 +++++++++++++++++++--------------- xarray/tests/test_dataarray.py | 1 - 6 files changed, 59 insertions(+), 58 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 993a1db05f9..8b380954c82 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,70 +42,68 @@ Deprecations Bug fixes ~~~~~~~~~ -- Allow reading netcdf files where the 'units' attribute is a number(:pull:`7085`) +- Allow reading netcdf files where the 'units' attribute is a number. (:pull:`7085`) By `Ghislain Picard `_. -- Allow decoding of 0 sized datetimes(:issue:`1329`, :pull:`6882`) +- Allow decoding of 0 sized datetimes. (:issue:`1329`, :pull:`6882`) By `Deepak Cherian `_. -- Make sure DataArray.name is always a string when used as label for plotting. - (:issue:`6826`, :pull:`6832`) +- Make sure DataArray.name is always a string when used as label for plotting. (:issue:`6826`, :pull:`6832`) By `Jimmy Westling `_. -- :py:attr:`DataArray.nbytes` now uses the ``nbytes`` property of the underlying array if available. - (:pull:`6797`) +- :py:attr:`DataArray.nbytes` now uses the ``nbytes`` property of the underlying array if available. (:pull:`6797`) By `Max Jones `_. - Rely on the array backend for string formatting. (:pull:`6823`). By `Jimmy Westling `_. -- Fix incompatibility with numpy 1.20 (:issue:`6818`, :pull:`6821`) +- Fix incompatibility with numpy 1.20. (:issue:`6818`, :pull:`6821`) By `Michael Niklas `_. - Fix side effects on index coordinate metadata after aligning objects. (:issue:`6852`, :pull:`6857`) By `Benoît Bovy `_. -- Make FacetGrid.set_titles send kwargs correctly using `handle.udpate(kwargs)`. - (:issue:`6839`, :pull:`6843`) +- Make FacetGrid.set_titles send kwargs correctly using `handle.udpate(kwargs)`. (:issue:`6839`, :pull:`6843`) By `Oliver Lopez `_. -- Fix bug where index variables would be changed inplace (:issue:`6931`, :pull:`6938`) +- Fix bug where index variables would be changed inplace. (:issue:`6931`, :pull:`6938`) By `Michael Niklas `_. - Allow taking the mean over non-time dimensions of datasets containing - dask-backed cftime arrays (:issue:`5897`, :pull:`6950`). By `Spencer Clark - `_. -- Harmonize returned multi-indexed indexes when applying ``concat`` along new dimension (:issue:`6881`, :pull:`6889`) + dask-backed cftime arrays. (:issue:`5897`, :pull:`6950`) + By `Spencer Clark `_. +- Harmonize returned multi-indexed indexes when applying ``concat`` along new dimension. (:issue:`6881`, :pull:`6889`) By `Fabian Hofmann `_. - Fix step plots with ``hue`` arg. (:pull:`6944`) By `András Gunyhó `_. -- Avoid use of random numbers in `test_weighted.test_weighted_operations_nonequal_coords` (:issue:`6504`, :pull:`6961`). +- Avoid use of random numbers in `test_weighted.test_weighted_operations_nonequal_coords`. (:issue:`6504`, :pull:`6961`) By `Luke Conibear `_. - Fix multiple regression issues with :py:meth:`Dataset.set_index` and - :py:meth:`Dataset.reset_index` (:pull:`6992`) + :py:meth:`Dataset.reset_index`. (:pull:`6992`) By `Benoît Bovy `_. - Raise a ``UserWarning`` when renaming a coordinate or a dimension creates a non-indexed dimension coordinate, and suggest the user creating an index - either with ``swap_dims`` or ``set_index`` (:issue:`6607`, :pull:`6999`). By - `Benoît Bovy `_. -- Use ``keep_attrs=True`` in grouping and resampling operations by default (:issue:`7012`). + either with ``swap_dims`` or ``set_index``. (:issue:`6607`, :pull:`6999`) + By `Benoît Bovy `_. +- Use ``keep_attrs=True`` in grouping and resampling operations by default. (:issue:`7012`) This means :py:attr:`Dataset.attrs` and :py:attr:`DataArray.attrs` are now preserved by default. By `Deepak Cherian `_. -- ``Dataset.encoding['source']`` now exists when reading from a Path object (:issue:`5888`, :pull:`6974`) +- ``Dataset.encoding['source']`` now exists when reading from a Path object. (:issue:`5888`, :pull:`6974`) By `Thomas Coleman `_. - Better dtype consistency for ``rolling.mean()``. (:issue:`7062`, :pull:`7063`) By `Sam Levang `_. -- Allow writing NetCDF files including only dimensionless variables using the distributed or multiprocessing scheduler - (:issue:`7013`, :pull:`7040`). +- Allow writing NetCDF files including only dimensionless variables using the distributed or multiprocessing scheduler. (:issue:`7013`, :pull:`7040`) By `Francesco Nattino `_. -- Fix bug where subplot_kwargs were not working when plotting with figsize, size or aspect (:issue:`7078`, :pull:`7080`) +- Fix deepcopy of attrs and encoding of DataArrays and Variables. (:issue:`2835`, :pull:`7089`) + By `Michael Niklas `_. +- Fix bug where subplot_kwargs were not working when plotting with figsize, size or aspect. (:issue:`7078`, :pull:`7080`) By `Michael Niklas `_. Documentation ~~~~~~~~~~~~~ -- Update merge docstrings (:issue:`6935`, :pull:`7033`). +- Update merge docstrings. (:issue:`6935`, :pull:`7033`) By `Zach Moon `_. - Raise a more informative error when trying to open a non-existent zarr store. (:issue:`6484`, :pull:`7060`) By `Sam Levang `_. -- Added examples to docstrings for :py:meth:`DataArray.expand_dims`, :py:meth:`DataArray.drop_duplicates`, :py:meth:`DataArray.reset_coords`, :py:meth:`DataArray.equals`, :py:meth:`DataArray.identical`, :py:meth:`DataArray.broadcast_equals`, :py:meth:`DataArray.bfill`, :py:meth:`DataArray.ffill`, :py:meth:`DataArray.fillna`, :py:meth:`DataArray.dropna`, :py:meth:`DataArray.drop_isel`, :py:meth:`DataArray.drop_sel`, :py:meth:`DataArray.head`, :py:meth:`DataArray.tail`. (:issue:`5816`, :pull:`7088`). +- Added examples to docstrings for :py:meth:`DataArray.expand_dims`, :py:meth:`DataArray.drop_duplicates`, :py:meth:`DataArray.reset_coords`, :py:meth:`DataArray.equals`, :py:meth:`DataArray.identical`, :py:meth:`DataArray.broadcast_equals`, :py:meth:`DataArray.bfill`, :py:meth:`DataArray.ffill`, :py:meth:`DataArray.fillna`, :py:meth:`DataArray.dropna`, :py:meth:`DataArray.drop_isel`, :py:meth:`DataArray.drop_sel`, :py:meth:`DataArray.head`, :py:meth:`DataArray.tail`. (:issue:`5816`, :pull:`7088`) By `Patrick Naylor `_. - Add missing docstrings to various array properties. (:pull:`7090`) By `Tom Nicholas `_. Internal Changes ~~~~~~~~~~~~~~~~ -- Added test for DataArray attrs deepcopy recursion/nested attrs (:issue:`2835`). +- Added test for DataArray attrs deepcopy recursion/nested attrs. (:issue:`2835`, :pull:`7086`) By `Paul hockett `_. .. _whats-new.2022.06.0: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c4bde71e0a3..794984b7a1b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -853,25 +853,23 @@ def loc(self) -> _LocIndexer: return _LocIndexer(self) @property - # Key type needs to be `Any` because of mypy#4167 def attrs(self) -> dict[Any, Any]: """Dictionary storing arbitrary metadata with this array.""" return self.variable.attrs @attrs.setter def attrs(self, value: Mapping[Any, Any]) -> None: - # Disable type checking to work around mypy bug - see mypy#4167 - self.variable.attrs = value # type: ignore[assignment] + self.variable.attrs = dict(value) @property - def encoding(self) -> dict[Hashable, Any]: + def encoding(self) -> dict[Any, Any]: """Dictionary of format-specific settings for how this array should be serialized.""" return self.variable.encoding @encoding.setter def encoding(self, value: Mapping[Any, Any]) -> None: - self.variable.encoding = value + self.variable.encoding = dict(value) @property def indexes(self) -> Indexes: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 76b229481f3..03bead3f00a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -633,7 +633,7 @@ def variables(self) -> Frozen[Hashable, Variable]: return Frozen(self._variables) @property - def attrs(self) -> dict[Hashable, Any]: + def attrs(self) -> dict[Any, Any]: """Dictionary of global attributes on this dataset""" if self._attrs is None: self._attrs = {} @@ -644,7 +644,7 @@ def attrs(self, value: Mapping[Any, Any]) -> None: self._attrs = dict(value) @property - def encoding(self) -> dict[Hashable, Any]: + def encoding(self) -> dict[Any, Any]: """Dictionary of global encoding attributes on this dataset""" if self._encoding is None: self._encoding = {} @@ -1123,7 +1123,7 @@ def _overwrite_indexes( return replaced def copy( - self: T_Dataset, deep: bool = False, data: Mapping | None = None + self: T_Dataset, deep: bool = False, data: Mapping[Any, ArrayLike] | None = None ) -> T_Dataset: """Returns a copy of this dataset. @@ -1252,8 +1252,9 @@ def copy( variables[k] = v.copy(deep=deep, data=data.get(k)) attrs = copy.deepcopy(self._attrs) if deep else copy.copy(self._attrs) + encoding = copy.deepcopy(self._encoding) if deep else copy.copy(self._encoding) - return self._replace(variables, indexes=indexes, attrs=attrs) + return self._replace(variables, indexes=indexes, attrs=attrs, encoding=encoding) def as_numpy(self: T_Dataset) -> T_Dataset: """ diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index cc92e20d91a..afbae5f3baa 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1107,7 +1107,7 @@ def dims(self) -> Mapping[Hashable, int]: return Frozen(self._dims) - def copy(self): + def copy(self) -> Indexes: return type(self)(dict(self._indexes), dict(self._variables)) def get_unique(self) -> list[T_PandasOrXarrayIndex]: diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 427ac4e9e36..0733d0d5236 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -894,7 +894,7 @@ def __setitem__(self, key, value): indexable[index_tuple] = value @property - def attrs(self) -> dict[Hashable, Any]: + def attrs(self) -> dict[Any, Any]: """Dictionary of local attributes on this variable.""" if self._attrs is None: self._attrs = {} @@ -905,7 +905,7 @@ def attrs(self, value: Mapping[Any, Any]) -> None: self._attrs = dict(value) @property - def encoding(self): + def encoding(self) -> dict[Any, Any]: """Dictionary of encodings on this variable.""" if self._encoding is None: self._encoding = {} @@ -918,7 +918,7 @@ def encoding(self, value): except ValueError: raise ValueError("encoding must be castable to a dictionary") - def copy(self, deep=True, data=None): + def copy(self, deep: bool = True, data: ArrayLike | None = None): """Returns a copy of this object. If `deep=True`, the data array is loaded into memory and copied onto @@ -929,7 +929,7 @@ def copy(self, deep=True, data=None): Parameters ---------- - deep : bool, optional + deep : bool, default: True Whether the data array is loaded into memory and copied onto the new object. Default is True. data : array_like, optional @@ -975,28 +975,29 @@ def copy(self, deep=True, data=None): pandas.DataFrame.copy """ if data is None: - data = self._data + ndata = self._data - if isinstance(data, indexing.MemoryCachedArray): + if isinstance(ndata, indexing.MemoryCachedArray): # don't share caching between copies - data = indexing.MemoryCachedArray(data.array) + ndata = indexing.MemoryCachedArray(ndata.array) if deep: - data = copy.deepcopy(data) + ndata = copy.deepcopy(ndata) else: - data = as_compatible_data(data) - if self.shape != data.shape: + ndata = as_compatible_data(data) + if self.shape != ndata.shape: raise ValueError( "Data shape {} must match shape of object {}".format( - data.shape, self.shape + ndata.shape, self.shape ) ) - # note: - # dims is already an immutable tuple - # attributes and encoding will be copied when the new Array is created - return self._replace(data=data) + attrs = copy.deepcopy(self._attrs) if deep else copy.copy(self._attrs) + encoding = copy.deepcopy(self._encoding) if deep else copy.copy(self._encoding) + + # note: dims is already an immutable tuple + return self._replace(data=ndata, attrs=attrs, encoding=encoding) def _replace( self: T_Variable, @@ -2877,7 +2878,7 @@ def concat( return cls(first_var.dims, data, attrs) - def copy(self, deep=True, data=None): + def copy(self, deep: bool = True, data: ArrayLike | None = None): """Returns a copy of this object. `deep` is ignored since data is stored in the form of @@ -2889,7 +2890,7 @@ def copy(self, deep=True, data=None): Parameters ---------- - deep : bool, optional + deep : bool, default: True Deep is ignored when data is given. Whether the data array is loaded into memory and copied onto the new object. Default is True. data : array_like, optional @@ -2902,16 +2903,20 @@ def copy(self, deep=True, data=None): data copied from original. """ if data is None: - data = self._data.copy(deep=deep) + ndata = self._data.copy(deep=deep) else: - data = as_compatible_data(data) - if self.shape != data.shape: + ndata = as_compatible_data(data) + if self.shape != ndata.shape: raise ValueError( "Data shape {} must match shape of object {}".format( - data.shape, self.shape + ndata.shape, self.shape ) ) - return self._replace(data=data) + + attrs = copy.deepcopy(self._attrs) if deep else copy.copy(self._attrs) + encoding = copy.deepcopy(self._encoding) if deep else copy.copy(self._encoding) + + return self._replace(data=ndata, attrs=attrs, encoding=encoding) def equals(self, other, equiv=None): # if equiv is specified, super up diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8bb314c4f3c..2536ada1155 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6458,7 +6458,6 @@ def test_delete_coords() -> None: assert set(a1.coords.keys()) == {"x"} -@pytest.mark.xfail def test_deepcopy_nested_attrs() -> None: """Check attrs deep copy, see :issue:`2835`""" da1 = xr.DataArray([[1, 2], [3, 4]], dims=("x", "y"), coords={"x": [10, 20]})