diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c00fe1a9e67..aeb6b2217c3 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1070,7 +1070,7 @@ def reset_coords( dataset[self.name] = self.variable return dataset - def __dask_tokenize__(self): + def __dask_tokenize__(self) -> object: from dask.base import normalize_token return normalize_token((type(self), self._variable, self._coords, self._name)) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 884e302b8be..e1fd9e025fb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -694,7 +694,7 @@ def __init__( data_vars, coords ) - self._attrs = dict(attrs) if attrs is not None else None + self._attrs = dict(attrs) if attrs else None self._close = None self._encoding = None self._variables = variables @@ -739,7 +739,7 @@ def attrs(self) -> dict[Any, Any]: @attrs.setter def attrs(self, value: Mapping[Any, Any]) -> None: - self._attrs = dict(value) + self._attrs = dict(value) if value else None @property def encoding(self) -> dict[Any, Any]: @@ -856,11 +856,11 @@ def load(self, **kwargs) -> Self: return self - def __dask_tokenize__(self): + def __dask_tokenize__(self) -> object: from dask.base import normalize_token return normalize_token( - (type(self), self._variables, self._coord_names, self._attrs) + (type(self), self._variables, self._coord_names, self._attrs or None) ) def __dask_graph__(self): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cd0c022d702..315c46369bd 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -2592,11 +2592,13 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): if not isinstance(self._data, PandasIndexingAdapter): self._data = PandasIndexingAdapter(self._data) - def __dask_tokenize__(self): + def __dask_tokenize__(self) -> object: from dask.base import normalize_token # Don't waste time converting pd.Index to np.ndarray - return normalize_token((type(self), self._dims, self._data.array, self._attrs)) + return normalize_token( + (type(self), self._dims, self._data.array, self._attrs or None) + ) def load(self): # data is already loaded into memory for IndexVariable diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py index 29722690437..fd209bc273f 100644 --- a/xarray/namedarray/core.py +++ b/xarray/namedarray/core.py @@ -511,7 +511,7 @@ def attrs(self) -> dict[Any, Any]: @attrs.setter def attrs(self, value: Mapping[Any, Any]) -> None: - self._attrs = dict(value) + self._attrs = dict(value) if value else None def _check_shape(self, new_data: duckarray[Any, _DType_co]) -> None: if new_data.shape != self.shape: @@ -570,13 +570,12 @@ def real( return real(self) return self._new(data=self._data.real) - def __dask_tokenize__(self) -> Hashable: + def __dask_tokenize__(self) -> object: # Use v.data, instead of v._data, in order to cope with the wrappers # around NetCDF and the like from dask.base import normalize_token - s, d, a, attrs = type(self), self._dims, self.data, self.attrs - return normalize_token((s, d, a, attrs)) # type: ignore[no-any-return] + return normalize_token((type(self), self._dims, self.data, self._attrs or None)) def __dask_graph__(self) -> Graph | None: if is_duck_dask_array(self._data): diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 0326a6173cd..b82a80b546a 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -218,7 +218,7 @@ def __eq__(self, other: ReprObject | Any) -> bool: def __hash__(self) -> int: return hash((type(self), self._value)) - def __dask_tokenize__(self) -> Hashable: + def __dask_tokenize__(self) -> object: from dask.base import normalize_token - return normalize_token((type(self), self._value)) # type: ignore[no-any-return] + return normalize_token((type(self), self._value)) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 07bf773cc88..517fc0c2d62 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -299,17 +299,6 @@ def test_persist(self): self.assertLazyAndAllClose(u + 1, v) self.assertLazyAndAllClose(u + 1, v2) - def test_tokenize_empty_attrs(self) -> None: - # Issue #6970 - assert self.eager_var._attrs is None - expected = dask.base.tokenize(self.eager_var) - assert self.eager_var.attrs == self.eager_var._attrs == {} - assert ( - expected - == dask.base.tokenize(self.eager_var) - == dask.base.tokenize(self.lazy_var.compute()) - ) - @requires_pint def test_tokenize_duck_dask_array(self): import pint @@ -1573,6 +1562,30 @@ def test_token_identical(obj, transform): ) +@pytest.mark.parametrize( + "obj", + [ + make_ds(), # Dataset + make_ds().variables["c2"], # Variable + make_ds().variables["x"], # IndexVariable + ], +) +def test_tokenize_empty_attrs(obj): + """Issues #6970 and #8788""" + obj.attrs = {} + assert obj._attrs is None + a = dask.base.tokenize(obj) + + assert obj.attrs == {} + assert obj._attrs == {} # attrs getter changed None to dict + b = dask.base.tokenize(obj) + assert a == b + + obj2 = obj.copy() + c = dask.base.tokenize(obj2) + assert a == c + + def test_recursive_token(): """Test that tokenization is invoked recursively, and doesn't just rely on the output of str() diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 289149bdd6d..09c12818754 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -878,10 +878,6 @@ def test_dask_token(): import dask s = sparse.COO.from_numpy(np.array([0, 0, 1, 2])) - - # https://github.com/pydata/sparse/issues/300 - s.__dask_tokenize__ = lambda: dask.base.normalize_token(s.__dict__) - a = DataArray(s) t1 = dask.base.tokenize(a) t2 = dask.base.tokenize(a)