diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5d0c30a1c2f..5cfec4189b1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -72,6 +72,9 @@ Bug fixes - Fix to once again support date offset strings as input to the loffset parameter of resample and test this functionality (:pull:`8422`, :issue:`8399`). By `Katelyn FitzGerald `_. +- Fix a bug where :py:meth:`DataArray.to_dataset` silently drops a variable + if a coordinate with the same name already exists (:pull:`8433`, :issue:`7823`). + By `András Gunyhó `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 262fc407b6d..5e6feb8eda4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -574,9 +574,24 @@ def subset(dim, label): array.attrs = {} return as_variable(array) - variables = {label: subset(dim, label) for label in self.get_index(dim)} - variables.update({k: v for k, v in self._coords.items() if k != dim}) + variables_from_split = { + label: subset(dim, label) for label in self.get_index(dim) + } coord_names = set(self._coords) - {dim} + + ambiguous_vars = set(variables_from_split) & coord_names + if ambiguous_vars: + rename_msg_fmt = ", ".join([f"{v}=..." for v in sorted(ambiguous_vars)]) + raise ValueError( + f"Splitting along the dimension {dim!r} would produce the variables " + f"{tuple(sorted(ambiguous_vars))} which are also existing coordinate " + f"variables. Use DataArray.rename({rename_msg_fmt}) or " + f"DataArray.assign_coords({dim}=...) to resolve this ambiguity." + ) + + variables = variables_from_split | { + k: v for k, v in self._coords.items() if k != dim + } indexes = filter_indexes_from_coords(self._indexes, coord_names) dataset = Dataset._construct_direct( variables, coord_names, indexes=indexes, attrs=self.attrs diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 0612f0a6ac6..44b9790f0b7 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3686,8 +3686,16 @@ def test_to_dataset_whole(self) -> None: actual = named.to_dataset("bar") def test_to_dataset_split(self) -> None: - array = DataArray([1, 2, 3], coords=[("x", list("abc"))], attrs={"a": 1}) - expected = Dataset({"a": 1, "b": 2, "c": 3}, attrs={"a": 1}) + array = DataArray( + [[1, 2], [3, 4], [5, 6]], + coords=[("x", list("abc")), ("y", [0.0, 0.1])], + attrs={"a": 1}, + ) + expected = Dataset( + {"a": ("y", [1, 2]), "b": ("y", [3, 4]), "c": ("y", [5, 6])}, + coords={"y": [0.0, 0.1]}, + attrs={"a": 1}, + ) actual = array.to_dataset("x") assert_identical(expected, actual) @@ -3715,6 +3723,51 @@ def test_to_dataset_retains_keys(self) -> None: assert_equal(array, result) + def test_to_dataset_coord_value_is_dim(self) -> None: + # github issue #7823 + + array = DataArray( + np.zeros((3, 3)), + coords={ + # 'a' is both a coordinate value and the name of a coordinate + "x": ["a", "b", "c"], + "a": [1, 2, 3], + }, + ) + + with pytest.raises( + ValueError, + match=( + re.escape("dimension 'x' would produce the variables ('a',)") + + ".*" + + re.escape("DataArray.rename(a=...) or DataArray.assign_coords(x=...)") + ), + ): + array.to_dataset("x") + + # test error message formatting when there are multiple ambiguous + # values/coordinates + array2 = DataArray( + np.zeros((3, 3, 2)), + coords={ + "x": ["a", "b", "c"], + "a": [1, 2, 3], + "b": [0.0, 0.1], + }, + ) + + with pytest.raises( + ValueError, + match=( + re.escape("dimension 'x' would produce the variables ('a', 'b')") + + ".*" + + re.escape( + "DataArray.rename(a=..., b=...) or DataArray.assign_coords(x=...)" + ) + ), + ): + array2.to_dataset("x") + def test__title_for_slice(self) -> None: array = DataArray( np.ones((4, 3, 2)),