From da647b06312bd93c3412ddd712bf7ecb52e3f28b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 25 Sep 2023 13:30:33 +0200
Subject: [PATCH] decode variable with mismatched coordinate attribute (#8195)

* decode variable with mismatched coordinate attribute, warn/raise meaningful error

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update xarray/conventions.py

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>

* use set comparison as suggested by review

* use emit_user_level_warning for all occurrences

* fix typing and docstring

* fix typing and docstring

* silently ignore names of missing variables as per review

* only decode if there is at least one variable matching a coordinate

* fix typing

* update docstring

* Apply suggestions from code review

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add review suggestion

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* Fix test

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
---
 doc/whats-new.rst                |  2 ++
 xarray/backends/api.py           |  6 ++++
 xarray/conventions.py            | 54 ++++++++++++++++----------------
 xarray/tests/test_conventions.py | 27 +++++++++++++++-
 4 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 4307c2829ca..c37a3213793 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -87,6 +87,8 @@ Bug fixes
   issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`,
   :issue:`1064`, :pull:`7827`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+- Fixed a bug where inaccurate ``coordinates`` silently failed to decode variable (:issue:`1809`, :pull:`8195`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
 - ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords
   (:issue:`6528`, :pull:`8114`)
   By `Maximilian Roos <https://github.com/max-sixty>`_.
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
index 58a05aeddce..7ca4377e4cf 100644
--- a/xarray/backends/api.py
+++ b/xarray/backends/api.py
@@ -488,6 +488,9 @@ def open_dataset(
           as coordinate variables.
         - "all": Set variables referred to in  ``'grid_mapping'``, ``'bounds'`` and
           other attributes as coordinate variables.
+
+        Only existing variables can be set as coordinates. Missing variables
+        will be silently ignored.
     drop_variables: str or iterable of str, optional
         A variable or list of variables to exclude from being parsed from the
         dataset. This may be useful to drop variables with problems or
@@ -691,6 +694,9 @@ def open_dataarray(
           as coordinate variables.
         - "all": Set variables referred to in  ``'grid_mapping'``, ``'bounds'`` and
           other attributes as coordinate variables.
+
+        Only existing variables can be set as coordinates. Missing variables
+        will be silently ignored.
     drop_variables: str or iterable of str, optional
         A variable or list of variables to exclude from being parsed from the
         dataset. This may be useful to drop variables with problems or
diff --git a/xarray/conventions.py b/xarray/conventions.py
index 596831e270a..cf207f0c37a 100644
--- a/xarray/conventions.py
+++ b/xarray/conventions.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
-import warnings
 from collections import defaultdict
 from collections.abc import Hashable, Iterable, Mapping, MutableMapping
-from typing import TYPE_CHECKING, Any, Union
+from typing import TYPE_CHECKING, Any, Literal, Union
 
 import numpy as np
 import pandas as pd
@@ -16,6 +15,7 @@
     contains_cftime_datetimes,
 )
 from xarray.core.pycompat import is_duck_dask_array
+from xarray.core.utils import emit_user_level_warning
 from xarray.core.variable import IndexVariable, Variable
 
 CF_RELATED_DATA = (
@@ -111,13 +111,13 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable:
             return var
 
         if is_duck_dask_array(data):
-            warnings.warn(
+            emit_user_level_warning(
                 f"variable {name} has data in the form of a dask array with "
                 "dtype=object, which means it is being loaded into memory "
                 "to determine a data type that can be safely stored on disk. "
                 "To avoid this, coerce this variable to a fixed-size dtype "
                 "with astype() before saving it.",
-                SerializationWarning,
+                category=SerializationWarning,
             )
             data = data.compute()
 
@@ -354,15 +354,14 @@ def _update_bounds_encoding(variables: T_Variables) -> None:
             and "bounds" in attrs
             and attrs["bounds"] in variables
         ):
-            warnings.warn(
-                "Variable '{0}' has datetime type and a "
-                "bounds variable but {0}.encoding does not have "
-                "units specified. The units encodings for '{0}' "
-                "and '{1}' will be determined independently "
+            emit_user_level_warning(
+                f"Variable {name:s} has datetime type and a "
+                f"bounds variable but {name:s}.encoding does not have "
+                f"units specified. The units encodings for {name:s} "
+                f"and {attrs['bounds']} will be determined independently "
                 "and may not be equal, counter to CF-conventions. "
                 "If this is a concern, specify a units encoding for "
-                "'{0}' before writing to a file.".format(name, attrs["bounds"]),
-                UserWarning,
+                f"{name:s} before writing to a file.",
             )
 
         if has_date_units and "bounds" in attrs:
@@ -379,7 +378,7 @@ def decode_cf_variables(
     concat_characters: bool = True,
     mask_and_scale: bool = True,
     decode_times: bool = True,
-    decode_coords: bool = True,
+    decode_coords: bool | Literal["coordinates", "all"] = True,
     drop_variables: T_DropVariables = None,
     use_cftime: bool | None = None,
     decode_timedelta: bool | None = None,
@@ -441,11 +440,14 @@ def stackable(dim: Hashable) -> bool:
         if decode_coords in [True, "coordinates", "all"]:
             var_attrs = new_vars[k].attrs
             if "coordinates" in var_attrs:
-                coord_str = var_attrs["coordinates"]
-                var_coord_names = coord_str.split()
-                if all(k in variables for k in var_coord_names):
-                    new_vars[k].encoding["coordinates"] = coord_str
-                    del var_attrs["coordinates"]
+                var_coord_names = [
+                    c for c in var_attrs["coordinates"].split() if c in variables
+                ]
+                # propagate as is
+                new_vars[k].encoding["coordinates"] = var_attrs["coordinates"]
+                del var_attrs["coordinates"]
+                # but only use as coordinate if existing
+                if var_coord_names:
                     coord_names.update(var_coord_names)
 
         if decode_coords == "all":
@@ -461,8 +463,8 @@ def stackable(dim: Hashable) -> bool:
                             for role_or_name in part.split()
                         ]
                         if len(roles_and_names) % 2 == 1:
-                            warnings.warn(
-                                f"Attribute {attr_name:s} malformed", stacklevel=5
+                            emit_user_level_warning(
+                                f"Attribute {attr_name:s} malformed"
                             )
                         var_names = roles_and_names[1::2]
                     if all(var_name in variables for var_name in var_names):
@@ -474,9 +476,8 @@ def stackable(dim: Hashable) -> bool:
                             for proj_name in var_names
                             if proj_name not in variables
                         ]
-                        warnings.warn(
+                        emit_user_level_warning(
                             f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}",
-                            stacklevel=5,
                         )
                     del var_attrs[attr_name]
 
@@ -493,7 +494,7 @@ def decode_cf(
     concat_characters: bool = True,
     mask_and_scale: bool = True,
     decode_times: bool = True,
-    decode_coords: bool = True,
+    decode_coords: bool | Literal["coordinates", "all"] = True,
     drop_variables: T_DropVariables = None,
     use_cftime: bool | None = None,
     decode_timedelta: bool | None = None,
@@ -632,12 +633,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
 
     for name in list(non_dim_coord_names):
         if isinstance(name, str) and " " in name:
-            warnings.warn(
+            emit_user_level_warning(
                 f"coordinate {name!r} has a space in its name, which means it "
                 "cannot be marked as a coordinate on disk and will be "
                 "saved as a data variable instead",
-                SerializationWarning,
-                stacklevel=6,
+                category=SerializationWarning,
             )
             non_dim_coord_names.discard(name)
 
@@ -710,11 +710,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
     if global_coordinates:
         attributes = dict(attributes)
         if "coordinates" in attributes:
-            warnings.warn(
+            emit_user_level_warning(
                 f"cannot serialize global coordinates {global_coordinates!r} because the global "
                 f"attribute 'coordinates' already exists. This may prevent faithful roundtripping"
                 f"of xarray datasets",
-                SerializationWarning,
+                category=SerializationWarning,
             )
         else:
             attributes["coordinates"] = " ".join(sorted(map(str, global_coordinates)))
diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py
index 4dae7809be9..5157688b629 100644
--- a/xarray/tests/test_conventions.py
+++ b/xarray/tests/test_conventions.py
@@ -80,6 +80,28 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None:
     assert_identical(actual, expected)
 
 
+def test_decode_cf_variable_with_mismatched_coordinates() -> None:
+    # tests for decoding mismatched coordinates attributes
+    # see GH #1809
+    zeros1 = np.zeros((1, 5, 3))
+    orig = Dataset(
+        {
+            "XLONG": (["x", "y"], zeros1.squeeze(0), {}),
+            "XLAT": (["x", "y"], zeros1.squeeze(0), {}),
+            "foo": (["time", "x", "y"], zeros1, {"coordinates": "XTIME XLONG XLAT"}),
+            "time": ("time", [0.0], {"units": "hours since 2017-01-01"}),
+        }
+    )
+    decoded = conventions.decode_cf(orig, decode_coords=True)
+    assert decoded["foo"].encoding["coordinates"] == "XTIME XLONG XLAT"
+    assert list(decoded.coords.keys()) == ["XLONG", "XLAT", "time"]
+
+    decoded = conventions.decode_cf(orig, decode_coords=False)
+    assert "coordinates" not in decoded["foo"].encoding
+    assert decoded["foo"].attrs.get("coordinates") == "XTIME XLONG XLAT"
+    assert list(decoded.coords.keys()) == ["time"]
+
+
 @requires_cftime
 class TestEncodeCFVariable:
     def test_incompatible_attributes(self) -> None:
@@ -246,9 +268,12 @@ def test_dataset(self) -> None:
         assert_identical(expected, actual)
 
     def test_invalid_coordinates(self) -> None:
-        # regression test for GH308
+        # regression test for GH308, GH1809
         original = Dataset({"foo": ("t", [1, 2], {"coordinates": "invalid"})})
+        decoded = Dataset({"foo": ("t", [1, 2], {}, {"coordinates": "invalid"})})
         actual = conventions.decode_cf(original)
+        assert_identical(decoded, actual)
+        actual = conventions.decode_cf(original, decode_coords=False)
         assert_identical(original, actual)
 
     def test_decode_coordinates(self) -> None: