From da647b06312bd93c3412ddd712bf7ecb52e3f28b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 25 Sep 2023 13:30:33 +0200 Subject: [PATCH] decode variable with mismatched coordinate attribute (#8195) * decode variable with mismatched coordinate attribute, warn/raise meaningful error * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray/conventions.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> * use set comparison as suggested by review * use emit_user_level_warning for all occurrences * fix typing and docstring * fix typing and docstring * silently ignore names of missing variables as per review * only decode if there is at least one variable matching a coordinate * fix typing * update docstring * Apply suggestions from code review Co-authored-by: Deepak Cherian * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add review suggestion Co-authored-by: Deepak Cherian * Fix test --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 2 ++ xarray/backends/api.py | 6 ++++ xarray/conventions.py | 54 ++++++++++++++++---------------- xarray/tests/test_conventions.py | 27 +++++++++++++++- 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4307c2829ca..c37a3213793 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -87,6 +87,8 @@ Bug fixes issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`, :issue:`1064`, :pull:`7827`). By `Kai Mühlbauer `_. +- Fixed a bug where inaccurate ``coordinates`` silently failed to decode variable (:issue:`1809`, :pull:`8195`). + By `Kai Mühlbauer `_ - ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords (:issue:`6528`, :pull:`8114`) By `Maximilian Roos `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 58a05aeddce..7ca4377e4cf 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -488,6 +488,9 @@ def open_dataset( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. drop_variables: str or iterable of str, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or @@ -691,6 +694,9 @@ def open_dataarray( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. drop_variables: str or iterable of str, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or diff --git a/xarray/conventions.py b/xarray/conventions.py index 596831e270a..cf207f0c37a 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -1,9 +1,8 @@ from __future__ import annotations -import warnings from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping -from typing import TYPE_CHECKING, Any, Union +from typing import TYPE_CHECKING, Any, Literal, Union import numpy as np import pandas as pd @@ -16,6 +15,7 @@ contains_cftime_datetimes, ) from xarray.core.pycompat import is_duck_dask_array +from xarray.core.utils import emit_user_level_warning from xarray.core.variable import IndexVariable, Variable CF_RELATED_DATA = ( @@ -111,13 +111,13 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable: return var if is_duck_dask_array(data): - warnings.warn( + emit_user_level_warning( f"variable {name} has data in the form of a dask array with " "dtype=object, which means it is being loaded into memory " "to determine a data type that can be safely stored on disk. " "To avoid this, coerce this variable to a fixed-size dtype " "with astype() before saving it.", - SerializationWarning, + category=SerializationWarning, ) data = data.compute() @@ -354,15 +354,14 @@ def _update_bounds_encoding(variables: T_Variables) -> None: and "bounds" in attrs and attrs["bounds"] in variables ): - warnings.warn( - "Variable '{0}' has datetime type and a " - "bounds variable but {0}.encoding does not have " - "units specified. The units encodings for '{0}' " - "and '{1}' will be determined independently " + emit_user_level_warning( + f"Variable {name:s} has datetime type and a " + f"bounds variable but {name:s}.encoding does not have " + f"units specified. The units encodings for {name:s} " + f"and {attrs['bounds']} will be determined independently " "and may not be equal, counter to CF-conventions. " "If this is a concern, specify a units encoding for " - "'{0}' before writing to a file.".format(name, attrs["bounds"]), - UserWarning, + f"{name:s} before writing to a file.", ) if has_date_units and "bounds" in attrs: @@ -379,7 +378,7 @@ def decode_cf_variables( concat_characters: bool = True, mask_and_scale: bool = True, decode_times: bool = True, - decode_coords: bool = True, + decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, decode_timedelta: bool | None = None, @@ -441,11 +440,14 @@ def stackable(dim: Hashable) -> bool: if decode_coords in [True, "coordinates", "all"]: var_attrs = new_vars[k].attrs if "coordinates" in var_attrs: - coord_str = var_attrs["coordinates"] - var_coord_names = coord_str.split() - if all(k in variables for k in var_coord_names): - new_vars[k].encoding["coordinates"] = coord_str - del var_attrs["coordinates"] + var_coord_names = [ + c for c in var_attrs["coordinates"].split() if c in variables + ] + # propagate as is + new_vars[k].encoding["coordinates"] = var_attrs["coordinates"] + del var_attrs["coordinates"] + # but only use as coordinate if existing + if var_coord_names: coord_names.update(var_coord_names) if decode_coords == "all": @@ -461,8 +463,8 @@ def stackable(dim: Hashable) -> bool: for role_or_name in part.split() ] if len(roles_and_names) % 2 == 1: - warnings.warn( - f"Attribute {attr_name:s} malformed", stacklevel=5 + emit_user_level_warning( + f"Attribute {attr_name:s} malformed" ) var_names = roles_and_names[1::2] if all(var_name in variables for var_name in var_names): @@ -474,9 +476,8 @@ def stackable(dim: Hashable) -> bool: for proj_name in var_names if proj_name not in variables ] - warnings.warn( + emit_user_level_warning( f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}", - stacklevel=5, ) del var_attrs[attr_name] @@ -493,7 +494,7 @@ def decode_cf( concat_characters: bool = True, mask_and_scale: bool = True, decode_times: bool = True, - decode_coords: bool = True, + decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, decode_timedelta: bool | None = None, @@ -632,12 +633,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): for name in list(non_dim_coord_names): if isinstance(name, str) and " " in name: - warnings.warn( + emit_user_level_warning( f"coordinate {name!r} has a space in its name, which means it " "cannot be marked as a coordinate on disk and will be " "saved as a data variable instead", - SerializationWarning, - stacklevel=6, + category=SerializationWarning, ) non_dim_coord_names.discard(name) @@ -710,11 +710,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): if global_coordinates: attributes = dict(attributes) if "coordinates" in attributes: - warnings.warn( + emit_user_level_warning( f"cannot serialize global coordinates {global_coordinates!r} because the global " f"attribute 'coordinates' already exists. This may prevent faithful roundtripping" f"of xarray datasets", - SerializationWarning, + category=SerializationWarning, ) else: attributes["coordinates"] = " ".join(sorted(map(str, global_coordinates))) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 4dae7809be9..5157688b629 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -80,6 +80,28 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: assert_identical(actual, expected) +def test_decode_cf_variable_with_mismatched_coordinates() -> None: + # tests for decoding mismatched coordinates attributes + # see GH #1809 + zeros1 = np.zeros((1, 5, 3)) + orig = Dataset( + { + "XLONG": (["x", "y"], zeros1.squeeze(0), {}), + "XLAT": (["x", "y"], zeros1.squeeze(0), {}), + "foo": (["time", "x", "y"], zeros1, {"coordinates": "XTIME XLONG XLAT"}), + "time": ("time", [0.0], {"units": "hours since 2017-01-01"}), + } + ) + decoded = conventions.decode_cf(orig, decode_coords=True) + assert decoded["foo"].encoding["coordinates"] == "XTIME XLONG XLAT" + assert list(decoded.coords.keys()) == ["XLONG", "XLAT", "time"] + + decoded = conventions.decode_cf(orig, decode_coords=False) + assert "coordinates" not in decoded["foo"].encoding + assert decoded["foo"].attrs.get("coordinates") == "XTIME XLONG XLAT" + assert list(decoded.coords.keys()) == ["time"] + + @requires_cftime class TestEncodeCFVariable: def test_incompatible_attributes(self) -> None: @@ -246,9 +268,12 @@ def test_dataset(self) -> None: assert_identical(expected, actual) def test_invalid_coordinates(self) -> None: - # regression test for GH308 + # regression test for GH308, GH1809 original = Dataset({"foo": ("t", [1, 2], {"coordinates": "invalid"})}) + decoded = Dataset({"foo": ("t", [1, 2], {}, {"coordinates": "invalid"})}) actual = conventions.decode_cf(original) + assert_identical(decoded, actual) + actual = conventions.decode_cf(original, decode_coords=False) assert_identical(original, actual) def test_decode_coordinates(self) -> None: