Skip to content

Commit

Permalink
decode variable with mismatched coordinate attribute (#8195)
Browse files Browse the repository at this point in the history
* decode variable with mismatched coordinate attribute, warn/raise meaningful error

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update xarray/conventions.py

Co-authored-by: Illviljan <[email protected]>

* use set comparison as suggested by review

* use emit_user_level_warning for all occurrences

* fix typing and docstring

* fix typing and docstring

* silently ignore names of missing variables as per review

* only decode if there is at least one variable matching a coordinate

* fix typing

* update docstring

* Apply suggestions from code review

Co-authored-by: Deepak Cherian <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add review suggestion

Co-authored-by: Deepak Cherian <[email protected]>

* Fix test

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Illviljan <[email protected]>
Co-authored-by: Deepak Cherian <[email protected]>
  • Loading branch information
4 people authored Sep 25, 2023
1 parent bac90ab commit da647b0
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 28 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ Bug fixes
issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`,
:issue:`1064`, :pull:`7827`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Fixed a bug where inaccurate ``coordinates`` silently failed to decode variable (:issue:`1809`, :pull:`8195`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
- ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords
(:issue:`6528`, :pull:`8114`)
By `Maximilian Roos <https://github.com/max-sixty>`_.
Expand Down
6 changes: 6 additions & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ def open_dataset(
as coordinate variables.
- "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and
other attributes as coordinate variables.
Only existing variables can be set as coordinates. Missing variables
will be silently ignored.
drop_variables: str or iterable of str, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
Expand Down Expand Up @@ -691,6 +694,9 @@ def open_dataarray(
as coordinate variables.
- "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and
other attributes as coordinate variables.
Only existing variables can be set as coordinates. Missing variables
will be silently ignored.
drop_variables: str or iterable of str, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
Expand Down
54 changes: 27 additions & 27 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from __future__ import annotations

import warnings
from collections import defaultdict
from collections.abc import Hashable, Iterable, Mapping, MutableMapping
from typing import TYPE_CHECKING, Any, Union
from typing import TYPE_CHECKING, Any, Literal, Union

import numpy as np
import pandas as pd
Expand All @@ -16,6 +15,7 @@
contains_cftime_datetimes,
)
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import emit_user_level_warning
from xarray.core.variable import IndexVariable, Variable

CF_RELATED_DATA = (
Expand Down Expand Up @@ -111,13 +111,13 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable:
return var

if is_duck_dask_array(data):
warnings.warn(
emit_user_level_warning(
f"variable {name} has data in the form of a dask array with "
"dtype=object, which means it is being loaded into memory "
"to determine a data type that can be safely stored on disk. "
"To avoid this, coerce this variable to a fixed-size dtype "
"with astype() before saving it.",
SerializationWarning,
category=SerializationWarning,
)
data = data.compute()

Expand Down Expand Up @@ -354,15 +354,14 @@ def _update_bounds_encoding(variables: T_Variables) -> None:
and "bounds" in attrs
and attrs["bounds"] in variables
):
warnings.warn(
"Variable '{0}' has datetime type and a "
"bounds variable but {0}.encoding does not have "
"units specified. The units encodings for '{0}' "
"and '{1}' will be determined independently "
emit_user_level_warning(
f"Variable {name:s} has datetime type and a "
f"bounds variable but {name:s}.encoding does not have "
f"units specified. The units encodings for {name:s} "
f"and {attrs['bounds']} will be determined independently "
"and may not be equal, counter to CF-conventions. "
"If this is a concern, specify a units encoding for "
"'{0}' before writing to a file.".format(name, attrs["bounds"]),
UserWarning,
f"{name:s} before writing to a file.",
)

if has_date_units and "bounds" in attrs:
Expand All @@ -379,7 +378,7 @@ def decode_cf_variables(
concat_characters: bool = True,
mask_and_scale: bool = True,
decode_times: bool = True,
decode_coords: bool = True,
decode_coords: bool | Literal["coordinates", "all"] = True,
drop_variables: T_DropVariables = None,
use_cftime: bool | None = None,
decode_timedelta: bool | None = None,
Expand Down Expand Up @@ -441,11 +440,14 @@ def stackable(dim: Hashable) -> bool:
if decode_coords in [True, "coordinates", "all"]:
var_attrs = new_vars[k].attrs
if "coordinates" in var_attrs:
coord_str = var_attrs["coordinates"]
var_coord_names = coord_str.split()
if all(k in variables for k in var_coord_names):
new_vars[k].encoding["coordinates"] = coord_str
del var_attrs["coordinates"]
var_coord_names = [
c for c in var_attrs["coordinates"].split() if c in variables
]
# propagate as is
new_vars[k].encoding["coordinates"] = var_attrs["coordinates"]
del var_attrs["coordinates"]
# but only use as coordinate if existing
if var_coord_names:
coord_names.update(var_coord_names)

if decode_coords == "all":
Expand All @@ -461,8 +463,8 @@ def stackable(dim: Hashable) -> bool:
for role_or_name in part.split()
]
if len(roles_and_names) % 2 == 1:
warnings.warn(
f"Attribute {attr_name:s} malformed", stacklevel=5
emit_user_level_warning(
f"Attribute {attr_name:s} malformed"
)
var_names = roles_and_names[1::2]
if all(var_name in variables for var_name in var_names):
Expand All @@ -474,9 +476,8 @@ def stackable(dim: Hashable) -> bool:
for proj_name in var_names
if proj_name not in variables
]
warnings.warn(
emit_user_level_warning(
f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}",
stacklevel=5,
)
del var_attrs[attr_name]

Expand All @@ -493,7 +494,7 @@ def decode_cf(
concat_characters: bool = True,
mask_and_scale: bool = True,
decode_times: bool = True,
decode_coords: bool = True,
decode_coords: bool | Literal["coordinates", "all"] = True,
drop_variables: T_DropVariables = None,
use_cftime: bool | None = None,
decode_timedelta: bool | None = None,
Expand Down Expand Up @@ -632,12 +633,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):

for name in list(non_dim_coord_names):
if isinstance(name, str) and " " in name:
warnings.warn(
emit_user_level_warning(
f"coordinate {name!r} has a space in its name, which means it "
"cannot be marked as a coordinate on disk and will be "
"saved as a data variable instead",
SerializationWarning,
stacklevel=6,
category=SerializationWarning,
)
non_dim_coord_names.discard(name)

Expand Down Expand Up @@ -710,11 +710,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
if global_coordinates:
attributes = dict(attributes)
if "coordinates" in attributes:
warnings.warn(
emit_user_level_warning(
f"cannot serialize global coordinates {global_coordinates!r} because the global "
f"attribute 'coordinates' already exists. This may prevent faithful roundtripping"
f"of xarray datasets",
SerializationWarning,
category=SerializationWarning,
)
else:
attributes["coordinates"] = " ".join(sorted(map(str, global_coordinates)))
Expand Down
27 changes: 26 additions & 1 deletion xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,28 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None:
assert_identical(actual, expected)


def test_decode_cf_variable_with_mismatched_coordinates() -> None:
# tests for decoding mismatched coordinates attributes
# see GH #1809
zeros1 = np.zeros((1, 5, 3))
orig = Dataset(
{
"XLONG": (["x", "y"], zeros1.squeeze(0), {}),
"XLAT": (["x", "y"], zeros1.squeeze(0), {}),
"foo": (["time", "x", "y"], zeros1, {"coordinates": "XTIME XLONG XLAT"}),
"time": ("time", [0.0], {"units": "hours since 2017-01-01"}),
}
)
decoded = conventions.decode_cf(orig, decode_coords=True)
assert decoded["foo"].encoding["coordinates"] == "XTIME XLONG XLAT"
assert list(decoded.coords.keys()) == ["XLONG", "XLAT", "time"]

decoded = conventions.decode_cf(orig, decode_coords=False)
assert "coordinates" not in decoded["foo"].encoding
assert decoded["foo"].attrs.get("coordinates") == "XTIME XLONG XLAT"
assert list(decoded.coords.keys()) == ["time"]


@requires_cftime
class TestEncodeCFVariable:
def test_incompatible_attributes(self) -> None:
Expand Down Expand Up @@ -246,9 +268,12 @@ def test_dataset(self) -> None:
assert_identical(expected, actual)

def test_invalid_coordinates(self) -> None:
# regression test for GH308
# regression test for GH308, GH1809
original = Dataset({"foo": ("t", [1, 2], {"coordinates": "invalid"})})
decoded = Dataset({"foo": ("t", [1, 2], {}, {"coordinates": "invalid"})})
actual = conventions.decode_cf(original)
assert_identical(decoded, actual)
actual = conventions.decode_cf(original, decode_coords=False)
assert_identical(original, actual)

def test_decode_coordinates(self) -> None:
Expand Down

0 comments on commit da647b0

Please sign in to comment.