Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

decode variable with mismatched coordinate attribute #8195

Merged
merged 19 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ Bug fixes
issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`,
:issue:`1064`, :pull:`7827`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Fixed a bug where mismatched ``coordinates`` silently failed to decode variable (:issue:`1809`, :pull:`8195`).
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
- ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords
(:issue:`6528`, :pull:`8114`)
By `Maximilian Roos <https://github.com/max-sixty>`_.
Expand Down
8 changes: 8 additions & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,10 @@ def open_dataset(
as coordinate variables.
- "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and
other attributes as coordinate variables.

Only existing variables can be set as coordinates. Missing variables
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hope that this is short enough but explains how missing variables are treated. No native speaker here, so please suggest better wording/grammar.

will be silently ignored. If ``coordinates`` do not match any variable,
no decoding takes place.
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
drop_variables: str or iterable of str, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
Expand Down Expand Up @@ -691,6 +695,10 @@ def open_dataarray(
as coordinate variables.
- "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and
other attributes as coordinate variables.

Only existing variables can be set as coordinates. Missing variables
will be silently ignored. If ``coordinates`` do not match any variable,
no decoding takes place.
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
drop_variables: str or iterable of str, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
Expand Down
60 changes: 31 additions & 29 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from __future__ import annotations

import warnings
from collections import defaultdict
from collections.abc import Hashable, Iterable, Mapping, MutableMapping
from typing import TYPE_CHECKING, Any, Union
from typing import TYPE_CHECKING, Any, Literal, Union

import numpy as np
import pandas as pd
Expand All @@ -16,6 +15,7 @@
contains_cftime_datetimes,
)
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import emit_user_level_warning
from xarray.core.variable import IndexVariable, Variable

CF_RELATED_DATA = (
Expand Down Expand Up @@ -111,13 +111,13 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable:
return var

if is_duck_dask_array(data):
warnings.warn(
emit_user_level_warning(
f"variable {name} has data in the form of a dask array with "
"dtype=object, which means it is being loaded into memory "
"to determine a data type that can be safely stored on disk. "
"To avoid this, coerce this variable to a fixed-size dtype "
"with astype() before saving it.",
SerializationWarning,
category=SerializationWarning,
)
data = data.compute()

Expand Down Expand Up @@ -354,15 +354,14 @@ def _update_bounds_encoding(variables: T_Variables) -> None:
and "bounds" in attrs
and attrs["bounds"] in variables
):
warnings.warn(
"Variable '{0}' has datetime type and a "
"bounds variable but {0}.encoding does not have "
"units specified. The units encodings for '{0}' "
"and '{1}' will be determined independently "
emit_user_level_warning(
f"Variable {name:s} has datetime type and a "
f"bounds variable but {name:s}.encoding does not have "
f"units specified. The units encodings for {name:s} "
f"and {attrs['bounds']} will be determined independently "
"and may not be equal, counter to CF-conventions. "
"If this is a concern, specify a units encoding for "
"'{0}' before writing to a file.".format(name, attrs["bounds"]),
UserWarning,
f"{name:s} before writing to a file.",
)

if has_date_units and "bounds" in attrs:
Expand All @@ -379,7 +378,7 @@ def decode_cf_variables(
concat_characters: bool = True,
mask_and_scale: bool = True,
decode_times: bool = True,
decode_coords: bool = True,
decode_coords: bool | Literal["coordinates", "all"] = True,
drop_variables: T_DropVariables = None,
use_cftime: bool | None = None,
decode_timedelta: bool | None = None,
Expand Down Expand Up @@ -441,10 +440,12 @@ def stackable(dim: Hashable) -> bool:
if decode_coords in [True, "coordinates", "all"]:
var_attrs = new_vars[k].attrs
if "coordinates" in var_attrs:
coord_str = var_attrs["coordinates"]
var_coord_names = coord_str.split()
if all(k in variables for k in var_coord_names):
new_vars[k].encoding["coordinates"] = coord_str
var_coord_names = [
c for c in var_attrs["coordinates"].split() if c in variables
]
# only decode if there is at least one variable matching a coordinate
if var_coord_names:
new_vars[k].encoding["coordinates"] = " ".join(var_coord_names)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
del var_attrs["coordinates"]
coord_names.update(var_coord_names)

Expand All @@ -461,8 +462,8 @@ def stackable(dim: Hashable) -> bool:
for role_or_name in part.split()
]
if len(roles_and_names) % 2 == 1:
warnings.warn(
f"Attribute {attr_name:s} malformed", stacklevel=5
emit_user_level_warning(
f"Attribute {attr_name:s} malformed"
)
var_names = roles_and_names[1::2]
if all(var_name in variables for var_name in var_names):
Expand All @@ -474,9 +475,8 @@ def stackable(dim: Hashable) -> bool:
for proj_name in var_names
if proj_name not in variables
]
warnings.warn(
emit_user_level_warning(
f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}",
stacklevel=5,
)
del var_attrs[attr_name]

Expand All @@ -493,7 +493,7 @@ def decode_cf(
concat_characters: bool = True,
mask_and_scale: bool = True,
decode_times: bool = True,
decode_coords: bool = True,
decode_coords: bool | Literal["coordinates", "all"] = True,
drop_variables: T_DropVariables = None,
use_cftime: bool | None = None,
decode_timedelta: bool | None = None,
Expand All @@ -514,14 +514,17 @@ def decode_cf(
decode_times : bool, optional
Decode cf times (e.g., integers since "hours since 2000-01-01") to
np.datetime64.
decode_coords : bool or {"coordinates", "all"}, optional
decode_coords : bool or {"coordinates", "coordinates_strict", "all", "all_strict"}, optional
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
Controls which variables are set as coordinate variables:

- "coordinates" or True: Set variables referred to in the
- "coordinates"/"coordinates_strict" or True: Set variables referred to in the
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
``'coordinates'`` attribute of the datasets or individual variables
as coordinate variables.
- "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and
- "all"/"all_strict": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
other attributes as coordinate variables.
When using the "strict"-forms an error is raised, if ``coordinates``-items are
missing from the dataset, otherwise a warning is issued and only the
available ``coordinates`items are handled.
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
drop_variables : str or iterable, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
Expand Down Expand Up @@ -632,12 +635,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):

for name in list(non_dim_coord_names):
if isinstance(name, str) and " " in name:
warnings.warn(
emit_user_level_warning(
f"coordinate {name!r} has a space in its name, which means it "
"cannot be marked as a coordinate on disk and will be "
"saved as a data variable instead",
SerializationWarning,
stacklevel=6,
category=SerializationWarning,
)
non_dim_coord_names.discard(name)

Expand Down Expand Up @@ -710,11 +712,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
if global_coordinates:
attributes = dict(attributes)
if "coordinates" in attributes:
warnings.warn(
emit_user_level_warning(
f"cannot serialize global coordinates {global_coordinates!r} because the global "
f"attribute 'coordinates' already exists. This may prevent faithful roundtripping"
f"of xarray datasets",
SerializationWarning,
category=SerializationWarning,
)
else:
attributes["coordinates"] = " ".join(sorted(map(str, global_coordinates)))
Expand Down
22 changes: 22 additions & 0 deletions xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,28 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None:
assert_identical(actual, expected)


def test_decode_cf_variable_with_mismatched_coordinates() -> None:
# tests for decoding mismatched coordinates attributes
# see GH #1809
zeros1 = np.zeros((1, 5, 3))
orig = Dataset(
{
"XLONG": (["x", "y"], zeros1.squeeze(0), {}),
"XLAT": (["x", "y"], zeros1.squeeze(0), {}),
"foo": (["time", "x", "y"], zeros1, {"coordinates": "XTIME XLONG XLAT"}),
"time": ("time", [0.0], {"units": "hours since 2017-01-01"}),
}
)
decoded = conventions.decode_cf(orig, decode_coords=True)
assert decoded["foo"].encoding["coordinates"] == "XLONG XLAT"
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
assert list(decoded.coords.keys()) == ["XLONG", "XLAT", "time"]

decoded = conventions.decode_cf(orig, decode_coords=False)
assert decoded["foo"].encoding.get("coordinates") is None
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
assert decoded["foo"].attrs.get("coordinates") == "XTIME XLONG XLAT"
assert list(decoded.coords.keys()) == ["time"]


@requires_cftime
class TestEncodeCFVariable:
def test_incompatible_attributes(self) -> None:
Expand Down