From 7477fd10115ba89b86752ee967c858f6f0fe7f9b Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Tue, 16 Jul 2024 08:25:41 +0200 Subject: [PATCH] Per-variable specification of boolean parameters in open_dataset (#9218) * allow per-variable choice of mask_and_scale in open_dataset * simplify docstring datatype * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * dict -> Mapping in type annotation Co-authored-by: Michael Niklas * use typevar for _item_or_default annotation Otherwise you lose all typing when you use that because it returns Any. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implement feature for 4 additional parameters * fix default value inconsistency * add what's new + None annotation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * _item_or_default return type T | None * remove deault default value _item_or_default * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docstring dtype naming --------- Co-authored-by: Mathijs Verhaegh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Michael Niklas --- doc/whats-new.rst | 3 +++ xarray/backends/api.py | 34 ++++++++++++++++++++++------------ xarray/conventions.py | 34 ++++++++++++++++++++++------------ 3 files changed, 47 insertions(+), 24 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6a8e898c93c..4d4291e050e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,9 @@ v2024.06.1 (unreleased) New Features ~~~~~~~~~~~~ +- Allow per-variable specification of ``mask_and_scale``, ``decode_times``, ``decode_timedelta`` + ``use_cftime`` and ``concat_characters`` params in :py:func:`~xarray.open_dataset` (:pull:`9218`). + By `Mathijs Verhaegh `_. - Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`). By `Martin Raspaud `_. - Extract the source url from fsspec objects (:issue:`9142`, :pull:`8923`). diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 521bdf65e6a..ece60a2b161 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -398,11 +398,11 @@ def open_dataset( chunks: T_Chunks = None, cache: bool | None = None, decode_cf: bool | None = None, - mask_and_scale: bool | None = None, - decode_times: bool | None = None, - decode_timedelta: bool | None = None, - use_cftime: bool | None = None, - concat_characters: bool | None = None, + mask_and_scale: bool | Mapping[str, bool] | None = None, + decode_times: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, @@ -451,25 +451,31 @@ def open_dataset( decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. - mask_and_scale : bool, optional + mask_and_scale : bool or dict-like, optional If True, replace array values equal to `_FillValue` with NA and scale values according to the formula `original_values * scale_factor + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are taken from variable attributes (if they exist). If the `_FillValue` or `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will - be replaced by NA. This keyword may not be supported by all the backends. - decode_times : bool, optional + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_times : bool or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_timedelta : bool, optional + decode_timedelta : bool or dict-like, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - use_cftime: bool, optional + use_cftime: bool or dict-like, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not specified). If None (default), attempt to decode times to @@ -478,12 +484,16 @@ def open_dataset( ``cftime.datetime`` objects, regardless of whether or not they can be represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. This keyword may not be supported by all the backends. - concat_characters : bool, optional + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: diff --git a/xarray/conventions.py b/xarray/conventions.py index 6eff45c5b2d..ff1256883ba 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -2,7 +2,7 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping -from typing import TYPE_CHECKING, Any, Literal, Union +from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np import pandas as pd @@ -384,16 +384,26 @@ def _update_bounds_encoding(variables: T_Variables) -> None: bounds_encoding.setdefault("calendar", encoding["calendar"]) +T = TypeVar("T") + + +def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T) -> T: + """ + Return item by key if obj is mapping and key is present, else return default value. + """ + return obj.get(key, default) if isinstance(obj, Mapping) else obj + + def decode_cf_variables( variables: T_Variables, attributes: T_Attrs, - concat_characters: bool = True, - mask_and_scale: bool = True, - decode_times: bool = True, + concat_characters: bool | Mapping[str, bool] = True, + mask_and_scale: bool | Mapping[str, bool] = True, + decode_times: bool | Mapping[str, bool] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, - use_cftime: bool | None = None, - decode_timedelta: bool | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, ) -> tuple[T_Variables, T_Attrs, set[Hashable]]: """ Decode several CF encoded variables. @@ -431,7 +441,7 @@ def stackable(dim: Hashable) -> bool: if k in drop_variables: continue stack_char_dim = ( - concat_characters + _item_or_default(concat_characters, k, True) and v.dtype == "S1" and v.ndim > 0 and stackable(v.dims[-1]) @@ -440,12 +450,12 @@ def stackable(dim: Hashable) -> bool: new_vars[k] = decode_cf_variable( k, v, - concat_characters=concat_characters, - mask_and_scale=mask_and_scale, - decode_times=decode_times, + concat_characters=_item_or_default(concat_characters, k, True), + mask_and_scale=_item_or_default(mask_and_scale, k, True), + decode_times=_item_or_default(decode_times, k, True), stack_char_dim=stack_char_dim, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + use_cftime=_item_or_default(use_cftime, k, None), + decode_timedelta=_item_or_default(decode_timedelta, k, None), ) except Exception as e: raise type(e)(f"Failed to decode variable {k!r}: {e}") from e