From 477cd5876ccf8d3afbf3bf6317aa1ec5881a016c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Feb 2024 09:46:33 +0100 Subject: [PATCH 01/15] correctly encode/decode _FillValues --- xarray/coding/variables.py | 77 ++++++++++++++++++++++++++++------- xarray/tests/test_backends.py | 72 +++++++++++++++++--------------- 2 files changed, 102 insertions(+), 47 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index c3d57ad1903..120bff0b594 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -280,8 +280,10 @@ def encode(self, variable: Variable, name: T_Name = None): data = duck_array_ops.fillna(data, fill_value) if mv_exists: + # Use _FillValue if available to align missing_value to prevent issues + # when decoding # Ensure missing_value is cast to same dtype as data's - encoding["missing_value"] = dtype.type(mv) + encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv)) fill_value = pop_to(encoding, attrs, "missing_value", name=name) if not pd.isnull(fill_value) and not fv_exists: if is_time_like and data.dtype.kind in "iu": @@ -322,7 +324,13 @@ def decode(self, variable: Variable, name: T_Name = None): if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min else: - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + else: + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) if encoded_fill_values: transform = partial( @@ -347,23 +355,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp return data -def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]: +def _choose_float_dtype( + dtype: np.dtype, mapping: MutableMapping +) -> type[np.floating[Any]]: """Return a float dtype that can losslessly represent `dtype` values.""" - # Keep float32 as-is. Upcast half-precision to single-precision, + # check scale/offset first to derive wanted float dtype + # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 + scale_factor = mapping.get("scale_factor") + add_offset = mapping.get("add_offset") + if scale_factor is not None or add_offset is not None: + # get the type from scale_factor/add_offset to determine + # the needed floating point type + if scale_factor is not None: + scale_type = type(scale_factor) + if add_offset is not None: + offset_type = type(add_offset) + # CF conforming, both scale_factor and add-offset are given and + # of same floating point type (float32/64) + if ( + add_offset is not None + and scale_factor is not None + and offset_type == scale_type + and scale_type in [np.float32, np.float64] + ): + # in case of int32 -> we need upcast to float64 + # due to precision issues + if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer): + return np.float64 + return np.dtype(scale_type).type + # Not CF conforming and add_offset given: + # A scale factor is entirely safe (vanishing into the mantissa), + # but a large integer offset could lead to loss of precision. + # Sensitivity analysis can be tricky, so we just use a float64 + # if there's any offset at all - better unoptimised than wrong! + if add_offset is not None: + return np.float64 + # return float32 in other cases where only scale_factor is given + return np.float32 + # If no scale_factor or add_offset is given, use some general rules. + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): return np.float32 # float32 can exactly represent all integers up to 24 bits if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): - # A scale factor is entirely safe (vanishing into the mantissa), - # but a large integer offset could lead to loss of precision. - # Sensitivity analysis can be tricky, so we just use a float64 - # if there's any offset at all - better unoptimised than wrong! - if not has_offset: - return np.float32 - # For all other types and circumstances, we just use float64. - # (safe because eg. complex numbers are not supported in NetCDF) - return np.float64 + return np.float32 class CFScaleOffsetCoder(VariableCoder): @@ -377,7 +413,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) if "scale_factor" in encoding or "add_offset" in encoding: - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + # if we have a _FillValue/masked_value we do not want to cast now + # but leave that to CFMaskCoder + dtype = data.dtype + if "_FillValue" not in encoding and "missing_value" not in encoding: + dtype = _choose_float_dtype(data.dtype, encoding) + # but still we need a copy prevent changing original data data = duck_array_ops.astype(data, dtype=dtype, copy=True) if "add_offset" in encoding: data -= pop_to(encoding, attrs, "add_offset", name=name) @@ -393,11 +434,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: add_offset = np.asarray(add_offset).item() + # if we have a _FillValue/masked_value we already have the wanted + # floating point dtype here (via CFMaskCoder), so no check is necessary + # only check in other cases + dtype = data.dtype + if "_FillValue" not in encoding and "missing_value" not in encoding: + dtype = _choose_float_dtype(dtype, encoding) + transform = partial( _scale_offset_decoding, scale_factor=scale_factor, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0eaa7ee7361..5e2ae8e4804 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -142,96 +142,100 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset: ) -def create_masked_and_scaled_data() -> Dataset: - x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32) +def create_masked_and_scaled_data(dtype: np.dtype) -> Dataset: + x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), "dtype": "i2", } return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_masked_and_scaled_data() -> Dataset: - attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)} +def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset: + attributes = { + "_FillValue": -1, + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) -def create_unsigned_masked_scaled_data() -> Dataset: +def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_encoded_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": "true", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_bad_unsigned_masked_scaled_data() -> Dataset: +def create_bad_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": True, "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": True, - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_signed_masked_scaled_data() -> Dataset: +def create_signed_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { "_FillValue": -127, "_Unsigned": "false", "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), } - x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32) + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_signed_masked_scaled_data() -> Dataset: +def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -127, "_Unsigned": "false", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([-110, 1, 127, -127], dtype="i1") @@ -889,9 +893,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: - decoded = decoded_fn() - encoded = encoded_fn() + @pytest.mark.parametrize("dtype", [np.dtype("float64"), np.dtype("float32")]) + def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: + decoded = decoded_fn(dtype) + encoded = encoded_fn(dtype) with self.roundtrip(decoded) as actual: for k in decoded.variables: @@ -912,7 +917,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: # make sure roundtrip encoding didn't change the # original dataset. - assert_allclose(encoded, encoded_fn(), decode_bytes=False) + assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False) with self.roundtrip(encoded) as actual: for k in decoded.variables: @@ -1645,6 +1650,7 @@ def test_mask_and_scale(self) -> None: v.add_offset = 10 v.scale_factor = 0.1 v[:] = np.array([-1, -1, 0, 1, 2]) + dtype = type(v.scale_factor) # first make sure netCDF4 reads the masked and scaled data # correctly @@ -1653,11 +1659,13 @@ def test_mask_and_scale(self) -> None: [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False] ) actual = nc.variables["x"][:] + print(actual.dtype, expected.dtype) assert_array_equal(expected, actual) # now check xarray with open_dataset(tmp_file) as ds: - expected = create_masked_and_scaled_data() + expected = create_masked_and_scaled_data(np.dtype(dtype)) + print(ds.variables["x"].dtype, expected.variables["x"].dtype) assert_identical(expected, ds) def test_0dimensional_variable(self) -> None: From 792e9429c56cf55ed5cebfd05f50db14ea5bbb30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Feb 2024 10:05:43 +0100 Subject: [PATCH 02/15] fix mypy --- xarray/coding/variables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 120bff0b594..4ea7cb8c0e6 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -400,6 +400,9 @@ def _choose_float_dtype( # float32 can exactly represent all integers up to 24 bits if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): return np.float32 + # For all other types and circumstances, we just use float64. + # (safe because eg. complex numbers are not supported in NetCDF) + return np.float64 class CFScaleOffsetCoder(VariableCoder): From f743fd1df3c92c47c6eecd5a9e1a2d6ebd3b14ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Feb 2024 11:14:27 +0100 Subject: [PATCH 03/15] fix CFMaskCode test --- xarray/tests/test_coding.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index f7579c4b488..dce65dbe11c 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -51,9 +51,8 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None assert encoded.dtype == encoded.attrs["missing_value"].dtype assert encoded.dtype == encoded.attrs["_FillValue"].dtype - with pytest.warns(variables.SerializationWarning): - roundtripped = decode_cf_variable("foo", encoded) - assert_identical(roundtripped, original) + roundtripped = decode_cf_variable("foo", encoded) + assert_identical(roundtripped, original) def test_CFMaskCoder_missing_value() -> None: From 551719a2cde48477a84a695c45b703eb2327dfa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Feb 2024 12:57:31 +0100 Subject: [PATCH 04/15] fix scale/offset --- xarray/coding/variables.py | 4 ++-- xarray/tests/test_coding.py | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4ea7cb8c0e6..7c1ad25b778 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -390,8 +390,8 @@ def _choose_float_dtype( # if there's any offset at all - better unoptimised than wrong! if add_offset is not None: return np.float64 - # return float32 in other cases where only scale_factor is given - return np.float32 + # return dtype depending on given scale_factor + return np.dtype(scale_type).type # If no scale_factor or add_offset is given, use some general rules. # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index dce65dbe11c..6d81d6f5dc8 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -95,16 +95,18 @@ def test_coder_roundtrip() -> None: @pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) -def test_scaling_converts_to_float32(dtype) -> None: +@pytest.mark.parametrize("dtype2", "f4 f8".split()) +def test_scaling_converts_to_float(dtype: str, dtype2: str) -> None: + dt = np.dtype(dtype2) original = xr.Variable( - ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) + ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=dt.type(10)) ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) - assert encoded.dtype == np.float32 + assert encoded.dtype == dt roundtripped = coder.decode(encoded) assert_identical(original, roundtripped) - assert roundtripped.dtype == np.float32 + assert roundtripped.dtype == dt @pytest.mark.parametrize("scale_factor", (10, [10])) From 85730e2e618d1e563a20796cb6fc3d9b2f9427fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Feb 2024 13:49:31 +0100 Subject: [PATCH 05/15] avert zarr issue --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5e2ae8e4804..e5346d5ed5a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -895,9 +895,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: ) @pytest.mark.parametrize("dtype", [np.dtype("float64"), np.dtype("float32")]) def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: + if hasattr(self, "zarr_version") and dtype == np.float32: + pytest.skip("float32 will be treated as float64 in zarr") decoded = decoded_fn(dtype) encoded = encoded_fn(dtype) - with self.roundtrip(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype From 5fe874fc1791184f8ac602d7e2eef251cc957fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Feb 2024 15:47:03 +0100 Subject: [PATCH 06/15] add whats-new.rst entry --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1b0f2f18efb..489d8d29bcd 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -66,6 +66,10 @@ Bug fixes lead to integer overflow or unsafe conversion from floating point to integer values (:issue:`8542`, :pull:`8575`). By `Spencer Clark `_. +- CF conform handling of `_FillValue`/`missing_value` and `dtype` in + `CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`, + :issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`). + By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ From f8488728dba64f2e0aada8842a904b7d618c99b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 7 Feb 2024 12:23:34 +0100 Subject: [PATCH 07/15] refactor fillvalue/missing value check to catch non-conforming values, apply review suggestions --- xarray/coding/variables.py | 133 +++++++++++++++++++--------------- xarray/tests/test_backends.py | 2 - 2 files changed, 76 insertions(+), 59 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 7c1ad25b778..136224f8a17 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -243,6 +243,44 @@ def _is_time_like(units): return any(tstr == units for tstr in time_strings) +def _check_fill_values(attrs, name, dtype): + """ "Check _FillValue and missing_value if available. + + Return dictionary with raw fill values and set with encoded fill values. + + Issue SerializationWarning if appropriate. + """ + raw_fill_dict = {} + [ + pop_to(attrs, raw_fill_dict, attr, name=name) + for attr in ("missing_value", "_FillValue") + ] + encoded_fill_values = set() + for k in list(raw_fill_dict): + v = raw_fill_dict[k] + kfill = {fv for fv in np.ravel(v) if not pd.isnull(fv)} + if not kfill and np.issubdtype(dtype, np.integer): + warnings.warn( + f"variable {name!r} has non-conforming {k!r} " + f"{v!r} defined, dropping {k!r} entirely.", + SerializationWarning, + stacklevel=3, + ) + del raw_fill_dict[k] + else: + encoded_fill_values |= kfill + + if len(encoded_fill_values) > 1: + warnings.warn( + f"variable {name!r} has multiple fill values " + f"{encoded_fill_values} defined, decoding all values to NaN.", + SerializationWarning, + stacklevel=3, + ) + + return raw_fill_dict, encoded_fill_values + + class CFMaskCoder(VariableCoder): """Mask or unmask fill values according to CF conventions.""" @@ -264,75 +302,56 @@ def encode(self, variable: Variable, name: T_Name = None): f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." ) - # special case DateTime to properly handle NaT - is_time_like = _is_time_like(attrs.get("units")) - if fv_exists: # Ensure _FillValue is cast to same dtype as data's encoding["_FillValue"] = dtype.type(fv) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) - if not pd.isnull(fill_value): - if is_time_like and data.dtype.kind in "iu": - data = duck_array_ops.where( - data != np.iinfo(np.int64).min, data, fill_value - ) - else: - data = duck_array_ops.fillna(data, fill_value) if mv_exists: - # Use _FillValue if available to align missing_value to prevent issues - # when decoding - # Ensure missing_value is cast to same dtype as data's + # try to use _FillValue, if it exists to align both values + # or use missing_value and ensure it's cast to same dtype as data's encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv)) fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and not fv_exists: - if is_time_like and data.dtype.kind in "iu": - data = duck_array_ops.where( - data != np.iinfo(np.int64).min, data, fill_value - ) - else: - data = duck_array_ops.fillna(data, fill_value) + + # apply fillna + if not pd.isnull(fill_value): + # special case DateTime to properly handle NaT + if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": + data = duck_array_ops.where( + data != np.iinfo(np.int64).min, data, fill_value + ) + else: + data = duck_array_ops.fillna(data, fill_value) return Variable(dims, data, attrs, encoding, fastpath=True) def decode(self, variable: Variable, name: T_Name = None): - dims, data, attrs, encoding = unpack_for_decoding(variable) + raw_fill_dict, encoded_fill_values = _check_fill_values( + variable.attrs, name, variable.dtype + ) - raw_fill_values = [ - pop_to(attrs, encoding, attr, name=name) - for attr in ("missing_value", "_FillValue") - ] - if raw_fill_values: - encoded_fill_values = { - fv - for option in raw_fill_values - for fv in np.ravel(option) - if not pd.isnull(fv) - } - - if len(encoded_fill_values) > 1: - warnings.warn( - "variable {!r} has multiple fill values {}, " - "decoding all values to NaN.".format(name, encoded_fill_values), - SerializationWarning, - stacklevel=3, - ) + if raw_fill_dict: + dims, data, attrs, encoding = unpack_for_decoding(variable) + [ + safe_setitem(encoding, attr, value, name=name) + for attr, value in raw_fill_dict.items() + ] - # special case DateTime to properly handle NaT - dtype: np.typing.DTypeLike - decoded_fill_value: Any - if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": - dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min - else: - if "scale_factor" not in attrs and "add_offset" not in attrs: - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + if encoded_fill_values: + # special case DateTime to properly handle NaT + dtype: np.typing.DTypeLike + decoded_fill_value: Any + if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": + dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min else: - dtype, decoded_fill_value = ( - _choose_float_dtype(data.dtype, attrs), - np.nan, - ) + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + else: + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) - if encoded_fill_values: transform = partial( _apply_mask, encoded_fill_values=encoded_fill_values, @@ -367,9 +386,9 @@ def _choose_float_dtype( # get the type from scale_factor/add_offset to determine # the needed floating point type if scale_factor is not None: - scale_type = type(scale_factor) + scale_type = np.dtype(type(scale_factor)) if add_offset is not None: - offset_type = type(add_offset) + offset_type = np.dtype(type(add_offset)) # CF conforming, both scale_factor and add-offset are given and # of same floating point type (float32/64) if ( @@ -382,7 +401,7 @@ def _choose_float_dtype( # due to precision issues if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer): return np.float64 - return np.dtype(scale_type).type + return scale_type # Not CF conforming and add_offset given: # A scale factor is entirely safe (vanishing into the mantissa), # but a large integer offset could lead to loss of precision. @@ -391,7 +410,7 @@ def _choose_float_dtype( if add_offset is not None: return np.float64 # return dtype depending on given scale_factor - return np.dtype(scale_type).type + return scale_type # If no scale_factor or add_offset is given, use some general rules. # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e5346d5ed5a..2251821d82a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1660,13 +1660,11 @@ def test_mask_and_scale(self) -> None: [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False] ) actual = nc.variables["x"][:] - print(actual.dtype, expected.dtype) assert_array_equal(expected, actual) # now check xarray with open_dataset(tmp_file) as ds: expected = create_masked_and_scaled_data(np.dtype(dtype)) - print(ds.variables["x"].dtype, expected.variables["x"].dtype) assert_identical(expected, ds) def test_0dimensional_variable(self) -> None: From 672e84e404d6242f119d39d60dd56f082ac2b5b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 7 Feb 2024 12:32:52 +0100 Subject: [PATCH 08/15] fix typing --- xarray/coding/variables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 136224f8a17..8bd08db92f0 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -401,7 +401,7 @@ def _choose_float_dtype( # due to precision issues if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer): return np.float64 - return scale_type + return scale_type.type # Not CF conforming and add_offset given: # A scale factor is entirely safe (vanishing into the mantissa), # but a large integer offset could lead to loss of precision. @@ -410,7 +410,7 @@ def _choose_float_dtype( if add_offset is not None: return np.float64 # return dtype depending on given scale_factor - return scale_type + return scale_type.type # If no scale_factor or add_offset is given, use some general rules. # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" From bff3a5d2db19413bef7ba4f817e95d275b074458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 7 Feb 2024 12:52:06 +0100 Subject: [PATCH 09/15] suppress warning in doc --- doc/user-guide/indexing.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/user-guide/indexing.rst b/doc/user-guide/indexing.rst index 90b7cbaf2a9..fba9dd585ab 100644 --- a/doc/user-guide/indexing.rst +++ b/doc/user-guide/indexing.rst @@ -549,6 +549,7 @@ __ https://numpy.org/doc/stable/user/basics.indexing.html#assigning-values-to-in You can also assign values to all variables of a :py:class:`Dataset` at once: .. ipython:: python + :okwarning: ds_org = xr.tutorial.open_dataset("eraint_uvz").isel( latitude=slice(56, 59), longitude=slice(255, 258), level=0 From f90e7e0fe26ea5a3123b07581811f627be19b271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 15 Mar 2024 08:20:58 +0100 Subject: [PATCH 10/15] FIX: silence SerializationWarnings --- xarray/tests/test_conventions.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 91a8e368de5..c83b74408c3 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -63,7 +63,25 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: np.arange(10), {"units": "foobar", "missing_value": np.nan, "_FillValue": np.nan}, ) - actual = conventions.decode_cf_variable("t", var) + + expected_warnings = { + ( + SerializationWarning, + "variable 't' has non-conforming 'missing_value' nan defined, " + "dropping 'missing_value' entirely.", + ), + ( + SerializationWarning, + "variable 't' has non-conforming '_FillValue' nan defined, " + "dropping '_FillValue' entirely.", + ), + } + + with pytest.warns(Warning) as winfo: + actual = conventions.decode_cf_variable("t", var) + actual_warnings = {(warn.category, warn.message.args[0]) for warn in winfo} + assert actual_warnings == expected_warnings + assert_identical(actual, expected) var = Variable( @@ -75,7 +93,10 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: "_FillValue": np.float32(np.nan), }, ) - actual = conventions.decode_cf_variable("t", var) + with pytest.warns(Warning) as winfo: + actual = conventions.decode_cf_variable("t", var) + actual_warnings = {(warn.category, warn.message.args[0]) for warn in winfo} + assert actual_warnings == expected_warnings assert_identical(actual, expected) @@ -122,6 +143,8 @@ def test_missing_fillvalue(self) -> None: with pytest.warns(Warning, match="floating point data as an integer"): conventions.encode_cf_variable(v) + conventions.encode_cf_variable(v) + def test_multidimensional_coordinates(self) -> None: # regression test for GH1763 # Set up test case with coordinates that have overlapping (but not From da2222e894e8508f2b9b1a0e69ddb24e62a57904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 15 Mar 2024 08:38:42 +0100 Subject: [PATCH 11/15] FIX: silence mypy by casting to string early --- xarray/tests/test_conventions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index c83b74408c3..e4584b4d0c1 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -79,7 +79,7 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: with pytest.warns(Warning) as winfo: actual = conventions.decode_cf_variable("t", var) - actual_warnings = {(warn.category, warn.message.args[0]) for warn in winfo} + actual_warnings = {(warn.category, str(warn.message)) for warn in winfo} assert actual_warnings == expected_warnings assert_identical(actual, expected) @@ -95,7 +95,7 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: ) with pytest.warns(Warning) as winfo: actual = conventions.decode_cf_variable("t", var) - actual_warnings = {(warn.category, warn.message.args[0]) for warn in winfo} + actual_warnings = {(warn.category, str(warn.message)) for warn in winfo} assert actual_warnings == expected_warnings assert_identical(actual, expected) From f6fe9cb863251bc43d873f66836c21a4b6271c4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 15 Mar 2024 16:50:32 +0100 Subject: [PATCH 12/15] Update xarray/tests/test_conventions.py Co-authored-by: Deepak Cherian --- xarray/tests/test_conventions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index e4584b4d0c1..b46824c7321 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -143,7 +143,6 @@ def test_missing_fillvalue(self) -> None: with pytest.warns(Warning, match="floating point data as an integer"): conventions.encode_cf_variable(v) - conventions.encode_cf_variable(v) def test_multidimensional_coordinates(self) -> None: # regression test for GH1763 From 6b30298f5d61a5bebc11953c82f8736d408f9fa4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Mar 2024 15:51:03 +0000 Subject: [PATCH 13/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_conventions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index b46824c7321..38161563029 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -143,7 +143,6 @@ def test_missing_fillvalue(self) -> None: with pytest.warns(Warning, match="floating point data as an integer"): conventions.encode_cf_variable(v) - def test_multidimensional_coordinates(self) -> None: # regression test for GH1763 # Set up test case with coordinates that have overlapping (but not From 1d1b1a0ad0f710aef7f4927895f94dab8cb7082d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 15 Mar 2024 17:06:08 +0100 Subject: [PATCH 14/15] Shorten test, add comment checking for two warnings --- xarray/tests/test_conventions.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 38161563029..a1293b4da93 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -64,23 +64,12 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: {"units": "foobar", "missing_value": np.nan, "_FillValue": np.nan}, ) - expected_warnings = { - ( - SerializationWarning, - "variable 't' has non-conforming 'missing_value' nan defined, " - "dropping 'missing_value' entirely.", - ), - ( - SerializationWarning, - "variable 't' has non-conforming '_FillValue' nan defined, " - "dropping '_FillValue' entirely.", - ), - } - + # the following code issues two warnings, so we need to check for both with pytest.warns(Warning) as winfo: actual = conventions.decode_cf_variable("t", var) - actual_warnings = {(warn.category, str(warn.message)) for warn in winfo} - assert actual_warnings == expected_warnings + for aw in winfo: + assert aw.category == SerializationWarning + assert "non-conforming" in str(aw.message) assert_identical(actual, expected) @@ -93,10 +82,13 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: "_FillValue": np.float32(np.nan), }, ) + + # the following code issues two warnings, so we need to check for both with pytest.warns(Warning) as winfo: actual = conventions.decode_cf_variable("t", var) - actual_warnings = {(warn.category, str(warn.message)) for warn in winfo} - assert actual_warnings == expected_warnings + for aw in winfo: + assert aw.category == SerializationWarning + assert "non-conforming" in str(aw.message) assert_identical(actual, expected) From 30985fa0ded7109ca55a1424a509d8c96c459317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 15 Mar 2024 17:11:20 +0100 Subject: [PATCH 15/15] make test even shorter --- xarray/tests/test_conventions.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index a1293b4da93..fdfea3c3fe8 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -65,10 +65,9 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: ) # the following code issues two warnings, so we need to check for both - with pytest.warns(Warning) as winfo: + with pytest.warns(SerializationWarning) as winfo: actual = conventions.decode_cf_variable("t", var) for aw in winfo: - assert aw.category == SerializationWarning assert "non-conforming" in str(aw.message) assert_identical(actual, expected) @@ -84,10 +83,9 @@ def test_decode_cf_with_conflicting_fill_missing_value() -> None: ) # the following code issues two warnings, so we need to check for both - with pytest.warns(Warning) as winfo: + with pytest.warns(SerializationWarning) as winfo: actual = conventions.decode_cf_variable("t", var) for aw in winfo: - assert aw.category == SerializationWarning assert "non-conforming" in str(aw.message) assert_identical(actual, expected)