From 1e020d3e6d5eed76366cdf3612e24fe424344df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 25 Mar 2024 13:04:02 +0100 Subject: [PATCH 1/3] warn and return bytes undecoded in case of UnicodeDecodeError in h5netcdf-backend --- xarray/backends/h5netcdf_.py | 12 +++++++++--- xarray/tests/test_backends.py | 9 +++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index b7c1b2a5f03..32610b5886f 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -28,6 +28,7 @@ from xarray.core import indexing from xarray.core.utils import ( FrozenDict, + emit_user_level_warning, is_remote_uri, read_magic_number_from_file, try_read_magic_number_from_file_or_path, @@ -60,9 +61,14 @@ def _getitem(self, key): def maybe_decode_bytes(txt): if isinstance(txt, bytes): - return txt.decode("utf-8") - else: - return txt + try: + return txt.decode("utf-8") + except UnicodeDecodeError: + emit_user_level_warning( + "'utf-8' codec can't decode bytes, " "returning bytes undecoded.", + UnicodeWarning, + ) + return txt def _read_attributes(h5netcdf_var): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fb137977e8..6f89d2e673b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3560,6 +3560,15 @@ def test_dump_encodings_h5py(self) -> None: assert actual.x.encoding["compression"] == "lzf" assert actual.x.encoding["compression_opts"] is None + def test_decode_utf8_warning(self) -> None: + title = b"\xc3" + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, "w") as f: + f.title = title + with pytest.warns(UnicodeWarning, match="returning bytes undecoded"): + ds = xr.load_dataset(tmp_file, engine="h5netcdf") + assert ds.title == title + @requires_h5netcdf @requires_netCDF4 From 9fd4f5813362de789df1e42cb210b231e17f77ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 25 Mar 2024 13:09:21 +0100 Subject: [PATCH 2/3] add whats-new.rst entry --- doc/whats-new.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c1bfaba8756..eb17cb74a93 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -57,15 +57,17 @@ Bug fixes `CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`, :issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`). By `Kai Mühlbauer `_. -- do not cast `_FillValue`/`missing_value` in `CFMaskCoder` if `_Unsigned` is provided +- Do not cast `_FillValue`/`missing_value` in `CFMaskCoder` if `_Unsigned` is provided (:issue:`8844`, :pull:`8852`). - Adapt handling of copy keyword argument for numpy >= 2.0dev - (:issue:`8844`, :pull:`8851`, :pull:`8865``). + (:issue:`8844`, :pull:`8851`, :pull:`8865`). By `Kai Mühlbauer `_. -- import trapz/trapezoid depending on numpy version. +- Import trapz/trapezoid depending on numpy version (:issue:`8844`, :pull:`8865`). By `Kai Mühlbauer `_. - +- Warn and return bytes undecoded in case of UnicodeDecodeError in h5netcdf-backend + (:issue:`5563`, :pull:`8874`). + By `Kai Mühlbauer `_. Documentation From e13b586d9ff075124345cd5a6a07daf5efed0196 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 25 Mar 2024 15:25:40 +0100 Subject: [PATCH 3/3] merge maybe_decode_bytes function into _read_attributes, add attribute and variable name to warning --- xarray/backends/h5netcdf_.py | 23 ++++++++++------------- xarray/tests/test_backends.py | 3 ++- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 32610b5886f..81ba37f6707 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -59,18 +59,6 @@ def _getitem(self, key): return array[key] -def maybe_decode_bytes(txt): - if isinstance(txt, bytes): - try: - return txt.decode("utf-8") - except UnicodeDecodeError: - emit_user_level_warning( - "'utf-8' codec can't decode bytes, " "returning bytes undecoded.", - UnicodeWarning, - ) - return txt - - def _read_attributes(h5netcdf_var): # GH451 # to ensure conventions decoding works properly on Python 3, decode all @@ -78,7 +66,16 @@ def _read_attributes(h5netcdf_var): attrs = {} for k, v in h5netcdf_var.attrs.items(): if k not in ["_FillValue", "missing_value"]: - v = maybe_decode_bytes(v) + if isinstance(v, bytes): + try: + v = v.decode("utf-8") + except UnicodeDecodeError: + emit_user_level_warning( + f"'utf-8' codec can't decode bytes for attribute " + f"{k!r} of h5netcdf object {h5netcdf_var.name!r}, " + f"returning bytes undecoded.", + UnicodeWarning, + ) attrs[k] = v return attrs diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6f89d2e673b..1d69b3adc63 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3565,9 +3565,10 @@ def test_decode_utf8_warning(self) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, "w") as f: f.title = title - with pytest.warns(UnicodeWarning, match="returning bytes undecoded"): + with pytest.warns(UnicodeWarning, match="returning bytes undecoded") as w: ds = xr.load_dataset(tmp_file, engine="h5netcdf") assert ds.title == title + assert "attribute 'title' of h5netcdf object '/'" in str(w[0].message) @requires_h5netcdf