pydata · kmuehlbauer · Jan 17, 2024 · Sep 12, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -221,6 +221,10 @@ New Features
 
 - Use `opt_einsum <https://optimized-einsum.readthedocs.io/en/stable/>`_ for :py:func:`xarray.dot` by default if installed.
   By `Deepak Cherian <https://github.com/dcherian>`_. (:issue:`7764`, :pull:`8373`).
+- Open netCDF4 enums and turn them into CF flag_meanings/flag_values.
+  This also gives a special meaning to the 'enum' attribute in DataArrays, when it is set, this tells the netCDF4 backend
+  to turn flag_meanings and flag_values into a netCDF4 Enum named using ``attrs["enum"]`` content.
+  By `Abel Aoun <https://github.com/bzah>_`(:issue:`8144`, :pull:`8147`)
 - Add ``DataArray.dt.total_seconds()`` method to match the Pandas API. (:pull:`8435`).
   By `Ben Mares <https://github.com/maresb>`_.
 - Allow passing ``region="auto"`` in  :py:meth:`Dataset.to_zarr` to automatically infer the

diff --git a/toto.nc b/toto.nc
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -2,6 +2,7 @@
 
 import os
 from collections.abc import Hashable, Iterable, Mapping, MutableMapping, Sequence
+from enum import EnumMeta
 from functools import partial
 from io import BytesIO
 from numbers import Number
@@ -172,7 +173,7 @@ def _validate_attrs(dataset, invalid_netcdf=False):
     `invalid_netcdf=True`.
     """
 
-    valid_types = (str, Number, np.ndarray, np.number, list, tuple)
+    valid_types = (str, Number, np.ndarray, np.number, list, tuple, EnumMeta)
     if invalid_netcdf:
         valid_types += (np.bool_,)
 
@@ -407,6 +408,7 @@ def open_dataset(
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
     backend_kwargs: dict[str, Any] | None = None,
+    decode_enum: bool | None = None,
     **kwargs,
 ) -> Dataset:
     """Open and decode a dataset from a file or file-like object.
@@ -512,6 +514,8 @@ def open_dataset(
     backend_kwargs: dict
         Additional keyword arguments passed on to the engine open function,
         equivalent to `**kwargs`.
+    decode_enum: bool, optional
+        If True, decode CF flag_values and flag_meanings into a pyton Enum.
     **kwargs: dict
         Additional keyword arguments passed on to the engine open function.
         For example:
@@ -566,6 +570,7 @@ def open_dataset(
         concat_characters=concat_characters,
         use_cftime=use_cftime,
         decode_coords=decode_coords,
+        decode_enum=decode_enum,
     )
 
     overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -5,6 +5,7 @@
 import os
 from collections.abc import Iterable
 from contextlib import suppress
+from enum import Enum
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
@@ -49,7 +50,6 @@
 # string used by netCDF4.
 _endian_lookup = {"=": "native", ">": "big", "<": "little", "|": "native"}
 
-
 NETCDF4_PYTHON_LOCK = combine_locks([NETCDFC_LOCK, HDF5_LOCK])
 
 
@@ -234,13 +234,13 @@ def _force_native_endianness(var):
 
 
 def _extract_nc4_variable_encoding(
-    variable,
+    variable: Variable,
     raise_on_invalid=False,
     lsd_okay=True,
     h5py_okay=False,
     backend="netCDF4",
     unlimited_dims=None,
-):
+) -> dict[str, Any]:
     if unlimited_dims is None:
         unlimited_dims = ()
 
@@ -257,6 +257,7 @@ def _extract_nc4_variable_encoding(
         "_FillValue",
         "dtype",
         "compression",
+        "enum",
         "significant_digits",
         "quantize_mode",
         "blosc_shuffle",
@@ -308,7 +309,7 @@ def _extract_nc4_variable_encoding(
     return encoding
 
 
-def _is_list_of_strings(value):
+def _is_list_of_strings(value) -> bool:
     arr = np.asarray(value)
     return arr.dtype.kind in ["U", "S"] and arr.size > 1
 
@@ -414,10 +415,14 @@ def _acquire(self, needs_lock=True):
     def ds(self):
         return self._acquire()
 
-    def open_store_variable(self, name, var):
+    def open_store_variable(self, name: str, var):
+        import netCDF4
+
         dimensions = var.dimensions
-        data = indexing.LazilyIndexedArray(NetCDF4ArrayWrapper(name, self))
         attributes = {k: var.getncattr(k) for k in var.ncattrs()}
+        data = indexing.LazilyIndexedArray(NetCDF4ArrayWrapper(name, self))
+        if isinstance(var.datatype, netCDF4.EnumType):
+            attributes["enum"] = Enum(var.datatype.name, var.datatype.enum_dict)
         _ensure_fill_value_valid(data, attributes)
         # netCDF4 specific encoding; save _FillValue for later
         encoding = {}
@@ -485,21 +490,20 @@ def encode_variable(self, variable):
         return variable
 
     def prepare_variable(
-        self, name, variable, check_encoding=False, unlimited_dims=None
+        self, name, variable: Variable, check_encoding=False, unlimited_dims=None
     ):
         _ensure_no_forward_slash_in_name(name)
-
-        datatype = _get_datatype(
-            variable, self.format, raise_on_invalid_encoding=check_encoding
-        )
         attrs = variable.attrs.copy()
-
         fill_value = attrs.pop("_FillValue", None)
-
+        if attrs.get("enum"):
+            datatype = self._build_and_get_enum(name, attrs, variable.dtype)
+        else:
+            datatype = _get_datatype(
+                variable, self.format, raise_on_invalid_encoding=check_encoding
+            )
         encoding = _extract_nc4_variable_encoding(
             variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims
         )
-
         if name in self.ds.variables:
             nc4_var = self.ds.variables[name]
         else:
@@ -527,6 +531,32 @@ def prepare_variable(
 
         return target, variable.data
 
+    def _build_and_get_enum(
+        self, var_name: str, attributes: dict, dtype: np.dtype
+    ) -> object:
+        enum = attributes.pop("enum")
+        enum_dict = {e.name: e.value for e in enum}
+        enum_name = enum.__name__
+        if enum_name in self.ds.enumtypes:
+            datatype = self.ds.enumtypes[enum_name]
+            if datatype.enum_dict != enum_dict:
+                error_msg = (
+                    f"Cannot save variable `{var_name}` because an enum"
+                    f" `{enum_name}` already exists in the Dataset but have"
+                    " a different definition. To fix this error, make sure"
+                    " each variable have a unique name in `attrs['enum']`"
+                    " or, if they should share same enum type, make sure"
+                    " the enums are identical."
+                )
+                raise ValueError(error_msg)
+        else:
+            datatype = self.ds.createEnumType(
+                dtype,
+                enum_name,
+                enum_dict,
+            )
+        return datatype
+
     def sync(self):
         self.ds.sync()
 
@@ -597,6 +627,7 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
         persist=False,
         lock=None,
         autoclose=False,
+        decode_enum: bool | None = None,
     ) -> Dataset:
         filename_or_obj = _normalize_path(filename_or_obj)
         store = NetCDF4DataStore.open(
@@ -622,6 +653,7 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
                 drop_variables=drop_variables,
                 use_cftime=use_cftime,
                 decode_timedelta=decode_timedelta,
+                decode_enum=decode_enum,
             )
         return ds
 

diff --git a/xarray/backends/store.py b/xarray/backends/store.py
@@ -37,6 +37,7 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
         drop_variables: str | Iterable[str] | None = None,
         use_cftime=None,
         decode_timedelta=None,
+        decode_enum: bool | None = None,
     ) -> Dataset:
         assert isinstance(filename_or_obj, AbstractDataStore)
 
@@ -53,6 +54,7 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
             drop_variables=drop_variables,
             use_cftime=use_cftime,
             decode_timedelta=decode_timedelta,
+            decode_enum=decode_enum,
         )
 
         ds = Dataset(vars, attrs=attrs)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -3,6 +3,7 @@
 
 import warnings
 from collections.abc import Hashable, MutableMapping
+from enum import Enum
 from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Union
 
@@ -566,11 +567,36 @@ def decode(self):
 
 class ObjectVLenStringCoder(VariableCoder):
     def encode(self):
-        return NotImplementedError
+        raise NotImplementedError
 
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         if variable.dtype == object and variable.encoding.get("dtype", False) == str:
             variable = variable.astype(variable.encoding["dtype"])
             return variable
         else:
             return variable
+
+
+class EnumCoder(VariableCoder):
+    """Decode CF flag_* to python Enum"""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        raise NotImplementedError
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        """From CF flag_* to python Enum"""
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+        if (
+            attrs.get("enum")
+            and attrs.get("flag_meanings")
+            and attrs.get("flag_values")
+        ):
+            flag_meanings = attrs.pop("flag_meanings")
+            flag_meanings = flag_meanings.split(" ")
+            flag_values = attrs.pop("flag_values")
+            flag_values = [int(v) for v in flag_values.split(", ")]
+            enum_name = attrs.pop("enum")
+            enum_dict = {k: v for k, v in zip(flag_meanings, flag_values)}
+            attrs["enum"] = Enum(enum_name, enum_dict)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        return variable
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -48,10 +48,6 @@
     T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore]
 
 
-def _var_as_tuple(var: Variable) -> T_VarTuple:
-    return var.dims, var.data, var.attrs.copy(), var.encoding.copy()
-
-
 def _infer_dtype(array, name=None):
     """Given an object array with no missing values, infer its dtype from all elements."""
     if array.dtype.kind != "O":
@@ -111,7 +107,7 @@ def _copy_with_dtype(data, dtype: np.typing.DTypeLike):
 def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable:
     # TODO: move this from conventions to backends? (it's not CF related)
     if var.dtype.kind == "O":
-        dims, data, attrs, encoding = _var_as_tuple(var)
+        dims, data, attrs, encoding = variables.unpack_for_encoding(var)
 
         # leave vlen dtypes unchanged
         if strings.check_vlen_dtype(data.dtype) is not None:
@@ -162,7 +158,7 @@ def encode_cf_variable(
     var: Variable, needs_copy: bool = True, name: T_Name = None
 ) -> Variable:
     """
-    Converts an Variable into an Variable which follows some
+    Converts a Variable into a Variable which follows some
     of the CF conventions:
 
         - Nans are masked using _FillValue (or the deprecated missing_value)
@@ -212,6 +208,7 @@ def decode_cf_variable(
     stack_char_dim: bool = True,
     use_cftime: bool | None = None,
     decode_timedelta: bool | None = None,
+    decode_enum: bool | None = None,
 ) -> Variable:
     """
     Decodes a variable which may hold CF encoded information.
@@ -252,6 +249,8 @@ def decode_cf_variable(
         represented using ``np.datetime64[ns]`` objects.  If False, always
         decode times to ``np.datetime64[ns]`` objects; if this is not possible
         raise an error.
+    decode_enum: bool, optional
+        Turn the CF flag_values and flag_meanings into a python Enum in `attrs['enum']`.
 
     Returns
     -------
@@ -295,6 +294,9 @@ def decode_cf_variable(
 
     var = variables.BooleanCoder().decode(var)
 
+    if decode_enum:
+        var = variables.EnumCoder().decode(var)
+
     dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
 
     encoding.setdefault("dtype", original_dtype)
@@ -393,6 +395,7 @@ def decode_cf_variables(
     drop_variables: T_DropVariables = None,
     use_cftime: bool | None = None,
     decode_timedelta: bool | None = None,
+    decode_enum: bool | None = None,
 ) -> tuple[T_Variables, T_Attrs, set[Hashable]]:
     """
     Decode several CF encoded variables.
@@ -445,9 +448,10 @@ def stackable(dim: Hashable) -> bool:
                 stack_char_dim=stack_char_dim,
                 use_cftime=use_cftime,
                 decode_timedelta=decode_timedelta,
+                decode_enum=decode_enum,
             )
         except Exception as e:
-            raise type(e)(f"Failed to decode variable {k!r}: {e}")
+            raise type(e)(f"Failed to decode variable {k!r}: {e}") from e
         if decode_coords in [True, "coordinates", "all"]:
             var_attrs = new_vars[k].attrs
             if "coordinates" in var_attrs:
@@ -509,6 +513,7 @@ def decode_cf(
     drop_variables: T_DropVariables = None,
     use_cftime: bool | None = None,
     decode_timedelta: bool | None = None,
+    decode_enum: bool = True,
 ) -> Dataset:
     """Decode the given Dataset or Datastore according to CF conventions into
     a new Dataset.
@@ -587,6 +592,7 @@ def decode_cf(
         drop_variables=drop_variables,
         use_cftime=use_cftime,
         decode_timedelta=decode_timedelta,
+        decode_enum=decode_enum,
     )
     ds = Dataset(vars, attrs=attrs)
     ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
@@ -602,6 +608,7 @@ def cf_decoder(
     concat_characters: bool = True,
     mask_and_scale: bool = True,
     decode_times: bool = True,
+    decode_enum: bool = True,
 ) -> tuple[T_Variables, T_Attrs]:
     """
     Decode a set of CF encoded variables and attributes.
@@ -633,7 +640,12 @@ def cf_decoder(
     decode_cf_variable
     """
     variables, attributes, _ = decode_cf_variables(
-        variables, attributes, concat_characters, mask_and_scale, decode_times
+        variables,
+        attributes,
+        concat_characters,
+        mask_and_scale,
+        decode_times,
+        decode_enum=decode_enum,
     )
     return variables, attributes
 

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -4069,6 +4069,11 @@ def to_netcdf(
         name is the same as a coordinate name, then it is given the name
         ``"__xarray_dataarray_variable__"``.
 
+        [netCDF4 backend only] When the CF flag_values/flag_meanings attributes are
+        set in for this DataArray, you can choose to replace these attributes by
+        a netcdf4 EnumType by updating the encoding dictionary with a key value pair
+        like: `da.attrs["enum"] = "enum_name"`.
+
         See Also
         --------
         Dataset.to_netcdf