CLIMADA-project · peanutfun · Oct 14, 2022 · Jul 5, 2022 · Jul 6, 2022 · Jul 6, 2022
diff --git a/climada/hazard/base.py b/climada/hazard/base.py
@@ -27,6 +27,7 @@
 import logging
 import pathlib
 import warnings
+from typing import Union, Optional, Callable, Dict
 
 import geopandas as gpd
 import h5py
@@ -38,6 +39,7 @@
 from rasterio.features import rasterize
 from rasterio.warp import reproject, Resampling, calculate_default_transform
 from scipy import sparse
+import xarray as xr
 
 from climada.hazard.tag import Tag as TagHazard
 from climada.hazard.centroids.centr import Centroids
@@ -369,6 +371,361 @@ def set_vector(self, *args, **kwargs):
                        "Use Hazard.from_vector instead.")
         self.__dict__ = Hazard.from_vector(*args, **kwargs).__dict__
 
+    @classmethod
+    def from_raster_xarray(
+        cls,
+        data: Union[xr.Dataset, str],
+        *,
+        intensity: str = "intensity",
+        coordinate_vars: Optional[Dict[str, str]] = None,
+        data_vars: Optional[Dict[str, str]] = None,
+        hazard_type: str = "",
+        intensity_unit: str = "",
+    ):
+        """Read raster-like data from an xarray Dataset or a raster data file
+
+        This method reads data that can be interpreted using three coordinates for time,
+        latitude, and longitude. The data and the coordinates themselves may be organized
+        in arbitrary dimensions in the Dataset (e.g. three dimensions 'year', 'month',
+        'day' for the coordinate 'date' representing time). The three coordinates to be
+        read can be specified via the ``coordinate_vars`` parameter.
+
+        The only required data is the intensity. For all other data, this method can
+        supply sensible default values. By default, this method will try to find these
+        "optional" data in the Dataset and read it, or use the default values otherwise.
+        Users may specify the variables in the Dataset to be read for certain Hazard
+        object entries, or may indicate that the default values should be used although
+        the Dataset contains appropriate data. This behavior is controlled via the
+        ``data_vars`` parameter.
+
+        If this method succeeds, it will always return a "consistent" Hazard object,
+        meaning that the object can be used in all CLIMADA operations without throwing
+        an error due to missing data or faulty data types.
+
+        Notes
+        -----
+        * Intensity and fraction data must be three-dimensional data that is interpreted
+          with the coordinates given by ``coordinate_vars`` (or the default). All other
+          data must be given in time coordinates only.
+        * To avoid confusion in the call signature, all parameters are keyword-only
+          arguments, except ``data``.
+        * The attributes ``Hazard.tag.haz_type`` and ``Hazard.unit`` currently cannot be
+          read from the Dataset. Use the method parameters to set these attributes or set
+          them in the resulting ``Hazard`` object by yourself.
+        * This method **does not** read lazily. Single data arrays must fit into memory.
+
+        Parameters
+        ----------
+        data : xarray.Dataset or str
+            The data to read from. May be a opened dataset or a path to a raster data
+            file, in which case the file is opened first. Works with any file format
+            supported by `xarray`.
+        intensity : str
+            Identifier of the `xarray.DataArray` containing the hazard intensity data.
+        coordinate_vars : dict(str, str)
+            Mapping from default coordinate names to coordinate names used in the data
+            to read. The default names are `time`, `longitude`, and `latitude`.
+        data_vars : dict(str, str)
+            Mapping from default variable names to variable names used in the data
+            to read. The default names are `fraction`, `hazard_type`, `frequency`,
+            `event_name`, and `event_id`. If these values are not set, the method tries
+            to load data from the default names. If this fails, the method uses default
+            values for each entry. If the values are set to empty strings (`""`), no data
+            is loaded and the default values are used exclusively. See examples for
+            details.
+
+            Default values are:
+            * `fraction`: 1.0 where intensity is not zero, else zero
+            * `hazard_type`: Empty string
+            * `frequency`: 1.0 for every event
+            * `event_name`: String representation of the event time
+            * `event_id`: Consecutive integers starting at 1 and increasing with time
+        hazard_type : str
+            The type identifier of the hazard. Will be stored directly in the hazard
+            object.
+        intensity_unit : str
+            The physical units of the intensity. Will be stored in the ``hazard.tag``.
+
+        Returns
+        -------
+        hazard : climada.Hazard
+            A hazard object created from the input data
+
+        Examples
+        --------
+        The use of this method is straightforward if the Dataset contains the data with
+        expected names.
+        >>> dset = xr.Dataset(
+        >>>     dict(
+        >>>         intensity=(
+        >>>             ["time", "latitude", "longitude"],
+        >>>             [[[0, 1, 2], [3, 4, 5]]],
+        >>>         )
+        >>>     ),
+        >>>     dict(
+        >>>         time=[datetime.datetime(2000, 1, 1)],
+        >>>         latitude=[0, 1],
+        >>>         longitude=[0, 1, 2],
+        >>>     ),
+        >>> )
+        >>> hazard = Hazard.from_raster_xarray(dset)
+
+        For non-default coordinate names, use the ``coordinate_vars`` argument.
+        >>> dset = xr.Dataset(
+        >>>     dict(
+        >>>         intensity=(
+        >>>             ["day", "lat", "longitude"],
+        >>>             [[[0, 1, 2], [3, 4, 5]]],
+        >>>         )
+        >>>     ),
+        >>>     dict(
+        >>>         day=[datetime.datetime(2000, 1, 1)],
+        >>>         lat=[0, 1],
+        >>>         longitude=[0, 1, 2],
+        >>>     ),
+        >>> )
+        >>> hazard = Hazard.from_raster_xarray(
+        >>>     dset, coordinate_vars=dict(time="day", latitude="lat")
+        >>> )
+
+        Coordinates can be different from the actual dataset dimensions. The following
+        loads the data with coordinates ``longitude`` and ``latitude`` (default names):
+        >>> dset = xr.Dataset(
+        >>>     dict(intensity=(["time", "y", "x"], [[[0, 1, 2], [3, 4, 5]]])),
+        >>>     dict(
+        >>>         time=[datetime.datetime(2000, 1, 1)],
+        >>>         y=[0, 1],
+        >>>         x=[0, 1, 2],
+        >>>         longitude=(["y", "x"], [[0.0, 0.1, 0.2], [0.0, 0.1, 0.2]]),
+        >>>         latitude=(["y", "x"], [[0.0, 0.0, 0.0], [0.1, 0.1, 0.1]]),
+        >>>     ),
+        >>> )
+        >>> hazard = Hazard.from_raster_xarray(dset)
+
+
+        Optional data is read from the dataset if the default keys are found. Users can
+        specify custom variables in the data, or that the default keys should be ignored,
+        with the ``data_vars`` argument.
+        >>> dset = xr.Dataset(
+        >>>     dict(
+        >>>         intensity=(
+        >>>             ["time", "latitude", "longitude"],
+        >>>             [[[0, 1, 2], [3, 4, 5]]],
+        >>>         ),
+        >>>         fraction=(
+        >>>             ["time", "latitude", "longitude"],
+        >>>             [[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]],
+        >>>         ),
+        >>>         freq=(["time"], [0.4]),
+        >>>         event_id=(["time"], [4]),
+        >>>     ),
+        >>>     dict(
+        >>>         time=[datetime.datetime(2000, 1, 1)],
+        >>>         latitude=[0, 1],
+        >>>         longitude=[0, 1, 2],
+        >>>     ),
+        >>> )
+        >>> hazard = Hazard.from_raster_xarray(
+        >>>     dset,
+        >>>     data_vars=dict(
+        >>>         # Load frequency from 'freq' array
+        >>>         frequency="freq",
+        >>>         # Ignore 'event_id' array and use default instead
+        >>>         event_id="",
+        >>>         # 'fraction' array is loaded because it has the default name
+        >>>     ),
+        >>> )
+        >>> np.array_equal(hazard.frequency, [0.4]) and np.array_equal(
+        >>>     hazard.event_id, [1]
+        >>> )
+        True
+
+        """
+        # If the data is a string, open the respective file
+        if not isinstance(data, xr.Dataset):
+            LOGGER.info("Loading Hazard from file: %s", data)
+            data = xr.open_dataset(data)
+        else:
+            LOGGER.info("Loading Hazard from xarray Dataset")
+
+        # Initialize Hazard object
+        hazard = cls(haz_type=hazard_type)
+        hazard.unit = intensity_unit
+
+        # Update coordinate identifiers
+        coords = {"time": "time", "longitude": "longitude", "latitude": "latitude"}
+        if coordinate_vars is not None:
+            unknown_coords = [co for co in coordinate_vars if co not in coords]
+            if unknown_coords:
+                raise ValueError(
+                    f"Unknown coordinates passed: '{unknown_coords}'. Supported "
+                    "coordinates are 'time', 'longitude', 'latitude'."
+                )
+            coords.update(coordinate_vars)
+
+        # Retrieve dimensions of coordinates
+        dims = dict(
+            time=data[coords["time"]].dims,
+            longitude=data[coords["longitude"]].dims,
+            latitude=data[coords["latitude"]].dims,
+        )
+
+        # Stack (vectorize) the entire dataset into 2D (time, lat/lon)
+        # NOTE: We want the set union of the dimensions, but Python 'set' does not
+        #       preserve order. However, we want longitude to run faster than latitude.
+        #       So we use 'dict' without values, as 'dict' preserves insertion order
+        #       (dict keys behave like a set).
+        data = data.stack(
+            event=dims["time"],
+            lat_lon=dict.fromkeys(dims["latitude"] + dims["longitude"]),
+        )
+
+        # Transform coordinates into centroids
+        hazard.centroids = Centroids.from_lat_lon(
+            data[coords["latitude"]].values, data[coords["longitude"]].values
+        )
+
+        def to_csr_matrix(array):
+            """Store an array as sparse matrix, optimizing storage space"""
+            mat = sparse.csr_matrix(array)
+            mat.eliminate_zeros()
+            return mat
+
+        # Read the intensity data and flatten it in spatial dimensions
+        LOGGER.debug("Loading Hazard intensity from DataArray '%s'", intensity)
+        hazard.intensity = to_csr_matrix(data[intensity])
+
+        # Define accessors for xarray DataArrays
+        def default_accessor(x: xr.DataArray):
+            """By default, use the numpy array representation of data"""
+            return x.values
+
+        def strict_positive_int_accessor(x: xr.DataArray, report_key: str):
-        def strict_positive_int_accessor(x: xr.DataArray, report_key: str):
+        def _strict_positive_int_accessor(x: xr.DataArray, report_key: str):
-        def strict_positive_int_accessor(x: xr.DataArray, report_key: str):
+        def _strict_positive_int_accessor(x: xr.DataArray, report_key: str):
+            """Only allow positive, non-zero integers"""
+            values = x.values
+            if not np.issubdtype(values.dtype, np.integer):
+                raise TypeError(f"'{report_key}' data array must be integers")
+            if not np.all(values > 0):
+                raise ValueError(f"'{report_key}' data must be larger than zero")
+            return np.asarray(values, dtype=np.int64)
+
+        # Create a DataFrame storing access information for each of data_vars
+        num_events = data.sizes["event"]
+        keys = ["fraction", "frequency", "event_id", "event_name", "date"]
+        data_ident = pd.DataFrame(
+            data=dict(
+                # The attribute of the Hazard class where data is stored
+                hazard_attr=keys,
+                # The identifier and default key used in this method
+                identifier=keys,
+                # The keys assigned by the user
+                user_key=None,
+                # The default value for each attribute
+                default_value=[
+                    to_csr_matrix(
+                        xr.apply_ufunc(
+                            lambda x: np.where(x != 0, 1, 0), data[intensity]
+                        ).values
+                    ),
+                    np.ones(num_events),
+                    np.array(range(num_events), dtype=int) + 1,
+                    list(data[coords["time"]].values),
+                    np.array(u_dt.datetime64_to_ordinal(data[coords["time"]].values)),
+                ],
+                # The accessor for the data in the Dataset
+                accessor=[
+                    lambda x: to_csr_matrix(default_accessor(x)),
+                    default_accessor,
+                    lambda x: strict_positive_int_accessor(x, "event_id"),
+                    lambda x: list(default_accessor(x)),  # list, not np.array
+                    lambda x: strict_positive_int_accessor(x, "date"),
+                ],
+            )
+        )
+
+        # Update the keys from user settings
+        if data_vars is not None:
+            ident = data_ident["identifier"]
+            unknown_keys = [
+                key for key in data_vars.keys() if not ident.str.contains(key).any()
+            ]
+            if unknown_keys:
+                raise ValueError(
+                    f"Unknown data variables passed: '{unknown_keys}'. Supported "
+                    f"data variables are {list(ident)}."
+                )
+
+            # Make 'identifier' an index for replacement, then make it a column again
+            data_ident = data_ident.set_index("identifier")
+            data_ident["user_key"].loc[list(data_vars.keys())] = list(
+                data_vars.values()
+            )
+            data_ident.reset_index(inplace=True)
+
+        def read_or_default(ident: pd.Series):
-        def read_or_default(ident: pd.Series):
+        def _read_or_default(ident: pd.Series):
-        def read_or_default(ident: pd.Series):
+        def _read_or_default(ident: pd.Series):
+            """Read data from a variable in the data or use default"""
+            key = ident["user_key"]
+            default_value = ident["default_value"]
+            accessor = ident["accessor"]
+            hazard_attr = ident["hazard_attr"]
+
+            # User does not want to read data
+            if ident["user_key"] == "":
+                LOGGER.debug(
+                    "Using default values for Hazard.%s per user request", hazard_attr
+                )
+                return default_value
+
+            default_key = ident["identifier"]
+            if not pd.isna(key):
+                # Read key exclusively
+                LOGGER.debug(
+                    "Reading data for Hazard.%s from DataArray '%s'", hazard_attr, key
+                )
+                val = accessor(data[key])
+            else:
+                # Try default key
+                try:
+                    val = accessor(data[default_key])
+                    LOGGER.debug(
+                        "Reading data for Hazard.%s from DataArray '%s'",
+                        hazard_attr,
+                        default_key,
+                    )
+                except KeyError:
+                    LOGGER.debug(
+                        "Using default values for Hazard.%s. No data found", hazard_attr
+                    )
+                    return default_value
+
+            def vshape(x):
+                """Return a shape tuple for all data types that may occur"""
+                if isinstance(x, list):
+                    return len(x)
+                elif isinstance(x, sparse.csr_matrix):
+                    return x.get_shape()
+                else:
+                    return x.shape
+
+            # Check size for read data
+            if not np.array_equal(vshape(val), vshape(default_value)):
+                raise RuntimeError(
+                    f"Hazard {default_key} (data key: '{key if key else default_key}') "
+                    f"must have shape {vshape(default_value)}, but shape is "
+                    f"{vshape(val)}"
+                )
+
+            # Return the data
+            return val
+
+        # Set the Hazard attributes
+        for _, ident in data_ident.iterrows():
+            setattr(hazard, ident["hazard_attr"], read_or_default(ident))
+
+        # Done!
+        LOGGER.debug("Hazard successfully loaded. Number of events: %i", num_events)
+        return hazard
+
     @classmethod
     def from_vector(cls, files_intensity, files_fraction=None, attrs=None,
                     inten_name=None, frac_name=None, dst_crs=None, haz_type=None):