From e8b25f40583fad5bcb3654bd7210777285be9c38 Mon Sep 17 00:00:00 2001 From: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> Date: Fri, 14 Oct 2022 14:25:59 +0200 Subject: [PATCH] Add Hazard classmethod for loading xarray Datasets (#507) Add Hazard classmethod for loading xarray datasets The method loads a consistent Hazard object from a raster dataset that can be interpreted as 2D or 3D dataset with dimensions "time", "latitude", and "longitude". It supports loading any file type supported by xarray. * Add classmethod `Hazard.from_raster_xarray`. * Add two unit test cases. * Add docstring/documentation Co-authored-by: Chahan M. Kropf Co-authored-by: emanuel-schmid --- climada/hazard/base.py | 474 +++++++++++++++++++++++- climada/hazard/test/test_base_xarray.py | 463 +++++++++++++++++++++++ 2 files changed, 936 insertions(+), 1 deletion(-) create mode 100644 climada/hazard/test/test_base_xarray.py diff --git a/climada/hazard/base.py b/climada/hazard/base.py index 433bf089f..80a1bec6a 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -27,6 +27,7 @@ import logging import pathlib import warnings +from typing import Union, Optional, Callable, Dict, Any import geopandas as gpd import h5py @@ -38,6 +39,7 @@ from rasterio.features import rasterize from rasterio.warp import reproject, Resampling, calculate_default_transform from scipy import sparse +import xarray as xr from climada.hazard.tag import Tag as TagHazard from climada.hazard.centroids.centr import Centroids @@ -47,7 +49,7 @@ from climada import CONFIG import climada.util.hdf5_handler as u_hdf5 import climada.util.coordinates as u_coord -from climada.util.constants import ONE_LAT_KM, DEF_FREQ_UNIT +from climada.util.constants import ONE_LAT_KM, DEF_CRS, DEF_FREQ_UNIT from climada.util.coordinates import NEAREST_NEIGHBOR_THRESHOLD LOGGER = logging.getLogger(__name__) @@ -92,6 +94,12 @@ } """MATLAB variable names""" +DEF_COORDS = dict(event="time", longitude="longitude", latitude="latitude") +"""Default coordinates when reading Hazard data from an xarray Dataset""" + +DEF_DATA_VARS = ["fraction", "frequency", "event_id", "event_name", "date"] +"""Default keys for optional Hazard attributes when reading from an xarray Dataset""" + class Hazard(): """ @@ -392,6 +400,470 @@ def set_vector(self, *args, **kwargs): "Use Hazard.from_vector instead.") self.__dict__ = Hazard.from_vector(*args, **kwargs).__dict__ + @classmethod + def from_raster_xarray( + cls, + data: Union[xr.Dataset, str], + hazard_type: str, + intensity_unit: str, + *, + intensity: str = "intensity", + coordinate_vars: Optional[Dict[str, str]] = None, + data_vars: Optional[Dict[str, str]] = None, + crs: str = DEF_CRS, + ): + """Read raster-like data from an xarray Dataset or a raster data file + + This method reads data that can be interpreted using three coordinates for event, + latitude, and longitude. The data and the coordinates themselves may be organized + in arbitrary dimensions in the Dataset (e.g. three dimensions 'year', 'month', + 'day' for the coordinate 'event'). The three coordinates to be read can be + specified via the ``coordinate_vars`` parameter. See Notes and Examples if you + want to load single-event data that does not contain an event dimension. + + The only required data is the intensity. For all other data, this method can + supply sensible default values. By default, this method will try to find these + "optional" data in the Dataset and read it, or use the default values otherwise. + Users may specify the variables in the Dataset to be read for certain Hazard + object entries, or may indicate that the default values should be used although + the Dataset contains appropriate data. This behavior is controlled via the + ``data_vars`` parameter. + + If this method succeeds, it will always return a "consistent" Hazard object, + meaning that the object can be used in all CLIMADA operations without throwing + an error due to missing data or faulty data types. + + Notes + ----- + * Single-valued coordinates given by ``coordinate_vars``, that are not proper + dimensions of the data, are promoted to dimensions automatically. If one of the + three coordinates does not exist, use ``Dataset.expand_dims`` (see + https://docs.xarray.dev/en/stable/generated/xarray.Dataset.expand_dims.html + and Examples) before loading the Dataset as Hazard. + * To avoid confusion in the call signature, several parameters are keyword-only + arguments. + * The attributes ``Hazard.tag.haz_type`` and ``Hazard.unit`` currently cannot be + read from the Dataset. Use the method parameters to set these attributes. + * This method does not read coordinate system metadata. Use the ``crs`` parameter + to set a custom coordinate system identifier. + * This method **does not** read lazily. Single data arrays must fit into memory. + + Parameters + ---------- + data : xarray.Dataset or str + The data to read from. May be a opened dataset or a path to a raster data + file, in which case the file is opened first. Works with any file format + supported by ``xarray``. + hazard_type : str + The type identifier of the hazard. Will be stored directly in the hazard + object. + intensity_unit : str + The physical units of the intensity. Will be stored in the ``hazard.tag``. + intensity : str, optional + Identifier of the `xarray.DataArray` containing the hazard intensity data. + coordinate_vars : dict(str, str), optional + Mapping from default coordinate names to coordinate names used in the data + to read. The default is + ``dict(event="time", longitude="longitude", latitude="latitude")`` + data_vars : dict(str, str), optional + Mapping from default variable names to variable names used in the data + to read. The default names are ``fraction``, ``hazard_type``, ``frequency``, + ``event_name``, and ``event_id``. If these values are not set, the method + tries to load data from the default names. If this fails, the method uses + default values for each entry. If the values are set to empty strings + (``""``), no data is loaded and the default values are used exclusively. See + examples for details. + + Default values are: + * ``fraction``: 1.0 where intensity is not zero, else zero + * ``hazard_type``: Empty string + * ``frequency``: 1.0 for every event + * ``event_name``: String representation of the event time + * ``event_id``: Consecutive integers starting at 1 and increasing with time + crs : str, optional + Identifier for the coordinate reference system of the coordinates. Defaults + to ``EPSG:4326`` (WGS 84), defined by ``climada.util.constants.DEF_CRS``. + See https://pyproj4.github.io/pyproj/dev/api/crs/crs.html#pyproj.crs.CRS.from_user_input + for further information on how to specify the coordinate system. + + Returns + ------- + hazard : climada.Hazard + A hazard object created from the input data + + Examples + -------- + The use of this method is straightforward if the Dataset contains the data with + expected names. + >>> dset = xr.Dataset( + >>> dict( + >>> intensity=( + >>> ["time", "latitude", "longitude"], + >>> [[[0, 1, 2], [3, 4, 5]]], + >>> ) + >>> ), + >>> dict( + >>> time=[datetime.datetime(2000, 1, 1)], + >>> latitude=[0, 1], + >>> longitude=[0, 1, 2], + >>> ), + >>> ) + >>> hazard = Hazard.from_raster_xarray(dset, "", "") + + For non-default coordinate names, use the ``coordinate_vars`` argument. + >>> dset = xr.Dataset( + >>> dict( + >>> intensity=( + >>> ["day", "lat", "longitude"], + >>> [[[0, 1, 2], [3, 4, 5]]], + >>> ) + >>> ), + >>> dict( + >>> day=[datetime.datetime(2000, 1, 1)], + >>> lat=[0, 1], + >>> longitude=[0, 1, 2], + >>> ), + >>> ) + >>> hazard = Hazard.from_raster_xarray( + >>> dset, "", "", coordinate_vars=dict(event="day", latitude="lat") + >>> ) + + Coordinates can be different from the actual dataset dimensions. The following + loads the data with coordinates ``longitude`` and ``latitude`` (default names): + >>> dset = xr.Dataset( + >>> dict(intensity=(["time", "y", "x"], [[[0, 1, 2], [3, 4, 5]]])), + >>> dict( + >>> time=[datetime.datetime(2000, 1, 1)], + >>> y=[0, 1], + >>> x=[0, 1, 2], + >>> longitude=(["y", "x"], [[0.0, 0.1, 0.2], [0.0, 0.1, 0.2]]), + >>> latitude=(["y", "x"], [[0.0, 0.0, 0.0], [0.1, 0.1, 0.1]]), + >>> ), + >>> ) + >>> hazard = Hazard.from_raster_xarray(dset, "", "") + + Optional data is read from the dataset if the default keys are found. Users can + specify custom variables in the data, or that the default keys should be ignored, + with the ``data_vars`` argument. + >>> dset = xr.Dataset( + >>> dict( + >>> intensity=( + >>> ["time", "latitude", "longitude"], + >>> [[[0, 1, 2], [3, 4, 5]]], + >>> ), + >>> fraction=( + >>> ["time", "latitude", "longitude"], + >>> [[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]], + >>> ), + >>> freq=(["time"], [0.4]), + >>> event_id=(["time"], [4]), + >>> ), + >>> dict( + >>> time=[datetime.datetime(2000, 1, 1)], + >>> latitude=[0, 1], + >>> longitude=[0, 1, 2], + >>> ), + >>> ) + >>> hazard = Hazard.from_raster_xarray( + >>> dset, + >>> "", + >>> "", + >>> data_vars=dict( + >>> # Load frequency from 'freq' array + >>> frequency="freq", + >>> # Ignore 'event_id' array and use default instead + >>> event_id="", + >>> # 'fraction' array is loaded because it has the default name + >>> ), + >>> ) + >>> np.array_equal(hazard.frequency, [0.4]) and np.array_equal( + >>> hazard.event_id, [1] + >>> ) + True + + If your read single-event data your dataset probably will not have a time + dimension. As long as a time *coordinate* exists, however, this method will + automatically promote it to a dataset dimension and load the data: + >>> dset = xr.Dataset( + >>> dict( + >>> intensity=( + >>> ["latitude", "longitude"], + >>> [[0, 1, 2], [3, 4, 5]], + >>> ) + >>> ), + >>> dict( + >>> time=[datetime.datetime(2000, 1, 1)], + >>> latitude=[0, 1], + >>> longitude=[0, 1, 2], + >>> ), + >>> ) + >>> hazard = Hazard.from_raster_xarray(dset, "", "") # Same as first example + + If one coordinate is missing altogehter, you must add it or expand the dimensions + before loading the dataset: + >>> dset = xr.Dataset( + >>> dict( + >>> intensity=( + >>> ["latitude", "longitude"], + >>> [[0, 1, 2], [3, 4, 5]], + >>> ) + >>> ), + >>> dict( + >>> latitude=[0, 1], + >>> longitude=[0, 1, 2], + >>> ), + >>> ) + >>> dset = dset.expand_dims(time=[numpy.datetime64("2000-01-01")]) + >>> hazard = Hazard.from_raster_xarray(dset, "", "") + """ + # If the data is a string, open the respective file + if not isinstance(data, xr.Dataset): + LOGGER.info("Loading Hazard from file: %s", data) + data = xr.open_dataset(data) + else: + LOGGER.info("Loading Hazard from xarray Dataset") + + # Initialize Hazard object + hazard = cls(haz_type=hazard_type) + hazard.units = intensity_unit + + # Update coordinate identifiers + coords = copy.deepcopy(DEF_COORDS) + coordinate_vars = coordinate_vars if coordinate_vars is not None else {} + unknown_coords = [co for co in coordinate_vars if co not in coords] + if unknown_coords: + raise ValueError( + f"Unknown coordinates passed: '{unknown_coords}'. Supported " + f"coordinates are {list(coords.keys())}." + ) + coords.update(coordinate_vars) + + # Retrieve dimensions of coordinates + try: + dims = dict( + event=data[coords["event"]].dims, + longitude=data[coords["longitude"]].dims, + latitude=data[coords["latitude"]].dims, + ) + # Handle KeyError for better error message + except KeyError as err: + key = err.args[0] + raise RuntimeError( + f"Dataset is missing dimension/coordinate: {key}. Dataset dimensions: " + f"{list(data.dims.keys())}" + ) from err + + # Try promoting single-value coordinates to dimensions + for key, val in dims.items(): + if not val: + coord = coords[key] + LOGGER.debug("Promoting Dataset coordinate '%s' to dimension", coord) + data = data.expand_dims(coord) + dims[key] = data[coord].dims + + # Stack (vectorize) the entire dataset into 2D (time, lat/lon) + # NOTE: We want the set union of the dimensions, but Python 'set' does not + # preserve order. However, we want longitude to run faster than latitude. + # So we use 'dict' without values, as 'dict' preserves insertion order + # (dict keys behave like a set). + data = data.stack( + event=dims["event"], + lat_lon=dict.fromkeys(dims["latitude"] + dims["longitude"]), + ) + + # Transform coordinates into centroids + hazard.centroids = Centroids.from_lat_lon( + data[coords["latitude"]].values, data[coords["longitude"]].values, crs=crs, + ) + + def to_csr_matrix(array: np.ndarray) -> sparse.csr_matrix: + """Store a numpy array as sparse matrix, optimizing storage space + + The CSR matrix stores NaNs explicitly, so we set them to zero. + """ + return sparse.csr_matrix(np.where(np.isnan(array), 0, array)) + + # Read the intensity data + LOGGER.debug("Loading Hazard intensity from DataArray '%s'", intensity) + hazard.intensity = to_csr_matrix(data[intensity]) + + # Define accessors for xarray DataArrays + def default_accessor(array: xr.DataArray) -> np.ndarray: + """Take a DataArray and return its numpy representation""" + return array.values + + def strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray: + """Take a positive int DataArray and return its numpy representation + + Raises + ------ + TypeError + If the underlying data type is not integer + ValueError + If any value is zero or less + """ + if not np.issubdtype(array.dtype, np.integer): + raise TypeError(f"'{array.name}' data array must be integers") + if not (array > 0).all(): + raise ValueError(f"'{array.name}' data must be larger than zero") + return array.values + + # Create a DataFrame storing access information for each of data_vars + # NOTE: Each row will be passed as arguments to + # `load_from_xarray_or_return_default`, see its docstring for further + # explanation of the DataFrame columns / keywords. + num_events = data.sizes["event"] + data_ident = pd.DataFrame( + data=dict( + # The attribute of the Hazard class where the data will be stored + hazard_attr=DEF_DATA_VARS, + # The identifier and default key used in this method + default_key=DEF_DATA_VARS, + # The key assigned by the user + user_key=None, + # The default value for each attribute + default_value=[ + to_csr_matrix( + xr.apply_ufunc( + lambda x: np.where(x != 0, 1, 0), data[intensity] + ).values + ), + np.ones(num_events), + np.array(range(num_events), dtype=int) + 1, + list(data[coords["event"]].values), + np.array(u_dt.datetime64_to_ordinal(data[coords["event"]].values)), + ], + # The accessor for the data in the Dataset + accessor=[ + lambda x: to_csr_matrix(default_accessor(x)), + default_accessor, + strict_positive_int_accessor, + lambda x: list(default_accessor(x).flat), # list, not np.array + strict_positive_int_accessor, + ], + ) + ) + + # Check for unexpected keys + data_vars = data_vars if data_vars is not None else {} + default_keys = data_ident["default_key"] + unknown_keys = [ + key for key in data_vars.keys() if not default_keys.str.contains(key).any() + ] + if unknown_keys: + raise ValueError( + f"Unknown data variables passed: '{unknown_keys}'. Supported " + f"data variables are {list(default_keys)}." + ) + + # Update with keys provided by the user + # NOTE: Keys in 'default_keys' missing from 'data_vars' will be set to 'None' + # (which is exactly what we want) and the result is written into + # 'user_key'. 'default_keys' is not modified. + data_ident["user_key"] = default_keys.map(data_vars) + + def load_from_xarray_or_return_default( + user_key: Optional[str], + default_key: str, + hazard_attr: str, + accessor: Callable[[xr.DataArray], Any], + default_value: Any, + ) -> Any: + """Load data for a single Hazard attribute or return the default value + + Does the following based on the ``user_key``: + * If the key is an empty string, return the default value + * If the key is a non-empty string, load the data for that key and return it. + * If the key is ``None``, look for the``default_key`` in the data. If it + exists, return that data. If not, return the default value. + + Parameters + ---------- + user_key : str or None + The key set by the user to identify the DataArray to read data from. + default_key : str + The default key identifying the DataArray to read data from. + hazard_attr : str + The name of the attribute of ``Hazard`` where the data will be stored in. + accessor : Callable + A callable that takes the DataArray as argument and returns the data + structure that is required by the ``Hazard`` attribute. + default_value + The default value/array to return in case the data could not be found. + + Returns + ------- + The object that will be stored in the ``Hazard`` attribute ``hazard_attr``. + + Raises + ------ + KeyError + If ``user_key`` was a non-empty string but no such key was found in the + data + RuntimeError + If the data structure loaded has a different shape than the default data + structure + """ + # User does not want to read data + if user_key == "": + LOGGER.debug( + "Using default values for Hazard.%s per user request", hazard_attr + ) + return default_value + + if not pd.isna(user_key): + # Read key exclusively + LOGGER.debug( + "Reading data for Hazard.%s from DataArray '%s'", + hazard_attr, + user_key, + ) + val = accessor(data[user_key]) + else: + # Try default key + try: + val = accessor(data[default_key]) + LOGGER.debug( + "Reading data for Hazard.%s from DataArray '%s'", + hazard_attr, + default_key, + ) + except KeyError: + LOGGER.debug( + "Using default values for Hazard.%s. No data found", hazard_attr + ) + return default_value + + def vshape(array): + """Return a shape tuple for any array-like type we use""" + if isinstance(array, list): + return len(array) + if isinstance(array, sparse.csr_matrix): + return array.get_shape() + return array.shape + + # Check size for read data + if not np.array_equal(vshape(val), vshape(default_value)): + raise RuntimeError( + f"'{user_key if user_key else default_key}' must have shape " + f"{vshape(default_value)}, but shape is {vshape(val)}" + ) + + # Return the data + return val + + # Set the Hazard attributes + for _, ident in data_ident.iterrows(): + setattr( + hazard, + ident["hazard_attr"], + load_from_xarray_or_return_default(**ident), + ) + + # Done! + LOGGER.debug("Hazard successfully loaded. Number of events: %i", num_events) + return hazard + @classmethod def from_vector(cls, files_intensity, files_fraction=None, attrs=None, inten_name=None, frac_name=None, dst_crs=None, haz_type=None): diff --git a/climada/hazard/test/test_base_xarray.py b/climada/hazard/test/test_base_xarray.py new file mode 100644 index 000000000..7aa3610a2 --- /dev/null +++ b/climada/hazard/test/test_base_xarray.py @@ -0,0 +1,463 @@ +""" +This file is part of CLIMADA. + +Copyright (C) 2022 ETH Zurich, CLIMADA contributors listed in AUTHORS. + +CLIMADA is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free +Software Foundation, version 3. + +CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with CLIMADA. If not, see . + +--- + +Test xarray reading capabilities of Hazard base class. +""" + +import os +import unittest +import datetime as dt +import numpy as np +from scipy.sparse import csr_matrix + +import xarray as xr +from pyproj import CRS + +from climada.hazard.base import Hazard +from climada.util.constants import DEF_CRS + +from pathlib import Path + + +class TestReadDefaultNetCDF(unittest.TestCase): + """Test reading a NetCDF file where the coordinates to read match the dimensions""" + + def setUp(self): + """Write a simple NetCDF file to read""" + self.netcdf_path = Path.cwd() / "default.nc" + self.intensity = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]]) + self.time = np.array([dt.datetime(1999, 1, 1), dt.datetime(2000, 1, 1)]) + self.latitude = np.array([0, 1]) + self.longitude = np.array([0, 1, 2]) + dset = xr.Dataset( + { + "intensity": (["time", "latitude", "longitude"], self.intensity), + }, + dict(time=self.time, latitude=self.latitude, longitude=self.longitude), + ) + dset.to_netcdf(self.netcdf_path) + + def tearDown(self): + """Delete the NetCDF file""" + self.netcdf_path.unlink() + + def _assert_default(self, hazard): + """Assertions for the default hazard to be loaded""" + self._assert_default_types(hazard) + self._assert_default_values(hazard) + + def _assert_default_values(self, hazard): + """Check the values of the default hazard to be loaded""" + # Hazard data + self.assertEqual(hazard.tag.haz_type, "") + self.assertEqual(hazard.units, "") + np.testing.assert_array_equal(hazard.event_id, [1, 2]) + np.testing.assert_array_equal( + hazard.event_name, [np.datetime64(val) for val in self.time] + ) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in self.time] + ) + np.testing.assert_array_equal(hazard.frequency, np.ones(hazard.event_id.size)) + + # Centroids + np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) + np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) + self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) + + # Intensity data + np.testing.assert_array_equal( + hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] + ) + + # Fraction default + np.testing.assert_array_equal( + hazard.fraction.toarray(), [[0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]] + ) + + def _assert_default_types(self, hazard): + """Check types of all hazard attributes""" + self.assertIsInstance(hazard.units, str) + self.assertIsInstance(hazard.tag.haz_type, str) + self.assertIsInstance(hazard.event_id, np.ndarray) + self.assertIsInstance(hazard.event_name, list) + self.assertIsInstance(hazard.frequency, np.ndarray) + self.assertIsInstance(hazard.intensity, csr_matrix) + self.assertIsInstance(hazard.fraction, csr_matrix) + self.assertIsInstance(hazard.date, np.ndarray) + + def test_load_path(self): + """Load the data with path as argument""" + hazard = Hazard.from_raster_xarray(self.netcdf_path, "", "") + self._assert_default(hazard) + + # Check wrong paths + with self.assertRaises(FileNotFoundError) as cm: + Hazard.from_raster_xarray("file-does-not-exist.nc", "", "") + self.assertIn("file-does-not-exist.nc", str(cm.exception)) + with self.assertRaises(KeyError) as cm: + Hazard.from_raster_xarray( + self.netcdf_path, "", "", intensity="wrong-intensity-path" + ) + self.assertIn("wrong-intensity-path", str(cm.exception)) + + def test_load_dataset(self): + """Load the data from an opened dataset as argument""" + dataset = xr.open_dataset(self.netcdf_path) + hazard = Hazard.from_raster_xarray(dataset, "", "") + self._assert_default(hazard) + + def test_type_and_unit(self): + """Test passing a custom type and unit""" + hazard = Hazard.from_raster_xarray( + self.netcdf_path, hazard_type="TC", intensity_unit="m/s" + ) + self._assert_default_types(hazard) + self.assertEqual(hazard.tag.haz_type, "TC") + self.assertEqual(hazard.units, "m/s") + + def test_data_vars(self): + """Check handling of data variables""" + dataset = xr.open_dataset(self.netcdf_path) + size = dataset.sizes["time"] + + # Set optionals in the dataset + frequency = np.ones(size) * 1.5 + event_id = np.array(range(size), dtype=np.int64) + 3 + event_name = ["bla"] * size + date = np.array(range(size)) + 100 + dataset["event_id"] = event_id + dataset["event_name"] = event_name + dataset["date"] = date + + # Assign a proper coordinate for a change + dataset = dataset.assign_coords(dict(frequency=("time", frequency))) + + # Assign fraction + frac = xr.DataArray( + np.array([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]]]), + dims=["time", "latitude", "longitude"], + coords=dict( + time=self.time, latitude=self.latitude, longitude=self.longitude + ), + ) + dataset["fraction"] = frac + + # Optionals should be read automatically + hazard = Hazard.from_raster_xarray(dataset, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal(hazard.frequency, frequency) + np.testing.assert_array_equal(hazard.event_id, event_id) + np.testing.assert_array_equal(hazard.event_name, event_name) + np.testing.assert_array_equal(hazard.date, date) + np.testing.assert_array_equal( + hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]] + ) + + # Ignore keys (should be default values) + hazard = Hazard.from_raster_xarray( + dataset, + "", + "", + data_vars=dict( + frequency="", event_id="", event_name="", date="", fraction="" + ), + ) + self._assert_default(hazard) + + # Wrong key + with self.assertRaises(ValueError) as cm: + Hazard.from_raster_xarray( + dataset, "", "", data_vars=dict(wrong_key="stuff") + ) + self.assertIn( + "Unknown data variables passed: '['wrong_key']'.", str(cm.exception) + ) + + # Non-existent identifier + with self.assertRaises(KeyError) as cm: + Hazard.from_raster_xarray( + dataset, "", "", data_vars=dict(frequency="freqqqqq") + ) + self.assertIn("freqqqqq", str(cm.exception)) + + # Wrong data length + # NOTE: This also implicitly checks that 'frequency' is not read! + dataset["freq"] = np.array(range(size + 1), dtype=np.float64) + with self.assertRaises(RuntimeError) as cm: + Hazard.from_raster_xarray(dataset, "", "", data_vars=dict(frequency="freq")) + self.assertIn( + f"'freq' must have shape ({size},), but shape is ({size + 1},)", + str(cm.exception), + ) + + # Integer data assertions + for key in ("event_id", "date"): + dset = dataset.copy(deep=True) + dset[key] = np.array(range(size), dtype=np.float64) + 3.5 + with self.assertRaises(TypeError) as cm: + Hazard.from_raster_xarray(dset, "", "") + self.assertIn(f"'{key}' data array must be integers", str(cm.exception)) + dset[key] = np.linspace(0, 10, size, dtype=np.int64) + with self.assertRaises(ValueError) as cm: + Hazard.from_raster_xarray(dset, "", "") + self.assertIn(f"'{key}' data must be larger than zero", str(cm.exception)) + + def test_nan(self): + """Check handling of NaNs in original data""" + dataset = xr.open_dataset(self.netcdf_path) + intensity = xr.DataArray( + np.array([[[0, np.nan, 2], [3, 4, 5]], [[6, np.nan, 8], [9, 10, 11]]]), + dims=["time", "latitude", "longitude"], + coords=dict( + time=self.time, latitude=self.latitude, longitude=self.longitude + ), + ) + dataset["intensity"] = intensity + fraction = xr.DataArray( + np.array([[[0, 0, 0], [0, 0, 0]], [[1, np.nan, 1], [np.nan, 1, 1]]]), + dims=["time", "latitude", "longitude"], + coords=dict( + time=self.time, latitude=self.latitude, longitude=self.longitude + ), + ) + dataset["fraction"] = fraction + frequency = np.ones(dataset.sizes["time"]) + frequency[0] = np.nan + dataset["frequency"] = frequency + + # Load hazard + hazard = Hazard.from_raster_xarray(dataset, "", "") + self._assert_default_types(hazard) + + # NaNs are set to zero in sparse data + np.testing.assert_array_equal( + hazard.intensity.toarray(), + [[0, 0, 2, 3, 4, 5], [6, 0, 8, 9, 10, 11]], + ) + np.testing.assert_array_equal( + hazard.fraction.toarray(), + [[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1]], + ) + + # NaNs are propagated in dense data + np.testing.assert_array_equal(hazard.frequency, frequency) + + def test_crs(self): + """Check if different CRS inputs are handled correctly""" + + def test_crs_from_input(crs_input): + crs = CRS.from_user_input(crs_input) + hazard = Hazard.from_raster_xarray(self.netcdf_path, "", "", crs=crs_input) + self.assertEqual(hazard.centroids.geometry.crs, crs) + + test_crs_from_input("EPSG:3857") + test_crs_from_input(3857) + test_crs_from_input("+proj=cea +lat_0=52.112866 +lon_0=5.150162 +units=m") + + def test_missing_dims(self): + """Test if missing coordinates are expanded and correct errors are thrown""" + # Drop time as dimension, but not as coordinate! + ds = xr.open_dataset(self.netcdf_path).isel(time=0).squeeze() + hazard = Hazard.from_raster_xarray(ds, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal(hazard.event_name, [np.datetime64(self.time[0])]) + np.testing.assert_array_equal(hazard.date, [self.time[0].toordinal()]) + np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) + np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) + self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) + np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]) + np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 1, 1, 1, 1, 1]]) + + # Now drop variable altogether, should raise an error + ds = ds.drop_vars("time") + with self.assertRaises(RuntimeError) as cm: + Hazard.from_raster_xarray(ds, "", "") + self.assertIn( + "Dataset is missing dimension/coordinate: time", str(cm.exception) + ) + + # Expand time again + ds = ds.expand_dims(time=[np.datetime64("2022-01-01")]) + hazard = Hazard.from_raster_xarray(ds, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal(hazard.event_name, [np.datetime64("2022-01-01")]) + np.testing.assert_array_equal( + hazard.date, [dt.datetime(2022, 1, 1).toordinal()] + ) + + +class TestReadDimsCoordsNetCDF(unittest.TestCase): + """Checks for dimensions and coordinates with different names and shapes""" + + def setUp(self): + """Write a NetCDF file with many coordinates""" + self.netcdf_path = Path.cwd() / "coords.nc" + self.intensity = np.array([[[0, 1, 2], [3, 4, 5]]]) + self.fraction = np.array([[[0, 0, 0], [1, 1, 1]]]) + self.time = np.array([dt.datetime(2000, 1, 1)]) + self.x = np.array([0, 1, 2]) + self.y = np.array([0, 1]) + self.lon = np.array([1, 2, 3]) + self.lat = np.array([1, 2]) + self.years = np.array([dt.datetime(1999, 2, 2)]) + self.longitude = np.array([[10, 11, 12], [10, 11, 12]]) + self.latitude = np.array([[100, 100, 100], [200, 200, 200]]) + + dset = xr.Dataset( + { + "intensity": (["time", "y", "x"], self.intensity), + "fraction": (["time", "y", "x"], self.fraction), + }, + { + "time": self.time, + "x": self.x, + "y": self.y, + "lon": (["x"], self.lon), + "lat": (["y"], self.lat), + "years": (["time"], self.years), + "latitude": (["y", "x"], self.latitude), + "longitude": (["y", "x"], self.longitude), + }, + ) + dset.to_netcdf(self.netcdf_path) + + def tearDown(self): + """Delete the NetCDF file""" + self.netcdf_path.unlink() + + def _assert_intensity_fraction(self, hazard): + """Check if intensity and fraction data are read correctly""" + np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]) + np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 1, 1, 1]]) + + def test_dimension_naming(self): + """Test if dimensions with different names can be read""" + hazard = Hazard.from_raster_xarray( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="y", longitude="x"), # 'time' stays default + ) + np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) + np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in self.time] + ) + self._assert_intensity_fraction(hazard) + + def test_coordinate_naming(self): + """Test if coordinates with different names than dimensions can be read""" + hazard = Hazard.from_raster_xarray( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="lat", longitude="lon", event="years"), + ) + np.testing.assert_array_equal(hazard.centroids.lat, [1, 1, 1, 2, 2, 2]) + np.testing.assert_array_equal(hazard.centroids.lon, [1, 2, 3, 1, 2, 3]) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in self.years] + ) + self._assert_intensity_fraction(hazard) + + def test_2D_coordinates(self): + """Test if read method correctly handles 2D coordinates""" + hazard = Hazard.from_raster_xarray( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + ) + np.testing.assert_array_equal( + hazard.centroids.lat, [100, 100, 100, 200, 200, 200] + ) + np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) + self._assert_intensity_fraction(hazard) + + def test_2D_time(self): + """Test if stacking multiple time dimensions works out""" + time = np.array( + [ + [dt.datetime(1999, 1, 1), dt.datetime(1999, 2, 1)], + [dt.datetime(2000, 1, 1), dt.datetime(2000, 2, 1)], + ] + ) + ds = xr.Dataset( + { + "intensity": ( + ["year", "month", "latitude", "longitude"], + [[[[1]], [[2]]], [[[3]], [[4]]]], + ), + "fraction": ( + ["year", "month", "latitude", "longitude"], + [[[[10]], [[20]]], [[[30]], [[40]]]], + ), + }, + { + "latitude": [1], + "longitude": [2], + "year": [1999, 2000], + "month": [1, 2], + "time": (["year", "month"], time), + }, + ) + hazard = Hazard.from_raster_xarray(ds, "", "") + + np.testing.assert_array_equal(hazard.intensity.toarray(), [[1], [2], [3], [4]]) + np.testing.assert_array_equal( + hazard.fraction.toarray(), [[10], [20], [30], [40]] + ) + np.testing.assert_array_equal(hazard.centroids.lat, [1]) + np.testing.assert_array_equal(hazard.centroids.lon, [2]) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in time.flat] + ) + + def test_errors(self): + """Check if expected errors are thrown""" + # Wrong coordinate key + with self.assertRaises(ValueError) as cm: + Hazard.from_raster_xarray( + self.netcdf_path, + "", + "", + coordinate_vars=dict(bar="latitude", longitude="baz"), + ) + self.assertIn("Unknown coordinates passed: '['bar']'.", str(cm.exception)) + + # Correctly specified, but the custom dimension does not exist + with self.assertRaises(RuntimeError) as cm: + Hazard.from_raster_xarray( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="lalalatitude"), + ) + self.assertIn( + "Dataset is missing dimension/coordinate: lalalatitude.", str(cm.exception) + ) + + +# Execute Tests +if __name__ == "__main__": + TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReadDefaultNetCDF) + TESTS.addTests( + unittest.TestLoader().loadTestsFromTestCase(TestReadDimsCoordsNetCDF) + ) + unittest.TextTestRunner(verbosity=2).run(TESTS)