Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Hazard classmethod for loading xarray Datasets #507

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4a9a520
Add Hazard classmethod for loading NetCDF file
peanutfun Jul 5, 2022
755f11e
Only display unknown coordinates in `from_raster_netcdf` error message
peanutfun Jul 6, 2022
525394b
Use CLIMADA utilities for converting datetime64
peanutfun Jul 6, 2022
17564d5
Read Hazard from datasets with varying dim/coord definitions
peanutfun Jul 6, 2022
cbc1dc5
Avoid redefining built-in `map`
peanutfun Jul 6, 2022
d90ca87
Add log messages to `Hazard.from_raster_netcdf`
peanutfun Jul 6, 2022
939968b
Extract dimension names from coordinates
peanutfun Jul 7, 2022
b110a94
Use lazy formatting for logger messages
peanutfun Jul 7, 2022
4283cf6
Merge branch 'develop' into 487-add-classmethod-to-hazard-for-reading…
peanutfun Jul 11, 2022
ab40d1a
Reorganize test_base_netcdf.py
peanutfun Jul 15, 2022
2a4283d
Consolidate tests for Hazard.intensity
peanutfun Jul 15, 2022
dac5dd5
Rename from_raster_netcdf to from_raster_xarray
peanutfun Jul 15, 2022
026a33f
Make `from_raster_xarray` read all Hazard data
peanutfun Jul 27, 2022
528b99e
Add more logger messages and update docstring
peanutfun Jul 27, 2022
d96a278
Remove 'fraction' parameter from 'from_raster_xarray'
peanutfun Jul 27, 2022
f817bca
Add examples for `from_raster_xarray`
peanutfun Jul 27, 2022
fb94fc6
Fix type hints in `from_raster_xarray`
peanutfun Jul 27, 2022
bda69a6
Apply suggestions from code review
peanutfun Jul 28, 2022
9c68a9e
Improve docstrings and simplify `from_raster_xarray`
peanutfun Jul 28, 2022
87033ea
Simplify `user_key` update
peanutfun Jul 28, 2022
abc9d32
Optimize storage in CSR matrix by setting NaNs to zero
peanutfun Jul 28, 2022
29b74f3
Make hazard type and unit required arguments
peanutfun Jul 29, 2022
7678d1f
Preprocess dict arguments to save indentation level
peanutfun Jul 29, 2022
c0fb24b
Let `to_csr_matrix` only take ndarrays as arguments
peanutfun Jul 29, 2022
b8c46a3
Do not hard-code coordinate keys
peanutfun Aug 3, 2022
7a9ea42
Remove superfluous newline in docstring
peanutfun Aug 3, 2022
b2eea56
Update docstring of `from_raster_xarray`
peanutfun Aug 4, 2022
1be9772
Update `Hazard.from_raster_xarray`
peanutfun Aug 5, 2022
ccec87f
Add CRS argument
peanutfun Aug 8, 2022
c19f9ea
Merge branch 'develop' into 487-add-classmethod-to-hazard-for-reading…
peanutfun Sep 29, 2022
07e8bf7
Fix bug where wrong Hazard attribute was set
peanutfun Oct 5, 2022
84b0795
Add test for crs parameter in from_raster_xarray
peanutfun Oct 6, 2022
a72112c
Update docstring of from_raster_xarray
peanutfun Oct 6, 2022
d777785
Add 'Test' prefix for test cases in test_base_xarray.py
peanutfun Oct 6, 2022
d36bcc2
Format test_base_xarray
peanutfun Oct 6, 2022
68c1459
Fix comment in from_raster_xarray
peanutfun Oct 6, 2022
2cffc9a
Merge branch 'develop' into 487-add-classmethod-to-hazard-for-reading…
emanuel-schmid Oct 7, 2022
1ff075d
Promote single-valued coordinates to dimensions
peanutfun Oct 14, 2022
1eea94c
Merge branch '487-add-classmethod-to-hazard-for-reading-raster-like-d…
peanutfun Oct 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions climada/hazard/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import logging
import pathlib
import warnings
from typing import Union, Optional, Callable, Dict

import geopandas as gpd
import h5py
Expand All @@ -38,6 +39,7 @@
from rasterio.features import rasterize
from rasterio.warp import reproject, Resampling, calculate_default_transform
from scipy import sparse
import xarray as xr

from climada.hazard.tag import Tag as TagHazard
from climada.hazard.centroids.centr import Centroids
Expand Down Expand Up @@ -369,6 +371,120 @@ def set_vector(self, *args, **kwargs):
"Use Hazard.from_vector instead.")
self.__dict__ = Hazard.from_vector(*args, **kwargs).__dict__

@classmethod
def from_raster_netcdf(
cls,
data: Union[xr.Dataset, str],
*,
intensity: str = "intensity",
fraction: Union[str, Callable[[np.ndarray], np.ndarray]] = "fraction",
dimension_vars: Optional[Dict[str, str]] = None,
coordinate_vars: Optional[Dict[str, str]] = None,
):
"""Read raster-like data from a NetCDF file

Parameters
----------
data : xarray.Dataset or str
The NetCDF data to read from. May be a opened dataset or a path to a NetCDF
file, in which case the file is opened first.
intensity : str
Identifier of the `xarray.DataArray` containing the hazard intensity data.
fraction : str or Callable
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
Identifier of the `xarray.DataArray` containing the hazard fraction data.
May be a callable, in which case the callable is applied on the respective
intensity value to yield a fraction.
dimension_vars : dict(str, str)
Mapping from default dimension names to dimension names used in the data
to read. The default dimensions are `time`, `longitude`, and `latitude`.
coordinate_vars : dict(str, str)
Mapping from default coordinate names to coordinate names used in the
data to read. The default coordinates are the settings for `time`,
`longitude`, and `latitude` in the `dimension_vars`, which themselves default
to `time`, `longitude`, and `latitude`. See Notes for details.

Notes
-----
This method is able to read *coordinates* that differ from the *dimensions* of
the evaluated dataset. If you are unfamiliar with the `xarray` nomenclature,
please consult https://docs.xarray.dev/en/stable/user-guide/terminology.html.
By default, it is assumed that the dimensions and coordinates of the dataset
are called `time`, `longitude`, and `latitude`, and the data is interpreted
accordingly. If the dimensions of the dataset are named differently, you may
specify the respective names via the `dimension_vars` mapping. If the dimension
names differ from the names of the coordinates you want to read as information
for time, latitude, and longitude, you may specify the coordinate names via the
`coordinate_vars` mapping. If the latter is not given, it is assumed that the
coordinates have the same names as the dimensions.

Returns
-------
hazard : climada.Hazard
A hazard object created from the input data

"""
# If the data is a string, open the respective file
if not isinstance(data, xr.Dataset):
data = xr.open_dataset(data)
hazard = cls() # TODO: Hazard type

def update_without_addition(map: dict, map_update: dict, error_keyword: str):
"""Update an existing map with another map

Throw an error if the other map contains keys that are not present in the
original map.
"""
unknown_keys = [key for key in map_update if key not in map]
if unknown_keys:
raise ValueError(
f"Unknown {error_keyword} passed: '{unknown_keys}'. Supported "
f"{error_keyword} are: '{list(map.keys())}'."
)
map.update(map_update)

# Update dimension identifiers
dims = {"time": "time", "longitude": "longitude", "latitude": "latitude"}
if dimension_vars is not None:
update_without_addition(dims, dimension_vars, "dimensions")

# Stack (vectorize) the entire dataset along the spatial dimensions
data = data.stack(lat_lon=[dims["latitude"], dims["longitude"]])

# Transform coordinates into centroids
# NOTE: We might use different coordinates here than the dimensions defining the
# dataset shape
coords = copy.deepcopy(dims)
if coordinate_vars is not None:
update_without_addition(coords, coordinate_vars, "coordinates")
hazard.centroids = Centroids.from_lat_lon(
data[coords["latitude"]].values, data[coords["longitude"]].values
)

# Read the intensity data and flatten it in spatial dimensions
hazard.intensity = sparse.csr_matrix(data[intensity])
hazard.intensity.eliminate_zeros()

# Use fraction data or apply callable
if isinstance(fraction, str):
fraction_arr = data[fraction]
elif isinstance(fraction, Callable):
fraction_arr = xr.apply_ufunc(fraction, data[intensity])
else:
raise TypeError("'fraction' parameter must be 'str' or Callable")
hazard.fraction = sparse.csr_matrix(fraction_arr)
hazard.fraction.eliminate_zeros()

# Fill hazard with required information
num_events = data[coords["time"]].size
hazard.event_id = np.array(range(num_events)) + 1 # event_id starts at 1
hazard.frequency = np.ones(num_events) # TODO: Optional read from file
hazard.event_name = list(data[coords["time"]].values)
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
hazard.date = np.array(u_dt.datetime64_to_ordinal(data[coords["time"]].values))
# TODO: hazard.unit

# Done!
return hazard

@classmethod
def from_vector(cls, files_intensity, files_fraction=None, attrs=None,
inten_name=None, frac_name=None, dst_crs=None, haz_type=None):
Expand Down
261 changes: 261 additions & 0 deletions climada/hazard/test/test_base_netcdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
"""
This file is part of CLIMADA.

Copyright (C) 2022 ETH Zurich, CLIMADA contributors listed in AUTHORS.

CLIMADA is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free
Software Foundation, version 3.

CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with CLIMADA. If not, see <https://www.gnu.org/licenses/>.

---

Test NetCDF reading capabilities of Hazard base class.
"""

import os
import unittest
import datetime as dt
import numpy as np
from scipy.sparse import csr_matrix

import xarray as xr

from climada.hazard.base import Hazard

THIS_DIR = os.path.dirname(__file__)


class ReadDimsCoordsNetCDF(unittest.TestCase):
"""Checks for dimensions and coordinates with different names and shapes"""

def setUp(self):
"""Write a NetCDF file with many coordinates"""
self.netcdf_path = os.path.join(THIS_DIR, "coords.nc")
self.intensity = np.array([[[0, 1, 2], [3, 4, 5]]])
self.fraction = np.array([[[0, 0, 0], [1, 1, 1]]])
self.time = np.array([dt.datetime(2000, 1, 1)])
self.x = np.array([0, 1, 2])
self.y = np.array([0, 1])
self.lon = np.array([1, 2, 3])
self.lat = np.array([1, 2])
self.years = np.array([dt.datetime(1999, 2, 2)])
self.longitude = np.array([[10, 11, 12], [10, 11, 12]])
self.latitude = np.array([[100, 100, 100], [200, 200, 200]])

dset = xr.Dataset(
{
"intensity": (["time", "y", "x"], self.intensity),
"fraction": (["time", "y", "x"], self.fraction),
},
{
"time": self.time,
"x": self.x,
"y": self.y,
"lon": (["x"], self.lon),
"lat": (["y"], self.lat),
"years": (["time"], self.years),
"latitude": (["y", "x"], self.latitude),
"longitude": (["y", "x"], self.longitude),
},
)
dset.to_netcdf(self.netcdf_path)

def tearDown(self):
"""Delete the NetCDF file"""
os.remove(self.netcdf_path)

def _assert_intensity_fraction(self, hazard):
"""Check if intensity and fraction data are read correctly"""
np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]])
np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 1, 1, 1]])

def test_dimension_naming(self):
"""Test if dimensions with different names can be read"""
hazard = Hazard.from_raster_netcdf(
self.netcdf_path,
dimension_vars=dict(latitude="y", longitude="x"), # 'time' stays default
)
np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
np.testing.assert_array_equal(
hazard.date, [val.toordinal() for val in self.time]
)
self._assert_intensity_fraction(hazard)

def test_coordinate_naming(self):
"""Test if coordinates with different names than dimensions can be read"""
hazard = Hazard.from_raster_netcdf(
self.netcdf_path,
dimension_vars=dict(latitude="y", longitude="x"),
coordinate_vars=dict(latitude="lat", longitude="lon", time="years"),
)
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
np.testing.assert_array_equal(hazard.centroids.lat, [1, 1, 1, 2, 2, 2])
np.testing.assert_array_equal(hazard.centroids.lon, [1, 2, 3, 1, 2, 3])
np.testing.assert_array_equal(
hazard.date, [val.toordinal() for val in self.years]
)
self._assert_intensity_fraction(hazard)

def test_2D_coordinates(self):
"""Test if read method correctly handles 2D coordinates"""
hazard = Hazard.from_raster_netcdf(
self.netcdf_path,
dimension_vars=dict(latitude="y", longitude="x"),
coordinate_vars=dict(latitude="latitude", longitude="longitude"),
)
np.testing.assert_array_equal(
hazard.centroids.lat, [100, 100, 100, 200, 200, 200]
)
np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12])
self._assert_intensity_fraction(hazard)

def test_errors(self):
"""Check if expected errors are thrown"""
# Default, should throw because default dimension names don't work out
with self.assertRaises(KeyError):
Hazard.from_raster_netcdf(self.netcdf_path)

# Wrong dimension key
with self.assertRaises(ValueError) as cm:
Hazard.from_raster_netcdf(
self.netcdf_path,
dimension_vars=dict(foo="latitude", longitude="longitude"),
)
self.assertIn("Unknown dimensions passed: '['foo']'.", str(cm.exception))

# Correctly specified, but the custom dimension does not exist
with self.assertRaises(KeyError):
Hazard.from_raster_netcdf(
self.netcdf_path, dimension_vars=dict(time="year")
)

# Wrong coordinate key
with self.assertRaises(ValueError) as cm:
Hazard.from_raster_netcdf(
self.netcdf_path,
dimension_vars=dict(latitude="y", longitude="x"),
coordinate_vars=dict(bar="latitude", longitude="baz"),
)
self.assertIn("Unknown coordinates passed: '['bar']'.", str(cm.exception))

# Correctly specified, but the custom dimension does not exist
with self.assertRaises(KeyError) as cm:
Hazard.from_raster_netcdf(
self.netcdf_path,
dimension_vars=dict(latitude="y", longitude="x"),
coordinate_vars=dict(latitude="lalalatitude"),
)


class ReadDefaultNetCDF(unittest.TestCase):
def setUp(self):
"""Write a simple NetCDF file to read"""
self.netcdf_path = os.path.join(THIS_DIR, "default.nc")
self.intensity = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]])
self.fraction = np.array([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]]])
self.time = np.array([dt.datetime(1999, 1, 1), dt.datetime(2000, 1, 1)])
self.latitude = np.array([0, 1])
self.longitude = np.array([0, 1, 2])
dset = xr.Dataset(
{
"intensity": (["time", "latitude", "longitude"], self.intensity),
"fraction": (["time", "latitude", "longitude"], self.fraction),
},
dict(time=self.time, latitude=self.latitude, longitude=self.longitude),
)
dset.to_netcdf(self.netcdf_path)

def tearDown(self):
"""Delete the NetCDF file"""
os.remove(self.netcdf_path)

def _assert_default(self, hazard):
"""Assertions for the default hazard to be loaded"""
# Hazard data
self.assertEqual(hazard.tag.haz_type, "")
self.assertIsInstance(hazard.event_id, np.ndarray)
np.testing.assert_array_equal(hazard.event_id, [1, 2])
self.assertIsInstance(hazard.event_name, list)
np.testing.assert_array_equal(
hazard.event_name, [np.datetime64(val) for val in self.time]
)
self.assertIsInstance(hazard.date, np.ndarray)
np.testing.assert_array_equal(
hazard.date, [val.toordinal() for val in self.time]
)

# Centroids
self.assertEqual(hazard.centroids.lat.size, 6)
self.assertEqual(hazard.centroids.lon.size, 6)
np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])

# Intensity data
self.assertIsInstance(hazard.intensity, csr_matrix)
np.testing.assert_array_equal(hazard.intensity.shape, [2, 6])
np.testing.assert_array_equal(hazard.intensity.toarray()[0], [0, 1, 2, 3, 4, 5])
np.testing.assert_array_equal(
hazard.intensity.toarray()[1], [6, 7, 8, 9, 10, 11]
)

# Fraction data
self.assertIsInstance(hazard.fraction, csr_matrix)
np.testing.assert_array_equal(hazard.fraction.shape, [2, 6])
np.testing.assert_array_equal(hazard.fraction.toarray()[0], [0, 0, 0, 0, 0, 0])
np.testing.assert_array_equal(hazard.fraction.toarray()[1], [1, 1, 1, 1, 1, 1])

def test_load_path(self):
"""Load the data with path as argument"""
hazard = Hazard.from_raster_netcdf(self.netcdf_path)
self._assert_default(hazard)

def test_load_dataset(self):
"""Load the data from an opened dataset as argument"""
dataset = xr.open_dataset(self.netcdf_path)
hazard = Hazard.from_raster_netcdf(dataset)
self._assert_default(hazard)

def test_fraction_callable(self):
"""Test creating a fraction from a callable"""
hazard = Hazard.from_raster_netcdf(
self.netcdf_path, fraction=lambda x: np.where(x > 1, 1, 0)
)
self.assertIsInstance(hazard.fraction, csr_matrix)
np.testing.assert_array_equal(hazard.fraction.shape, [2, 6])
np.testing.assert_array_equal(hazard.fraction.toarray()[0], [0, 0, 1, 1, 1, 1])
np.testing.assert_array_equal(hazard.fraction.toarray()[1], [1, 1, 1, 1, 1, 1])

def test_errors(self):
"""Check the errors thrown"""
# TODO: Maybe move to 'test_load_path'
# Wrong paths
with self.assertRaises(FileNotFoundError):
Hazard.from_raster_netcdf("file-does-not-exist.nc")
with self.assertRaises(KeyError):
Hazard.from_raster_netcdf(
self.netcdf_path, intensity="wrong-intensity-path"
)
with self.assertRaises(KeyError):
Hazard.from_raster_netcdf(self.netcdf_path, fraction="wrong-fraction-path")

# TODO: Maybe move to 'test_fraction_callable'
# Wrong type passed as fraction
with self.assertRaises(TypeError) as cm:
Hazard.from_raster_netcdf(self.netcdf_path, fraction=3)
self.assertIn(
"'fraction' parameter must be 'str' or Callable", str(cm.exception)
)


# Execute Tests
if __name__ == "__main__":
TESTS = unittest.TestLoader().loadTestsFromTestCase(ReadDefaultNetCDF)
TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(ReadDimsCoordsNetCDF))
unittest.TextTestRunner(verbosity=2).run(TESTS)