CLIMADA-project · peanutfun · Jul 17, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ CLIMADA tutorials. [#872](https://github.com/CLIMADA-project/climada_python/pull
 - Centroids complete overhaul. Most function should be backward compatible. Internal data is stored in a geodataframe attribute. Raster are now stored as points, and the meta attribute is removed. Several methds were deprecated or removed. [#787](https://github.com/CLIMADA-project/climada_python/pull/787)
 - Improved error messages produced by `ImpactCalc.impact()` in case impact function in the exposures is not found in impf_set [#863](https://github.com/CLIMADA-project/climada_python/pull/863)
 - Changed module structure: `climada.hazard.Hazard` has been split into the modules `base`, `io` and `plot` [#871](https://github.com/CLIMADA-project/climada_python/pull/871)
+- Ensure `csr_matrix` stored in `climada.hazard.Hazard` have consistent data format and store no explicit zeros [#893](https://github.com/CLIMADA-project/climada_python/pull/893)
 
 ### Fixed
 

diff --git a/climada/engine/impact_calc.py b/climada/engine/impact_calc.py
@@ -61,6 +61,8 @@ def __init__(self,
         self.exposures = exposures
         self.impfset = impfset
         self.hazard = hazard
+        self.hazard.check_matrices()
+
         # exposures index to use for matrix reconstruction
         self._orig_exp_idx = np.arange(self.exposures.gdf.shape[0])
 

diff --git a/climada/engine/test/test_impact_calc.py b/climada/engine/test/test_impact_calc.py
@@ -70,6 +70,16 @@ def test_init(self):
         np.testing.assert_array_equal(HAZ.event_id, icalc.hazard.event_id)
         np.testing.assert_array_equal(HAZ.event_name, icalc.hazard.event_name)
 
+        # Test check matrices
-        # Test check matrices
+        # Test check matrices
+        # Check fraction and intensity have the same shape
+        # Check that explicit zeroes are pruned
-        # Test check matrices
+        # Test check matrices
+        # Check fraction and intensity have the same shape
+        # Check that explicit zeroes are pruned
+        hazard = deepcopy(HAZ)
+        hazard.intensity[0, hazard.intensity.indices[0]] = 0
+        hazard.fraction = sparse.csr_matrix(np.ones((1, 1)))
+        with self.assertRaisesRegex(
+            ValueError, "Intensity and fraction matrices must have the same shape"
+        ):
+            ImpactCalc(ENT.exposures, ENT.impact_funcs, hazard)
+            self.assertEqual(hazard.intensity.nnz, HAZ.intensity.nnz - 1)  # was pruned
+
     def test_metrics(self):
         """Test methods to get impact metrics"""
         mat = sparse.csr_matrix(np.array(

diff --git a/climada/hazard/base.py b/climada/hazard/base.py
@@ -44,11 +44,28 @@
 LOGGER = logging.getLogger(__name__)


 class Hazard(HazardIO, HazardPlot):
    """
     Contains events of some hazard type defined at centroids. Loads from
     files with format defined in FILE_EXT.
 
+    Attention
+    ---------
+    This class uses instances of
+    `scipy.sparse.csr_matrix
+    <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html>`_
+    to store :py:attr:`intensity` and :py:attr:`fraction`. This data types comes with
+    its particular pitfalls. Depending on how the objects are instantiated and modified,
+    a matrix might end up in a "non-canonical" state. In this state, its ``.data``
+    attribute does not necessarily represent the values apparent in the final matrix.
+    In particular, a "non-canonical" matrix may store "duplicates", i.e. multiple values
+    that map to the same matrix position. This is supported, and the default behavior is
+    to sum up these values. To avoid any inconsistencies, call :py:meth:`check_matrices`
+    once you inserted all data. This will explicitly sum all values at the same matrix
+    position and eliminate explicit zeros. This class will call
+    :py:func:`climada.util.checker.prune_csr_matrix` whenever a csr_matrix is assigned
+    to one of the aforementioned attributes.
+
     Attributes
     ----------
     haz_type : str
@@ -90,8 +107,8 @@
                   'centroids',
                   'event_id',
                   'frequency',
-                  'intensity',
-                  'fraction'
+                  '_intensity',
+                  '_fraction'
                   }
     """Name of the variables needed to compute the impact. Types: scalar, str,
     list, 1dim np.array of size num_events, scipy.sparse matrix of shape
@@ -186,11 +203,58 @@
             np.empty((0, 0)))  # events x centroids
         self.fraction = fraction if fraction is not None else sparse.csr_matrix(
             self.intensity.shape)  # events x centroids
+        self.check_matrices()
 
         self.pool = pool
         if self.pool:
             LOGGER.info('Using %s CPUs.', self.pool.ncpus)
 
+    @property
+    def intensity(self) -> sparse.csr_matrix:
+        """Hazard intensity matrix"""
+        return self._intensity
+
+    @intensity.setter
+    def intensity(self, value: sparse.csr_matrix):
+        """Set intensity matrix to new value"""
+        self._intensity = value
+        u_check.prune_csr_matrix(self._intensity)
+
+    @property
+    def fraction(self) -> sparse.csr_matrix:
+        """Hazard fraction matrix"""
+        return self._fraction
+
+    @fraction.setter
+    def fraction(self, value: sparse.csr_matrix):
+        """Set fraction matrix to new value"""
+        self._fraction = value
+        u_check.prune_csr_matrix(self._fraction)
+
+    def check_matrices(self):
+        """Ensure that matrices are consistently shaped and stored
+
+        See Also
+        --------
+        :py:func:`climada.util.checker.prune_csr_matrix`
+
+        Todo
+        -----
+        * Check consistency with centroids
+
+        Raises
+        ------
+        ValueError
+            If matrices are ill-formed or ill-shaped in relation to each other
+        """
+        u_check.prune_csr_matrix(self.intensity)
+        u_check.prune_csr_matrix(self.fraction)
+        if self.fraction.nnz > 0:
+            if self.intensity.shape != self.fraction.shape:
+                raise ValueError(
+                    "Intensity and fraction matrices must have the same shape"
+                )
+
     @classmethod
     def get_default(cls, attribute):
         """Get the Hazard type default for a given attribute.

diff --git a/climada/hazard/io.py b/climada/hazard/io.py
@@ -1004,12 +1004,18 @@ def write_hdf5(self, file_name, todense=False):
         LOGGER.info('Writing %s', file_name)
         with h5py.File(file_name, 'w') as hf_data:
             str_dt = h5py.special_dtype(vlen=str)
-            for (var_name, var_val) in self.__dict__.items():
+            for var_name in self.__dict__:
                 if var_name == 'centroids':
                     # Centroids have their own write_hdf5 method,
                     # which is invoked at the end of this method (s.b.)
-                    pass
-                elif isinstance(var_val, sparse.csr_matrix):
+                    continue
+                # Prune private attributes
+                if var_name in self.vars_oblig:
+                    var_name = var_name.lstrip("_")
+
+                var_val = getattr(self, var_name)  # Also works with properties
+
+                if isinstance(var_val, sparse.csr_matrix):
                     if todense:
                         hf_data.create_dataset(var_name, data=var_val.toarray())
                     else:
@@ -1065,11 +1071,18 @@ def from_hdf5(cls, file_name):
         haz = cls()
         hazard_kwargs = dict()
         with h5py.File(file_name, 'r') as hf_data:
-            for (var_name, var_val) in haz.__dict__.items():
+            for var_name in haz.__dict__:
+                # Prune private attributes
+                if var_name in haz.vars_oblig:
+                    var_name = var_name.lstrip("_")
+
                 if var_name not in hf_data.keys():
                     continue
                 if var_name == 'centroids':
                     continue
+
+                var_val = getattr(haz, var_name)  # Also works with properties
+
                 if isinstance(var_val, np.ndarray) and var_val.ndim == 1:
                     hazard_kwargs[var_name] = np.array(hf_data.get(var_name))
                 elif isinstance(var_val, sparse.csr_matrix):

diff --git a/climada/hazard/test/test_base.py b/climada/hazard/test/test_base.py
@@ -124,18 +124,18 @@ def test_check_wrongFreq_fail(self):
     def test_check_wrongInten_fail(self):
         """Wrong hazard definition"""
         self.hazard.intensity = sparse.csr_matrix([[1, 2], [1, 2]])
-
-        with self.assertRaises(ValueError) as cm:
+        with self.assertRaisesRegex(
+            ValueError, "Invalid Hazard._intensity row size: 3 != 2."
+        ):
             self.hazard.check()
-        self.assertIn('Invalid Hazard.intensity row size: 3 != 2.', str(cm.exception))
 
     def test_check_wrongFrac_fail(self):
         """Wrong hazard definition"""
         self.hazard.fraction = sparse.csr_matrix([[1], [1], [1]])
-
-        with self.assertRaises(ValueError) as cm:
+        with self.assertRaisesRegex(
+            ValueError, "Invalid Hazard._fraction column size: 2 != 1."
+        ):
             self.hazard.check()
-        self.assertIn('Invalid Hazard.fraction column size: 2 != 1.', str(cm.exception))
 
     def test_check_wrongEvName_fail(self):
         """Wrong hazard definition"""
@@ -212,6 +212,79 @@ def test_get_date_strings_pass(self):
         self.assertEqual(haz.get_event_date()[560],
                          u_dt.date_to_str(haz.date[560]))
 
+    def test_matrix_update(self):
+        """Check that the csr_matrix can be updated with element access"""
+        self.hazard.intensity[[0, 2], 1] = 10
+        np.testing.assert_array_equal(
+            self.hazard.intensity.toarray(), [[1, 10], [1, 2], [1, 10]]
+        )
+        self.hazard.fraction[:, 1] = 0
+        np.testing.assert_array_equal(
+            self.hazard.fraction.toarray(), [[1, 0], [1, 0], [1, 0]]
+        )
+
+    def test_matrix_consistency(self):
+        """Check that the csr_matrix is brought into canonical format"""
+        # Non-canonical: First three data points will be summed onto the first matrix
+        # entry, forth will be an explicit zero entry
+        data = [0, 1, 2, 0]
+        indices = [0, 0, 0, 1]
+        indptr = [0, 4, 4, 4]
+        matrix = sparse.csr_matrix((data, indices, indptr), shape=(3, 2))
+
+        def check_canonical_matrix(mat):
+            self.assertTrue(mat.has_canonical_format)
+            self.assertEqual(mat.nnz, 1)
+
+        # Check canonical format when initializing
+        hazard_new = Hazard("TC", intensity=matrix.copy(), fraction=matrix.copy())
+        matrix_attrs = ("intensity", "fraction")
+        for attr in matrix_attrs:
+            with self.subTest(matrix=attr):
+                check_canonical_matrix(getattr(hazard_new, attr))
+
+        # Check conversion to canonical format when assigning
+        for attr in matrix_attrs:
+            with self.subTest(matrix=attr):
+                setattr(self.hazard, attr, matrix.copy())
+                check_canonical_matrix(getattr(self.hazard, attr))
+
+    def test_check_matrices(self):
+        """Test the check_matrices method"""
+        # Check shapes
+        with self.assertRaisesRegex(
+            ValueError, "Intensity and fraction matrices must have the same shape"
+        ):
+            Hazard(
+                "TC",
+                intensity=sparse.csr_matrix(np.ones((2, 2))),
+                fraction=sparse.csr_matrix(np.ones((2, 3))),
+            )
+
+        hazard = Hazard("TC")
+        hazard.fraction = sparse.csr_matrix(np.zeros((2, 2)))
+        hazard.check_matrices()  # No error, fraction.nnz = 0
+        hazard.fraction = sparse.csr_matrix(np.ones((2, 2)))
+        with self.assertRaisesRegex(
+            ValueError, "Intensity and fraction matrices must have the same shape"
+        ):
+            hazard.check_matrices()
+        hazard.intensity = sparse.csr_matrix(np.ones((2, 3)))
+        with self.assertRaisesRegex(
+            ValueError, "Intensity and fraction matrices must have the same shape"
+        ):
+            hazard.check_matrices()
+
+        # Check that matrices are pruned
+        hazard.intensity[:] = 0
+        hazard.fraction = sparse.csr_matrix(([0], [0], [0, 1, 1]), shape=(2, 3))
+        hazard.check_matrices()
+        for attr in ("intensity", "fraction"):
+            with self.subTest(matrix=attr):
+                matrix = getattr(hazard, attr)
+                self.assertEqual(matrix.nnz, 0)
+                self.assertTrue(matrix.has_canonical_format)
+
 class TestRemoveDupl(unittest.TestCase):
     """Test remove_duplicates method."""
 

diff --git a/climada/util/checker.py b/climada/util/checker.py
@@ -23,7 +23,8 @@
     'size',
     'shape',
     'array_optional',
-    'array_default'
+    'array_default',
+    'prune_csr_matrix',
 ]
 
 import logging
@@ -162,3 +163,36 @@ def array_default(exp_len, var, var_name, def_val):
     else:
         size(exp_len, var, var_name)
     return res
+
+def prune_csr_matrix(matrix: sparse.csr_matrix):
+    """Ensure that the matrix is in the "canonical format".
+
+    Depending on how the matrix was instantiated or modified, it might be in a
+    "non-canonical" state. This only relates to its internal storage. In this state,
+    multiple values might be stored for a single "apparent" value in the matrix.
+    Also, the matrix might store zeros explicitly, which could be removed.
+    Calling this function makes sure that the matrix is in the "canonical state", and
+    brings it into this state, if possible.
+
+    See Also
+    --------
+    `csr_matrix.has_canonical_format
+    <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.has_canonical_format.html#scipy.sparse.csr_matrix.has_canonical_format>`_
+
+    Parameters
+    ----------
+    matrix : csr_matrix
+        The matrix to check. It will be modified *inplace*. No apparent matrix values
+        will change.
+
+    Raises
+    ------
+    ValueError
+        If
+        `csr_matrix.check_format
+        <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.check_format.html#scipy.sparse.csr_matrix.check_format>`_
+        fails
+    """
+    matrix.check_format()
+    matrix.eliminate_zeros()
+    matrix.sum_duplicates()
diff --git a/climada/util/test/test_checker.py b/climada/util/test/test_checker.py
@@ -87,6 +87,28 @@ def test_check_optionals_fail(self):
             u_check.check_optionals(dummy.__dict__, dummy.vars_opt, "DummyClass.", dummy.id.size)
         self.assertIn('Invalid DummyClass.list size: 25 != 3.', str(cm.exception))
 
+    def test_prune_csr_matrix(self):
+        """Check that csr matrices are brought into canonical format"""
+        # Non-canonical: First three data points will be summed onto the first matrix
+        # entry, forth will be an explicit zero entry
+        data = [0, 1, 2, 0]
+        indices = [0, 0, 0, 1]
+        indptr = [0, 4, 4, 4]
+        matrix = sparse.csr_matrix((data, indices, indptr), shape=(3, 2))
+
+        # These checks just make sure that we understand how csr_matrix works
+        np.testing.assert_array_equal(matrix.data, data)
+        np.testing.assert_array_equal(matrix[0, [0, 1]].toarray(), [[3, 0]])
+        self.assertEqual(matrix.nnz, 4)
+        self.assertFalse(matrix.has_canonical_format)
+
+        # Now test our function
+        u_check.prune_csr_matrix(matrix)
+        self.assertTrue(matrix.has_canonical_format)
+        self.assertEqual(matrix[0, 0], 3)
+        np.testing.assert_array_equal(matrix.data, [3])
+        self.assertEqual(matrix.nnz, 1)
+
 # Execute Tests
 if __name__ == "__main__":
     TESTS = unittest.TestLoader().loadTestsFromTestCase(TestChecks)