Refactor hindcast prediction module (#341)

* Allows for multiple references to be passed into HindcastEnsemble.verify() * Accounts for uninitialized in alignment * Sets up framework to speed up hindcast bootstrapping
pangeo-data · Mar 23, 2020 · 8be0a89 · 8be0a89
1 parent 5370e6b
commit 8be0a89
Show file tree

Hide file tree

Showing 23 changed files with 967 additions and 989 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,12 +2,16 @@
 What's New
 ==========
 
-climpred v2.0.1 (2020-01-##)
+climpred v2.1.0 (2020-04-##)
 ============================
 
 New Features
 ------------
 
+- Union with observations only enforced if ``reference='persistence'``. (:pr:`341`)
+  `Riley X. Brady`_.
+- ``HindcastEnsemble.verify()`` now takes ``reference=...`` keyword. (:pr:`341`)
+  `Riley X. Brady`_.
 - compute_perfect_model now accepts inits as cftime and integer. Implemented cftime
   into bootstrap_uninit function, which requires a Leap or NoLeap calendar.
   (:pr:`332`) `Aaron Spring`_.
@@ -35,6 +39,10 @@ New Features
       should be based on the same set of verification dates. (:pr:`331`)
       `Riley X. Brady`_.
 
+Bug Fixes
+---------
+- Alignment options now account for differences in the historical time series if
+  ``reference='historical'``. (:pr:`341`) `Riley X. Brady`_.
 
 Internals/Minor Fixes
 ---------------------

diff --git a/ci/environment-dev-3.6.yml b/ci/environment-dev-3.6.yml
@@ -33,6 +33,9 @@ dependencies:
   - isort
   - pre-commit
   - pylint
+  # Currently 5.4.0 or greater breaks pytest-sugar. It looks like this will be fixed
+  # soon and we can revert back to the newest pytest.
+  # https://github.com/Teemu/pytest-sugar/issues/187
   - pytest<5.4.0
   - pytest-cov
   - pytest-sugar

diff --git a/climpred/alignment.py b/climpred/alignment.py
@@ -1,3 +1,4 @@
+import numpy as np
 import xarray as xr
 
 from .checks import is_in_list
@@ -6,7 +7,7 @@
 from .utils import get_multiple_lead_cftime_shift_args, shift_cftime_index
 
 
-def return_inits_and_verif_dates(forecast, verif, alignment):
+def return_inits_and_verif_dates(forecast, verif, alignment, reference=None, hist=None):
     """Returns initializations and verification dates for an arbitrary number of leads
     per a given alignment strategy.
 
@@ -31,22 +32,36 @@ def return_inits_and_verif_dates(forecast, verif, alignment):
         verif_dates (dict): Keys are the lead time integer, values are an
             ``xr.CFTimeIndex`` of verification dates.
     """
+    if isinstance(reference, str):
+        reference = [reference]
+    elif reference is None:
+        reference = []
+
     is_in_list(alignment, VALID_ALIGNMENTS, 'alignment')
     units = forecast['lead'].attrs['units']
     leads = forecast['lead'].values
+
     # `init` renamed to `time` in compute functions.
     all_inits = forecast['time']
     all_verifs = verif['time']
-    union_with_verifs = all_inits.isin(all_verifs)
+
+    # If aligning historical reference, need to account for potential differences in its
+    # temporal coverage. Note that the historical reference only aligns verification
+    # dates and doesn't care about inits.
+    if hist is not None:
+        all_verifs = np.sort(list(set(all_verifs.data) & set(hist['time'].data)))
+        all_verifs = xr.DataArray(all_verifs, dims=['time'], coords=[all_verifs])
 
     # Construct list of `n` offset over all leads.
     n, freq = get_multiple_lead_cftime_shift_args(units, leads)
     init_lead_matrix = _construct_init_lead_matrix(forecast, n, freq, leads)
-    # Currently enforce a union between `inits` and observations in the verification
-    # data. This is because persistence forecasts with the verification data need to
-    # have the same initializations for consistency. This behavior should be changed
-    # as alternative reference forecasts are introduced.
-    init_lead_matrix = init_lead_matrix.where(union_with_verifs, drop=True)
+
+    # A union between `inits` and observations in the verification data is required
+    # for persistence, since the persistence forecast is based off a common set of
+    # initializations.
+    if 'persistence' in reference:
+        union_with_verifs = all_inits.isin(all_verifs)
+        init_lead_matrix = init_lead_matrix.where(union_with_verifs, drop=True)
     valid_inits = init_lead_matrix['time']
 
     if 'same_init' in alignment:

diff --git a/climpred/bootstrap.py b/climpred/bootstrap.py
@@ -14,7 +14,8 @@
 from .comparisons import ALL_COMPARISONS, COMPARISON_ALIASES, HINDCAST_COMPARISONS
 from .exceptions import KeywordError
 from .metrics import ALL_METRICS, METRIC_ALIASES
-from .prediction import compute_hindcast, compute_perfect_model, compute_persistence
+from .prediction import compute_hindcast, compute_perfect_model
+from .reference import compute_persistence
 from .stats import dpp, varweighted_mean_period
 from .utils import (
     _transpose_and_rechunk_to,
@@ -718,6 +719,15 @@ def bootstrap_hindcast(
             'resample over initializations.'
         )
 
+    # Kludge for now. Since we're computing persistence here we need to ensure that
+    # all products have a union in their time axis.
+    times = np.sort(
+        list(set(hind.init.data) & set(hist.time.data) & set(verif.time.data))
+    )
+    hind = hind.sel(init=times)
+    hist = hist.sel(time=times)
+    verif = verif.sel(time=times)
+
     return bootstrap_compute(
         hind,
         verif,