From a45d2bd33db9ce6ca5d4894dd2f9ff706aed7fd6 Mon Sep 17 00:00:00 2001
From: Jacob Mims <122570226+jtmims@users.noreply.github.com>
Date: Thu, 16 Jan 2025 08:46:18 -0600
Subject: [PATCH] add function to normalize all time coord units (#732)

* add function to normalize all time coord units

* add logic for all time unit types

* re-add whitespace

* typo
---
 src/preprocessor.py | 82 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/preprocessor.py b/src/preprocessor.py
index 629826157..62ca930d2 100644
--- a/src/preprocessor.py
+++ b/src/preprocessor.py
@@ -886,7 +886,11 @@ def crop_date_range(self, case_date_range: util.DateRange, xr_ds, time_coord) ->
                              decode_times=True,
                              use_cftime=True  # use cftime instead of np.datetime6
         )
-        cal = xr_ds[time_coord.name].attrs.get('calendar', 'noleap')
+        cal = 'noleap'
+        if 'calendar' in xr_ds[time_coord.name].attrs:
+            cal = xr_ds[time_coord.name].attrs['calendar']
+        elif 'calendar' in xr_ds[time_coord.name].encoding:
+            cal = xr_ds[time_coord.name].encoding['calendar']
 
         ds_date_time = xr_ds[time_coord.name].values
         ds_start_time = ds_date_time[0]
@@ -1012,6 +1016,81 @@ def check_group_daterange(self, df: pd.DataFrame, date_range: util.DateRange,
         # hit an exception; return empty DataFrame to signify failure
         return pd.DataFrame(columns=group_df.columns)
 
+    def normalize_time_units(self, subset_dict: dict, time_coord, log=_log) -> dict:
+        """ 
+        Some datasets will have the time units that are different in each individual file.
+        This function updates each time unit to rely on the earliest year grabbed in the
+        query stage.
+
+        This function assumes the time coord units attr will be of the form "{unit} since ????".
+        """
+
+        time_units = np.sort([subset_dict[f].time.units for f in list(subset_dict)])
+        tn = time_coord.name #abbreviate        
+
+        # assumes each dataset has the same calendar
+        cal = 'noleap'
+        if 'calendar' in subset_dict[list(subset_dict)[0]][tn].attrs:
+            cal = subset_dict[list(subset_dict)[0]][tn].attrs['calendar']
+        elif 'calendar' in subset_dict[list(subset_dict)[0]][tn].encoding:
+            cal = subset_dict[list(subset_dict)[0]][tn].encoding['calendar'] 
+ 
+        if len(set(time_units)) > 1: # check if each dataset has the different time coord units
+            # check if time coord units are in the form "{unit} since {date}"
+            # they can be different units as this function converts to the earliest case
+            if all(["since" in u for u in time_units]):
+                start_unit = time_units[0].split(" ")[0] 
+                start_str = " ".join(time_units[0].split(" ")[2:])
+                start_cft = dl.str_to_cftime(
+                                start_str.replace(" ","").replace(":", "").replace("-", ""),
+                                calendar=cal
+                            )
+                new_unit_str = f"{start_unit} since {start_str}"
+
+                # dictionary of how many seconds are in each time unit
+                seconds_in = {
+                    "seconds": 1.0,
+                    "minutes": 60.0,
+                    "hours": 3600.0,
+                    "days": 86400.0,
+                    "weeks": 604800.0, # these are rarer and vague cases (they could be problematic)
+                    "months": 2628000.0, # seconds in common year (365 days) / 12
+                    "years": 31536000.0 # common year (365 days)
+                }
+
+
+                for f in list(subset_dict):
+                    current_unit = subset_dict[f][time_coord.name].units.split(" ")[0].lower()
+                    current_str = " ".join(subset_dict[f][tn].units.split(" ")[2:])
+                    current_cft = dl.str_to_cftime(
+                                  current_str.replace(" ","").replace(":", "").replace("-", ""),
+                                  calendar=cal
+                                  )
+
+                    #TODO: add logic to add year values for different calendars
+        
+                    if current_cft > start_cft:
+                        # get difference between current files unit reference point and earliest found
+                        diff = ((current_cft-start_cft).total_seconds())/seconds_in[start_unit] 
+
+                        subset_dict[f].coords['time'] = subset_dict[f][tn].assign_attrs(
+                            units=new_unit_str
+                        )
+                        
+                        # convert current unit if it is not the same as the earliest reference
+                        if current_unit != start_unit:
+                             factor = seconds_in[current_unit]/seconds_in[start_unit]
+                        else:
+                            factor = 1.0
+                        
+                        # change the values in the dataset
+                        for i, v in enumerate(subset_dict[f][tn].values):
+                            subset_dict[f].coords[tn].values[i] = factor*v + diff
+            else:
+                raise AttributeError("Different units were found for time coord in each file. "
+                      "We were unable to normalize due to the units not being in '{unit} since ' format")
+        
+        return subset_dict
 
     def query_catalog(self,
                       case_dict: dict,
@@ -1124,6 +1203,7 @@ def query_catalog(self,
                 # tl;dr hic sunt dracones
                 var_xr = []
                 if not var.is_static:
+                    cat_subset_dict = self.normalize_time_units(cat_subset_dict, var.T)
                     time_sort_dict = {f: cat_subset_dict[f].time.values[0]
                                       for f in list(cat_subset_dict)}
                     time_sort_dict = dict(sorted(time_sort_dict.items(), key=lambda item: item[1]))