Robustify L0C

ltelab · Dec 20, 2024 · 51c8df4 · 51c8df4
1 parent 62e96ed
commit 51c8df4
Show file tree

Hide file tree

Showing 38 changed files with 1,740 additions and 673 deletions.
diff --git a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml
@@ -36,7 +36,7 @@ firmware_version: ""
 sensor_beam_length: ""
 sensor_beam_width: ""
 sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
 calibration_sensitivity: ""
 calibration_certification_date: ""
 calibration_certification_url: ""

diff --git a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml
@@ -36,7 +36,7 @@ firmware_version: ""
 sensor_beam_length: ""
 sensor_beam_width: ""
 sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
 calibration_sensitivity: ""
 calibration_certification_date: ""
 calibration_certification_url: ""

diff --git a/disdrodb/api/info.py b/disdrodb/api/info.py
@@ -19,11 +19,14 @@
 """Retrieve file information from DISDRODB products file names and filepaths."""
 
 import os
+from collections import defaultdict
 from pathlib import Path
 
 import numpy as np
 from trollsift import Parser
 
+from disdrodb.utils.time import acronym_to_seconds
+
 ####---------------------------------------------------------------------------
 ########################
 #### FNAME PATTERNS ####
@@ -32,7 +35,7 @@
     "{product:s}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
     ".{version:s}.{data_format:s}"
 )
-DISDRODB_FNAME_L2E_PATTERN = (
+DISDRODB_FNAME_L2E_PATTERN = (  # also L0C and L1 --> accumulation_acronym = sample_interval
     "{product:s}.{accumulation_acronym}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
     ".{version:s}.{data_format:s}"
 )
@@ -50,10 +53,10 @@
 
 def _parse_filename(filename):
     """Parse the filename with trollsift."""
-    if filename.startswith("L0") or filename.startswith("L1"):
+    if filename.startswith("L0A") or filename.startswith("L0B"):
         p = Parser(DISDRODB_FNAME_L0_PATTERN)
         info_dict = p.parse(filename)
-    elif filename.startswith("L2E"):
+    elif filename.startswith("L2E") or filename.startswith("L1") or filename.startswith("L0C"):
         p = Parser(DISDRODB_FNAME_L2E_PATTERN)
         info_dict = p.parse(filename)
     elif filename.startswith("L2M"):
@@ -71,6 +74,11 @@ def _get_info_from_filename(filename):
         info_dict = _parse_filename(filename)
     except ValueError:
         raise ValueError(f"{filename} can not be parsed. Report the issue.")
+
+    # Add additional information to info dictionary
+    if "accumulation_acronym" in info_dict:
+        info_dict["sample_interval"] = acronym_to_seconds(info_dict["accumulation_acronym"])
+
     # Return info dictionary
     return info_dict
 
@@ -152,6 +160,13 @@ def get_start_end_time_from_filepaths(filepaths):
     return np.array(list_start_time).astype("M8[s]"), np.array(list_end_time).astype("M8[s]")
 
 
+def get_sample_interval_from_filepaths(filepaths):
+    """Return the sample interval of the specified files."""
+    list_accumulation_acronym = get_key_from_filepaths(filepaths, key="accumulation_acronym")
+    list_sample_interval = [acronym_to_seconds(s) for s in list_accumulation_acronym]
+    return list_sample_interval
+
+
 ####--------------------------------------------------------------------------.
 ###################################
 #### DISDRODB Tree Components  ####
@@ -316,3 +331,136 @@ def infer_data_source_from_path(path: str) -> str:
 
 
 ####--------------------------------------------------------------------------.
+#######################
+#### Group utility ####
+#######################
+
+
+FILE_KEYS = [
+    "product",
+    "subproduct",
+    "campaign_name",
+    "station_name",
+    "start_time",
+    "end_time",
+    "data_format",
+    "accumulation_acronym",
+    "sample_interval",
+]
+
+
+TIME_KEYS = [
+    "year",
+    "month",
+    "month_name",
+    "quarter",
+    "season",
+    "day",
+    "doy",
+    "dow",
+    "hour",
+    "minute",
+    "second",
+]
+
+
+def check_groups(groups):
+    """Check groups validity."""
+    if not isinstance(groups, (str, list)):
+        raise TypeError("'groups' must be a list (or a string if a single group is specified.")
+    if isinstance(groups, str):
+        groups = [groups]
+    groups = np.array(groups)
+    valid_keys = FILE_KEYS + TIME_KEYS
+    invalid_keys = groups[np.isin(groups, valid_keys, invert=True)]
+    if len(invalid_keys) > 0:
+        raise ValueError(f"The following group keys are invalid: {invalid_keys}. Valid values are {valid_keys}.")
+    return groups.tolist()
+
+
+def get_season(time):
+    """Get season from `datetime.datetime` or `datetime.date` object."""
+    month = time.month
+    if month in [12, 1, 2]:
+        return "DJF"  # Winter (December, January, February)
+    if month in [3, 4, 5]:
+        return "MAM"  # Spring (March, April, May)
+    if month in [6, 7, 8]:
+        return "JJA"  # Summer (June, July, August)
+    return "SON"  # Autumn (September, October, November)
+
+
+def get_time_component(time, component):
+    """Get time component from `datetime.datetime` object."""
+    func_dict = {
+        "year": lambda time: time.year,
+        "month": lambda time: time.month,
+        "day": lambda time: time.day,
+        "doy": lambda time: time.timetuple().tm_yday,  # Day of year
+        "dow": lambda time: time.weekday(),  # Day of week (0=Monday, 6=Sunday)
+        "hour": lambda time: time.hour,
+        "minute": lambda time: time.minute,
+        "second": lambda time: time.second,
+        # Additional
+        "month_name": lambda time: time.strftime("%B"),  # Full month name
+        "quarter": lambda time: (time.month - 1) // 3 + 1,  # Quarter (1-4)
+        "season": lambda time: get_season(time),  # Season (DJF, MAM, JJA, SON)
+    }
+    return str(func_dict[component](time))
+
+
+def _get_groups_value(groups, filepath):
+    """Return the value associated to the groups keys.
+
+    If multiple keys are specified, the value returned is a string of format: ``<group_value_1>/<group_value_2>/...``
+
+    If a single key is specified and is ``start_time`` or ``end_time``, the function
+    returns a :py:class:`datetime.datetime` object.
+    """
+    single_key = len(groups) == 1
+    info_dict = get_info_from_filepath(filepath)
+    start_time = info_dict["start_time"]
+    list_key_values = []
+    for key in groups:
+        if key in TIME_KEYS:
+            list_key_values.append(get_time_component(start_time, component=key))
+        else:
+            value = info_dict.get(key, f"{key}=None")
+            list_key_values.append(value if single_key else str(value))
+    if single_key:
+        return list_key_values[0]
+    return "/".join(list_key_values)
+
+
+def group_filepaths(filepaths, groups=None):
+    """
+    Group filepaths in a dictionary if groups are specified.
+
+    Parameters
+    ----------
+    filepaths : list
+        List of filepaths.
+    groups: list or str
+        The group keys by which to group the filepaths.
+        Valid group keys are ``product``, ``subproduct``, ``campaign_name``, ``station_name``,
+        ``start_time``, ``end_time``,``accumulation_acronym``,``sample_interval``,
+        ``data_format``,
+        ``year``, ``month``, ``day``,  ``doy``, ``dow``, ``hour``, ``minute``, ``second``,
+        ``month_name``, ``quarter``, ``season``.
+        The time components are extracted from ``start_time`` !
+        If groups is ``None`` returns the input filepaths list.
+        The default is ``None``.
+
+    Returns
+    -------
+    dict or list
+        Either a dictionary of format ``{<group_value>: <list_filepaths>}``.
+        or the original input filepaths (if ``groups=None``)
+
+    """
+    if groups is None:
+        return filepaths
+    groups = check_groups(groups)
+    filepaths_dict = defaultdict(list)
+    _ = [filepaths_dict[_get_groups_value(groups, filepath)].append(filepath) for filepath in filepaths]
+    return dict(filepaths_dict)
diff --git a/disdrodb/api/io.py b/disdrodb/api/io.py
@@ -157,6 +157,7 @@ def _get_list_stations_with_data(product, campaign_dir):
     # Get stations directory
     list_stations_dir = _get_list_stations_dirs(product=product, campaign_dir=campaign_dir)
     # Count number of files within directory
+    # - TODO: here just check for one file !
     list_nfiles_per_station = [count_files(station_dir, "*", recursive=True) for station_dir in list_stations_dir]
     # Keep only stations with at least one file
     stations_names = [os.path.basename(path) for n, path in zip(list_nfiles_per_station, list_stations_dir) if n >= 1]
@@ -181,7 +182,6 @@ def _get_campaign_stations(base_dir, product, data_source, campaign_name):
         data_source=data_source,
         campaign_name=campaign_name,
     )
-
     # Get list of stations with data and metadata
     list_stations_data = _get_list_stations_with_data(product=product, campaign_dir=campaign_dir)
     list_stations_metadata = _get_list_stations_with_metadata(campaign_dir)
@@ -387,7 +387,7 @@ def available_stations(
     raise_error_if_empty=False,
     base_dir=None,
 ):
-    """Return stations for which data are available on disk.
+    """Return stations for which data and metadata are available on disk.
 
     Raise an error if no stations are available.
     """
@@ -410,6 +410,7 @@ def available_stations(
     # If data_source is None, retrieve all stations
     if data_sources is None:
         list_info = _get_stations(base_dir=base_dir, product=product)
+
     ###-----------------------------------------------.
     ### Filter by data_sources
     else: