Skip to content

Commit

Permalink
Robustify L0C
Browse files Browse the repository at this point in the history
  • Loading branch information
ghiggi committed Dec 20, 2024
1 parent 62e96ed commit 51c8df4
Show file tree
Hide file tree
Showing 38 changed files with 1,740 additions and 673 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
measurement_interval: ""
measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
measurement_interval: ""
measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
Expand Down
154 changes: 151 additions & 3 deletions disdrodb/api/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@
"""Retrieve file information from DISDRODB products file names and filepaths."""

import os
from collections import defaultdict
from pathlib import Path

import numpy as np
from trollsift import Parser

from disdrodb.utils.time import acronym_to_seconds

####---------------------------------------------------------------------------
########################
#### FNAME PATTERNS ####
Expand All @@ -32,7 +35,7 @@
"{product:s}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
".{version:s}.{data_format:s}"
)
DISDRODB_FNAME_L2E_PATTERN = (
DISDRODB_FNAME_L2E_PATTERN = ( # also L0C and L1 --> accumulation_acronym = sample_interval
"{product:s}.{accumulation_acronym}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
".{version:s}.{data_format:s}"
)
Expand All @@ -50,10 +53,10 @@

def _parse_filename(filename):
"""Parse the filename with trollsift."""
if filename.startswith("L0") or filename.startswith("L1"):
if filename.startswith("L0A") or filename.startswith("L0B"):
p = Parser(DISDRODB_FNAME_L0_PATTERN)
info_dict = p.parse(filename)
elif filename.startswith("L2E"):
elif filename.startswith("L2E") or filename.startswith("L1") or filename.startswith("L0C"):
p = Parser(DISDRODB_FNAME_L2E_PATTERN)
info_dict = p.parse(filename)
elif filename.startswith("L2M"):
Expand All @@ -71,6 +74,11 @@ def _get_info_from_filename(filename):
info_dict = _parse_filename(filename)
except ValueError:
raise ValueError(f"{filename} can not be parsed. Report the issue.")

# Add additional information to info dictionary
if "accumulation_acronym" in info_dict:
info_dict["sample_interval"] = acronym_to_seconds(info_dict["accumulation_acronym"])

# Return info dictionary
return info_dict

Expand Down Expand Up @@ -152,6 +160,13 @@ def get_start_end_time_from_filepaths(filepaths):
return np.array(list_start_time).astype("M8[s]"), np.array(list_end_time).astype("M8[s]")


def get_sample_interval_from_filepaths(filepaths):
"""Return the sample interval of the specified files."""
list_accumulation_acronym = get_key_from_filepaths(filepaths, key="accumulation_acronym")
list_sample_interval = [acronym_to_seconds(s) for s in list_accumulation_acronym]
return list_sample_interval


####--------------------------------------------------------------------------.
###################################
#### DISDRODB Tree Components ####
Expand Down Expand Up @@ -316,3 +331,136 @@ def infer_data_source_from_path(path: str) -> str:


####--------------------------------------------------------------------------.
#######################
#### Group utility ####
#######################


FILE_KEYS = [
"product",
"subproduct",
"campaign_name",
"station_name",
"start_time",
"end_time",
"data_format",
"accumulation_acronym",
"sample_interval",
]


TIME_KEYS = [
"year",
"month",
"month_name",
"quarter",
"season",
"day",
"doy",
"dow",
"hour",
"minute",
"second",
]


def check_groups(groups):
"""Check groups validity."""
if not isinstance(groups, (str, list)):
raise TypeError("'groups' must be a list (or a string if a single group is specified.")
if isinstance(groups, str):
groups = [groups]
groups = np.array(groups)
valid_keys = FILE_KEYS + TIME_KEYS
invalid_keys = groups[np.isin(groups, valid_keys, invert=True)]
if len(invalid_keys) > 0:
raise ValueError(f"The following group keys are invalid: {invalid_keys}. Valid values are {valid_keys}.")
return groups.tolist()


def get_season(time):
"""Get season from `datetime.datetime` or `datetime.date` object."""
month = time.month
if month in [12, 1, 2]:
return "DJF" # Winter (December, January, February)
if month in [3, 4, 5]:
return "MAM" # Spring (March, April, May)
if month in [6, 7, 8]:
return "JJA" # Summer (June, July, August)
return "SON" # Autumn (September, October, November)


def get_time_component(time, component):
"""Get time component from `datetime.datetime` object."""
func_dict = {
"year": lambda time: time.year,
"month": lambda time: time.month,
"day": lambda time: time.day,
"doy": lambda time: time.timetuple().tm_yday, # Day of year
"dow": lambda time: time.weekday(), # Day of week (0=Monday, 6=Sunday)
"hour": lambda time: time.hour,
"minute": lambda time: time.minute,
"second": lambda time: time.second,
# Additional
"month_name": lambda time: time.strftime("%B"), # Full month name
"quarter": lambda time: (time.month - 1) // 3 + 1, # Quarter (1-4)
"season": lambda time: get_season(time), # Season (DJF, MAM, JJA, SON)
}
return str(func_dict[component](time))


def _get_groups_value(groups, filepath):
"""Return the value associated to the groups keys.
If multiple keys are specified, the value returned is a string of format: ``<group_value_1>/<group_value_2>/...``
If a single key is specified and is ``start_time`` or ``end_time``, the function
returns a :py:class:`datetime.datetime` object.
"""
single_key = len(groups) == 1
info_dict = get_info_from_filepath(filepath)
start_time = info_dict["start_time"]
list_key_values = []
for key in groups:
if key in TIME_KEYS:
list_key_values.append(get_time_component(start_time, component=key))
else:
value = info_dict.get(key, f"{key}=None")
list_key_values.append(value if single_key else str(value))
if single_key:
return list_key_values[0]
return "/".join(list_key_values)


def group_filepaths(filepaths, groups=None):
"""
Group filepaths in a dictionary if groups are specified.
Parameters
----------
filepaths : list
List of filepaths.
groups: list or str
The group keys by which to group the filepaths.
Valid group keys are ``product``, ``subproduct``, ``campaign_name``, ``station_name``,
``start_time``, ``end_time``,``accumulation_acronym``,``sample_interval``,
``data_format``,
``year``, ``month``, ``day``, ``doy``, ``dow``, ``hour``, ``minute``, ``second``,
``month_name``, ``quarter``, ``season``.
The time components are extracted from ``start_time`` !
If groups is ``None`` returns the input filepaths list.
The default is ``None``.
Returns
-------
dict or list
Either a dictionary of format ``{<group_value>: <list_filepaths>}``.
or the original input filepaths (if ``groups=None``)
"""
if groups is None:
return filepaths
groups = check_groups(groups)
filepaths_dict = defaultdict(list)
_ = [filepaths_dict[_get_groups_value(groups, filepath)].append(filepath) for filepath in filepaths]
return dict(filepaths_dict)
5 changes: 3 additions & 2 deletions disdrodb/api/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def _get_list_stations_with_data(product, campaign_dir):
# Get stations directory
list_stations_dir = _get_list_stations_dirs(product=product, campaign_dir=campaign_dir)
# Count number of files within directory
# - TODO: here just check for one file !
list_nfiles_per_station = [count_files(station_dir, "*", recursive=True) for station_dir in list_stations_dir]
# Keep only stations with at least one file
stations_names = [os.path.basename(path) for n, path in zip(list_nfiles_per_station, list_stations_dir) if n >= 1]
Expand All @@ -181,7 +182,6 @@ def _get_campaign_stations(base_dir, product, data_source, campaign_name):
data_source=data_source,
campaign_name=campaign_name,
)

# Get list of stations with data and metadata
list_stations_data = _get_list_stations_with_data(product=product, campaign_dir=campaign_dir)
list_stations_metadata = _get_list_stations_with_metadata(campaign_dir)
Expand Down Expand Up @@ -387,7 +387,7 @@ def available_stations(
raise_error_if_empty=False,
base_dir=None,
):
"""Return stations for which data are available on disk.
"""Return stations for which data and metadata are available on disk.
Raise an error if no stations are available.
"""
Expand All @@ -410,6 +410,7 @@ def available_stations(
# If data_source is None, retrieve all stations
if data_sources is None:
list_info = _get_stations(base_dir=base_dir, product=product)

###-----------------------------------------------.
### Filter by data_sources
else:
Expand Down
Loading

0 comments on commit 51c8df4

Please sign in to comment.