Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catalog strptime #272

Merged
merged 11 commits into from
Aug 13, 2019
57 changes: 39 additions & 18 deletions siphon/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ class DatasetCollection(IndexableMapping):
default_regex = re.compile(r'(?P<year>\d{4})(?P<month>[01]\d)(?P<day>[0123]\d)_'
r'(?P<hour>[012]\d)(?P<minute>[0-5]\d)')

def _get_datasets_with_times(self, regex):
def _get_datasets_with_times(self, regex, strptime=None):
# Set the default regex if we don't have one
# If strptime is provided, pass the regex group named 'strptime' to strptime
if regex is None:
regex = self.default_regex
else:
Expand All @@ -59,19 +60,25 @@ def _get_datasets_with_times(self, regex):
if match:
found_date = True
date_parts = match.groupdict()
dt = datetime(int(date_parts.get('year', 0)), int(date_parts.get('month', 0)),
int(date_parts.get('day', 0)), int(date_parts.get('hour', 0)),
int(date_parts.get('minute', 0)),
int(date_parts.get('second', 0)),
int(date_parts.get('microsecond', 0)))
if strptime is not None:
date_str = date_parts.get('strptime', 0)
dt = datetime.strptime(date_str, strptime)
else:
dt = datetime(int(date_parts.get('year', 0)),
int(date_parts.get('month', 0)),
int(date_parts.get('day', 0)),
int(date_parts.get('hour', 0)),
int(date_parts.get('minute', 0)),
int(date_parts.get('second', 0)),
int(date_parts.get('microsecond', 0)))
yield dt, self[ds]

# If we never found any keys that match, we should let the user know that rather
# than have it be the same as if nothing matched filters
if not found_date:
raise ValueError('No datasets with times found.')

def filter_time_nearest(self, time, regex=None):
def filter_time_nearest(self, time, regex=None, strptime=None):
"""Filter keys for an item closest to the desired time.
dopplershift marked this conversation as resolved.
Show resolved Hide resolved

Loops over all keys in the collection and uses `regex` to extract and build
Expand All @@ -86,20 +93,27 @@ def filter_time_nearest(self, time, regex=None):
The desired time
regex : str, optional
The regular expression to use to extract date/time information from the key. If
given, this should contain named groups: 'year', 'month', 'day', 'hour', 'minute',
'second', and 'microsecond', as appropriate. When a match is found, any of those
groups missing from the pattern will be assigned a value of 0. The default pattern
looks for patterns like: 20171118_2356.
given, this should contain either
1. named groups: 'year', 'month', 'day', 'hour', 'minute', 'second',
and 'microsecond', as appropriate. When a match is found, any of those groups
missing from the pattern will be assigned a value of 0. The default pattern looks
for patterns like: 20171118_2356.
or
2. a group named 'strptime' (e.g., r'_s(?P<strptime>\d{13})' for GOES-16 data)
dopplershift marked this conversation as resolved.
Show resolved Hide resolved
to be parsed with strptime.
strptime : str, optional
the format string that corresponds to regex option (2) above. For example, GOES-16
data with a julian date matching the regex above is parsed with '%Y%j%H%M%S'.

Returns
-------
The value with a time closest to that desired

"""
return min(self._get_datasets_with_times(regex),
return min(self._get_datasets_with_times(regex, strptime),
key=lambda i: abs((i[0] - time).total_seconds()))[-1]

def filter_time_range(self, start, end, regex=None):
def filter_time_range(self, start, end, regex=None, strptime=None):
"""Filter keys for all items within the desired time range.
dopplershift marked this conversation as resolved.
Show resolved Hide resolved

Loops over all keys in the collection and uses `regex` to extract and build
Expand All @@ -115,17 +129,24 @@ def filter_time_range(self, start, end, regex=None):
The end of the desired time range, inclusive
regex : str, optional
The regular expression to use to extract date/time information from the key. If
given, this should contain named groups: 'year', 'month', 'day', 'hour', 'minute',
'second', and 'microsecond', as appropriate. When a match is found, any of those
groups missing from the pattern will be assigned a value of 0. The default pattern
looks for patterns like: 20171118_2356.
given, this should contain either
1. named groups: 'year', 'month', 'day', 'hour', 'minute', 'second',
and 'microsecond', as appropriate. When a match is found, any of those groups
missing from the pattern will be assigned a value of 0. The default pattern looks
for patterns like: 20171118_2356.
or
2. a group named 'strptime' (e.g., r'_s(?P<strptime>\d{13})' for GOES-16 data)
dopplershift marked this conversation as resolved.
Show resolved Hide resolved
to be parsed with strptime.
strptime : str, optional
the format string that corresponds to regex option (2) above. For example, GOES-16
data with a julian date matching the regex above is parsed with '%Y%j%H%M%S'.

Returns
-------
All values corresponding to times within the specified range

"""
return [item[-1] for item in self._get_datasets_with_times(regex)
return [item[-1] for item in self._get_datasets_with_times(regex, strptime)
if start <= item[0] <= end]

def __str__(self):
Expand Down
40 changes: 40 additions & 0 deletions siphon/tests/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,46 @@ def test_datasets_time_range():
'NAM_CONUS_20km_noaaport_20150529_0000.grib1']


@recorder.use_cassette('top_level_20km_rap_catalog')
def test_datasets_time_range_regex():
"""Test getting datasets by time range using filenames, with manual regex"""
dopplershift marked this conversation as resolved.
Show resolved Hide resolved
# This is DatasetCollection.default_regex, but tests passing it explicitly
regex = (r'(?P<year>\d{4})(?P<month>[01]\d)(?P<day>[0123]\d)_'
r'(?P<hour>[012]\d)(?P<minute>[0-5]\d)')

url = ('http://thredds.ucar.edu/thredds/catalog/grib/NCEP/NAM/'
'CONUS_20km/noaaport/catalog.xml')
cat = TDSCatalog(url)
in_range = cat.catalog_refs.filter_time_range(datetime(2015, 5, 28, 0),
datetime(2015, 5, 29, 0),
regex=regex)
titles = [item.title for item in in_range]
assert titles == ['NAM_CONUS_20km_noaaport_20150528_0000.grib1',
'NAM_CONUS_20km_noaaport_20150528_0600.grib1',
'NAM_CONUS_20km_noaaport_20150528_1200.grib1',
'NAM_CONUS_20km_noaaport_20150528_1800.grib1',
'NAM_CONUS_20km_noaaport_20150529_0000.grib1']


@recorder.use_cassette('top_level_20km_rap_catalog')
def test_datasets_time_range_strptime():
"""Test getting datasets by time range using filenames, with strptime"""
dopplershift marked this conversation as resolved.
Show resolved Hide resolved
regex = r'noaaport_(?P<strptime>\d{8}_\d{4})'
strptime = '%Y%m%d_%H%M'
url = ('http://thredds.ucar.edu/thredds/catalog/grib/NCEP/NAM/'
'CONUS_20km/noaaport/catalog.xml')
cat = TDSCatalog(url)
in_range = cat.catalog_refs.filter_time_range(datetime(2015, 5, 28, 0),
datetime(2015, 5, 29, 0),
regex=regex, strptime=strptime)
titles = [item.title for item in in_range]
assert titles == ['NAM_CONUS_20km_noaaport_20150528_0000.grib1',
'NAM_CONUS_20km_noaaport_20150528_0600.grib1',
'NAM_CONUS_20km_noaaport_20150528_1200.grib1',
'NAM_CONUS_20km_noaaport_20150528_1800.grib1',
'NAM_CONUS_20km_noaaport_20150529_0000.grib1']


@recorder.use_cassette('top_level_20km_rap_catalog')
def test_datasets_time_range_raises():
"""Test getting datasets by time range using filenames."""
Expand Down