Skip to content

Commit

Permalink
filter by time
Browse files Browse the repository at this point in the history
  • Loading branch information
khider committed Jul 30, 2024
1 parent bce5fa5 commit b7c4ae6
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 1 deletion.
16 changes: 16 additions & 0 deletions pylipd/globals/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,22 @@
}
"""

QUERY_FILTER_TIME = """
SELECT ?dsname ?minage ?maxage WHERE {
?ds a le:Dataset .
?ds le:hasName ?dsname .
?ds le:hasPaleoData ?data .
?data le:hasMeasurementTable ?table .
?table le:hasVariable ?var .
?table le:hasVariable ?timevar .
?timevar le:hasName ?time_variableName .
FILTER (regex(?time_variableName, "year.*") || regex(?time_variableName, "age.*")) .
?timevar le:hasMinValue ?minage .
?timevar le:hasMaxValue ?maxage .
}
"""

QUERY_FILTER_VARIABLE_NAME = """
SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?name WHERE {
?uri le:hasVariableId ?id .
Expand Down
75 changes: 74 additions & 1 deletion pylipd/lipd.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pylipd.utils.json_to_rdf import JSONToRDF
from pylipd.utils.rdf_to_json import RDFToJSON

from .globals.queries import QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
from .lipd_series import LiPDSeries
from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
from .utils.rdf_graph import RDFGraph
Expand Down Expand Up @@ -1158,6 +1158,79 @@ def filter_by_archive_type(self, archiveType):
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)

def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None):
"""
Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation.
If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`.
Parameters
----------
timeBound : list
Minimum and Maximum age value to search for.
timeBoundType : str, optional
The type of querying to perform. Possible values include: "any", "entire", and "entirely".
- any: Overlap any portions of matching datasets (default)
- entirely: are entirely overlapped by matching datasets
- entire: overlap entire matching datasets but dataset can be shorter than the bounds
The default is 'any'.
recordLength : float, optional
The minimum length the record needs to have while matching the ageBound criteria. The default is None.
Raises
------
ValueError
timeBoundType must take the values in ["any", "entire", and "entirely"]
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified time interval
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_time(timeBound=[0,1800])
Lfiltered.get_all_dataset_names()
"""

if timeBound and timeBound[0]>timeBound[1]:
timeBound = [timeBound[1],timeBound[0]]

timeBoundType=timeBoundType.lower()

query = QUERY_FILTER_TIME
__, df = self.query(query)
if recordLength is None:
if timeBoundType == 'entirely':
filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])]
elif timeBoundType == 'entire':
filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])]
elif timeBoundType == 'any':
filter_df = df[(df['minage'] <= timeBound[1])]
else:
raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
else:
if timeBoundType == 'entirely':
filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
elif timeBoundType == 'entire':
filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
elif timeBoundType == 'any':
filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)]
else:
raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")

dsnames = list(filter_df['dsname'])
return self.get(dsnames)


def get_datasets(self) -> 'list[Dataset]':
'''
Return datasets as instances of the Dataset class
Expand Down
12 changes: 12 additions & 0 deletions pylipd/tests/test_LiPD.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,18 @@ def test_archive_to(self,pages2k):
Lfiltered = D.filter_by_archive_type('marine sediment')
assert len(Lfiltered.get_all_archiveTypes())==1
assert Lfiltered.get_all_archiveTypes()[0] == 'Marine sediment'

@pytest.mark.parametrize(('timeBoundType', 'recordLength'),
[('any', None),
('any', 500),
('entire', None),
('entire', 20),
('entirely',None),
('entirely', 100)
])
def test_time_t0(self,timeBoundType,recordLength,pages2k):
D=pages2k
Lfiltered = D.filter_by_time(timeBound=[0,1800], timeBoundType=timeBoundType,recordLength=recordLength)


class TestGet():
Expand Down

0 comments on commit b7c4ae6

Please sign in to comment.