From b7c4ae604a188d3cfe32a80e585c050853344a14 Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:31:47 -0700 Subject: [PATCH] filter by time --- pylipd/globals/queries.py | 16 +++++++++ pylipd/lipd.py | 75 ++++++++++++++++++++++++++++++++++++++- pylipd/tests/test_LiPD.py | 12 +++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py index 7a99736..982de89 100644 --- a/pylipd/globals/queries.py +++ b/pylipd/globals/queries.py @@ -259,6 +259,22 @@ } """ +QUERY_FILTER_TIME = """ + SELECT ?dsname ?minage ?maxage WHERE { + ?ds a le:Dataset . + ?ds le:hasName ?dsname . + + ?ds le:hasPaleoData ?data . + ?data le:hasMeasurementTable ?table . + ?table le:hasVariable ?var . + ?table le:hasVariable ?timevar . + ?timevar le:hasName ?time_variableName . + FILTER (regex(?time_variableName, "year.*") || regex(?time_variableName, "age.*")) . + ?timevar le:hasMinValue ?minage . + ?timevar le:hasMaxValue ?maxage . +} +""" + QUERY_FILTER_VARIABLE_NAME = """ SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?name WHERE { ?uri le:hasVariableId ?id . diff --git a/pylipd/lipd.py b/pylipd/lipd.py index c833776..c00555b 100644 --- a/pylipd/lipd.py +++ b/pylipd/lipd.py @@ -20,7 +20,7 @@ from pylipd.utils.json_to_rdf import JSONToRDF from pylipd.utils.rdf_to_json import RDFToJSON -from .globals.queries import QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION +from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION from .lipd_series import LiPDSeries from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd from .utils.rdf_graph import RDFGraph @@ -1158,6 +1158,79 @@ def filter_by_archive_type(self, archiveType): dsnames = [sanitizeId(row.dsname) for row in qres] return self.get(dsnames) + def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None): + """ + Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation. + + If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`. + + Parameters + ---------- + timeBound : list + Minimum and Maximum age value to search for. + timeBoundType : str, optional + The type of querying to perform. Possible values include: "any", "entire", and "entirely". + - any: Overlap any portions of matching datasets (default) + - entirely: are entirely overlapped by matching datasets + - entire: overlap entire matching datasets but dataset can be shorter than the bounds + The default is 'any'. + recordLength : float, optional + The minimum length the record needs to have while matching the ageBound criteria. The default is None. + + Raises + ------ + ValueError + timeBoundType must take the values in ["any", "entire", and "entirely"] + + Returns + ------- + pylipd.lipd.LiPD + A new LiPD object that only contains datasets that have the specified time interval + + Examples + -------- + pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method. + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + + lipd = load_dir('Pages2k') + Lfiltered = lipd.filter_by_time(timeBound=[0,1800]) + Lfiltered.get_all_dataset_names() + + """ + + if timeBound and timeBound[0]>timeBound[1]: + timeBound = [timeBound[1],timeBound[0]] + + timeBoundType=timeBoundType.lower() + + query = QUERY_FILTER_TIME + __, df = self.query(query) + if recordLength is None: + if timeBoundType == 'entirely': + filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])] + elif timeBoundType == 'entire': + filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])] + elif timeBoundType == 'any': + filter_df = df[(df['minage'] <= timeBound[1])] + else: + raise ValueError("timeBoundType must be in ['any', 'entirely','entire']") + else: + if timeBoundType == 'entirely': + filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)] + elif timeBoundType == 'entire': + filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)] + elif timeBoundType == 'any': + filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)] + else: + raise ValueError("timeBoundType must be in ['any', 'entirely','entire']") + + dsnames = list(filter_df['dsname']) + return self.get(dsnames) + + def get_datasets(self) -> 'list[Dataset]': ''' Return datasets as instances of the Dataset class diff --git a/pylipd/tests/test_LiPD.py b/pylipd/tests/test_LiPD.py index 5de8d8e..a35b30a 100644 --- a/pylipd/tests/test_LiPD.py +++ b/pylipd/tests/test_LiPD.py @@ -98,6 +98,18 @@ def test_archive_to(self,pages2k): Lfiltered = D.filter_by_archive_type('marine sediment') assert len(Lfiltered.get_all_archiveTypes())==1 assert Lfiltered.get_all_archiveTypes()[0] == 'Marine sediment' + + @pytest.mark.parametrize(('timeBoundType', 'recordLength'), + [('any', None), + ('any', 500), + ('entire', None), + ('entire', 20), + ('entirely',None), + ('entirely', 100) + ]) + def test_time_t0(self,timeBoundType,recordLength,pages2k): + D=pages2k + Lfiltered = D.filter_by_time(timeBound=[0,1800], timeBoundType=timeBoundType,recordLength=recordLength) class TestGet():