From 8c822570d77adbed08f1260ac07b7395441160bc Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:20:19 -0700 Subject: [PATCH 1/5] filter by resolution --- pylipd/globals/queries.py | 13 +++++++ pylipd/lipd_series.py | 60 ++++++++++++++++++++++++++++++--- pylipd/tests/test_LiPDSeries.py | 8 ++++- 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py index a609d3b..9196a49 100644 --- a/pylipd/globals/queries.py +++ b/pylipd/globals/queries.py @@ -282,6 +282,19 @@ } """ +QUERY_FILTER_VARIABLE_RESOLUTION = """ + SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?v WHERE { + ?uri le:hasVariableId ?id . + ?uri le:hasResolution ?res . + ?res le:has[stat]Value ?v . + FILTER(?v<[value]) . + ?uri le:foundInDataset ?dsuri . + ?uri le:foundInDatasetName ?dataSetName . + ?uri le:foundInTable ?tableuri . + } +""" + + QUERY_TIMESERIES_ESSENTIALS_PALEO =""" PREFIX wgs84: SELECT ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev diff --git a/pylipd/lipd_series.py b/pylipd/lipd_series.py index c7e6469..1791935 100644 --- a/pylipd/lipd_series.py +++ b/pylipd/lipd_series.py @@ -1,6 +1,5 @@ from tqdm import tqdm -from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY - +from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION from .utils.multi_processing import multi_load_lipd_series from .utils.rdf_graph import RDFGraph @@ -254,10 +253,61 @@ def filter_by_proxy(self, proxy): varuris = [str(row.uri) for row in qres] dsuris = [*set([str(row.dsuri) for row in qres])] - #print(len(dsuris)) - rdfgraph = self.get(varuris) S = LiPDSeries(rdfgraph.graph) S.lipds = {k: self.lipds[k].copy() for k in dsuris} return S - \ No newline at end of file + + def filter_by_resolution(self, threshold, stats='Mean'): + ''' + Filters series to return a new LiPDSeries that only keeps variables that have a resolution less than the specified threshold. + + Parameters + ---------- + threshold : float + The maximum resolution to keep + stats : str, optional + Whether to use 'Mean', 'Median', 'Min' or 'Max' resolution. The default is 'Mean'. + + Raises + ------ + ValueError + Make sure that the stats is of ['Mean','Median', 'Min', 'Max']. + + Returns + ------- + S : pylipd.lipd_series.LiPDSeries + A new LiPDSeries object that only contains the filtered variables + + Examples + -------- + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + lipd = load_dir('Pages2k') + S = lipd.to_lipd_series() + S_filtered = S.filter_by_resolution(10) + + ''' + + stats = stats.capitalize() #make sure that the first letter is capitalized + stats_allowed = ['Mean','Median', 'Min', 'Max'] #possible values + if stats not in stats_allowed: + raise ValueError("Stats must be ['Mean','Median', 'Min', 'Max']") + + threshold = float(threshold) # make sure this is a float or can be coerced in one + + query = QUERY_FILTER_VARIABLE_RESOLUTION + query = query.replace("[value]", str(threshold)) + query = query.replace("[stat]", stats) + + qres,q_df = self.query(query) + + varuris = [str(row.uri) for row in qres] + dsuris = [*set([str(row.dsuri) for row in qres])] + + rdfgraph = self.get(varuris) + S = LiPDSeries(rdfgraph.graph) + S.lipds = {k: self.lipds[k].copy() for k in dsuris} + return S \ No newline at end of file diff --git a/pylipd/tests/test_LiPDSeries.py b/pylipd/tests/test_LiPDSeries.py index 9e28d99..6bfdfb7 100644 --- a/pylipd/tests/test_LiPDSeries.py +++ b/pylipd/tests/test_LiPDSeries.py @@ -18,7 +18,7 @@ 4. after `pip install pytest-xdist`, one may execute "pytest -n 4" to test in parallel with number of workers specified by `-n` 5. for more details, see https://docs.pytest.org/en/stable/usage.html """ - +import pytest from pylipd.lipd_series import LiPDSeries class TestLoad(): @@ -64,6 +64,12 @@ def test_proxy_t0(self,pages2k): Sfiltered = S.filter_by_proxy('ring width') v = Sfiltered.get_all_proxy() assert len(v)==1 + + @pytest.mark.parametrize('stats',['Mean','Median','Min','Max']) + def test_resolution_t0(self,stats,pages2k): + D=pages2k + S = D.to_lipd_series() + Sfiltered = S.filter_by_resolution(threshold = 10,stats=stats) \ No newline at end of file From f0022a25e1532cf0fe77e7ee603ad9372d9e903e Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:38:09 -0700 Subject: [PATCH 2/5] get the properties attached to variable in LiPDSeries --- pylipd/globals/queries.py | 6 ++++++ pylipd/lipd_series.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py index 9196a49..7a99736 100644 --- a/pylipd/globals/queries.py +++ b/pylipd/globals/queries.py @@ -436,6 +436,12 @@ } """ +## At the LiPDSeries level + +QUERY_LiPDSERIES_PROPERTIES=""" + SELECT DISTINCT ?p WHERE { + ?uri ?p ?v .} + """ QUERY_DATASET_PROPERTIES=""" PREFIX le: diff --git a/pylipd/lipd_series.py b/pylipd/lipd_series.py index 1791935..8958800 100644 --- a/pylipd/lipd_series.py +++ b/pylipd/lipd_series.py @@ -1,5 +1,5 @@ from tqdm import tqdm -from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION +from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION, QUERY_LiPDSERIES_PROPERTIES from .utils.multi_processing import multi_load_lipd_series from .utils.rdf_graph import RDFGraph @@ -174,6 +174,37 @@ def get_timeseries_essentials(self): qres_df['values']=qres_df['values'].apply(lambda row : np.array(json.loads(row))) return qres_df + + def get_variable_properties(self): + """ + Get a list of all the properties name associated with the dataset. Useful to write custom queries + + Returns + ------- + clean_list : list + A list of unique variable properties + + Examples + -------- + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + + lipd = load_dir() + S = lipd.to_lipd_series() + l = S.get_variable_properties() + + print(l) + + + """ + + query_list = self.query(QUERY_LiPDSERIES_PROPERTIES)[1].iloc[:,0].values.tolist() + clean_list = [item.split("#")[-1] for item in query_list] + + return clean_list + def filter_by_name(self, name): ''' From bce5fa59047fa1d0c49055ef2d84fcdbe4662af7 Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:39:09 -0700 Subject: [PATCH 3/5] Update test_LiPDSeries.py --- pylipd/tests/test_LiPDSeries.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pylipd/tests/test_LiPDSeries.py b/pylipd/tests/test_LiPDSeries.py index 6bfdfb7..b0f7960 100644 --- a/pylipd/tests/test_LiPDSeries.py +++ b/pylipd/tests/test_LiPDSeries.py @@ -48,6 +48,11 @@ def test_proxy_t0(self, pages2k): D=pages2k S = D.to_lipd_series() names = S.get_all_proxy() + + def test_variable_t0(self,pages2k): + D=pages2k + S = D.to_lipd_series() + l = S.get_variable_properties() class TestFiler(): From b7c4ae604a188d3cfe32a80e585c050853344a14 Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:31:47 -0700 Subject: [PATCH 4/5] filter by time --- pylipd/globals/queries.py | 16 +++++++++ pylipd/lipd.py | 75 ++++++++++++++++++++++++++++++++++++++- pylipd/tests/test_LiPD.py | 12 +++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py index 7a99736..982de89 100644 --- a/pylipd/globals/queries.py +++ b/pylipd/globals/queries.py @@ -259,6 +259,22 @@ } """ +QUERY_FILTER_TIME = """ + SELECT ?dsname ?minage ?maxage WHERE { + ?ds a le:Dataset . + ?ds le:hasName ?dsname . + + ?ds le:hasPaleoData ?data . + ?data le:hasMeasurementTable ?table . + ?table le:hasVariable ?var . + ?table le:hasVariable ?timevar . + ?timevar le:hasName ?time_variableName . + FILTER (regex(?time_variableName, "year.*") || regex(?time_variableName, "age.*")) . + ?timevar le:hasMinValue ?minage . + ?timevar le:hasMaxValue ?maxage . +} +""" + QUERY_FILTER_VARIABLE_NAME = """ SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?name WHERE { ?uri le:hasVariableId ?id . diff --git a/pylipd/lipd.py b/pylipd/lipd.py index c833776..c00555b 100644 --- a/pylipd/lipd.py +++ b/pylipd/lipd.py @@ -20,7 +20,7 @@ from pylipd.utils.json_to_rdf import JSONToRDF from pylipd.utils.rdf_to_json import RDFToJSON -from .globals.queries import QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION +from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION from .lipd_series import LiPDSeries from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd from .utils.rdf_graph import RDFGraph @@ -1158,6 +1158,79 @@ def filter_by_archive_type(self, archiveType): dsnames = [sanitizeId(row.dsname) for row in qres] return self.get(dsnames) + def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None): + """ + Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation. + + If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`. + + Parameters + ---------- + timeBound : list + Minimum and Maximum age value to search for. + timeBoundType : str, optional + The type of querying to perform. Possible values include: "any", "entire", and "entirely". + - any: Overlap any portions of matching datasets (default) + - entirely: are entirely overlapped by matching datasets + - entire: overlap entire matching datasets but dataset can be shorter than the bounds + The default is 'any'. + recordLength : float, optional + The minimum length the record needs to have while matching the ageBound criteria. The default is None. + + Raises + ------ + ValueError + timeBoundType must take the values in ["any", "entire", and "entirely"] + + Returns + ------- + pylipd.lipd.LiPD + A new LiPD object that only contains datasets that have the specified time interval + + Examples + -------- + pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method. + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + + lipd = load_dir('Pages2k') + Lfiltered = lipd.filter_by_time(timeBound=[0,1800]) + Lfiltered.get_all_dataset_names() + + """ + + if timeBound and timeBound[0]>timeBound[1]: + timeBound = [timeBound[1],timeBound[0]] + + timeBoundType=timeBoundType.lower() + + query = QUERY_FILTER_TIME + __, df = self.query(query) + if recordLength is None: + if timeBoundType == 'entirely': + filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])] + elif timeBoundType == 'entire': + filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])] + elif timeBoundType == 'any': + filter_df = df[(df['minage'] <= timeBound[1])] + else: + raise ValueError("timeBoundType must be in ['any', 'entirely','entire']") + else: + if timeBoundType == 'entirely': + filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)] + elif timeBoundType == 'entire': + filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)] + elif timeBoundType == 'any': + filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)] + else: + raise ValueError("timeBoundType must be in ['any', 'entirely','entire']") + + dsnames = list(filter_df['dsname']) + return self.get(dsnames) + + def get_datasets(self) -> 'list[Dataset]': ''' Return datasets as instances of the Dataset class diff --git a/pylipd/tests/test_LiPD.py b/pylipd/tests/test_LiPD.py index 5de8d8e..a35b30a 100644 --- a/pylipd/tests/test_LiPD.py +++ b/pylipd/tests/test_LiPD.py @@ -98,6 +98,18 @@ def test_archive_to(self,pages2k): Lfiltered = D.filter_by_archive_type('marine sediment') assert len(Lfiltered.get_all_archiveTypes())==1 assert Lfiltered.get_all_archiveTypes()[0] == 'Marine sediment' + + @pytest.mark.parametrize(('timeBoundType', 'recordLength'), + [('any', None), + ('any', 500), + ('entire', None), + ('entire', 20), + ('entirely',None), + ('entirely', 100) + ]) + def test_time_t0(self,timeBoundType,recordLength,pages2k): + D=pages2k + Lfiltered = D.filter_by_time(timeBound=[0,1800], timeBoundType=timeBoundType,recordLength=recordLength) class TestGet(): From 3b08da072874211c9a27a0e5f0537c2e23622e58 Mon Sep 17 00:00:00 2001 From: Deborah Khider Date: Mon, 19 Aug 2024 14:11:22 -0700 Subject: [PATCH 5/5] Update CITATION.cff --- CITATION.cff | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index f937fbd..162fd6a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,4 +1,4 @@ -cff-version: 0.0.4 +cff-version: 0.0.5 message: "If you use this software, please cite it as below." authors: - family-names: "Ratnakar" @@ -8,7 +8,7 @@ authors: given-names: "Deborah" orcid: "https://orcid.org/0000-0001-7501-8430" title: "PyLiPD: a Python package for the manipulation of LiPD datasets" -version: v1.3.6 +version: v1.3.7 doi: 10.5281/zenodo.7951201 -date-released: 2023-06-19 +date-released: 2024-08-19 url: "https://github.com/LinkedEarth/pylipd"