From bf9e9a46609ad4ce4d74b6c266e51167fd7319c8 Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Fri, 26 Jul 2024 10:00:30 -0700 Subject: [PATCH 1/2] proxy queries --- pylipd/__init__.py | 4 +- pylipd/globals/queries.py | 25 ++++++++++ pylipd/lipd_series.py | 87 +++++++++++++++++++++++++++++++-- pylipd/tests/test_LiPD.py | 7 ++- pylipd/tests/test_LiPDSeries.py | 15 +++++- 5 files changed, 131 insertions(+), 7 deletions(-) diff --git a/pylipd/__init__.py b/pylipd/__init__.py index c541770..648503e 100644 --- a/pylipd/__init__.py +++ b/pylipd/__init__.py @@ -1,3 +1,5 @@ -__version__ = "1.3.5" +from importlib.metadata import version +__version__ = version('pyleoclim') + from .utils import * diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py index f3569bd..a609d3b 100644 --- a/pylipd/globals/queries.py +++ b/pylipd/globals/queries.py @@ -132,6 +132,19 @@ } +""" + +QUERY_DISTINCT_PROXY = """ + PREFIX le: + + SELECT distinct ?proxy + WHERE { + OPTIONAL{?uri le:hasProxy ?proxyObj . + ?proxyObj rdfs:label ?proxy .} + ?uri le:hasVariableId ?TSID + } + + """ QUERY_VARIABLE = """ @@ -257,6 +270,18 @@ } """ +QUERY_FILTER_VARIABLE_PROXY = """ + SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?proxy WHERE { + ?uri le:hasVariableId ?id . + ?uri le:hasProxy ?proxyObj . + ?proxyObj rdfs:label ?proxy . + FILTER regex(?proxy, "[proxy].*", "i") . + ?uri le:foundInDataset ?dsuri . + ?uri le:foundInDatasetName ?dataSetName . + ?uri le:foundInTable ?tableuri . + } +""" + QUERY_TIMESERIES_ESSENTIALS_PALEO =""" PREFIX wgs84: SELECT ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev diff --git a/pylipd/lipd_series.py b/pylipd/lipd_series.py index ed7e82d..c7e6469 100644 --- a/pylipd/lipd_series.py +++ b/pylipd/lipd_series.py @@ -1,5 +1,5 @@ from tqdm import tqdm -from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS +from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY from .utils.multi_processing import multi_load_lipd_series from .utils.rdf_graph import RDFGraph @@ -95,7 +95,7 @@ def get_all_variables(self): def get_all_variable_names(self): """ - Get a list of all possible distinct variableNames. Useful for filtering and qeurying. + Get a list of all possible distinct variableNames. Useful for filtering and querying. Returns ------- @@ -115,6 +115,30 @@ def get_all_variable_names(self): """ return self.query(QUERY_DISTINCT_VARIABLE)[1].iloc[:,0].values.tolist() + + def get_all_proxy(self): + + """ + Get a list of all possible proxy. Useful for filtering and querying. + + Returns + ------- + list + A list of unique proxies + + Examples + -------- + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + lipd = load_dir('Pages2k') + S = lipd.to_lipd_series() + proxyName = S.get_all_proxy() + print(proxyName) + """ + + return self.query(QUERY_DISTINCT_PROXY)[1].iloc[:,0].values.tolist() def get_timeseries_essentials(self): '''This function returns information about each variable: `dataSetName`, `archiveType`, `name`, `values`, `units`, `TSID`, `proxy`. @@ -167,6 +191,18 @@ def filter_by_name(self, name): pylipd.lipd_series.LiPDSeries A new LiPDSeries object that only contains variables that have the specified name (regex) + + Examples + -------- + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_datasets + lipd = load_datasets('ODP846.Lawrence.2006.lpd') + S = lipd.to_lipd_series() + sst = S.filter_by_name('sst') + + print(sst.get_all_variable_names()) ''' query = QUERY_FILTER_VARIABLE_NAME @@ -176,9 +212,52 @@ def filter_by_name(self, name): varuris = [str(row.uri) for row in qres] dsuris = [*set([str(row.dsuri) for row in qres])] - print(len(dsuris)) + #print(len(dsuris)) + + rdfgraph = self.get(varuris) + S = LiPDSeries(rdfgraph.graph) + S.lipds = {k: self.lipds[k].copy() for k in dsuris} + return S + def filter_by_proxy(self, proxy): + ''' + Filters series to return a new LiPDSeries that only keeps variables that have the specified proxy (regex) + + Parameters + ---------- + + proxy : str + The name of the proxy to filter by + + Returns + ------- + + pylipd.lipd_series.LiPDSeries + A new LiPDSeries object that only contains variables that have the specified name (regex) + + Examples + -------- + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + lipd = load_dir('Pages2k') + S = lipd.to_lipd_series() + S_filtered = S.filter_by_proxy('ring width') + print(S_filtered.get_all_proxy()) + + ''' + query = QUERY_FILTER_VARIABLE_PROXY + query = query.replace("[proxy]", proxy) + + qres, qres_df = self.query(query) + varuris = [str(row.uri) for row in qres] + dsuris = [*set([str(row.dsuri) for row in qres])] + + #print(len(dsuris)) + rdfgraph = self.get(varuris) S = LiPDSeries(rdfgraph.graph) S.lipds = {k: self.lipds[k].copy() for k in dsuris} - return S \ No newline at end of file + return S + \ No newline at end of file diff --git a/pylipd/tests/test_LiPD.py b/pylipd/tests/test_LiPD.py index 48acca6..f0adce3 100644 --- a/pylipd/tests/test_LiPD.py +++ b/pylipd/tests/test_LiPD.py @@ -24,6 +24,8 @@ from pylipd.lipd import LiPD import urllib as urllib + + class TestLiPDLoad(): def test_load_t0(self, odp846): @@ -181,7 +183,10 @@ class TestRdf(): def test_convert_to_rdf_t0(self): lipd = LiPD() - lipd.convert_lipd_dir_to_rdf("./examples/data/Pages2k", "all-lipd.nq") + try: + lipd.convert_lipd_dir_to_rdf("../data/Pages2k", "all-lipd.nq") + except: + lipd.convert_lipd_dir_to_rdf("./examples/data/Pages2k", "all-lipd.nq") \ No newline at end of file diff --git a/pylipd/tests/test_LiPDSeries.py b/pylipd/tests/test_LiPDSeries.py index 5219438..9e28d99 100644 --- a/pylipd/tests/test_LiPDSeries.py +++ b/pylipd/tests/test_LiPDSeries.py @@ -43,7 +43,12 @@ def test_timeseries_essentials_t0(self, pages2k): D=pages2k S = D.to_lipd_series() names = S.get_timeseries_essentials() - + + def test_proxy_t0(self, pages2k): + D=pages2k + S = D.to_lipd_series() + names = S.get_all_proxy() + class TestFiler(): def test_name_t0(self,pages2k): @@ -52,5 +57,13 @@ def test_name_t0(self,pages2k): Sfiltered = S.filter_by_name('temperature') df=Sfiltered.get_timeseries_essentials() assert len(df.index)==11 + + def test_proxy_t0(self,pages2k): + D=pages2k + S = D.to_lipd_series() + Sfiltered = S.filter_by_proxy('ring width') + v = Sfiltered.get_all_proxy() + assert len(v)==1 + \ No newline at end of file From 564be332f8e8dd7f810cd62cfa105c4c18ca73cc Mon Sep 17 00:00:00 2001 From: Deborah Khider Date: Fri, 26 Jul 2024 10:21:26 -0700 Subject: [PATCH 2/2] Update __init__.py --- pylipd/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylipd/__init__.py b/pylipd/__init__.py index 0f11b88..eaf8d20 100644 --- a/pylipd/__init__.py +++ b/pylipd/__init__.py @@ -1,5 +1,5 @@ from importlib.metadata import version -__version__ = version('pyleoclim') +__version__ = version('pylipd') from .utils import *