From 8c822570d77adbed08f1260ac07b7395441160bc Mon Sep 17 00:00:00 2001 From: khider <11758571+khider@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:20:19 -0700 Subject: [PATCH] filter by resolution --- pylipd/globals/queries.py | 13 +++++++ pylipd/lipd_series.py | 60 ++++++++++++++++++++++++++++++--- pylipd/tests/test_LiPDSeries.py | 8 ++++- 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py index a609d3b..9196a49 100644 --- a/pylipd/globals/queries.py +++ b/pylipd/globals/queries.py @@ -282,6 +282,19 @@ } """ +QUERY_FILTER_VARIABLE_RESOLUTION = """ + SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?v WHERE { + ?uri le:hasVariableId ?id . + ?uri le:hasResolution ?res . + ?res le:has[stat]Value ?v . + FILTER(?v<[value]) . + ?uri le:foundInDataset ?dsuri . + ?uri le:foundInDatasetName ?dataSetName . + ?uri le:foundInTable ?tableuri . + } +""" + + QUERY_TIMESERIES_ESSENTIALS_PALEO =""" PREFIX wgs84: SELECT ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev diff --git a/pylipd/lipd_series.py b/pylipd/lipd_series.py index c7e6469..1791935 100644 --- a/pylipd/lipd_series.py +++ b/pylipd/lipd_series.py @@ -1,6 +1,5 @@ from tqdm import tqdm -from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY - +from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION from .utils.multi_processing import multi_load_lipd_series from .utils.rdf_graph import RDFGraph @@ -254,10 +253,61 @@ def filter_by_proxy(self, proxy): varuris = [str(row.uri) for row in qres] dsuris = [*set([str(row.dsuri) for row in qres])] - #print(len(dsuris)) - rdfgraph = self.get(varuris) S = LiPDSeries(rdfgraph.graph) S.lipds = {k: self.lipds[k].copy() for k in dsuris} return S - \ No newline at end of file + + def filter_by_resolution(self, threshold, stats='Mean'): + ''' + Filters series to return a new LiPDSeries that only keeps variables that have a resolution less than the specified threshold. + + Parameters + ---------- + threshold : float + The maximum resolution to keep + stats : str, optional + Whether to use 'Mean', 'Median', 'Min' or 'Max' resolution. The default is 'Mean'. + + Raises + ------ + ValueError + Make sure that the stats is of ['Mean','Median', 'Min', 'Max']. + + Returns + ------- + S : pylipd.lipd_series.LiPDSeries + A new LiPDSeries object that only contains the filtered variables + + Examples + -------- + + .. jupyter-execute:: + + from pylipd.utils.dataset import load_dir + lipd = load_dir('Pages2k') + S = lipd.to_lipd_series() + S_filtered = S.filter_by_resolution(10) + + ''' + + stats = stats.capitalize() #make sure that the first letter is capitalized + stats_allowed = ['Mean','Median', 'Min', 'Max'] #possible values + if stats not in stats_allowed: + raise ValueError("Stats must be ['Mean','Median', 'Min', 'Max']") + + threshold = float(threshold) # make sure this is a float or can be coerced in one + + query = QUERY_FILTER_VARIABLE_RESOLUTION + query = query.replace("[value]", str(threshold)) + query = query.replace("[stat]", stats) + + qres,q_df = self.query(query) + + varuris = [str(row.uri) for row in qres] + dsuris = [*set([str(row.dsuri) for row in qres])] + + rdfgraph = self.get(varuris) + S = LiPDSeries(rdfgraph.graph) + S.lipds = {k: self.lipds[k].copy() for k in dsuris} + return S \ No newline at end of file diff --git a/pylipd/tests/test_LiPDSeries.py b/pylipd/tests/test_LiPDSeries.py index 9e28d99..6bfdfb7 100644 --- a/pylipd/tests/test_LiPDSeries.py +++ b/pylipd/tests/test_LiPDSeries.py @@ -18,7 +18,7 @@ 4. after `pip install pytest-xdist`, one may execute "pytest -n 4" to test in parallel with number of workers specified by `-n` 5. for more details, see https://docs.pytest.org/en/stable/usage.html """ - +import pytest from pylipd.lipd_series import LiPDSeries class TestLoad(): @@ -64,6 +64,12 @@ def test_proxy_t0(self,pages2k): Sfiltered = S.filter_by_proxy('ring width') v = Sfiltered.get_all_proxy() assert len(v)==1 + + @pytest.mark.parametrize('stats',['Mean','Median','Min','Max']) + def test_resolution_t0(self,stats,pages2k): + D=pages2k + S = D.to_lipd_series() + Sfiltered = S.filter_by_resolution(threshold = 10,stats=stats) \ No newline at end of file