Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
IKCAP committed Aug 20, 2024
2 parents 0220293 + 3b08da0 commit 4328d32
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 10 deletions.
6 changes: 3 additions & 3 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cff-version: 0.0.4
cff-version: 0.0.5
message: "If you use this software, please cite it as below."
authors:
- family-names: "Ratnakar"
Expand All @@ -8,7 +8,7 @@ authors:
given-names: "Deborah"
orcid: "https://orcid.org/0000-0001-7501-8430"
title: "PyLiPD: a Python package for the manipulation of LiPD datasets"
version: v1.3.6
version: v1.3.7
doi: 10.5281/zenodo.7951201
date-released: 2023-06-19
date-released: 2024-08-19
url: "https://github.com/LinkedEarth/pylipd"
35 changes: 35 additions & 0 deletions pylipd/globals/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,22 @@
}
"""

QUERY_FILTER_TIME = """
SELECT ?dsname ?minage ?maxage WHERE {
?ds a le:Dataset .
?ds le:hasName ?dsname .
?ds le:hasPaleoData ?data .
?data le:hasMeasurementTable ?table .
?table le:hasVariable ?var .
?table le:hasVariable ?timevar .
?timevar le:hasName ?time_variableName .
FILTER (regex(?time_variableName, "year.*") || regex(?time_variableName, "age.*")) .
?timevar le:hasMinValue ?minage .
?timevar le:hasMaxValue ?maxage .
}
"""

QUERY_FILTER_VARIABLE_NAME = """
SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?name WHERE {
?uri le:hasVariableId ?id .
Expand All @@ -282,6 +298,19 @@
}
"""

QUERY_FILTER_VARIABLE_RESOLUTION = """
SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?v WHERE {
?uri le:hasVariableId ?id .
?uri le:hasResolution ?res .
?res le:has[stat]Value ?v .
FILTER(?v<[value]) .
?uri le:foundInDataset ?dsuri .
?uri le:foundInDatasetName ?dataSetName .
?uri le:foundInTable ?tableuri .
}
"""


QUERY_TIMESERIES_ESSENTIALS_PALEO ="""
PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
SELECT ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev
Expand Down Expand Up @@ -423,6 +452,12 @@
}
"""

## At the LiPDSeries level

QUERY_LiPDSERIES_PROPERTIES="""
SELECT DISTINCT ?p WHERE {
?uri ?p ?v .}
"""

QUERY_DATASET_PROPERTIES="""
PREFIX le: <http://linked.earth/ontology#>
Expand Down
75 changes: 74 additions & 1 deletion pylipd/lipd.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pylipd.utils.json_to_rdf import JSONToRDF
from pylipd.utils.rdf_to_json import RDFToJSON

from .globals.queries import QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
from .lipd_series import LiPDSeries
from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
from .utils.rdf_graph import RDFGraph
Expand Down Expand Up @@ -1158,6 +1158,79 @@ def filter_by_archive_type(self, archiveType):
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)

def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None):
"""
Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation.
If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`.
Parameters
----------
timeBound : list
Minimum and Maximum age value to search for.
timeBoundType : str, optional
The type of querying to perform. Possible values include: "any", "entire", and "entirely".
- any: Overlap any portions of matching datasets (default)
- entirely: are entirely overlapped by matching datasets
- entire: overlap entire matching datasets but dataset can be shorter than the bounds
The default is 'any'.
recordLength : float, optional
The minimum length the record needs to have while matching the ageBound criteria. The default is None.
Raises
------
ValueError
timeBoundType must take the values in ["any", "entire", and "entirely"]
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified time interval
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_time(timeBound=[0,1800])
Lfiltered.get_all_dataset_names()
"""

if timeBound and timeBound[0]>timeBound[1]:
timeBound = [timeBound[1],timeBound[0]]

timeBoundType=timeBoundType.lower()

query = QUERY_FILTER_TIME
__, df = self.query(query)
if recordLength is None:
if timeBoundType == 'entirely':
filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])]
elif timeBoundType == 'entire':
filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])]
elif timeBoundType == 'any':
filter_df = df[(df['minage'] <= timeBound[1])]
else:
raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
else:
if timeBoundType == 'entirely':
filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
elif timeBoundType == 'entire':
filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
elif timeBoundType == 'any':
filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)]
else:
raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")

dsnames = list(filter_df['dsname'])
return self.get(dsnames)


def get_datasets(self) -> 'list[Dataset]':
'''
Return datasets as instances of the Dataset class
Expand Down
91 changes: 86 additions & 5 deletions pylipd/lipd_series.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from tqdm import tqdm
from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY

from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION, QUERY_LiPDSERIES_PROPERTIES
from .utils.multi_processing import multi_load_lipd_series
from .utils.rdf_graph import RDFGraph

Expand Down Expand Up @@ -175,6 +174,37 @@ def get_timeseries_essentials(self):
qres_df['values']=qres_df['values'].apply(lambda row : np.array(json.loads(row)))

return qres_df

def get_variable_properties(self):
"""
Get a list of all the properties name associated with the dataset. Useful to write custom queries
Returns
-------
clean_list : list
A list of unique variable properties
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir()
S = lipd.to_lipd_series()
l = S.get_variable_properties()
print(l)
"""

query_list = self.query(QUERY_LiPDSERIES_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]

return clean_list


def filter_by_name(self, name):
'''
Expand Down Expand Up @@ -254,10 +284,61 @@ def filter_by_proxy(self, proxy):
varuris = [str(row.uri) for row in qres]
dsuris = [*set([str(row.dsuri) for row in qres])]

#print(len(dsuris))

rdfgraph = self.get(varuris)
S = LiPDSeries(rdfgraph.graph)
S.lipds = {k: self.lipds[k].copy() for k in dsuris}
return S


def filter_by_resolution(self, threshold, stats='Mean'):
'''
Filters series to return a new LiPDSeries that only keeps variables that have a resolution less than the specified threshold.
Parameters
----------
threshold : float
The maximum resolution to keep
stats : str, optional
Whether to use 'Mean', 'Median', 'Min' or 'Max' resolution. The default is 'Mean'.
Raises
------
ValueError
Make sure that the stats is of ['Mean','Median', 'Min', 'Max'].
Returns
-------
S : pylipd.lipd_series.LiPDSeries
A new LiPDSeries object that only contains the filtered variables
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
S = lipd.to_lipd_series()
S_filtered = S.filter_by_resolution(10)
'''

stats = stats.capitalize() #make sure that the first letter is capitalized
stats_allowed = ['Mean','Median', 'Min', 'Max'] #possible values
if stats not in stats_allowed:
raise ValueError("Stats must be ['Mean','Median', 'Min', 'Max']")

threshold = float(threshold) # make sure this is a float or can be coerced in one

query = QUERY_FILTER_VARIABLE_RESOLUTION
query = query.replace("[value]", str(threshold))
query = query.replace("[stat]", stats)

qres,q_df = self.query(query)

varuris = [str(row.uri) for row in qres]
dsuris = [*set([str(row.dsuri) for row in qres])]

rdfgraph = self.get(varuris)
S = LiPDSeries(rdfgraph.graph)
S.lipds = {k: self.lipds[k].copy() for k in dsuris}
return S
12 changes: 12 additions & 0 deletions pylipd/tests/test_LiPD.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,18 @@ def test_archive_to(self,pages2k):
Lfiltered = D.filter_by_archive_type('marine sediment')
assert len(Lfiltered.get_all_archiveTypes())==1
assert Lfiltered.get_all_archiveTypes()[0] == 'Marine sediment'

@pytest.mark.parametrize(('timeBoundType', 'recordLength'),
[('any', None),
('any', 500),
('entire', None),
('entire', 20),
('entirely',None),
('entirely', 100)
])
def test_time_t0(self,timeBoundType,recordLength,pages2k):
D=pages2k
Lfiltered = D.filter_by_time(timeBound=[0,1800], timeBoundType=timeBoundType,recordLength=recordLength)


class TestGet():
Expand Down
13 changes: 12 additions & 1 deletion pylipd/tests/test_LiPDSeries.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
4. after `pip install pytest-xdist`, one may execute "pytest -n 4" to test in parallel with number of workers specified by `-n`
5. for more details, see https://docs.pytest.org/en/stable/usage.html
"""

import pytest
from pylipd.lipd_series import LiPDSeries

class TestLoad():
Expand Down Expand Up @@ -48,6 +48,11 @@ def test_proxy_t0(self, pages2k):
D=pages2k
S = D.to_lipd_series()
names = S.get_all_proxy()

def test_variable_t0(self,pages2k):
D=pages2k
S = D.to_lipd_series()
l = S.get_variable_properties()

class TestFiler():

Expand All @@ -64,6 +69,12 @@ def test_proxy_t0(self,pages2k):
Sfiltered = S.filter_by_proxy('ring width')
v = Sfiltered.get_all_proxy()
assert len(v)==1

@pytest.mark.parametrize('stats',['Mean','Median','Min','Max'])
def test_resolution_t0(self,stats,pages2k):
D=pages2k
S = D.to_lipd_series()
Sfiltered = S.filter_by_resolution(threshold = 10,stats=stats)



0 comments on commit 4328d32

Please sign in to comment.