Merge branch 'main' of https://github.com/LinkedEarth/pylipd

LinkedEarth · Aug 20, 2024 · 4328d32 · 4328d32
2 parents 0220293 + 3b08da0
commit 4328d32
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 10 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,4 +1,4 @@
-cff-version: 0.0.4
+cff-version: 0.0.5
 message: "If you use this software, please cite it as below."
 authors:
   - family-names: "Ratnakar"
@@ -8,7 +8,7 @@ authors:
     given-names: "Deborah"
     orcid: "https://orcid.org/0000-0001-7501-8430"
 title: "PyLiPD: a Python package for the manipulation of LiPD datasets"
-version: v1.3.6
+version: v1.3.7
 doi: 10.5281/zenodo.7951201
-date-released: 2023-06-19
+date-released: 2024-08-19
 url: "https://github.com/LinkedEarth/pylipd"
diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py
@@ -259,6 +259,22 @@
     }
 """
 
+QUERY_FILTER_TIME = """
+    SELECT ?dsname ?minage ?maxage WHERE {
+        ?ds a le:Dataset .
+        ?ds le:hasName ?dsname .
+        
+        ?ds le:hasPaleoData ?data .
+        ?data le:hasMeasurementTable ?table .
+        ?table le:hasVariable ?var .
+        ?table le:hasVariable ?timevar .
+        ?timevar le:hasName ?time_variableName .
+        FILTER (regex(?time_variableName, "year.*") || regex(?time_variableName, "age.*")) .
+        ?timevar le:hasMinValue ?minage .
+        ?timevar le:hasMaxValue ?maxage .
+}
+"""
+
 QUERY_FILTER_VARIABLE_NAME = """
     SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?name WHERE {
         ?uri le:hasVariableId ?id .
@@ -282,6 +298,19 @@
     }
 """
 
+QUERY_FILTER_VARIABLE_RESOLUTION = """
+    SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?v WHERE {
+        ?uri le:hasVariableId ?id .
+        ?uri le:hasResolution ?res .
+        ?res le:has[stat]Value ?v .
+        FILTER(?v<[value]) .
+        ?uri le:foundInDataset ?dsuri .
+        ?uri le:foundInDatasetName ?dataSetName .
+        ?uri le:foundInTable ?tableuri .        
+    }
+"""
+
+
 QUERY_TIMESERIES_ESSENTIALS_PALEO ="""
     PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
     SELECT ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev 
@@ -423,6 +452,12 @@
     }
 """
 
+## At the LiPDSeries level
+
+QUERY_LiPDSERIES_PROPERTIES="""
+    SELECT DISTINCT ?p WHERE {
+        ?uri ?p ?v .}
+    """
 
 QUERY_DATASET_PROPERTIES="""
     PREFIX le: <http://linked.earth/ontology#>

diff --git a/pylipd/lipd.py b/pylipd/lipd.py
@@ -20,7 +20,7 @@
 from pylipd.utils.json_to_rdf import JSONToRDF
 from pylipd.utils.rdf_to_json import RDFToJSON
 
-from .globals.queries import QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
+from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
 from .lipd_series import LiPDSeries
 from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
 from .utils.rdf_graph import RDFGraph
@@ -1158,6 +1158,79 @@ def filter_by_archive_type(self, archiveType):
         dsnames = [sanitizeId(row.dsname) for row in qres]
         return self.get(dsnames)
 
+    def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None):
+        """
+        Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation. 
+        
+        If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`. 
+
+        Parameters
+        ----------
+        timeBound : list
+            Minimum and Maximum age value to search for.
+        timeBoundType : str, optional
+            The type of querying to perform. Possible values include: "any", "entire", and "entirely".
+            - any: Overlap any portions of matching datasets (default)
+            - entirely: are entirely overlapped by matching datasets
+            - entire: overlap entire matching datasets but dataset can be shorter than the bounds
+            The default is 'any'.
+        recordLength : float, optional
+            The minimum length the record needs to have while matching the ageBound criteria. The default is None.
+
+        Raises
+        ------
+        ValueError
+            timeBoundType must take the values in ["any", "entire", and "entirely"]
+
+        Returns
+        -------
+        pylipd.lipd.LiPD
+            A new LiPD object that only contains datasets that have the specified time interval
+            
+        Examples
+        --------
+        pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
+        
+        .. jupyter-execute::
+            
+            from pylipd.utils.dataset import load_dir
+
+            lipd = load_dir('Pages2k')
+            Lfiltered = lipd.filter_by_time(timeBound=[0,1800])
+            Lfiltered.get_all_dataset_names()
+
+        """
+
+        if timeBound and timeBound[0]>timeBound[1]:
+                timeBound = [timeBound[1],timeBound[0]]
+
+        timeBoundType=timeBoundType.lower()
+
+        query = QUERY_FILTER_TIME
+        __, df = self.query(query)
+        if recordLength is None:
+            if timeBoundType == 'entirely':
+                filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])]
+            elif timeBoundType == 'entire':
+                filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])]
+            elif timeBoundType == 'any':
+                filter_df = df[(df['minage'] <= timeBound[1])]
+            else:
+                raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
+        else:
+            if timeBoundType == 'entirely':
+                filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
+            elif timeBoundType == 'entire':
+                filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
+            elif timeBoundType == 'any':
+                filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)]
+            else:
+                raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
+
+        dsnames = list(filter_df['dsname'])
+        return self.get(dsnames)
+
+
     def get_datasets(self) -> 'list[Dataset]':
         '''
         Return datasets as instances of the Dataset class

diff --git a/pylipd/lipd_series.py b/pylipd/lipd_series.py
@@ -1,6 +1,5 @@
 from tqdm import tqdm
-from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY
-
+from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION, QUERY_LiPDSERIES_PROPERTIES
 from .utils.multi_processing import multi_load_lipd_series
 from .utils.rdf_graph import RDFGraph
 
@@ -175,6 +174,37 @@ def get_timeseries_essentials(self):
         qres_df['values']=qres_df['values'].apply(lambda row : np.array(json.loads(row)))
 
         return qres_df
+
+    def get_variable_properties(self):
+        """
+        Get a list of all the properties name associated with the dataset. Useful to write custom queries
+
+        Returns
+        -------
+        clean_list : list
+            A list of unique variable properties
+        
+        Examples
+        --------
+        
+        .. jupyter-execute::
+
+            from pylipd.utils.dataset import load_dir
+
+            lipd = load_dir()
+            S = lipd.to_lipd_series()
+            l = S.get_variable_properties()
+            
+            print(l)
+
+
+        """
+
+        query_list = self.query(QUERY_LiPDSERIES_PROPERTIES)[1].iloc[:,0].values.tolist()
+        clean_list = [item.split("#")[-1] for item in query_list]
+
+        return clean_list
+
 
     def filter_by_name(self, name):
         '''
@@ -254,10 +284,61 @@ def filter_by_proxy(self, proxy):
         varuris = [str(row.uri) for row in qres]
         dsuris = [*set([str(row.dsuri) for row in qres])]
 
-        #print(len(dsuris))
-
         rdfgraph = self.get(varuris)
         S = LiPDSeries(rdfgraph.graph)
         S.lipds = {k: self.lipds[k].copy() for k in dsuris}
         return S
-
+
+    def filter_by_resolution(self, threshold, stats='Mean'):
+        '''
+        Filters series to return a new LiPDSeries that only keeps variables that have a resolution less than the specified threshold. 
+
+        Parameters
+        ----------
+        threshold : float
+            The maximum resolution to keep
+        stats : str, optional
+            Whether to use 'Mean', 'Median', 'Min' or 'Max' resolution. The default is 'Mean'.
+
+        Raises
+        ------
+        ValueError
+            Make sure that the stats is of ['Mean','Median', 'Min', 'Max'].
+
+        Returns
+        -------
+        S : pylipd.lipd_series.LiPDSeries
+            A new LiPDSeries object that only contains the filtered variables
+        
+        Examples
+        --------
+        
+        .. jupyter-execute::
+    
+            from pylipd.utils.dataset import load_dir
+            lipd = load_dir('Pages2k')
+            S = lipd.to_lipd_series()
+            S_filtered = S.filter_by_resolution(10)
+
+        '''
+
+        stats = stats.capitalize() #make sure that the first letter is capitalized
+        stats_allowed = ['Mean','Median', 'Min', 'Max'] #possible values
+        if stats not in stats_allowed:
+            raise ValueError("Stats must be ['Mean','Median', 'Min', 'Max']")
+
+        threshold = float(threshold) # make sure this is a float or can be coerced in one
+
+        query = QUERY_FILTER_VARIABLE_RESOLUTION
+        query = query.replace("[value]", str(threshold))
+        query = query.replace("[stat]", stats)
+
+        qres,q_df = self.query(query)
+
+        varuris = [str(row.uri) for row in qres]
+        dsuris = [*set([str(row.dsuri) for row in qres])]
+
+        rdfgraph = self.get(varuris)
+        S = LiPDSeries(rdfgraph.graph)
+        S.lipds = {k: self.lipds[k].copy() for k in dsuris}
+        return S
diff --git a/pylipd/tests/test_LiPD.py b/pylipd/tests/test_LiPD.py
@@ -98,6 +98,18 @@ def test_archive_to(self,pages2k):
         Lfiltered = D.filter_by_archive_type('marine sediment')
         assert len(Lfiltered.get_all_archiveTypes())==1
         assert Lfiltered.get_all_archiveTypes()[0] == 'Marine sediment'
+
+    @pytest.mark.parametrize(('timeBoundType', 'recordLength'),
+                             [('any', None),
+                              ('any', 500),
+                              ('entire', None),
+                              ('entire', 20),
+                              ('entirely',None),
+                              ('entirely', 100)
+                                 ])
+    def test_time_t0(self,timeBoundType,recordLength,pages2k):
+        D=pages2k
+        Lfiltered = D.filter_by_time(timeBound=[0,1800], timeBoundType=timeBoundType,recordLength=recordLength)
 
 
 class TestGet():

diff --git a/pylipd/tests/test_LiPDSeries.py b/pylipd/tests/test_LiPDSeries.py
@@ -18,7 +18,7 @@
 4. after `pip install pytest-xdist`, one may execute "pytest -n 4" to test in parallel with number of workers specified by `-n`
 5. for more details, see https://docs.pytest.org/en/stable/usage.html
 """
-
+import pytest
 from pylipd.lipd_series import LiPDSeries
 
 class TestLoad():
@@ -48,6 +48,11 @@ def test_proxy_t0(self, pages2k):
         D=pages2k
         S = D.to_lipd_series()
         names = S.get_all_proxy()
+
+    def test_variable_t0(self,pages2k):
+        D=pages2k
+        S = D.to_lipd_series()
+        l = S.get_variable_properties()
 
 class TestFiler():
 
@@ -64,6 +69,12 @@ def test_proxy_t0(self,pages2k):
         Sfiltered = S.filter_by_proxy('ring width')
         v = Sfiltered.get_all_proxy()
         assert len(v)==1
+
+    @pytest.mark.parametrize('stats',['Mean','Median','Min','Max'])
+    def test_resolution_t0(self,stats,pages2k):
+        D=pages2k
+        S = D.to_lipd_series()
+        Sfiltered = S.filter_by_resolution(threshold = 10,stats=stats)