filter by time

LinkedEarth · Jul 30, 2024 · b7c4ae6 · b7c4ae6
1 parent bce5fa5
commit b7c4ae6
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 1 deletion.
diff --git a/pylipd/globals/queries.py b/pylipd/globals/queries.py
@@ -259,6 +259,22 @@
     }
 """
 
+QUERY_FILTER_TIME = """
+    SELECT ?dsname ?minage ?maxage WHERE {
+        ?ds a le:Dataset .
+        ?ds le:hasName ?dsname .
+        
+        ?ds le:hasPaleoData ?data .
+        ?data le:hasMeasurementTable ?table .
+        ?table le:hasVariable ?var .
+        ?table le:hasVariable ?timevar .
+        ?timevar le:hasName ?time_variableName .
+        FILTER (regex(?time_variableName, "year.*") || regex(?time_variableName, "age.*")) .
+        ?timevar le:hasMinValue ?minage .
+        ?timevar le:hasMaxValue ?maxage .
+}
+"""
+
 QUERY_FILTER_VARIABLE_NAME = """
     SELECT ?uri ?dsuri ?dsname ?tableuri ?id ?name WHERE {
         ?uri le:hasVariableId ?id .

diff --git a/pylipd/lipd.py b/pylipd/lipd.py
@@ -20,7 +20,7 @@
 from pylipd.utils.json_to_rdf import JSONToRDF
 from pylipd.utils.rdf_to_json import RDFToJSON
 
-from .globals.queries import QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
+from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
 from .lipd_series import LiPDSeries
 from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
 from .utils.rdf_graph import RDFGraph
@@ -1158,6 +1158,79 @@ def filter_by_archive_type(self, archiveType):
         dsnames = [sanitizeId(row.dsname) for row in qres]
         return self.get(dsnames)
 
+    def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None):
+        """
+        Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation. 
+        
+        If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`. 
+
+        Parameters
+        ----------
+        timeBound : list
+            Minimum and Maximum age value to search for.
+        timeBoundType : str, optional
+            The type of querying to perform. Possible values include: "any", "entire", and "entirely".
+            - any: Overlap any portions of matching datasets (default)
+            - entirely: are entirely overlapped by matching datasets
+            - entire: overlap entire matching datasets but dataset can be shorter than the bounds
+            The default is 'any'.
+        recordLength : float, optional
+            The minimum length the record needs to have while matching the ageBound criteria. The default is None.
+
+        Raises
+        ------
+        ValueError
+            timeBoundType must take the values in ["any", "entire", and "entirely"]
+
+        Returns
+        -------
+        pylipd.lipd.LiPD
+            A new LiPD object that only contains datasets that have the specified time interval
+            
+        Examples
+        --------
+        pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
+        
+        .. jupyter-execute::
+            
+            from pylipd.utils.dataset import load_dir
+
+            lipd = load_dir('Pages2k')
+            Lfiltered = lipd.filter_by_time(timeBound=[0,1800])
+            Lfiltered.get_all_dataset_names()
+
+        """
+
+        if timeBound and timeBound[0]>timeBound[1]:
+                timeBound = [timeBound[1],timeBound[0]]
+
+        timeBoundType=timeBoundType.lower()
+
+        query = QUERY_FILTER_TIME
+        __, df = self.query(query)
+        if recordLength is None:
+            if timeBoundType == 'entirely':
+                filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])]
+            elif timeBoundType == 'entire':
+                filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])]
+            elif timeBoundType == 'any':
+                filter_df = df[(df['minage'] <= timeBound[1])]
+            else:
+                raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
+        else:
+            if timeBoundType == 'entirely':
+                filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
+            elif timeBoundType == 'entire':
+                filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
+            elif timeBoundType == 'any':
+                filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)]
+            else:
+                raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
+
+        dsnames = list(filter_df['dsname'])
+        return self.get(dsnames)
+
+
     def get_datasets(self) -> 'list[Dataset]':
         '''
         Return datasets as instances of the Dataset class

diff --git a/pylipd/tests/test_LiPD.py b/pylipd/tests/test_LiPD.py
@@ -98,6 +98,18 @@ def test_archive_to(self,pages2k):
         Lfiltered = D.filter_by_archive_type('marine sediment')
         assert len(Lfiltered.get_all_archiveTypes())==1
         assert Lfiltered.get_all_archiveTypes()[0] == 'Marine sediment'
+
+    @pytest.mark.parametrize(('timeBoundType', 'recordLength'),
+                             [('any', None),
+                              ('any', 500),
+                              ('entire', None),
+                              ('entire', 20),
+                              ('entirely',None),
+                              ('entirely', 100)
+                                 ])
+    def test_time_t0(self,timeBoundType,recordLength,pages2k):
+        D=pages2k
+        Lfiltered = D.filter_by_time(timeBound=[0,1800], timeBoundType=timeBoundType,recordLength=recordLength)
 
 
 class TestGet():