Add get_ranges query function

This function returns a DataFrame containing the distinct ranges of contiguous data. For example, >>> cc.querying.get_ranges(session, "01deg_jra55v13_ryf9091", "u", "1 daily") start end 0 1950-01-01 00:00:00 1950-02-01 00:00:00 1 1950-02-01 00:00:00 1971-01-01 00:00:00 2 2086-01-01 00:00:00 2100-10-01 00:00:00 3 2100-10-01 00:00:00 2101-01-01 00:00:00 4 2170-01-01 00:00:00 2180-01-01 00:00:00 Curiously, this example does highlight an inconsistency in the data for this experiment, but it does show the gaps where no daily velocity data is available.
COSIMA · Apr 22, 2024 · bd3bdf7 · bd3bdf7
1 parent a9f5fc9
commit bd3bdf7
Showing 1 changed file with 46 additions and 0 deletions.
diff --git a/cosima_cookbook/querying.py b/cosima_cookbook/querying.py
@@ -7,8 +7,10 @@
 import logging
 import os.path
 import pandas as pd
+import sqlalchemy as sa
 from sqlalchemy import func, distinct, or_
 from sqlalchemy.orm import aliased
+from sqlalchemy.sql.expression import cast
 from sqlalchemy.sql.selectable import subquery
 import warnings
 import xarray as xr
@@ -272,6 +274,50 @@ def get_frequencies(session, experiment=None):
     return pd.DataFrame(q, columns=[c["name"] for c in q.column_descriptions])
 
 
+def get_ranges(session, experiment, variable, frequency, cellmethods=None):
+    # first, we query for the files with a flag indicating that the current row is not
+    # contiguous with its predecessor
+    flag_q = (
+        session.query(
+            NCFile.time_start,
+            NCFile.time_end,
+            (
+                NCFile.time_start
+                != func.lag(NCFile.time_end, 1, "").over(order_by=NCFile.time_start)
+            ).label("flag"),
+        )
+        .join(NCFile.experiment)
+        .join(NCFile.ncvars)
+        .join(NCVar.variable)
+        .filter(NCExperiment.experiment == experiment)
+        .filter(NCFile.frequency == frequency)
+        .filter(NCVar.variable == variable)
+        .order_by(NCFile.time_start)
+    ).subquery()
+
+    # now, by summing over the flag (as an integer), we get a column that allows us to group
+    # on consecutive files
+    group_q = session.query(
+        flag_q,
+        func.sum(cast(flag_q.c.flag, sa.Integer))
+        .over(order_by=flag_q.c.time_start)
+        .label("grp"),
+    ).subquery()
+
+    # we just need the smallest start time and largest end time out of each group
+    # to gets its extent
+    q = (
+        session.query(
+            func.min(group_q.c.time_start),
+            func.max(group_q.c.time_end),
+        )
+        .group_by(group_q.c.grp)
+        .order_by(group_q.c.time_start)
+    )
+
+    return pd.DataFrame(q, columns=["start", "end"])
+
+
 def getvar(
     expt,
     variable,