Skip to content

Commit

Permalink
Add get_ranges query function
Browse files Browse the repository at this point in the history
This function returns a DataFrame containing the distinct ranges of
contiguous data. For example,

>>> cc.querying.get_ranges(session, "01deg_jra55v13_ryf9091", "u", "1 daily")
                 start                  end
0  1950-01-01 00:00:00  1950-02-01 00:00:00
1  1950-02-01 00:00:00  1971-01-01 00:00:00
2  2086-01-01 00:00:00  2100-10-01 00:00:00
3  2100-10-01 00:00:00  2101-01-01 00:00:00
4  2170-01-01 00:00:00  2180-01-01 00:00:00

Curiously, this example does highlight an inconsistency in the data
for this experiment, but it does show the gaps where no daily velocity
data is available.
  • Loading branch information
angus-g committed Apr 22, 2024
1 parent a9f5fc9 commit bd3bdf7
Showing 1 changed file with 46 additions and 0 deletions.
46 changes: 46 additions & 0 deletions cosima_cookbook/querying.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
import logging
import os.path
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import func, distinct, or_
from sqlalchemy.orm import aliased
from sqlalchemy.sql.expression import cast
from sqlalchemy.sql.selectable import subquery
import warnings
import xarray as xr
Expand Down Expand Up @@ -272,6 +274,50 @@ def get_frequencies(session, experiment=None):
return pd.DataFrame(q, columns=[c["name"] for c in q.column_descriptions])


def get_ranges(session, experiment, variable, frequency, cellmethods=None):
# first, we query for the files with a flag indicating that the current row is not
# contiguous with its predecessor
flag_q = (
session.query(
NCFile.time_start,
NCFile.time_end,
(
NCFile.time_start
!= func.lag(NCFile.time_end, 1, "").over(order_by=NCFile.time_start)
).label("flag"),
)
.join(NCFile.experiment)
.join(NCFile.ncvars)
.join(NCVar.variable)
.filter(NCExperiment.experiment == experiment)
.filter(NCFile.frequency == frequency)
.filter(NCVar.variable == variable)
.order_by(NCFile.time_start)
).subquery()

# now, by summing over the flag (as an integer), we get a column that allows us to group
# on consecutive files
group_q = session.query(
flag_q,
func.sum(cast(flag_q.c.flag, sa.Integer))
.over(order_by=flag_q.c.time_start)
.label("grp"),
).subquery()

# we just need the smallest start time and largest end time out of each group
# to gets its extent
q = (
session.query(
func.min(group_q.c.time_start),
func.max(group_q.c.time_end),
)
.group_by(group_q.c.grp)
.order_by(group_q.c.time_start)
)

return pd.DataFrame(q, columns=["start", "end"])


def getvar(
expt,
variable,
Expand Down

0 comments on commit bd3bdf7

Please sign in to comment.