reichlab · bsweger · Jan 28, 2025 · Jan 28, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/src/.ruff.toml b/src/.ruff.toml
@@ -0,0 +1,2 @@
+[format]
+quote-style = "double"
diff --git a/src/README.md b/src/README.md
@@ -4,8 +4,7 @@
 > **The `src/` directory contains scripts used by hub administrators** and automated jobs.
 > Hub participants and modelers: turn back now. There is nothing for you here but misery.
 
-Details of these scripts can be found below. All of these scripts assume that your working directory is the `src/` directory. To ensure stability, the R scripts manage their dependencies with [the renv R package](https://rstudio.github.io/renv/). The `variant-hub-admin.Rproj` file allows you to open the `src/` folder as an independent R project from the root of this hub. 
-
+Details of these scripts can be found below. All of these scripts assume that your working directory is the `src/` directory. To ensure stability, the R scripts manage their dependencies with [the renv R package](https://rstudio.github.io/renv/). The `variant-hub-admin.Rproj` file allows you to open the `src/` folder as an independent R project from the root of this hub.
 
 The scripts are designed to be run by scheduled GitHub workflows on a Linux-based runner
 (_i.e._, they have not been tested in a Windows environment).
@@ -40,20 +39,19 @@ then appended to the hub's existing `hub-config/tasks.json` file.
 
 To run the script manually (RStudio users):
 
-
 1. Open `src/make_round_config.R` in RStudio _OR_ open the `src/variant-nowcast-hub.Rproj` project in RStudio.
 2. If prompted by `renv` that some of the packages in `renv.lock` are not installed:
 
     ```r
     renv::restore()
     ```
+
 3. Run the make_round_config script:
 
     ```r
     source("make_round_config.R")
     ```
 
-
 To run the script manually (without RStudio):
 
 1. Open an R session and set the working directory to the repo's `src` directory.
@@ -62,13 +60,19 @@ To run the script manually (without RStudio):
     ```r
     renv::restore()
     ```
-4. Run the make_round_config script:
+
+3. Run the make_round_config script:
 
     ```r
      source("make_round_config.R")
     ```
 
-## Creating a list of sequences collected by location and date
+## Post round-submission scripts
+
+After a modeling round closes for submissions (Wednesdays at 8 PM US Eastern),
+the [`run-post-submission-jobs.yaml` GitHub workflow](https://github.com/reichlab/variant-nowcast-hub/blob/main/.github/workflows/run-post-submission-jobs.yaml) runs two of the scripts in this directory.
+
+### get_location_date_counts.py
 
 For each location used by this hub, `get_location_date_counts.py` generates a daily count of Sars-Cov-2 genome sequences collected.
 The output includes counts for each of the 31 days prior to the latest round's nowcast date (_i.e._, the round_id)
@@ -90,3 +94,50 @@ To run the script manually:
     uv run src/get_location_date_counts.py
     ```
 
+### get_target_data.py
+
+`get_target_data.py` generates a sets of oracle output and timeseries target
+data. This script is actually a small command line interface (CLI), with
+options as follows:
+
+```sh
+➜ uv run --with-requirements src/requirements.txt python src/get_target_data.py --help
+Usage: get_target_data.py [OPTIONS]
+
+Options:
+  --nowcast-date [%Y-%m-%d]       The modeling round nowcast date (i.e.,
+                                  round_id) (YYYY-MM-DD). [required]
+  --sequence-as-of [%Y-%m-%d]     Get counts based on the last available
+                                  Nextstrain sequence metadata on or prior to
+                                  this UTC date (YYYY-MM-DD). Default is the
+                                  nowcast date + 90 days.
+  --tree-as-of [%Y-%m-%d]         Use this UTC date to retrieve the reference
+                                  tree used for clade assignment (YYYY-MM-DD).
+                                  Defaults to created_at in the rounds
+                                  modeled-clades file.
+  --collection-min-date [%Y-%m-%d]
+                                  Assign clades to sequences collected on or
+                                  after this UTC date (YYYY-MM-DD). Default is
+                                  the nowcast date minus 90 days.
+  --collection-max-date [%Y-%m-%d]
+                                  Assign clades to sequences collected on or
+                                  before this UTC date (YYYY-MM-DD), Default
+                                  is the nowcast date plus 10 days.
+  --target-data-dir TEXT          Path object to the directory where the
+```
+
+To run the script manually:
+
+1. Make sure that `uv` is installed on your machine:
+
+    ```bash
+    brew install uv
+    ```
+
+    (see [`uv` documentation](https://docs.astral.sh/uv/getting-started/installation/#installing-uv) for a full list of installation options)
+
+2. From the root of the repo, run the following command:
+
+    ```bash
+    uv run src/get_target_data.py --nowcast-date=2024-10-09
+    ```
diff --git a/src/get_target_data.py b/src/get_target_data.py
@@ -22,25 +22,25 @@
 # requires-python = ">=3.12,<3.13"
 # dependencies = [
 #   "click",
-#   "cladetime@git+https://github.com/reichlab/cladetime",
+#   "cladetime",
 #   "polars>=1.17.1,<1.18.0",
 #   "pyarrow>=18.1.0,<19.0.0",
 # ]
 # ///
 
 import json
-from pathlib import Path
 import logging
 import sys
 from datetime import date, datetime, timedelta, timezone
+from pathlib import Path
 
 import click
 import polars as pl
 import pyarrow as pa  # type: ignore
 import pyarrow.dataset as ds  # type: ignore
 import pyarrow.parquet as pq  # type: ignore
-from click.testing import CliRunner
 from click import Context, Option
+from click.testing import CliRunner
 
 from cladetime import Clade, CladeTime, sequence  # type: ignore
 
@@ -135,6 +135,7 @@ def set_collection_max_date(ctx, param, value):
     value = value.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc)
     return value
 
+
 def set_target_data_dir(ctx, param, value):
     """Set the target_data_dir default value to the hub's target-data directory."""
     if value is None:
@@ -152,7 +153,7 @@ def set_target_data_dir(ctx, param, value):
     "--nowcast-date",
     type=click.DateTime(formats=["%Y-%m-%d"]),
     required=True,
-    help="The modeling round nowcast date (i.e., round_id) (YYYY-MM-DD). The tree as of date is set to this reference date minus two days.",
+    help="The modeling round nowcast date (i.e., round_id) (YYYY-MM-DD).",
 )
 @click.option(
     "--sequence-as-of",
@@ -176,7 +177,7 @@ def set_target_data_dir(ctx, param, value):
     required=False,
     default=None,
     callback=normalize_date,
-    help="Assign clades to sequences collected on or after this UTC date (YYYY-MM-DD). Default is the nowcast date minus 31 days.",
+    help="Assign clades to sequences collected on or after this UTC date (YYYY-MM-DD). Default is the nowcast date minus 90 days.",
 )
 @click.option(
     "--collection-max-date",
@@ -195,7 +196,7 @@ def set_target_data_dir(ctx, param, value):
     help=(
         "Path object to the directory where the target data will be saved. Default is the hub's target-data directory. "
         "Specify '.' to save target data to the current working directory."
-    )
+    ),
 )
 def main(
     nowcast_date: datetime,
@@ -360,7 +361,13 @@ def create_target_data(
             pl.col("target_date")
             >= datetime.fromisoformat(nowcast_string) - timedelta(days=31)
         )
-        .with_columns(pl.lit(nowcast_string).alias("nowcast_date"))
+        .with_columns(
+            pl.lit(nowcast_string).alias("nowcast_date"),
+            pl.lit(sequence_as_of_string).alias("sequence_as_of"),
+            pl.lit(assignments.meta["tree_as_of"].strftime("%Y-%m-%d")).alias(
+                "tree_as_of"
+            ),
+        )
         .rename({"observation": "oracle_value"})
     )
 
@@ -429,6 +436,8 @@ def write_target_data(
             ("clade", pa.string()),
             ("oracle_value", pa.float64()),
             ("nowcast_date", pa.date32()),
+            ("sequence_as_of", pa.date32()),
+            ("tree_as_of", pa.date32()),
         ]
     )
     oracle_arrow = oracle_arrow.cast(oracle_schema)
@@ -582,7 +591,15 @@ def test_target_data():
 
     oracle = oracle.collect()
     expected_oracle_cols = set(
-        ["nowcast_date", "location", "target_date", "clade", "oracle_value"]
+        [
+            "nowcast_date",
+            "location",
+            "target_date",
+            "clade",
+            "oracle_value",
+            "sequence_as_of",
+            "tree_as_of",
+        ]
     )
     assert set(oracle.columns) == expected_oracle_cols
     assert oracle.height == ts.height
@@ -689,6 +706,8 @@ def test_target_data_integration(caplog, tmp_path):
     assert oracle_schema_dict.get("target_date") is date
     assert oracle_schema_dict.get("clade") is str
     assert oracle_schema_dict.get("oracle_value") is float
+    assert oracle_schema_dict.get("sequence_as_of") is date
+    assert oracle_schema_dict.get("tree_as_of") is date
 
     # check data types when reading target data with Arrow
     ts_arrow = ds.dataset(str(ts_path), format="parquet")
@@ -708,3 +727,5 @@ def test_target_data_integration(caplog, tmp_path):
     assert oracle_schema.field("clade").type == pa.string()
     assert oracle_schema.field("oracle_value").type == pa.float64()
     assert oracle_schema.field("target_date").type == pa.date32()
+    assert oracle_schema.field("sequence_as_of").type == pa.date32()
+    assert oracle_schema.field("tree_as_of").type == pa.date32()