From 4d17fe0f991b400a588ca816aa82cee377f9549b Mon Sep 17 00:00:00 2001
From: Ben Hearsum <ben@mozilla.com>
Date: Fri, 19 Jan 2024 09:09:17 -0500
Subject: [PATCH] Add the ability to run starting from a specific task (fixes
 #227)

A couple of example runs with this:
* https://firefox-ci-tc.services.mozilla.com/tasks/groups/Sc5vusi4TUKLmD3CsZ46kg uses https://firefox-ci-tc.services.mozilla.com/tasks/groups/JjNp3KcyTUObUtOA9BgK5g as its `previous-group-id` with `start-stage: train-backwards` and `target-stage: train-teacher`.
* https://firefox-ci-tc.services.mozilla.com/tasks/groups/GTujkfDHRcODvlRSfsqJYA uses the above group as its `previous-group-id` with `start-stage: train-teacher` and `target-stage: all`. Note that it ended up depending on tasks from both the above group and the one that it was based on, and ended up running `train-teacher` and everything after it.

Big thanks to @gabrielBusta for suggesting this implementation!
---
 docs/task-cluster.md                          | 14 +++
 taskcluster/config.yml                        | 39 ++++++++
 taskcluster/requirements.in                   |  2 +-
 taskcluster/requirements.txt                  | 10 +--
 .../translations_taskgraph/actions/train.py   | 89 +++++++++++--------
 5 files changed, 110 insertions(+), 44 deletions(-)

diff --git a/docs/task-cluster.md b/docs/task-cluster.md
index ab1773017..3a5ebac72 100644
--- a/docs/task-cluster.md
+++ b/docs/task-cluster.md
@@ -108,6 +108,20 @@ tasks:
             stage: merge-corpus
 ```
 
+## Running only later parts of the pipeline
+
+When hacking on later parts of the pipeline it can often be useful to re-use earlier runs of the pipeline, even if those runs were done with different training parameters. To do this, we must bypass the usual caching mechanisms of Taskgraph, and force it to replace earlier tasks with ones we provide. To do this, you can run a training action as usual, but also provide `start-stage` and `previous_group_ids` parameters. For example:
+
+```
+start-stage: train-student
+target-stage: all
+previous_group_ids: ["SsGpi3TGShaDT-h93fHL-g"]
+```
+
+...will run `train-student` and all tasks _after_ it. All tasks upstream of `train-student` will be replaced with the tasks of the same name from the `SsGpi3TGShaDT-h93fHL-g` task group, or tasks that are upstream from one of those tasks. It is important that you provide a task group id that contains the task or tasks from the `start-stage` you've given, otherwise Taskgraph will be unable to correctly find the upstream tasks you want to re-use.
+
+Note: This feature should _never_ be used for production training, as it completely bypasses all caching mechanisms, and you will most likely end up with invalid or useless models.
+
 ## Interactive Tasks
 
 Taskcluster allows authorized users to run so-called [interactive tasks](https://docs.taskcluster.net/docs/reference/workers/docker-worker/features#feature-interactive). These tasks allow users to gain a shell in the same environment that a pipeline step runs in. This can often be useful for quicker debugging or testing of ideas.
diff --git a/taskcluster/config.yml b/taskcluster/config.yml
index 75f693305..9d066d469 100644
--- a/taskcluster/config.yml
+++ b/taskcluster/config.yml
@@ -15,6 +15,45 @@ taskgraph:
         firefox_translations_training:
             name: "firefox-translations-training"
 
+# The list of valid stages that can be used with `target-stage and `start-stage`.
+# These get attached to tasks in `kinds`.
+valid-stages:
+    - clean-corpus
+    - clean-mono
+    - bicleaner
+    - bicleaner-model
+    - merge-corpus
+    - merge-devset
+    - merge-mono
+    - train-vocab
+    - train-backwards
+    - evaluate-backwards
+    - split-corpus
+    - split-mono
+    - translate-mono-trg
+    - collect-mono-trg
+    - train-teacher
+    - evaluate-teacher
+    - evaluate-finetuned-teacher
+    - translate-corpus
+    - extract-best
+    - collect-corpus
+    - translate-mono-src
+    - collect-mono-src
+    - merge-translated
+    - score
+    - cefilter
+    - alignments
+    - train-student
+    - evaluate-student
+    - finetune-student
+    - evaluate-finetuned-student
+    - quantize
+    - evaluate-quantized
+    - export
+    - evaluate-teacher-ensemble
+    - all
+
 workers:
     aliases:
         # Use for quick tasks that don't require GPUs, eg: linting, tests
diff --git a/taskcluster/requirements.in b/taskcluster/requirements.in
index 95c2f0291..12af9a734 100644
--- a/taskcluster/requirements.in
+++ b/taskcluster/requirements.in
@@ -1 +1 @@
-taskcluster-taskgraph>=7
+taskcluster-taskgraph>=7.1.2
diff --git a/taskcluster/requirements.txt b/taskcluster/requirements.txt
index c0be2b94a..6282cf81c 100644
--- a/taskcluster/requirements.txt
+++ b/taskcluster/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --generate-hashes requirements.in
+#    pip-compile --generate-hashes taskcluster/requirements.in
 #
 appdirs==1.4.4 \
     --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \
@@ -302,10 +302,10 @@ slugid==2.0.0 \
     --hash=sha256:a950d98b72691178bdd4d6c52743c4a2aa039207cf7a97d71060a111ff9ba297 \
     --hash=sha256:aec8b0e01c4ad32e38e12d609eab3ec912fd129aaf6b2ded0199b56a5f8fd67c
     # via taskcluster-taskgraph
-taskcluster-taskgraph==7.1.0 \
-    --hash=sha256:7274765492eb3a5bc60901a7e8b0e7b0a9907921357c9dd66099541500a716a3 \
-    --hash=sha256:d794850df1aa8b0bb2250e8e9c0ad92917f6147de80d50d5f28c6c6713c7912d
-    # via -r requirements.in
+taskcluster-taskgraph==7.1.2 \
+    --hash=sha256:b7c341b2530cf86f6ee6c61ccdffd744e8590ef222c8f6477af7164d06aeda0f \
+    --hash=sha256:d343343cb9585438331e3f6771cf417e3631c33c667b10420183df06d8e9febf
+    # via -r taskcluster/requirements.in
 taskcluster-urls==13.0.1 \
     --hash=sha256:5e25e7e6818e8877178b175ff43d2e6548afad72694aa125f404a7329ece0973 \
     --hash=sha256:b25e122ecec249c4299ac7b20b08db76e3e2025bdaeb699a9d444556de5fd367 \
diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py
index 0d786d3db..a1f91dc08 100644
--- a/taskcluster/translations_taskgraph/actions/train.py
+++ b/taskcluster/translations_taskgraph/actions/train.py
@@ -5,6 +5,8 @@
 from taskgraph.actions.registry import register_callback_action
 from taskgraph.decision import taskgraph_decision
 from taskgraph.parameters import Parameters
+from taskgraph.taskgraph import TaskGraph
+from taskgraph.util.taskcluster import get_ancestors, get_artifact
 
 from translations_taskgraph.parameters import get_defaults
 
@@ -54,49 +56,34 @@ def validate_pretrained_models(params):
     schema=lambda graph_config: {
         "type": "object",
         "properties": {
+            "previous_group_ids": {
+                "type": "array",
+                "description": """Optional: an array of taskIds of decision or action
+tasks from the previous group(s) to use to populate our `previous_group_kinds`.
+Tasks specified here will be used as long as their label matches a needed task, and that
+task is upstream of `start-stage`. (That is to say: even if a task from one of these groups
+has a cache digest that doesn't match what the downstream task wants, it will still be used. This
+can be used for quick iteration of functionality where the quality of the outputs is not important.)""",
+                "items": {
+                    "type": "string",
+                },
+            },
+            "start-stage": {
+                "type": "string",
+                "description": """The stage of the pipeline to begin at, provided replacements
+can be found for tasks upstream of this stage. Usually used in conjunction with `previous_group_ids`
+which allows for specifying task group ids to fetch existing tasks from.""",
+                "default": "",
+                # We need to allow for no stage to be specified, in additional to all of the
+                # valid stages.
+                "enum": graph_config["valid-stages"] + [""],
+            },
             "target-stage": {
                 "type": "string",
                 "description": """The stage of the pipeline to run until
 (any stages this choice depends on will be automatically included).""",
                 "default": defaults["target-stage"],
-                # TODO: this should probably be specified in ci/config.yml
-                "enum": [
-                    "clean-corpus",
-                    "clean-mono",
-                    "bicleaner",
-                    "bicleaner-model",
-                    "merge-corpus",
-                    "merge-devset",
-                    "merge-mono",
-                    "train-vocab",
-                    "train-backwards",
-                    "evaluate-backwards",
-                    "split-corpus",
-                    "split-mono",
-                    "translate-mono-trg",
-                    "collect-mono-trg",
-                    "train-teacher",
-                    "evaluate-teacher",
-                    "evaluate-finetuned-teacher",
-                    "translate-corpus",
-                    "extract-best",
-                    "collect-corpus",
-                    "translate-mono-src",
-                    "collect-mono-src",
-                    "merge-translated",
-                    "score",
-                    "cefilter",
-                    "alignments",
-                    "train-student",
-                    "evaluate-student",
-                    "finetune-student",
-                    "evaluate-finetuned-student",
-                    "quantize",
-                    "evaluate-quantized",
-                    "export",
-                    "evaluate-teacher-ensemble",
-                    "all",
-                ],
+                "enum": graph_config["valid-stages"],
             },
             "experiment": {
                 "type": "object",
@@ -343,6 +330,32 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
 
     parameters = dict(parameters)
 
+    start_stage = input.pop("start-stage", None)
+    if start_stage:
+        previous_group_ids = input.get("previous_group_ids")
+
+        # First, we create one big graph out of all of the tasks from the specified group IDs.
+        label_to_task_id = {}
+        combined_full_task_graph = {}
+        for graph_id in previous_group_ids:
+            label_to_task_id = get_artifact(graph_id, "public/label-to-taskid.json")
+            full_task_graph = get_artifact(graph_id, "public/full-task-graph.json")
+            combined_full_task_graph.update(full_task_graph)
+        _, combined_full_task_graph = TaskGraph.from_json(combined_full_task_graph)
+
+        # Next, we find the task id(s) corresponding of the tasks that match the stage
+        # we want to start at.
+        start_task_ids = []
+        for label, task in combined_full_task_graph.tasks.items():
+            if task.attributes.get("stage") == start_stage:
+                start_task_ids.append(label_to_task_id[label])
+
+        # Finally, we walk up the graph from our starting point and add any tasks found
+        # as `existing_tasks`. These map task labels (eg: train-backwards-ru-en) to
+        # task ids, and will be used instead of scheduling new tasks for any tasks with
+        # an identical name.
+        parameters["existing_tasks"] = get_ancestors(start_task_ids)
+
     parameters["target_tasks_method"] = "train-target-tasks"
     parameters["optimize_target_tasks"] = True
     parameters["tasks_for"] = "action"