From 335989fcf073d06d4b030ca3a4b1228aa858cf38 Mon Sep 17 00:00:00 2001
From: Ben Hearsum <ben@mozilla.com>
Date: Fri, 19 Jan 2024 09:09:17 -0500
Subject: [PATCH] Add the ability to run starting from a specific task (fixes
 #227)

The doc updates included here provide an overview of how to use this feature. Here are a few examples from my own testing:
* An initial run with `target-stage: train-teacher` and no `start-stage`: https://firefox-ci-tc.services.mozilla.com/tasks/groups/SsGpi3TGShaDT-h93fHL-g - which ran everything up to `train-teacher` (except some tasks that it managed to find in the caches)
* A run with `target-stage: train-teacher`, `start-stage: train-backwards`, and `previous_group_ids` set to the group id above - which scheduled everything from `train-backwards` to `train-teacher`. https://firefox-ci-tc.services.mozilla.com/tasks/groups/ERLyIsbaRXK8gEVWFUAp3Q
* A run the same as the above, except with `start-stage: train-student` - which scheduled everything past `train-teacher`. (We explicitly asked for `train-student`, but because the original run didn't include anything past `train-teacher` it also had to provide things like `alignments`.) https://firefox-ci-tc.services.mozilla.com/tasks/groups/KhG4FN-yQOeBsyeoC8e-MQ

(There are some failures there - because I overwrote that branch as the tasks were running...but the important thing is that all of the correct tasks were scheduled.)

Big thanks to @gabrielBusta for suggesting this implementation!
---
 docs/task-cluster.md                          |  14 +++
 taskcluster/config.yml                        |  38 +++++++
 .../translations_taskgraph/actions/train.py   | 104 +++++++++++-------
 3 files changed, 119 insertions(+), 37 deletions(-)

diff --git a/docs/task-cluster.md b/docs/task-cluster.md
index c6e21a81c..50bdf0f71 100644
--- a/docs/task-cluster.md
+++ b/docs/task-cluster.md
@@ -97,6 +97,20 @@ tasks:
             stage: merge-corpus
 ```
 
+## Running only later parts of the pipeline
+
+When hacking on later parts of the pipeline it can often be useful to re-use earlier runs of the pipeline, even if those runs were done with different training parameters. To do this, we must bypass the usual caching mechanisms of Taskgraph, and force it to replace earlier tasks with ones we provide. To do this, you can run a training action as usual, but also provide `start-stage` and `previous_group_ids` parameters. For example:
+
+```
+start-stage: train-student
+target-stage: all
+previous_group_ids: ["SsGpi3TGShaDT-h93fHL-g"]
+```
+
+...will run `train-student` and all tasks _after_ it. All tasks upstream of `train-student` will be replaced with the tasks of the same name from the `SsGpi3TGShaDT-h93fHL-g` task group. (If there are required tasks that do not exist in any of the provided groups, they will be created - so you must ensure that any tasks you want to skip exist in one of the previous groups.)
+
+Note: This feature should _never_ be used for production training, as it completely bypasses all caching mechanisms, and you will most likely end up with invalid or useless models.
+
 ## Interactive Tasks
 
 Taskcluster allows authorized users to run so-called [interactive tasks](https://docs.taskcluster.net/docs/reference/workers/docker-worker/features#feature-interactive). These tasks allow users to gain a shell in the same environment that a pipeline step runs in. This can often be useful for quicker debugging or testing of ideas.
diff --git a/taskcluster/config.yml b/taskcluster/config.yml
index ea648d977..52e993932 100644
--- a/taskcluster/config.yml
+++ b/taskcluster/config.yml
@@ -15,6 +15,44 @@ taskgraph:
         firefox_translations_training:
             name: "firefox-translations-training"
 
+# The list of valid stages that can be used with `target-stage and `start-stage`.
+# These get attached to tasks in `kinds`.
+valid-stages:
+    - clean-corpus
+    - clean-mono
+    - bicleaner
+    - merge-corpus
+    - merge-devset
+    - merge-mono
+    - train-vocab
+    - train-backwards
+    - evaluate-backwards
+    - split-corpus
+    - split-mono
+    - translate-mono-trg
+    - collect-mono-trg
+    - train-teacher
+    - evaluate-teacher
+    - evaluate-finetuned-teacher
+    - translate-corpus
+    - extract-best
+    - collect-corpus
+    - translate-mono-src
+    - collect-mono-src
+    - merge-translated
+    - score
+    - cefilter
+    - alignments
+    - train-student
+    - evaluate-student
+    - finetune-student
+    - evaluate-finetuned-student
+    - quantize
+    - evaluate-quantized
+    - export
+    - evaluate-teacher-ensemble
+    - all
+
 workers:
     aliases:
         # Use for quick tasks that don't require GPUs, eg: linting, tests
diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py
index b0cf98237..e906d0ef9 100644
--- a/taskcluster/translations_taskgraph/actions/train.py
+++ b/taskcluster/translations_taskgraph/actions/train.py
@@ -5,6 +5,12 @@
 from taskgraph.actions.registry import register_callback_action
 from taskgraph.decision import taskgraph_decision
 from taskgraph.parameters import Parameters
+from taskgraph.taskgraph import TaskGraph
+from taskgraph.util.taskcluster import get_artifact
+from taskgraph.util.taskgraph import (
+    find_decision_task,
+    find_existing_tasks_from_previous_kinds,
+)
 
 from translations_taskgraph.parameters import get_defaults
 
@@ -54,48 +60,34 @@ def validate_pretrained_models(params):
     schema=lambda graph_config: {
         "type": "object",
         "properties": {
+            "previous_group_ids": {
+                "type": "array",
+                "description": """Optional: an array of taskIds of decision or action
+tasks from the previous group(s) to use to populate our `previous_group_kinds`.
+Tasks specified here will be used as long as their label matches a needed task, and that
+task is upstream of `start-stage`. (That is to say: even if a task from one of these groups
+has a cache digest that doesn't match what the downstream task wants, it will still be used. This
+can be used for quick iteration of functionality where the quality of the outputs is not important.)""",
+                "items": {
+                    "type": "string",
+                },
+            },
+            "start-stage": {
+                "type": "string",
+                "description": """The stage of the pipeline to begin at, provided replacements
+can be found for tasks upstream of this stage. Usually used in conjunction with `previous_group_ids`
+which allows for specifying task group ids to fetch existing tasks from.""",
+                "default": "",
+                # We need to allow for no stage to be specified, in additional to all of the
+                # valid stages.
+                "enum": graph_config["valid-stages"] + [""],
+            },
             "target-stage": {
                 "type": "string",
                 "description": """The stage of the pipeline to run until
 (any stages this choice depends on will be automatically included).""",
                 "default": defaults["target-stage"],
-                # TODO: this should probably be specified in ci/config.yml
-                "enum": [
-                    "clean-corpus",
-                    "clean-mono",
-                    "bicleaner",
-                    "merge-corpus",
-                    "merge-devset",
-                    "merge-mono",
-                    "train-vocab",
-                    "train-backwards",
-                    "evaluate-backwards",
-                    "split-corpus",
-                    "split-mono",
-                    "translate-mono-trg",
-                    "collect-mono-trg",
-                    "train-teacher",
-                    "evaluate-teacher",
-                    "evaluate-finetuned-teacher",
-                    "translate-corpus",
-                    "extract-best",
-                    "collect-corpus",
-                    "translate-mono-src",
-                    "collect-mono-src",
-                    "merge-translated",
-                    "score",
-                    "cefilter",
-                    "alignments",
-                    "train-student",
-                    "evaluate-student",
-                    "finetune-student",
-                    "evaluate-finetuned-student",
-                    "quantize",
-                    "evaluate-quantized",
-                    "export",
-                    "evaluate-teacher-ensemble",
-                    "all",
-                ],
+                "enum": graph_config["valid-stages"],
             },
             "experiment": {
                 "type": "object",
@@ -342,6 +334,44 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
 
     parameters = dict(parameters)
 
+    # Building up existing_tasks is largely cribbed from existing release promotion actions, eg:
+    # https://github.com/mozilla-releng/mozilla-taskgraph/blob/main/src/mozilla_taskgraph/actions/release_promotion.py
+    start_stage = input.pop("start-stage", None)
+    if start_stage:
+        previous_group_ids = input.get("previous_group_ids")
+        if not previous_group_ids:
+            previous_group_ids = [find_decision_task(parameters, graph_config)]
+
+        # First, we create one big graph out of all of the tasks from the specified group IDs.
+        combined_full_task_graph = {}
+        for graph_id in previous_group_ids:
+            full_task_graph = get_artifact(graph_id, "public/full-task-graph.json")
+            combined_full_task_graph.update(full_task_graph)
+        _, combined_full_task_graph = TaskGraph.from_json(combined_full_task_graph)
+
+        # Next, we find the graph node(s) corresponding to the tasks that match the stage
+        # we want to start at.
+        start_nodes = set()
+        for task in combined_full_task_graph.tasks.values():
+            if task.attributes.get("stage") == start_stage:
+                start_nodes.add(task.label)
+
+        # Grab the names of the `kinds` for all of the tasks that are at or downstream of
+        # a `start-stage` task.
+        rebuild_kinds = set()
+        for label in combined_full_task_graph.graph.transitive_closure(
+            start_nodes, reverse=True
+        ).nodes:
+            task = combined_full_task_graph[label]
+            rebuild_kinds.add(task.kind)
+
+        # Finally, use all of the data we've gathered to replace as many tasks as we can
+        # with existing ones. Anything upstream of all of the `start-stage` tasks will
+        # get replaced if a matching task label is found in `combined_full_task_graph`.
+        parameters["existing_tasks"] = find_existing_tasks_from_previous_kinds(
+            combined_full_task_graph, previous_group_ids, rebuild_kinds
+        )
+
     parameters["target_tasks_method"] = "train-target-tasks"
     parameters["optimize_target_tasks"] = True
     parameters["tasks_for"] = "action"