Add the ability to run starting from a specific task (fixes #227)

The doc updates included here provide an overview of how to use this feature. Here are a few examples from my own testing: * An initial run with `target-stage: train-teacher` and no `start-stage`: https://firefox-ci-tc.services.mozilla.com/tasks/groups/SsGpi3TGShaDT-h93fHL-g - which ran everything up to `train-teacher` (except some tasks that it managed to find in the caches) * A run with `target-stage: train-teacher`, `start-stage: train-backwards`, and `previous_group_ids` set to the group id above - which scheduled everything from `train-backwards` to `train-teacher`. https://firefox-ci-tc.services.mozilla.com/tasks/groups/ERLyIsbaRXK8gEVWFUAp3Q * A run the same as the above, except with `start-stage: train-student` - which scheduled everything past `train-teacher`. (We explicitly asked for `train-student`, but because the original run didn't include anything past `train-teacher` it also had to provide things like `alignments`.) https://firefox-ci-tc.services.mozilla.com/tasks/groups/KhG4FN-yQOeBsyeoC8e-MQ (There are some failures there - because I overwrote that branch as the tasks were running...but the important thing is that all of the correct tasks were scheduled.) Big thanks to @gabrielBusta for suggesting this implementation!
mozilla · Jan 29, 2024 · cc835ca · cc835ca
1 parent 437ceac
commit cc835ca
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 38 deletions.
diff --git a/docs/task-cluster.md b/docs/task-cluster.md
@@ -108,6 +108,20 @@ tasks:
             stage: merge-corpus
 ```
 
+## Running only later parts of the pipeline
+
+When hacking on later parts of the pipeline it can often be useful to re-use earlier runs of the pipeline, even if those runs were done with different training parameters. To do this, we must bypass the usual caching mechanisms of Taskgraph, and force it to replace earlier tasks with ones we provide. To do this, you can run a training action as usual, but also provide `start-stage` and `previous_group_ids` parameters. For example:
+
+```
+start-stage: train-student
+target-stage: all
+previous_group_ids: ["SsGpi3TGShaDT-h93fHL-g"]
+```
+
+...will run `train-student` and all tasks _after_ it. All tasks upstream of `train-student` will be replaced with the tasks of the same name from the `SsGpi3TGShaDT-h93fHL-g` task group. (If there are required tasks that do not exist in any of the provided groups, they will be created - so you must ensure that any tasks you want to skip exist in one of the previous groups.)
+
+Note: This feature should _never_ be used for production training, as it completely bypasses all caching mechanisms, and you will most likely end up with invalid or useless models.
+
 ## Interactive Tasks
 
 Taskcluster allows authorized users to run so-called [interactive tasks](https://docs.taskcluster.net/docs/reference/workers/docker-worker/features#feature-interactive). These tasks allow users to gain a shell in the same environment that a pipeline step runs in. This can often be useful for quicker debugging or testing of ideas.

diff --git a/taskcluster/config.yml b/taskcluster/config.yml
@@ -15,6 +15,45 @@ taskgraph:
         firefox_translations_training:
             name: "firefox-translations-training"
 
+# The list of valid stages that can be used with `target-stage and `start-stage`.
+# These get attached to tasks in `kinds`.
+valid-stages:
+    - clean-corpus
+    - clean-mono
+    - bicleaner
+    - bicleaner-model
+    - merge-corpus
+    - merge-devset
+    - merge-mono
+    - train-vocab
+    - train-backwards
+    - evaluate-backwards
+    - split-corpus
+    - split-mono
+    - translate-mono-trg
+    - collect-mono-trg
+    - train-teacher
+    - evaluate-teacher
+    - evaluate-finetuned-teacher
+    - translate-corpus
+    - extract-best
+    - collect-corpus
+    - translate-mono-src
+    - collect-mono-src
+    - merge-translated
+    - score
+    - cefilter
+    - alignments
+    - train-student
+    - evaluate-student
+    - finetune-student
+    - evaluate-finetuned-student
+    - quantize
+    - evaluate-quantized
+    - export
+    - evaluate-teacher-ensemble
+    - all
+
 workers:
     aliases:
         # Use for quick tasks that don't require GPUs, eg: linting, tests

diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py
@@ -5,6 +5,12 @@
 from taskgraph.actions.registry import register_callback_action
 from taskgraph.decision import taskgraph_decision
 from taskgraph.parameters import Parameters
+from taskgraph.taskgraph import TaskGraph
+from taskgraph.util.taskcluster import get_artifact, get_recursive_dependencies
+from taskgraph.util.taskgraph import (
+    find_decision_task,
+    find_existing_tasks_from_previous_kinds,
+)
 
 from translations_taskgraph.parameters import get_defaults
 
@@ -54,49 +60,34 @@ def validate_pretrained_models(params):
     schema=lambda graph_config: {
         "type": "object",
         "properties": {
+            "previous_group_ids": {
+                "type": "array",
+                "description": """Optional: an array of taskIds of decision or action
+tasks from the previous group(s) to use to populate our `previous_group_kinds`.
+Tasks specified here will be used as long as their label matches a needed task, and that
+task is upstream of `start-stage`. (That is to say: even if a task from one of these groups
+has a cache digest that doesn't match what the downstream task wants, it will still be used. This
+can be used for quick iteration of functionality where the quality of the outputs is not important.)""",
+                "items": {
+                    "type": "string",
+                },
+            },
+            "start-stage": {
+                "type": "string",
+                "description": """The stage of the pipeline to begin at, provided replacements
+can be found for tasks upstream of this stage. Usually used in conjunction with `previous_group_ids`
+which allows for specifying task group ids to fetch existing tasks from.""",
+                "default": "",
+                # We need to allow for no stage to be specified, in additional to all of the
+                # valid stages.
+                "enum": graph_config["valid-stages"] + [""],
+            },
             "target-stage": {
                 "type": "string",
                 "description": """The stage of the pipeline to run until
 (any stages this choice depends on will be automatically included).""",
                 "default": defaults["target-stage"],
-                # TODO: this should probably be specified in ci/config.yml
-                "enum": [
-                    "clean-corpus",
-                    "clean-mono",
-                    "bicleaner",
-                    "bicleaner-model",
-                    "merge-corpus",
-                    "merge-devset",
-                    "merge-mono",
-                    "train-vocab",
-                    "train-backwards",
-                    "evaluate-backwards",
-                    "split-corpus",
-                    "split-mono",
-                    "translate-mono-trg",
-                    "collect-mono-trg",
-                    "train-teacher",
-                    "evaluate-teacher",
-                    "evaluate-finetuned-teacher",
-                    "translate-corpus",
-                    "extract-best",
-                    "collect-corpus",
-                    "translate-mono-src",
-                    "collect-mono-src",
-                    "merge-translated",
-                    "score",
-                    "cefilter",
-                    "alignments",
-                    "train-student",
-                    "evaluate-student",
-                    "finetune-student",
-                    "evaluate-finetuned-student",
-                    "quantize",
-                    "evaluate-quantized",
-                    "export",
-                    "evaluate-teacher-ensemble",
-                    "all",
-                ],
+                "enum": graph_config["valid-stages"],
             },
             "experiment": {
                 "type": "object",
@@ -343,6 +334,28 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
 
     parameters = dict(parameters)
 
+    start_stage = input.pop("start-stage", None)
+    if start_stage:
+        previous_group_ids = input.get("previous_group_ids")
+        if not previous_group_ids:
+            previous_group_ids = [find_decision_task(parameters, graph_config)]
+
+        # First, we create one big graph out of all of the tasks from the specified group IDs.
+        combined_full_task_graph = {}
+        for graph_id in previous_group_ids:
+            full_task_graph = get_artifact(graph_id, "public/full-task-graph.json")
+            combined_full_task_graph.update(full_task_graph)
+        _, combined_full_task_graph = TaskGraph.from_json(combined_full_task_graph)
+
+        # Next, we find the task id(s) corresponding of the tasks that match the stage
+        # we want to start at.
+        start_task_ids = []
+        for task_id, task in combined_full_task_graph.tasks.items():
+            if task.attributes.get("stage") == start_stage:
+                start_nodes.add(task_id)
+
+        parameters["existing_tasks"] = get_recursive_dependencies(start_task_ids)
+
     parameters["target_tasks_method"] = "train-target-tasks"
     parameters["optimize_target_tasks"] = True
     parameters["tasks_for"] = "action"