Add the ability to run starting from a specific task (fixes #227)

A couple of example runs with this: * https://firefox-ci-tc.services.mozilla.com/tasks/groups/Sc5vusi4TUKLmD3CsZ46kg uses https://firefox-ci-tc.services.mozilla.com/tasks/groups/JjNp3KcyTUObUtOA9BgK5g as its `previous-group-id` with `start-stage: train-backwards` and `target-stage: train-teacher`. * https://firefox-ci-tc.services.mozilla.com/tasks/groups/GTujkfDHRcODvlRSfsqJYA uses the above group as its `previous-group-id` with `start-stage: train-teacher` and `target-stage: all`. Note that it ended up depending on tasks from both the above group and the one that it was based on, and ended up running `train-teacher` and everything after it. Big thanks to @gabrielBusta for suggesting this implementation!
mozilla · Jan 31, 2024 · 179ac8d · 179ac8d
1 parent 437ceac
commit 179ac8d
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 44 deletions.
diff --git a/docs/task-cluster.md b/docs/task-cluster.md
@@ -108,6 +108,20 @@ tasks:
             stage: merge-corpus
 ```
 
+## Running only later parts of the pipeline
+
+When hacking on later parts of the pipeline it can often be useful to re-use earlier runs of the pipeline, even if those runs were done with different training parameters. To do this, we must bypass the usual caching mechanisms of Taskgraph, and force it to replace earlier tasks with ones we provide. To do this, you can run a training action as usual, but also provide `start-stage` and `previous_group_ids` parameters. For example:
+
+```
+start-stage: train-student
+target-stage: all
+previous_group_ids: ["SsGpi3TGShaDT-h93fHL-g"]
+```
+
+...will run `train-student` and all tasks _after_ it. All tasks upstream of `train-student` will be replaced with the tasks of the same name from the `SsGpi3TGShaDT-h93fHL-g` task group, or tasks that are upstream from one of those tasks. It is important that you provide a task group id that contains the task or tasks from the `start-stage` you've given, otherwise Taskgraph will be unable to correctly find the upstream tasks you want to re-use.
+
+Note: This feature should _never_ be used for production training, as it completely bypasses all caching mechanisms, and you will most likely end up with invalid or useless models.
+
 ## Interactive Tasks
 
 Taskcluster allows authorized users to run so-called [interactive tasks](https://docs.taskcluster.net/docs/reference/workers/docker-worker/features#feature-interactive). These tasks allow users to gain a shell in the same environment that a pipeline step runs in. This can often be useful for quicker debugging or testing of ideas.

diff --git a/taskcluster/config.yml b/taskcluster/config.yml
@@ -15,6 +15,45 @@ taskgraph:
         firefox_translations_training:
             name: "firefox-translations-training"
 
+# The list of valid stages that can be used with `target-stage and `start-stage`.
+# These get attached to tasks in `kinds`.
+valid-stages:
+    - clean-corpus
+    - clean-mono
+    - bicleaner
+    - bicleaner-model
+    - merge-corpus
+    - merge-devset
+    - merge-mono
+    - train-vocab
+    - train-backwards
+    - evaluate-backwards
+    - split-corpus
+    - split-mono
+    - translate-mono-trg
+    - collect-mono-trg
+    - train-teacher
+    - evaluate-teacher
+    - evaluate-finetuned-teacher
+    - translate-corpus
+    - extract-best
+    - collect-corpus
+    - translate-mono-src
+    - collect-mono-src
+    - merge-translated
+    - score
+    - cefilter
+    - alignments
+    - train-student
+    - evaluate-student
+    - finetune-student
+    - evaluate-finetuned-student
+    - quantize
+    - evaluate-quantized
+    - export
+    - evaluate-teacher-ensemble
+    - all
+
 workers:
     aliases:
         # Use for quick tasks that don't require GPUs, eg: linting, tests

diff --git a/taskcluster/requirements.in b/taskcluster/requirements.in
@@ -1 +1 @@
-taskcluster-taskgraph>=7
+taskcluster-taskgraph>=7.1.2
diff --git a/taskcluster/requirements.txt b/taskcluster/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --generate-hashes requirements.in
+#    pip-compile --generate-hashes taskcluster/requirements.in
 #
 appdirs==1.4.4 \
     --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \
@@ -302,10 +302,10 @@ slugid==2.0.0 \
     --hash=sha256:a950d98b72691178bdd4d6c52743c4a2aa039207cf7a97d71060a111ff9ba297 \
     --hash=sha256:aec8b0e01c4ad32e38e12d609eab3ec912fd129aaf6b2ded0199b56a5f8fd67c
     # via taskcluster-taskgraph
-taskcluster-taskgraph==7.1.0 \
-    --hash=sha256:7274765492eb3a5bc60901a7e8b0e7b0a9907921357c9dd66099541500a716a3 \
-    --hash=sha256:d794850df1aa8b0bb2250e8e9c0ad92917f6147de80d50d5f28c6c6713c7912d
-    # via -r requirements.in
+taskcluster-taskgraph==7.1.2 \
+    --hash=sha256:b7c341b2530cf86f6ee6c61ccdffd744e8590ef222c8f6477af7164d06aeda0f \
+    --hash=sha256:d343343cb9585438331e3f6771cf417e3631c33c667b10420183df06d8e9febf
+    # via -r taskcluster/requirements.in
 taskcluster-urls==13.0.1 \
     --hash=sha256:5e25e7e6818e8877178b175ff43d2e6548afad72694aa125f404a7329ece0973 \
     --hash=sha256:b25e122ecec249c4299ac7b20b08db76e3e2025bdaeb699a9d444556de5fd367 \

diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py
@@ -5,6 +5,8 @@
 from taskgraph.actions.registry import register_callback_action
 from taskgraph.decision import taskgraph_decision
 from taskgraph.parameters import Parameters
+from taskgraph.taskgraph import TaskGraph
+from taskgraph.util.taskcluster import get_recursive_dependencies, get_artifact
 
 from translations_taskgraph.parameters import get_defaults
 
@@ -54,49 +56,34 @@ def validate_pretrained_models(params):
     schema=lambda graph_config: {
         "type": "object",
         "properties": {
+            "previous_group_ids": {
+                "type": "array",
+                "description": """Optional: an array of taskIds of decision or action
+tasks from the previous group(s) to use to populate our `previous_group_kinds`.
+Tasks specified here will be used as long as their label matches a needed task, and that
+task is upstream of `start-stage`. (That is to say: even if a task from one of these groups
+has a cache digest that doesn't match what the downstream task wants, it will still be used. This
+can be used for quick iteration of functionality where the quality of the outputs is not important.)""",
+                "items": {
+                    "type": "string",
+                },
+            },
+            "start-stage": {
+                "type": "string",
+                "description": """The stage of the pipeline to begin at, provided replacements
+can be found for tasks upstream of this stage. Usually used in conjunction with `previous_group_ids`
+which allows for specifying task group ids to fetch existing tasks from.""",
+                "default": "",
+                # We need to allow for no stage to be specified, in additional to all of the
+                # valid stages.
+                "enum": graph_config["valid-stages"] + [""],
+            },
             "target-stage": {
                 "type": "string",
                 "description": """The stage of the pipeline to run until
 (any stages this choice depends on will be automatically included).""",
                 "default": defaults["target-stage"],
-                # TODO: this should probably be specified in ci/config.yml
-                "enum": [
-                    "clean-corpus",
-                    "clean-mono",
-                    "bicleaner",
-                    "bicleaner-model",
-                    "merge-corpus",
-                    "merge-devset",
-                    "merge-mono",
-                    "train-vocab",
-                    "train-backwards",
-                    "evaluate-backwards",
-                    "split-corpus",
-                    "split-mono",
-                    "translate-mono-trg",
-                    "collect-mono-trg",
-                    "train-teacher",
-                    "evaluate-teacher",
-                    "evaluate-finetuned-teacher",
-                    "translate-corpus",
-                    "extract-best",
-                    "collect-corpus",
-                    "translate-mono-src",
-                    "collect-mono-src",
-                    "merge-translated",
-                    "score",
-                    "cefilter",
-                    "alignments",
-                    "train-student",
-                    "evaluate-student",
-                    "finetune-student",
-                    "evaluate-finetuned-student",
-                    "quantize",
-                    "evaluate-quantized",
-                    "export",
-                    "evaluate-teacher-ensemble",
-                    "all",
-                ],
+                "enum": graph_config["valid-stages"],
             },
             "experiment": {
                 "type": "object",
@@ -343,6 +330,32 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
 
     parameters = dict(parameters)
 
+    start_stage = input.pop("start-stage", None)
+    if start_stage:
+        previous_group_ids = input.get("previous_group_ids")
+
+        # First, we create one big graph out of all of the tasks from the specified group IDs.
+        label_to_task_id = {}
+        combined_full_task_graph = {}
+        for graph_id in previous_group_ids:
+            label_to_task_id = get_artifact(graph_id, "public/label-to-taskid.json")
+            full_task_graph = get_artifact(graph_id, "public/full-task-graph.json")
+            combined_full_task_graph.update(full_task_graph)
+        _, combined_full_task_graph = TaskGraph.from_json(combined_full_task_graph)
+
+        # Next, we find the task id(s) corresponding of the tasks that match the stage
+        # we want to start at.
+        start_task_ids = []
+        for label, task in combined_full_task_graph.tasks.items():
+            if task.attributes.get("stage") == start_stage:
+                start_task_ids.append(label_to_task_id[label])
+
+        # Finally, we walk up the graph from our starting point and add any tasks found
+        # as `existing_tasks`. These map task labels (eg: train-backwards-ru-en) to
+        # task ids, and will be used instead of scheduling new tasks for any tasks with
+        # an identical name.
+        parameters["existing_tasks"] = get_recursive_dependencies(start_task_ids)
+
     parameters["target_tasks_method"] = "train-target-tasks"
     parameters["optimize_target_tasks"] = True
     parameters["tasks_for"] = "action"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		taskcluster-taskgraph>=7
		taskcluster-taskgraph>=7.1.2