From cba711216434691e0d1e1a5d2c39f198bb79ef68 Mon Sep 17 00:00:00 2001
From: Bastien Abadie <bastien@nextcairn.com>
Date: Thu, 2 May 2024 21:12:47 +0200
Subject: [PATCH 01/14] Configure evaluation tasks

---
 taskcluster/kinds/evaluate-quantized/kind.yml   | 16 ++++++++++++++++
 .../kinds/evaluate-teacher-ensemble/kind.yml    | 17 +++++++++++++++++
 taskcluster/kinds/evaluate/kind.yml             | 16 ++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml
index 6118b4e41..4c3cb9e8d 100644
--- a/taskcluster/kinds/evaluate-quantized/kind.yml
+++ b/taskcluster/kinds/evaluate-quantized/kind.yml
@@ -45,9 +45,11 @@ tasks:
             substitution-fields:
                 - run.command
                 - fetches
+                - worker.env
             from-parameters:
                 src_locale: training_config.experiment.src
                 trg_locale: training_config.experiment.trg
+                wandb_publication: training_config.wandb-publication
 
         worker-type: b-gpu
         worker:
@@ -60,6 +62,19 @@ tasks:
                 # This is a separate environment variable so tests can override it.
                 BMT_MARIAN: $MOZ_FETCHES_DIR
 
+                # Weight & Biases trigger
+                WANDB_PUBLICATION: "{wandb_publication}"
+
+                # Weight & Biases publication token is stored in that secret
+                TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
+
+            # Taskcluster proxy is required to read secrets
+            taskcluster-proxy: true
+
+        # The task needs to be able to read that secret to publish on Weight & Biases
+        scopes:
+          - secrets:get:project/translations/level-1/weights-and-biases
+
         # Don't run unless explicitly scheduled
         run-on-tasks-for: []
 
@@ -71,6 +86,7 @@ tasks:
                 - >-
                     pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    pip install $VCS_PATH/tracking &&
                     export PATH=$PATH:~/.local/bin &&
                     zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst &&
                     $VCS_PATH/pipeline/eval/eval.py
diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
index 8976d1f78..7924ef432 100644
--- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
+++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
@@ -43,10 +43,13 @@ tasks:
             substitution-fields:
                 - fetches
                 - run.command
+                - worker.env
             from-parameters:
                 best_model: training_config.experiment.best-model
                 src_locale: training_config.experiment.src
                 trg_locale: training_config.experiment.trg
+                wandb_publication: training_config.wandb-publication
+
         worker-type: b-gpu
         worker:
             artifacts:
@@ -58,6 +61,19 @@ tasks:
                 # This is a separate environment variable so tests can override it.
                 MARIAN: $MOZ_FETCHES_DIR
 
+                # Weight & Biases trigger
+                WANDB_PUBLICATION: "{wandb_publication}"
+
+                # Weight & Biases publication token is stored in that secret
+                TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
+
+            # Taskcluster proxy is required to read secrets
+            taskcluster-proxy: true
+
+        # The task needs to be able to read that secret to publish on Weight & Biases
+        scopes:
+          - secrets:get:project/translations/level-1/weights-and-biases
+
         # Don't run unless explicitly scheduled
         run-on-tasks-for: []
 
@@ -75,6 +91,7 @@ tasks:
                 - >-
                     pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    pip install $VCS_PATH/tracking &&
                     export PATH=$PATH:~/.local/bin &&
                     sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                     sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml
index d2809983a..cf80e81ef 100644
--- a/taskcluster/kinds/evaluate/kind.yml
+++ b/taskcluster/kinds/evaluate/kind.yml
@@ -41,11 +41,13 @@ task-defaults:
     task-context:
         substitution-fields:
             - run.command
+            - worker.env
         from-parameters:
             best_model: training_config.experiment.best-model
             src_locale: training_config.experiment.src
             trg_locale: training_config.experiment.trg
             split_chunks: training_config.experiment.teacher-ensemble
+            wandb_publication: training_config.wandb-publication
     worker-type: b-gpu
     worker:
         artifacts:
@@ -57,6 +59,19 @@ task-defaults:
             # This is a separate environment variable so tests can override it.
             MARIAN: $MOZ_FETCHES_DIR
 
+            # Weight & Biases trigger
+            WANDB_PUBLICATION: "{wandb_publication}"
+
+            # Weight & Biases publication token is stored in that secret
+            TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases
+
+        # Taskcluster proxy is required to read secrets
+        taskcluster-proxy: true
+
+    # The task needs to be able to read that secret to publish on Weight & Biases
+    scopes:
+      - secrets:get:project/translations/level-1/weights-and-biases
+
     # Don't run unless explicitly scheduled
     run-on-tasks-for: []
 
@@ -73,6 +88,7 @@ task-defaults:
             - -c
             - >-
                 pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                pip install $VCS_PATH/tracking &&
                 export PATH=$PATH:~/.local/bin &&
                 export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
                 sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&

From 08b7175c73c29834debfcaeabe6488c25d8ab5e8 Mon Sep 17 00:00:00 2001
From: Bastien Abadie <bastien@nextcairn.com>
Date: Thu, 2 May 2024 21:13:08 +0200
Subject: [PATCH 02/14] Extract w&b code into module

---
 pipeline/eval/eval.py                         |  23 +++
 .../translations_parser/cli/taskcluster.py    | 109 +++----------
 tracking/translations_parser/wandb.py         | 149 ++++++++++++++++++
 3 files changed, 190 insertions(+), 91 deletions(-)
 create mode 100644 tracking/translations_parser/wandb.py

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index febca10d9..57b097d86 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -54,6 +54,13 @@
 from pipeline.common.logging import get_logger
 
 logger = get_logger("eval")
+try:
+    from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher
+
+    WANDB_AVAILABLE = True
+except ImportError as e:
+    print(f"Failed to import tracking module: {e}")
+    WANDB_AVAILABLE = False
 
 
 def run_bash_oneliner(command: str):
@@ -136,6 +143,11 @@ def main(args_list: Optional[list[str]] = None) -> None:
     parser.add_argument(
         "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
     )
+
+    # Add Weight & Biases CLI args when module is loaded
+    if WANDB_AVAILABLE:
+        add_wandb_arguments(parser)
+
     args = parser.parse_args(args_list)
 
     src = args.src
@@ -329,6 +341,17 @@ def main(args_list: Optional[list[str]] = None) -> None:
     with open(metrics_file, "w") as file:
         file.write(f"{bleu_details['score']}\n" f"{chrf_details['score']}\n" f"{comet_score}\n")
 
+    if WANDB_AVAILABLE:
+        wandb = get_wandb_publisher(  # noqa
+            project_name=args.wandb_project,
+            group_name=args.wandb_group,
+            run_name=args.wandb_run_name,
+            taskcluster_secret=args.taskcluster_secret,
+            artifacts=args.wandb_artifacts,
+            publication=args.wandb_publication,
+        )
+        # TODO: publish scores
+
 
 if __name__ == "__main__":
     main()
diff --git a/tracking/translations_parser/cli/taskcluster.py b/tracking/translations_parser/cli/taskcluster.py
index fa5909333..9158f9d55 100644
--- a/tracking/translations_parser/cli/taskcluster.py
+++ b/tracking/translations_parser/cli/taskcluster.py
@@ -20,10 +20,10 @@
 from io import TextIOWrapper
 from pathlib import Path
 
-import taskcluster
 from translations_parser.parser import TrainingParser, logger
-from translations_parser.publishers import CSVExport, Publisher, WandB
-from translations_parser.utils import build_task_name, taskcluster_log_filter
+from translations_parser.publishers import CSVExport, Publisher
+from translations_parser.utils import taskcluster_log_filter
+from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher
 
 
 def get_args() -> argparse.Namespace:
@@ -99,59 +99,11 @@ def get_args() -> argparse.Namespace:
         dest="loglevel",
         const=logging.DEBUG,
     )
-    return parser.parse_args()
-
-
-def get_wandb_token(secret_name):
-    """
-    Retrieve the Weight & Biases token from Taskcluster secret
-    """
-    secrets = taskcluster.Secrets({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]})
-
-    try:
-        wandb_secret = secrets.get(secret_name)
-        return wandb_secret["secret"]["token"]
-    except Exception as e:
-        raise Exception(
-            f"Weight & Biases secret API Key retrieved from Taskcluster is malformed: {e}"
-        )
 
+    # Extend parser with Weight & Biases CLI args
+    add_wandb_arguments(parser)
 
-def get_wandb_names():
-    """
-    Find the various names needed to publish on Weight & Biases using
-    the taskcluster task & group payloads
-    """
-    task_id = os.environ.get("TASK_ID")
-    if not task_id:
-        raise Exception("Weight & Biases name detection can only run in taskcluster")
-
-    # Load task & group definition
-    # CI task groups do not expose any configuration, so we must use default values
-    queue = taskcluster.Queue({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]})
-    task = queue.task(task_id)
-    _, task_name = build_task_name(task)
-    group_id = task["taskGroupId"]
-    task_group = queue.task(group_id)
-    config = task_group.get("extra", {}).get("action", {}).get("context", {}).get("input")
-    if config is None:
-        logger.warn(
-            f"Experiment configuration missing on {group_id} @ extra/action/context/input, fallback to CI values"
-        )
-        experiment = {
-            "src": "ru",
-            "trg": "en",
-            "name": "ci",
-        }
-    else:
-        experiment = config["experiment"]
-
-    # Build project, group and run names
-    return (
-        f'{experiment["src"]}-{experiment["trg"]}',
-        f'{experiment["name"]}_{group_id}',
-        task_name,
-    )
+    return parser.parse_args()
 
 
 def boot() -> None:
@@ -171,44 +123,19 @@ def boot() -> None:
         with args.input_file.open("r") as f:
             lines = (line.strip() for line in f.readlines())
 
-    # Load secret from Taskcluster and auto-configure naming
-    if args.taskcluster_secret:
-        assert os.environ.get(
-            "TASKCLUSTER_PROXY_URL"
-        ), "When using `--taskcluster-secret`, `TASKCLUSTER_PROXY_URL` environment variable must be set too."
-
-        # Weight and Biases client use environment variable to read the token
-        os.environ.setdefault("WANDB_API_KEY", get_wandb_token(args.taskcluster_secret))
-
-        project_name, group_name, run_name = get_wandb_names()
-    else:
-        # Fallback to CLI args for names
-        project_name = args.wandb_project
-        group_name = args.wandb_group
-        run_name = args.wandb_run_name
-
-    # Enable publication on weight and biases when project is set
-    # But prevent running when explicitly disabled by operator
+    # Build publisher output, CSV is always enabled, Weight & Biases upon operator choice
     publishers: list[Publisher] = [CSVExport(output_dir=args.output_dir)]
-    if not args.wandb_publication:
-        logger.info(
-            "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION"
-        )
-    elif not project_name:
-        logger.info("Skip weight & biases publication as project name is not set")
-    else:
-        publishers.append(
-            WandB(
-                project=project_name,
-                group=group_name,
-                name=run_name,
-                artifacts=args.wandb_artifacts,
-                tags=args.tags,
-                config={
-                    "logs_file": args.input_file,
-                },
-            )
-        )
+    wandb_publisher = get_wandb_publisher(
+        project_name=args.wandb_project,
+        group_name=args.wandb_group,
+        run_name=args.wandb_run_name,
+        taskcluster_secret=args.taskcluster_secret,
+        logs_file=args.input_file,
+        artifacts=args.wandb_artifacts,
+        publication=args.wandb_publication,
+    )
+    if wandb_publisher:
+        publishers.append(wandb_publisher)
 
     # Use log filtering when using non-stream (for uploading past experiments)
     log_filter = taskcluster_log_filter if not args.from_stream else None
diff --git a/tracking/translations_parser/wandb.py b/tracking/translations_parser/wandb.py
new file mode 100644
index 000000000..92058c84a
--- /dev/null
+++ b/tracking/translations_parser/wandb.py
@@ -0,0 +1,149 @@
+import os
+from pathlib import Path
+
+import taskcluster
+from translations_parser.parser import logger
+from translations_parser.publishers import WandB
+from translations_parser.utils import build_task_name
+
+
+def add_wandb_arguments(parser):
+    parser.add_argument(
+        "--wandb-project",
+        help="Publish the training run to a Weight & Biases project.",
+        default=None,
+    )
+    parser.add_argument(
+        "--wandb-artifacts",
+        help="Directory containing training artifacts to publish on Weight & Biases.",
+        type=Path,
+        default=None,
+    )
+    parser.add_argument(
+        "--wandb-group",
+        help="Add the training run to a Weight & Biases group e.g. by language pair or experiment.",
+        default=None,
+    )
+    parser.add_argument(
+        "--wandb-run-name",
+        help="Use a custom name for the Weight & Biases run.",
+        default=None,
+    )
+    parser.add_argument(
+        "--wandb-publication",
+        action="store_true",
+        help="Trigger publication on Weight & Biases. Disabled by default. Can be set though env variable WANDB_PUBLICATION=true|false",
+        default=os.environ.get("WANDB_PUBLICATION", "false").lower() == "true",
+    )
+    parser.add_argument(
+        "--taskcluster-secret",
+        help="Taskcluster secret name used to store the Weight & Biases secret API Key.",
+        type=str,
+        default=os.environ.get("TASKCLUSTER_SECRET"),
+    )
+    parser.add_argument(
+        "--tags",
+        help="List of tags to use on Weight & Biases publication",
+        type=str,
+        default=["taskcluster"],
+        nargs="+",
+    )
+
+
+def get_wandb_token(secret_name):
+    """
+    Retrieve the Weight & Biases token from Taskcluster secret
+    """
+    secrets = taskcluster.Secrets({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]})
+
+    try:
+        wandb_secret = secrets.get(secret_name)
+        return wandb_secret["secret"]["token"]
+    except Exception as e:
+        raise Exception(
+            f"Weight & Biases secret API Key retrieved from Taskcluster is malformed: {e}"
+        )
+
+
+def get_wandb_names():
+    """
+    Find the various names needed to publish on Weight & Biases using
+    the taskcluster task & group payloads
+    """
+    task_id = os.environ.get("TASK_ID")
+    if not task_id:
+        raise Exception("Weight & Biases name detection can only run in taskcluster")
+
+    # Load task & group definition
+    # CI task groups do not expose any configuration, so we must use default values
+    queue = taskcluster.Queue({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]})
+    task = queue.task(task_id)
+    _, task_name = build_task_name(task)
+    group_id = task["taskGroupId"]
+    task_group = queue.task(group_id)
+    config = task_group.get("extra", {}).get("action", {}).get("context", {}).get("input")
+    if config is None:
+        logger.warn(
+            f"Experiment configuration missing on {group_id} @ extra/action/context/input, fallback to CI values"
+        )
+        experiment = {
+            "src": "ru",
+            "trg": "en",
+            "name": "ci",
+        }
+    else:
+        experiment = config["experiment"]
+
+    # Build project, group and run names
+    return (
+        f'{experiment["src"]}-{experiment["trg"]}',
+        f'{experiment["name"]}_{group_id}',
+        task_name,
+    )
+
+
+def get_wandb_publisher(
+    project_name=None,
+    group_name=None,
+    run_name=None,
+    taskcluster_secret=None,
+    artifacts=[],
+    tags=[],
+    logs_file=None,
+    publication=False,
+):
+    # Load secret from Taskcluster and auto-configure naming
+    if taskcluster_secret:
+        assert os.environ.get(
+            "TASKCLUSTER_PROXY_URL"
+        ), "When using `--taskcluster-secret`, `TASKCLUSTER_PROXY_URL` environment variable must be set too."
+
+        # Weight and Biases client use environment variable to read the token
+        os.environ.setdefault("WANDB_API_KEY", get_wandb_token(taskcluster_secret))
+
+        project_name, group_name, run_name = get_wandb_names()
+
+    # Enable publication on weight and biases when project is set
+    # But prevent running when explicitly disabled by operator
+    if not publication:
+        logger.info(
+            "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION"
+        )
+        return
+    if not project_name:
+        logger.info("Skip weight & biases publication as project name is not set")
+        return
+
+    # Build optional configuration with log file
+    config = {}
+    if logs_file:
+        config["logs_file"] = logs_file
+
+    return WandB(
+        project=project_name,
+        group=group_name,
+        name=run_name,
+        artifacts=artifacts,
+        tags=tags,
+        config=config,
+    )

From bfc4e0b54f4069d25c3d4fd8dcd5b8df992345fc Mon Sep 17 00:00:00 2001
From: Bastien Abadie <abadie@teklia.com>
Date: Tue, 7 May 2024 08:46:42 +0200
Subject: [PATCH 03/14] Do not check taskcluwter when publication is disabled

---
 tracking/translations_parser/wandb.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tracking/translations_parser/wandb.py b/tracking/translations_parser/wandb.py
index 92058c84a..93e948a0a 100644
--- a/tracking/translations_parser/wandb.py
+++ b/tracking/translations_parser/wandb.py
@@ -112,6 +112,12 @@ def get_wandb_publisher(
     logs_file=None,
     publication=False,
 ):
+    if not publication:
+        logger.info(
+            "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION"
+        )
+        return
+
     # Load secret from Taskcluster and auto-configure naming
     if taskcluster_secret:
         assert os.environ.get(
@@ -125,11 +131,6 @@ def get_wandb_publisher(
 
     # Enable publication on weight and biases when project is set
     # But prevent running when explicitly disabled by operator
-    if not publication:
-        logger.info(
-            "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION"
-        )
-        return
     if not project_name:
         logger.info("Skip weight & biases publication as project name is not set")
         return

From f9c163eb1d1bf705e1dd5fde23138fdacd531160 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Mon, 13 May 2024 11:47:42 +0200
Subject: [PATCH 04/14] Publish evaluation metrics to W&B

---
 pipeline/eval/eval.py                         | 28 ++++++++++++++++++-
 taskcluster/kinds/evaluate-quantized/kind.yml |  1 +
 .../kinds/evaluate-teacher-ensemble/kind.yml  |  1 +
 taskcluster/kinds/evaluate/kind.yml           |  1 +
 4 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index 57b097d86..236a46977 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -55,6 +55,8 @@
 
 logger = get_logger("eval")
 try:
+    from translations_parser.data import Metric
+    from translations_parser.utils import parse_tag
     from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher
 
     WANDB_AVAILABLE = True
@@ -143,6 +145,11 @@ def main(args_list: Optional[list[str]] = None) -> None:
     parser.add_argument(
         "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
     )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="The name of the dataset (e.g. 'flores-aug-inline-noise_devtest')",
+    )
 
     # Add Weight & Biases CLI args when module is loaded
     if WANDB_AVAILABLE:
@@ -350,7 +357,26 @@ def main(args_list: Optional[list[str]] = None) -> None:
             artifacts=args.wandb_artifacts,
             publication=args.wandb_publication,
         )
-        # TODO: publish scores
+        try:
+            # Build a valid task tag from the base dataset to detect importer, dataset aud augmentation.
+            # Set "teacher" as the default value for the tag, as the real model is already detected above.
+            _, importer, dataset, augmentation = parse_tag(f"eval_teacher_{args.dataset}")
+        except ValueError:
+            print(
+                "Metric could not be published to W&B because the dataset could not be parsed: {args.dataset}"
+            )
+        else:
+            wandb.handle_metrics(
+                metrics=[
+                    Metric(
+                        importer=importer,
+                        dataset=dataset,
+                        augmentation=augmentation,
+                        chrf=chrf_details["score"],
+                        bleu_detok=bleu_details["score"],
+                    )
+                ]
+            )
 
 
 if __name__ == "__main__":
diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml
index 4c3cb9e8d..e837ef309 100644
--- a/taskcluster/kinds/evaluate-quantized/kind.yml
+++ b/taskcluster/kinds/evaluate-quantized/kind.yml
@@ -103,6 +103,7 @@ tasks:
                     --marian            "$BMT_MARIAN"
                     --gpus              "$GPUS"
                     --model_variant     quantized
+                    --dataset          {dataset}
 
         dependencies:
             dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}
diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
index 7924ef432..05b08e31a 100644
--- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
+++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
@@ -108,6 +108,7 @@ tasks:
                     --workspace        "$WORKSPACE"
                     --gpus             "$GPUS"
                     --model_variant    gpu
+                    --dataset      {dataset}
 
         # Run after the teacher ensemble models are trained.
         from-deps:
diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml
index cf80e81ef..a4437bdc7 100644
--- a/taskcluster/kinds/evaluate/kind.yml
+++ b/taskcluster/kinds/evaluate/kind.yml
@@ -105,6 +105,7 @@ task-defaults:
                 --workspace         "$WORKSPACE"
                 --gpus              "$GPUS"
                 --model_variant     gpu
+                --dataset           {dataset}
 
 tasks:
     backward-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}:

From 9009544958e4c95f4cbaf39c9e50790b638d1851 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Tue, 14 May 2024 13:12:14 +0200
Subject: [PATCH 05/14] Fix running eval tracking on CI

---
 taskcluster/kinds/evaluate-quantized/kind.yml        | 5 +++--
 taskcluster/kinds/evaluate-teacher-ensemble/kind.yml | 5 +++--
 taskcluster/kinds/evaluate/kind.yml                  | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml
index e837ef309..d216b15d7 100644
--- a/taskcluster/kinds/evaluate-quantized/kind.yml
+++ b/taskcluster/kinds/evaluate-quantized/kind.yml
@@ -84,10 +84,11 @@ tasks:
                 - bash
                 - -c
                 - >-
-                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                    export PATH=$PATH:~/.local/bin &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    pip install --upgrade pip
+                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     pip install $VCS_PATH/tracking &&
-                    export PATH=$PATH:~/.local/bin &&
                     zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst &&
                     $VCS_PATH/pipeline/eval/eval.py
                     --src               {src_locale}
diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
index 05b08e31a..0f7361a67 100644
--- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
+++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
@@ -89,10 +89,11 @@ tasks:
                 - bash
                 - -c
                 - >-
-                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                    export PATH=$PATH:~/.local/bin &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    pip install --upgrade pip &&
+                    pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     pip install $VCS_PATH/tracking &&
-                    export PATH=$PATH:~/.local/bin &&
                     sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                     sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                     $VCS_PATH/pipeline/eval/eval.py
diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml
index a4437bdc7..096bf5f32 100644
--- a/taskcluster/kinds/evaluate/kind.yml
+++ b/taskcluster/kinds/evaluate/kind.yml
@@ -87,10 +87,11 @@ task-defaults:
             - bash
             - -c
             - >-
-                pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
-                pip install $VCS_PATH/tracking &&
                 export PATH=$PATH:~/.local/bin &&
                 export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                pip install --upgrade pip &&
+                pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
+                pip install $VCS_PATH/tracking &&
                 sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                 sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
                 $VCS_PATH/pipeline/eval/eval.py

From eb7d6db098d41cba9ede7a3148fa22c9ed3bc148 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Wed, 15 May 2024 16:22:15 +0200
Subject: [PATCH 06/14] Use args.wandb_run_name instead of default teacher

---
 pipeline/eval/eval.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index 236a46977..9ac74b277 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -358,12 +358,13 @@ def main(args_list: Optional[list[str]] = None) -> None:
             publication=args.wandb_publication,
         )
         try:
-            # Build a valid task tag from the base dataset to detect importer, dataset aud augmentation.
-            # Set "teacher" as the default value for the tag, as the real model is already detected above.
-            _, importer, dataset, augmentation = parse_tag(f"eval_teacher_{args.dataset}")
+            _, importer, dataset, augmentation = parse_tag(
+                f"eval_{args.wandb_run_name}_{args.dataset}"
+            )
         except ValueError:
             print(
-                "Metric could not be published to W&B because the dataset could not be parsed: {args.dataset}"
+                "Metric could not be published to W&B because the dataset could not be parsed from tag "
+                f"'eval_{args.wandb_run_name}_{args.dataset}'."
             )
         else:
             wandb.handle_metrics(

From 0819fcc8db600a942dbac7c0fc9f8ddf154d163b Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Fri, 17 May 2024 13:56:26 +0200
Subject: [PATCH 07/14] Remove duplicated arguments

---
 .../translations_parser/cli/taskcluster.py    | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/tracking/translations_parser/cli/taskcluster.py b/tracking/translations_parser/cli/taskcluster.py
index 9158f9d55..59d374b68 100644
--- a/tracking/translations_parser/cli/taskcluster.py
+++ b/tracking/translations_parser/cli/taskcluster.py
@@ -51,46 +51,6 @@ def get_args() -> argparse.Namespace:
         type=Path,
         default=Path(__file__).parent.parent / "output",
     )
-    parser.add_argument(
-        "--wandb-project",
-        help="Publish the training run to a Weight & Biases project.",
-        default=None,
-    )
-    parser.add_argument(
-        "--wandb-artifacts",
-        help="Directory containing training artifacts to publish on Weight & Biases.",
-        type=Path,
-        default=None,
-    )
-    parser.add_argument(
-        "--wandb-group",
-        help="Add the training run to a Weight & Biases group e.g. by language pair or experiment.",
-        default=None,
-    )
-    parser.add_argument(
-        "--wandb-run-name",
-        help="Use a custom name for the Weight & Biases run.",
-        default=None,
-    )
-    parser.add_argument(
-        "--wandb-publication",
-        action="store_true",
-        help="Trigger publication on Weight & Biases. Disabled by default. Can be set though env variable WANDB_PUBLICATION=True|False",
-        default=os.environ.get("WANDB_PUBLICATION", "false").lower() == "true",
-    )
-    parser.add_argument(
-        "--taskcluster-secret",
-        help="Taskcluster secret name used to store the Weight & Biases secret API Key.",
-        type=str,
-        default=os.environ.get("TASKCLUSTER_SECRET"),
-    )
-    parser.add_argument(
-        "--tags",
-        help="List of tags to use on Weight & Biases publication",
-        type=str,
-        default=["taskcluster"],
-        nargs="+",
-    )
     parser.add_argument(
         "--verbose",
         "-v",

From 990943d831c5881ae636d3648667d23650a7517d Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Mon, 20 May 2024 20:53:36 +0200
Subject: [PATCH 08/14] Retrieve dataset from Taskcluster directly

---
 pipeline/eval/eval.py                         | 37 +++++--------------
 taskcluster/kinds/evaluate-quantized/kind.yml |  1 -
 .../kinds/evaluate-teacher-ensemble/kind.yml  |  1 -
 taskcluster/kinds/evaluate/kind.yml           |  1 -
 tracking/translations_parser/utils.py         | 27 ++++++++++++++
 5 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index 9ac74b277..bc8e832c4 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -55,8 +55,7 @@
 
 logger = get_logger("eval")
 try:
-    from translations_parser.data import Metric
-    from translations_parser.utils import parse_tag
+    from translations_parser.utils import metric_from_tc_context
     from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher
 
     WANDB_AVAILABLE = True
@@ -145,11 +144,6 @@ def main(args_list: Optional[list[str]] = None) -> None:
     parser.add_argument(
         "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
     )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        help="The name of the dataset (e.g. 'flores-aug-inline-noise_devtest')",
-    )
 
     # Add Weight & Biases CLI args when module is loaded
     if WANDB_AVAILABLE:
@@ -357,27 +351,14 @@ def main(args_list: Optional[list[str]] = None) -> None:
             artifacts=args.wandb_artifacts,
             publication=args.wandb_publication,
         )
-        try:
-            _, importer, dataset, augmentation = parse_tag(
-                f"eval_{args.wandb_run_name}_{args.dataset}"
-            )
-        except ValueError:
-            print(
-                "Metric could not be published to W&B because the dataset could not be parsed from tag "
-                f"'eval_{args.wandb_run_name}_{args.dataset}'."
-            )
-        else:
-            wandb.handle_metrics(
-                metrics=[
-                    Metric(
-                        importer=importer,
-                        dataset=dataset,
-                        augmentation=augmentation,
-                        chrf=chrf_details["score"],
-                        bleu_detok=bleu_details["score"],
-                    )
-                ]
-            )
+        wandb.handle_metrics(
+            metrics=[
+                metric_from_tc_context(
+                    chrf=chrf_details["score"],
+                    bleu_detok=bleu_details["score"],
+                )
+            ]
+        )
 
 
 if __name__ == "__main__":
diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml
index d216b15d7..e133d2b10 100644
--- a/taskcluster/kinds/evaluate-quantized/kind.yml
+++ b/taskcluster/kinds/evaluate-quantized/kind.yml
@@ -104,7 +104,6 @@ tasks:
                     --marian            "$BMT_MARIAN"
                     --gpus              "$GPUS"
                     --model_variant     quantized
-                    --dataset          {dataset}
 
         dependencies:
             dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}
diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
index 0f7361a67..08ff1b465 100644
--- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
+++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
@@ -109,7 +109,6 @@ tasks:
                     --workspace        "$WORKSPACE"
                     --gpus             "$GPUS"
                     --model_variant    gpu
-                    --dataset      {dataset}
 
         # Run after the teacher ensemble models are trained.
         from-deps:
diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml
index 096bf5f32..1d6f3378a 100644
--- a/taskcluster/kinds/evaluate/kind.yml
+++ b/taskcluster/kinds/evaluate/kind.yml
@@ -106,7 +106,6 @@ task-defaults:
                 --workspace         "$WORKSPACE"
                 --gpus              "$GPUS"
                 --model_variant     gpu
-                --dataset           {dataset}
 
 tasks:
     backward-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}:
diff --git a/tracking/translations_parser/utils.py b/tracking/translations_parser/utils.py
index 58581cffb..75d00a18e 100644
--- a/tracking/translations_parser/utils.py
+++ b/tracking/translations_parser/utils.py
@@ -1,9 +1,12 @@
 import logging
+import os
 import re
 from collections.abc import Sequence
 from datetime import datetime
 from typing import NamedTuple, Optional
 
+import taskcluster
+
 logger = logging.getLogger(__name__)
 
 # Keywords used to split eval filenames into model and dataset
@@ -149,3 +152,27 @@ def build_task_name(task: dict):
     prefix = task["tags"]["kind"].split("-")[0]
     label = parse_task_label(task["tags"]["label"])
     return prefix, label.model
+
+
+def metric_from_tc_context(chrf: float, bleu: float):
+    """
+    Find the various names needed to build a metric directly from a Taskcluster task
+    """
+    from translations_parser.data import Metric
+
+    task_id = os.environ.get("TASK_ID")
+    if not task_id:
+        raise Exception("Evaluation metric can only be build in taskcluster")
+
+    # CI task groups do not expose any configuration, so we must use default values
+    queue = taskcluster.Queue({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]})
+    task = queue.task(task_id)
+    parsed = parse_task_label(task["tags"]["label"])
+
+    return Metric(
+        importer=parsed.importer,
+        dataset=parsed.dataset,
+        augmentation=parsed.augmentation,
+        chrf=chrf,
+        bleu_detok=bleu,
+    )

From 7c86b899ae60c11dc521456cefda8cf63ba036e3 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Tue, 21 May 2024 08:29:19 +0200
Subject: [PATCH 09/14] Add missing calls to publisher and logging

---
 pipeline/eval/eval.py                         | 14 ++++++--------
 taskcluster/kinds/evaluate-quantized/kind.yml |  2 +-
 tracking/translations_parser/publishers.py    | 17 ++++++++---------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index bc8e832c4..de43422f9 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -351,14 +351,12 @@ def main(args_list: Optional[list[str]] = None) -> None:
             artifacts=args.wandb_artifacts,
             publication=args.wandb_publication,
         )
-        wandb.handle_metrics(
-            metrics=[
-                metric_from_tc_context(
-                    chrf=chrf_details["score"],
-                    bleu_detok=bleu_details["score"],
-                )
-            ]
-        )
+        logger.info("Initializing Weight & Biases client")
+        wandb.open()
+        logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
+        metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
+        wandb.handle_metrics(metrics=[metric])
+        wandb.close()
 
 
 if __name__ == "__main__":
diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml
index e133d2b10..ce0c30341 100644
--- a/taskcluster/kinds/evaluate-quantized/kind.yml
+++ b/taskcluster/kinds/evaluate-quantized/kind.yml
@@ -86,7 +86,7 @@ tasks:
                 - >-
                     export PATH=$PATH:~/.local/bin &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
-                    pip install --upgrade pip
+                    pip install --upgrade pip &&
                     pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
                     pip install $VCS_PATH/tracking &&
                     zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst &&
diff --git a/tracking/translations_parser/publishers.py b/tracking/translations_parser/publishers.py
index b3f03a338..5112819ea 100644
--- a/tracking/translations_parser/publishers.py
+++ b/tracking/translations_parser/publishers.py
@@ -92,11 +92,9 @@ def __init__(
         self.parser: TrainingParser | None = None
         self.wandb: wandb.sdk.wandb_run.Run | wandb.sdk.lib.disabled.RunDisabled | None = None
 
-    def open(self, parser) -> None:
-        if parser is None or self.parser is not None:
-            return
+    def open(self, parser=None) -> None:
         self.parser = parser
-        config = parser.config
+        config = getattr(parser, "config", {})
         config.update(self.extra_kwargs.pop("config", {}))
 
         try:
@@ -188,12 +186,13 @@ def handle_metrics(self, metrics: Sequence[Metric]) -> None:
             )
 
     def close(self) -> None:
-        if self.wandb is None or self.parser is None:
+        if self.wandb is None:
             return
-        # Store runtime logs as the main log artifact
-        # This will be overwritten in case an unhandled exception occurs
-        with (Path(self.wandb.dir) / "output.log").open("w") as f:
-            f.write(self.parser.logs_str)
+        if self.parser is not None:
+            # Store runtime logs as the main log artifact
+            # This will be overwritten in case an unhandled exception occurs
+            with (Path(self.wandb.dir) / "output.log").open("w") as f:
+                f.write(self.parser.logs_str)
 
         # Publish artifacts
         if self.artifacts:

From d3d5ec3d37167f0b82f2a3b6dcea421cdbe56795 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Tue, 21 May 2024 11:15:00 +0200
Subject: [PATCH 10/14] Allow publishing metrics as a table on existing runs
 (i.e. previous trainings)

---
 pipeline/eval/eval.py                      | 3 ++-
 tracking/translations_parser/publishers.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index de43422f9..a8d4f7897 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -352,7 +352,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
             publication=args.wandb_publication,
         )
         logger.info("Initializing Weight & Biases client")
-        wandb.open()
+        # Allow publishing metrics as a table on existing runs (i.e. previous trainings)
+        wandb.open(resume=True)
         logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
         metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
         wandb.handle_metrics(metrics=[metric])
diff --git a/tracking/translations_parser/publishers.py b/tracking/translations_parser/publishers.py
index 5112819ea..20807d9aa 100644
--- a/tracking/translations_parser/publishers.py
+++ b/tracking/translations_parser/publishers.py
@@ -92,7 +92,7 @@ def __init__(
         self.parser: TrainingParser | None = None
         self.wandb: wandb.sdk.wandb_run.Run | wandb.sdk.lib.disabled.RunDisabled | None = None
 
-    def open(self, parser=None) -> None:
+    def open(self, parser=None, resume: bool = False) -> None:
         self.parser = parser
         config = getattr(parser, "config", {})
         config.update(self.extra_kwargs.pop("config", {}))
@@ -119,7 +119,7 @@ def open(self, parser=None) -> None:
             elif len(existing_runs) == 1:
                 run = existing_runs[0]
                 # Avoid overriding an existing run on a first training, this should not happen
-                if int(os.environ.get("RUN_ID", 0)) < 1:
+                if resume is False and int(os.environ.get("RUN_ID", 0)) < 1:
                     logger.warning(
                         f"A W&B run already exists with name '{name}': {run}. No data will be published."
                     )

From 782a8d772e80b10e3368f47f313bd57855392361 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Wed, 22 May 2024 12:50:15 +0200
Subject: [PATCH 11/14] Update regex to parse labels ending with '-1'

---
 tests/test_tracking_utils.py          | 4 ++++
 tracking/translations_parser/utils.py | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/test_tracking_utils.py b/tests/test_tracking_utils.py
index 82cc139dc..5761e624b 100644
--- a/tests/test_tracking_utils.py
+++ b/tests/test_tracking_utils.py
@@ -76,6 +76,10 @@
             "evaluate-student-sacrebleu-wmt19-ast-en",
             ("student", "sacrebleu", "wmt19", None),
         ),
+        (
+            "evaluate-teacher-flores-devtest-ru-en-1",
+            ("teacher-1", "flores", "devtest", None),
+        ),
     ],
 )
 def test_parse_task_label(task_label, parsed_values):
diff --git a/tracking/translations_parser/utils.py b/tracking/translations_parser/utils.py
index 75d00a18e..2aa6ab5e2 100644
--- a/tracking/translations_parser/utils.py
+++ b/tracking/translations_parser/utils.py
@@ -83,6 +83,11 @@
     #   evaluate-quantized-mtdata_aug-mix_Neulab-tedtalks_eng-lit-lt-en
     #                                     ^^^^^^^^^^^^^^^^^^^^^^^
     r"_?(?P<dataset>[-\w\d_]*?(-[a-z]{3}-[a-z]{3})?)?"
+    #
+    # Match language (project) suffix.
+    # evaluate-teacher-flores-devtest-ru-en-1
+    #                                 ^^^^^
+    #
     r"-?(?P<lang>[a-z]{2,3}-[a-z]{2,3})?"
     #
     # Match the task chunking, for instance:
@@ -90,7 +95,7 @@
     #                                            ^
     #   evaluate-teacher-flores-flores_aug-title_devtest-lt-en-1_2
     #                                                          ^
-    r"-?((?P<task_suffix>\d+)(\/|_)\d+)?"
+    r"(-(?P<task_suffix>\d+)([\/|_]\d+)?)?"
     #
     r"$"
 )

From 468f6feb2dd8d0a9803e52a8f3ff808b6bf46330 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Wed, 22 May 2024 13:00:50 +0200
Subject: [PATCH 12/14] Generic support for train/eval different naming

---
 .../translations_parser/cli/taskcluster_group.py    | 13 +++----------
 tracking/translations_parser/utils.py               |  7 ++++++-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tracking/translations_parser/cli/taskcluster_group.py b/tracking/translations_parser/cli/taskcluster_group.py
index 6204b2465..8edfa59d9 100644
--- a/tracking/translations_parser/cli/taskcluster_group.py
+++ b/tracking/translations_parser/cli/taskcluster_group.py
@@ -8,6 +8,7 @@
 
 import argparse
 import logging
+import re
 import tempfile
 from collections import defaultdict
 from pathlib import Path
@@ -20,8 +21,9 @@
 from translations_parser.data import Metric
 from translations_parser.parser import TrainingParser, logger
 from translations_parser.publishers import WandB
-from translations_parser.utils import MULTIPLE_TRAIN_SUFFIX, build_task_name, parse_task_label
+from translations_parser.utils import build_task_name, parse_task_label
 
+MULTIPLE_TRAIN_SUFFIX = re.compile(r"(-\d+)/\d+$")
 KIND_TAG_TARGET = ("train", "finetune")
 queue = taskcluster.Queue({"rootUrl": "https://firefox-ci-tc.services.mozilla.com"})
 
@@ -230,15 +232,6 @@ def publish_task_group(group_id: str, override: bool = False) -> None:
             except ValueError:
                 continue
 
-            if eval_label and (re_match := MULTIPLE_TRAIN_SUFFIX.search(eval_label)):
-                (suffix,) = re_match.groups()
-                model_name += suffix
-
-            # Training task may be named differently from the evaluation tasks, use training name by default
-            model_name = model_name.replace("finetuned", "finetune")
-            if model_name == "backward":
-                model_name = "backwards"
-
             # Evaluation tasks must be a dependency of the run and match its name
             if (
                 training_task["status"]["taskId"] in eval_task["task"]["dependencies"]
diff --git a/tracking/translations_parser/utils.py b/tracking/translations_parser/utils.py
index 2aa6ab5e2..ebfeab75c 100644
--- a/tracking/translations_parser/utils.py
+++ b/tracking/translations_parser/utils.py
@@ -99,7 +99,6 @@
     #
     r"$"
 )
-MULTIPLE_TRAIN_SUFFIX = re.compile(r"(-\d+)/\d+$")
 
 
 class ParsedTaskLabel(NamedTuple):
@@ -122,6 +121,12 @@ def parse_task_label(task_label: str) -> ParsedTaskLabel:
         raise ValueError(task_label)
     groups = match.groupdict()
     model = groups["model"]
+
+    # Naming may be inconsistent between train and evaluation tasks
+    model = model.replace("finetuned", "finetune")
+    if model == "backward":
+        model = "backwards"
+
     suffix = groups.get("suffix") or groups.get("task_suffix")
     if not suffix and model == "teacher":
         # Keep the index on teacher runs for compatibility with legacy models

From 044d93b5939b50114f27c4101dcb6c993c3dfdfd Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Wed, 22 May 2024 15:54:00 +0200
Subject: [PATCH 13/14] Update tests

---
 tests/test_tracking_cli.py   | 4 ++--
 tests/test_tracking_utils.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_tracking_cli.py b/tests/test_tracking_cli.py
index f9188010a..d8ce40d19 100644
--- a/tests/test_tracking_cli.py
+++ b/tests/test_tracking_cli.py
@@ -123,9 +123,9 @@ def test_experiments_marian_1_10(wandb_mock, getargs_mock, caplog, samples_dir,
             ),
             (logging.INFO, "Found 2 quantized metrics from speed folder"),
             (logging.INFO, "Found 16 metrics from task logs"),
-            (logging.INFO, "Creating missing run backward with associated metrics"),
+            (logging.INFO, "Creating missing run backwards with associated metrics"),
             (logging.INFO, "Creating missing run quantized with associated metrics"),
-            (logging.INFO, "Creating missing run student-finetuned with associated metrics"),
+            (logging.INFO, "Creating missing run student-finetune with associated metrics"),
             (logging.INFO, "Creating missing run teacher-base-0 with associated metrics"),
             (logging.INFO, "Creating missing run teacher-base-1 with associated metrics"),
             (logging.INFO, "Creating missing run teacher-ensemble with associated metrics"),
diff --git a/tests/test_tracking_utils.py b/tests/test_tracking_utils.py
index 5761e624b..79f2be668 100644
--- a/tests/test_tracking_utils.py
+++ b/tests/test_tracking_utils.py
@@ -33,7 +33,7 @@
         ),
         (
             "eval_student-finetuned_flores_devtest",
-            ("student-finetuned", "flores", "devtest", None),
+            ("student-finetune", "flores", "devtest", None),
         ),
         (
             "eval_teacher-base0_flores_devtest",
@@ -65,7 +65,7 @@
         ),
         (
             "evaluate-backward-url-gcp_pytest-dataset_a0017e-en-ru",
-            ("backward", "url", "gcp_pytest-dataset_a0017e", None),
+            ("backwards", "url", "gcp_pytest-dataset_a0017e", None),
         ),
         (
             "train-teacher-ast-en-1",

From 53a012ce571f8343f90dc4ef45d5f067facd59d6 Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Wed, 22 May 2024 17:52:47 +0200
Subject: [PATCH 14/14] Support disabled publication

---
 pipeline/eval/eval.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py
index a8d4f7897..31c5e43d7 100755
--- a/pipeline/eval/eval.py
+++ b/pipeline/eval/eval.py
@@ -351,13 +351,14 @@ def main(args_list: Optional[list[str]] = None) -> None:
             artifacts=args.wandb_artifacts,
             publication=args.wandb_publication,
         )
-        logger.info("Initializing Weight & Biases client")
-        # Allow publishing metrics as a table on existing runs (i.e. previous trainings)
-        wandb.open(resume=True)
-        logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
-        metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
-        wandb.handle_metrics(metrics=[metric])
-        wandb.close()
+        if wandb:
+            logger.info("Initializing Weight & Biases client")
+            # Allow publishing metrics as a table on existing runs (i.e. previous trainings)
+            wandb.open(resume=True)
+            logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
+            metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
+            wandb.handle_metrics(metrics=[metric])
+            wandb.close()
 
 
 if __name__ == "__main__":