From cba711216434691e0d1e1a5d2c39f198bb79ef68 Mon Sep 17 00:00:00 2001 From: Bastien Abadie Date: Thu, 2 May 2024 21:12:47 +0200 Subject: [PATCH 01/14] Configure evaluation tasks --- taskcluster/kinds/evaluate-quantized/kind.yml | 16 ++++++++++++++++ .../kinds/evaluate-teacher-ensemble/kind.yml | 17 +++++++++++++++++ taskcluster/kinds/evaluate/kind.yml | 16 ++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index 6118b4e41..4c3cb9e8d 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -45,9 +45,11 @@ tasks: substitution-fields: - run.command - fetches + - worker.env from-parameters: src_locale: training_config.experiment.src trg_locale: training_config.experiment.trg + wandb_publication: training_config.wandb-publication worker-type: b-gpu worker: @@ -60,6 +62,19 @@ tasks: # This is a separate environment variable so tests can override it. BMT_MARIAN: $MOZ_FETCHES_DIR + # Weight & Biases trigger + WANDB_PUBLICATION: "{wandb_publication}" + + # Weight & Biases publication token is stored in that secret + TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases + + # Taskcluster proxy is required to read secrets + taskcluster-proxy: true + + # The task needs to be able to read that secret to publish on Weight & Biases + scopes: + - secrets:get:project/translations/level-1/weights-and-biases + # Don't run unless explicitly scheduled run-on-tasks-for: [] @@ -71,6 +86,7 @@ tasks: - >- pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && + pip install $VCS_PATH/tracking && export PATH=$PATH:~/.local/bin && zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst && $VCS_PATH/pipeline/eval/eval.py diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml index 8976d1f78..7924ef432 100644 --- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml @@ -43,10 +43,13 @@ tasks: substitution-fields: - fetches - run.command + - worker.env from-parameters: best_model: training_config.experiment.best-model src_locale: training_config.experiment.src trg_locale: training_config.experiment.trg + wandb_publication: training_config.wandb-publication + worker-type: b-gpu worker: artifacts: @@ -58,6 +61,19 @@ tasks: # This is a separate environment variable so tests can override it. MARIAN: $MOZ_FETCHES_DIR + # Weight & Biases trigger + WANDB_PUBLICATION: "{wandb_publication}" + + # Weight & Biases publication token is stored in that secret + TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases + + # Taskcluster proxy is required to read secrets + taskcluster-proxy: true + + # The task needs to be able to read that secret to publish on Weight & Biases + scopes: + - secrets:get:project/translations/level-1/weights-and-biases + # Don't run unless explicitly scheduled run-on-tasks-for: [] @@ -75,6 +91,7 @@ tasks: - >- pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && + pip install $VCS_PATH/tracking && export PATH=$PATH:~/.local/bin && sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml index d2809983a..cf80e81ef 100644 --- a/taskcluster/kinds/evaluate/kind.yml +++ b/taskcluster/kinds/evaluate/kind.yml @@ -41,11 +41,13 @@ task-defaults: task-context: substitution-fields: - run.command + - worker.env from-parameters: best_model: training_config.experiment.best-model src_locale: training_config.experiment.src trg_locale: training_config.experiment.trg split_chunks: training_config.experiment.teacher-ensemble + wandb_publication: training_config.wandb-publication worker-type: b-gpu worker: artifacts: @@ -57,6 +59,19 @@ task-defaults: # This is a separate environment variable so tests can override it. MARIAN: $MOZ_FETCHES_DIR + # Weight & Biases trigger + WANDB_PUBLICATION: "{wandb_publication}" + + # Weight & Biases publication token is stored in that secret + TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases + + # Taskcluster proxy is required to read secrets + taskcluster-proxy: true + + # The task needs to be able to read that secret to publish on Weight & Biases + scopes: + - secrets:get:project/translations/level-1/weights-and-biases + # Don't run unless explicitly scheduled run-on-tasks-for: [] @@ -73,6 +88,7 @@ task-defaults: - -c - >- pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && + pip install $VCS_PATH/tracking && export PATH=$PATH:~/.local/bin && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && From 08b7175c73c29834debfcaeabe6488c25d8ab5e8 Mon Sep 17 00:00:00 2001 From: Bastien Abadie Date: Thu, 2 May 2024 21:13:08 +0200 Subject: [PATCH 02/14] Extract w&b code into module --- pipeline/eval/eval.py | 23 +++ .../translations_parser/cli/taskcluster.py | 109 +++---------- tracking/translations_parser/wandb.py | 149 ++++++++++++++++++ 3 files changed, 190 insertions(+), 91 deletions(-) create mode 100644 tracking/translations_parser/wandb.py diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index febca10d9..57b097d86 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -54,6 +54,13 @@ from pipeline.common.logging import get_logger logger = get_logger("eval") +try: + from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher + + WANDB_AVAILABLE = True +except ImportError as e: + print(f"Failed to import tracking module: {e}") + WANDB_AVAILABLE = False def run_bash_oneliner(command: str): @@ -136,6 +143,11 @@ def main(args_list: Optional[list[str]] = None) -> None: parser.add_argument( "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)" ) + + # Add Weight & Biases CLI args when module is loaded + if WANDB_AVAILABLE: + add_wandb_arguments(parser) + args = parser.parse_args(args_list) src = args.src @@ -329,6 +341,17 @@ def main(args_list: Optional[list[str]] = None) -> None: with open(metrics_file, "w") as file: file.write(f"{bleu_details['score']}\n" f"{chrf_details['score']}\n" f"{comet_score}\n") + if WANDB_AVAILABLE: + wandb = get_wandb_publisher( # noqa + project_name=args.wandb_project, + group_name=args.wandb_group, + run_name=args.wandb_run_name, + taskcluster_secret=args.taskcluster_secret, + artifacts=args.wandb_artifacts, + publication=args.wandb_publication, + ) + # TODO: publish scores + if __name__ == "__main__": main() diff --git a/tracking/translations_parser/cli/taskcluster.py b/tracking/translations_parser/cli/taskcluster.py index fa5909333..9158f9d55 100644 --- a/tracking/translations_parser/cli/taskcluster.py +++ b/tracking/translations_parser/cli/taskcluster.py @@ -20,10 +20,10 @@ from io import TextIOWrapper from pathlib import Path -import taskcluster from translations_parser.parser import TrainingParser, logger -from translations_parser.publishers import CSVExport, Publisher, WandB -from translations_parser.utils import build_task_name, taskcluster_log_filter +from translations_parser.publishers import CSVExport, Publisher +from translations_parser.utils import taskcluster_log_filter +from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher def get_args() -> argparse.Namespace: @@ -99,59 +99,11 @@ def get_args() -> argparse.Namespace: dest="loglevel", const=logging.DEBUG, ) - return parser.parse_args() - - -def get_wandb_token(secret_name): - """ - Retrieve the Weight & Biases token from Taskcluster secret - """ - secrets = taskcluster.Secrets({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]}) - - try: - wandb_secret = secrets.get(secret_name) - return wandb_secret["secret"]["token"] - except Exception as e: - raise Exception( - f"Weight & Biases secret API Key retrieved from Taskcluster is malformed: {e}" - ) + # Extend parser with Weight & Biases CLI args + add_wandb_arguments(parser) -def get_wandb_names(): - """ - Find the various names needed to publish on Weight & Biases using - the taskcluster task & group payloads - """ - task_id = os.environ.get("TASK_ID") - if not task_id: - raise Exception("Weight & Biases name detection can only run in taskcluster") - - # Load task & group definition - # CI task groups do not expose any configuration, so we must use default values - queue = taskcluster.Queue({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]}) - task = queue.task(task_id) - _, task_name = build_task_name(task) - group_id = task["taskGroupId"] - task_group = queue.task(group_id) - config = task_group.get("extra", {}).get("action", {}).get("context", {}).get("input") - if config is None: - logger.warn( - f"Experiment configuration missing on {group_id} @ extra/action/context/input, fallback to CI values" - ) - experiment = { - "src": "ru", - "trg": "en", - "name": "ci", - } - else: - experiment = config["experiment"] - - # Build project, group and run names - return ( - f'{experiment["src"]}-{experiment["trg"]}', - f'{experiment["name"]}_{group_id}', - task_name, - ) + return parser.parse_args() def boot() -> None: @@ -171,44 +123,19 @@ def boot() -> None: with args.input_file.open("r") as f: lines = (line.strip() for line in f.readlines()) - # Load secret from Taskcluster and auto-configure naming - if args.taskcluster_secret: - assert os.environ.get( - "TASKCLUSTER_PROXY_URL" - ), "When using `--taskcluster-secret`, `TASKCLUSTER_PROXY_URL` environment variable must be set too." - - # Weight and Biases client use environment variable to read the token - os.environ.setdefault("WANDB_API_KEY", get_wandb_token(args.taskcluster_secret)) - - project_name, group_name, run_name = get_wandb_names() - else: - # Fallback to CLI args for names - project_name = args.wandb_project - group_name = args.wandb_group - run_name = args.wandb_run_name - - # Enable publication on weight and biases when project is set - # But prevent running when explicitly disabled by operator + # Build publisher output, CSV is always enabled, Weight & Biases upon operator choice publishers: list[Publisher] = [CSVExport(output_dir=args.output_dir)] - if not args.wandb_publication: - logger.info( - "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION" - ) - elif not project_name: - logger.info("Skip weight & biases publication as project name is not set") - else: - publishers.append( - WandB( - project=project_name, - group=group_name, - name=run_name, - artifacts=args.wandb_artifacts, - tags=args.tags, - config={ - "logs_file": args.input_file, - }, - ) - ) + wandb_publisher = get_wandb_publisher( + project_name=args.wandb_project, + group_name=args.wandb_group, + run_name=args.wandb_run_name, + taskcluster_secret=args.taskcluster_secret, + logs_file=args.input_file, + artifacts=args.wandb_artifacts, + publication=args.wandb_publication, + ) + if wandb_publisher: + publishers.append(wandb_publisher) # Use log filtering when using non-stream (for uploading past experiments) log_filter = taskcluster_log_filter if not args.from_stream else None diff --git a/tracking/translations_parser/wandb.py b/tracking/translations_parser/wandb.py new file mode 100644 index 000000000..92058c84a --- /dev/null +++ b/tracking/translations_parser/wandb.py @@ -0,0 +1,149 @@ +import os +from pathlib import Path + +import taskcluster +from translations_parser.parser import logger +from translations_parser.publishers import WandB +from translations_parser.utils import build_task_name + + +def add_wandb_arguments(parser): + parser.add_argument( + "--wandb-project", + help="Publish the training run to a Weight & Biases project.", + default=None, + ) + parser.add_argument( + "--wandb-artifacts", + help="Directory containing training artifacts to publish on Weight & Biases.", + type=Path, + default=None, + ) + parser.add_argument( + "--wandb-group", + help="Add the training run to a Weight & Biases group e.g. by language pair or experiment.", + default=None, + ) + parser.add_argument( + "--wandb-run-name", + help="Use a custom name for the Weight & Biases run.", + default=None, + ) + parser.add_argument( + "--wandb-publication", + action="store_true", + help="Trigger publication on Weight & Biases. Disabled by default. Can be set though env variable WANDB_PUBLICATION=true|false", + default=os.environ.get("WANDB_PUBLICATION", "false").lower() == "true", + ) + parser.add_argument( + "--taskcluster-secret", + help="Taskcluster secret name used to store the Weight & Biases secret API Key.", + type=str, + default=os.environ.get("TASKCLUSTER_SECRET"), + ) + parser.add_argument( + "--tags", + help="List of tags to use on Weight & Biases publication", + type=str, + default=["taskcluster"], + nargs="+", + ) + + +def get_wandb_token(secret_name): + """ + Retrieve the Weight & Biases token from Taskcluster secret + """ + secrets = taskcluster.Secrets({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]}) + + try: + wandb_secret = secrets.get(secret_name) + return wandb_secret["secret"]["token"] + except Exception as e: + raise Exception( + f"Weight & Biases secret API Key retrieved from Taskcluster is malformed: {e}" + ) + + +def get_wandb_names(): + """ + Find the various names needed to publish on Weight & Biases using + the taskcluster task & group payloads + """ + task_id = os.environ.get("TASK_ID") + if not task_id: + raise Exception("Weight & Biases name detection can only run in taskcluster") + + # Load task & group definition + # CI task groups do not expose any configuration, so we must use default values + queue = taskcluster.Queue({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]}) + task = queue.task(task_id) + _, task_name = build_task_name(task) + group_id = task["taskGroupId"] + task_group = queue.task(group_id) + config = task_group.get("extra", {}).get("action", {}).get("context", {}).get("input") + if config is None: + logger.warn( + f"Experiment configuration missing on {group_id} @ extra/action/context/input, fallback to CI values" + ) + experiment = { + "src": "ru", + "trg": "en", + "name": "ci", + } + else: + experiment = config["experiment"] + + # Build project, group and run names + return ( + f'{experiment["src"]}-{experiment["trg"]}', + f'{experiment["name"]}_{group_id}', + task_name, + ) + + +def get_wandb_publisher( + project_name=None, + group_name=None, + run_name=None, + taskcluster_secret=None, + artifacts=[], + tags=[], + logs_file=None, + publication=False, +): + # Load secret from Taskcluster and auto-configure naming + if taskcluster_secret: + assert os.environ.get( + "TASKCLUSTER_PROXY_URL" + ), "When using `--taskcluster-secret`, `TASKCLUSTER_PROXY_URL` environment variable must be set too." + + # Weight and Biases client use environment variable to read the token + os.environ.setdefault("WANDB_API_KEY", get_wandb_token(taskcluster_secret)) + + project_name, group_name, run_name = get_wandb_names() + + # Enable publication on weight and biases when project is set + # But prevent running when explicitly disabled by operator + if not publication: + logger.info( + "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION" + ) + return + if not project_name: + logger.info("Skip weight & biases publication as project name is not set") + return + + # Build optional configuration with log file + config = {} + if logs_file: + config["logs_file"] = logs_file + + return WandB( + project=project_name, + group=group_name, + name=run_name, + artifacts=artifacts, + tags=tags, + config=config, + ) From bfc4e0b54f4069d25c3d4fd8dcd5b8df992345fc Mon Sep 17 00:00:00 2001 From: Bastien Abadie Date: Tue, 7 May 2024 08:46:42 +0200 Subject: [PATCH 03/14] Do not check taskcluwter when publication is disabled --- tracking/translations_parser/wandb.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tracking/translations_parser/wandb.py b/tracking/translations_parser/wandb.py index 92058c84a..93e948a0a 100644 --- a/tracking/translations_parser/wandb.py +++ b/tracking/translations_parser/wandb.py @@ -112,6 +112,12 @@ def get_wandb_publisher( logs_file=None, publication=False, ): + if not publication: + logger.info( + "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION" + ) + return + # Load secret from Taskcluster and auto-configure naming if taskcluster_secret: assert os.environ.get( @@ -125,11 +131,6 @@ def get_wandb_publisher( # Enable publication on weight and biases when project is set # But prevent running when explicitly disabled by operator - if not publication: - logger.info( - "Skip weight & biases publication as requested by operator through WANDB_PUBLICATION" - ) - return if not project_name: logger.info("Skip weight & biases publication as project name is not set") return From f9c163eb1d1bf705e1dd5fde23138fdacd531160 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Mon, 13 May 2024 11:47:42 +0200 Subject: [PATCH 04/14] Publish evaluation metrics to W&B --- pipeline/eval/eval.py | 28 ++++++++++++++++++- taskcluster/kinds/evaluate-quantized/kind.yml | 1 + .../kinds/evaluate-teacher-ensemble/kind.yml | 1 + taskcluster/kinds/evaluate/kind.yml | 1 + 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index 57b097d86..236a46977 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -55,6 +55,8 @@ logger = get_logger("eval") try: + from translations_parser.data import Metric + from translations_parser.utils import parse_tag from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher WANDB_AVAILABLE = True @@ -143,6 +145,11 @@ def main(args_list: Optional[list[str]] = None) -> None: parser.add_argument( "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)" ) + parser.add_argument( + "--dataset", + type=str, + help="The name of the dataset (e.g. 'flores-aug-inline-noise_devtest')", + ) # Add Weight & Biases CLI args when module is loaded if WANDB_AVAILABLE: @@ -350,7 +357,26 @@ def main(args_list: Optional[list[str]] = None) -> None: artifacts=args.wandb_artifacts, publication=args.wandb_publication, ) - # TODO: publish scores + try: + # Build a valid task tag from the base dataset to detect importer, dataset aud augmentation. + # Set "teacher" as the default value for the tag, as the real model is already detected above. + _, importer, dataset, augmentation = parse_tag(f"eval_teacher_{args.dataset}") + except ValueError: + print( + "Metric could not be published to W&B because the dataset could not be parsed: {args.dataset}" + ) + else: + wandb.handle_metrics( + metrics=[ + Metric( + importer=importer, + dataset=dataset, + augmentation=augmentation, + chrf=chrf_details["score"], + bleu_detok=bleu_details["score"], + ) + ] + ) if __name__ == "__main__": diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index 4c3cb9e8d..e837ef309 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -103,6 +103,7 @@ tasks: --marian "$BMT_MARIAN" --gpus "$GPUS" --model_variant quantized + --dataset {dataset} dependencies: dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale} diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml index 7924ef432..05b08e31a 100644 --- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml @@ -108,6 +108,7 @@ tasks: --workspace "$WORKSPACE" --gpus "$GPUS" --model_variant gpu + --dataset {dataset} # Run after the teacher ensemble models are trained. from-deps: diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml index cf80e81ef..a4437bdc7 100644 --- a/taskcluster/kinds/evaluate/kind.yml +++ b/taskcluster/kinds/evaluate/kind.yml @@ -105,6 +105,7 @@ task-defaults: --workspace "$WORKSPACE" --gpus "$GPUS" --model_variant gpu + --dataset {dataset} tasks: backward-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}: From 9009544958e4c95f4cbaf39c9e50790b638d1851 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Tue, 14 May 2024 13:12:14 +0200 Subject: [PATCH 05/14] Fix running eval tracking on CI --- taskcluster/kinds/evaluate-quantized/kind.yml | 5 +++-- taskcluster/kinds/evaluate-teacher-ensemble/kind.yml | 5 +++-- taskcluster/kinds/evaluate/kind.yml | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index e837ef309..d216b15d7 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -84,10 +84,11 @@ tasks: - bash - -c - >- - pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && + export PATH=$PATH:~/.local/bin && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && + pip install --upgrade pip + pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && pip install $VCS_PATH/tracking && - export PATH=$PATH:~/.local/bin && zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst && $VCS_PATH/pipeline/eval/eval.py --src {src_locale} diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml index 05b08e31a..0f7361a67 100644 --- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml @@ -89,10 +89,11 @@ tasks: - bash - -c - >- - pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && + export PATH=$PATH:~/.local/bin && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && + pip install --upgrade pip && + pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && pip install $VCS_PATH/tracking && - export PATH=$PATH:~/.local/bin && sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && $VCS_PATH/pipeline/eval/eval.py diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml index a4437bdc7..096bf5f32 100644 --- a/taskcluster/kinds/evaluate/kind.yml +++ b/taskcluster/kinds/evaluate/kind.yml @@ -87,10 +87,11 @@ task-defaults: - bash - -c - >- - pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && - pip install $VCS_PATH/tracking && export PATH=$PATH:~/.local/bin && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && + pip install --upgrade pip && + pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && + pip install $VCS_PATH/tracking && sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml && $VCS_PATH/pipeline/eval/eval.py From eb7d6db098d41cba9ede7a3148fa22c9ed3bc148 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Wed, 15 May 2024 16:22:15 +0200 Subject: [PATCH 06/14] Use args.wandb_run_name instead of default teacher --- pipeline/eval/eval.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index 236a46977..9ac74b277 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -358,12 +358,13 @@ def main(args_list: Optional[list[str]] = None) -> None: publication=args.wandb_publication, ) try: - # Build a valid task tag from the base dataset to detect importer, dataset aud augmentation. - # Set "teacher" as the default value for the tag, as the real model is already detected above. - _, importer, dataset, augmentation = parse_tag(f"eval_teacher_{args.dataset}") + _, importer, dataset, augmentation = parse_tag( + f"eval_{args.wandb_run_name}_{args.dataset}" + ) except ValueError: print( - "Metric could not be published to W&B because the dataset could not be parsed: {args.dataset}" + "Metric could not be published to W&B because the dataset could not be parsed from tag " + f"'eval_{args.wandb_run_name}_{args.dataset}'." ) else: wandb.handle_metrics( From 0819fcc8db600a942dbac7c0fc9f8ddf154d163b Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Fri, 17 May 2024 13:56:26 +0200 Subject: [PATCH 07/14] Remove duplicated arguments --- .../translations_parser/cli/taskcluster.py | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/tracking/translations_parser/cli/taskcluster.py b/tracking/translations_parser/cli/taskcluster.py index 9158f9d55..59d374b68 100644 --- a/tracking/translations_parser/cli/taskcluster.py +++ b/tracking/translations_parser/cli/taskcluster.py @@ -51,46 +51,6 @@ def get_args() -> argparse.Namespace: type=Path, default=Path(__file__).parent.parent / "output", ) - parser.add_argument( - "--wandb-project", - help="Publish the training run to a Weight & Biases project.", - default=None, - ) - parser.add_argument( - "--wandb-artifacts", - help="Directory containing training artifacts to publish on Weight & Biases.", - type=Path, - default=None, - ) - parser.add_argument( - "--wandb-group", - help="Add the training run to a Weight & Biases group e.g. by language pair or experiment.", - default=None, - ) - parser.add_argument( - "--wandb-run-name", - help="Use a custom name for the Weight & Biases run.", - default=None, - ) - parser.add_argument( - "--wandb-publication", - action="store_true", - help="Trigger publication on Weight & Biases. Disabled by default. Can be set though env variable WANDB_PUBLICATION=True|False", - default=os.environ.get("WANDB_PUBLICATION", "false").lower() == "true", - ) - parser.add_argument( - "--taskcluster-secret", - help="Taskcluster secret name used to store the Weight & Biases secret API Key.", - type=str, - default=os.environ.get("TASKCLUSTER_SECRET"), - ) - parser.add_argument( - "--tags", - help="List of tags to use on Weight & Biases publication", - type=str, - default=["taskcluster"], - nargs="+", - ) parser.add_argument( "--verbose", "-v", From 990943d831c5881ae636d3648667d23650a7517d Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Mon, 20 May 2024 20:53:36 +0200 Subject: [PATCH 08/14] Retrieve dataset from Taskcluster directly --- pipeline/eval/eval.py | 37 +++++-------------- taskcluster/kinds/evaluate-quantized/kind.yml | 1 - .../kinds/evaluate-teacher-ensemble/kind.yml | 1 - taskcluster/kinds/evaluate/kind.yml | 1 - tracking/translations_parser/utils.py | 27 ++++++++++++++ 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index 9ac74b277..bc8e832c4 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -55,8 +55,7 @@ logger = get_logger("eval") try: - from translations_parser.data import Metric - from translations_parser.utils import parse_tag + from translations_parser.utils import metric_from_tc_context from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher WANDB_AVAILABLE = True @@ -145,11 +144,6 @@ def main(args_list: Optional[list[str]] = None) -> None: parser.add_argument( "--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)" ) - parser.add_argument( - "--dataset", - type=str, - help="The name of the dataset (e.g. 'flores-aug-inline-noise_devtest')", - ) # Add Weight & Biases CLI args when module is loaded if WANDB_AVAILABLE: @@ -357,27 +351,14 @@ def main(args_list: Optional[list[str]] = None) -> None: artifacts=args.wandb_artifacts, publication=args.wandb_publication, ) - try: - _, importer, dataset, augmentation = parse_tag( - f"eval_{args.wandb_run_name}_{args.dataset}" - ) - except ValueError: - print( - "Metric could not be published to W&B because the dataset could not be parsed from tag " - f"'eval_{args.wandb_run_name}_{args.dataset}'." - ) - else: - wandb.handle_metrics( - metrics=[ - Metric( - importer=importer, - dataset=dataset, - augmentation=augmentation, - chrf=chrf_details["score"], - bleu_detok=bleu_details["score"], - ) - ] - ) + wandb.handle_metrics( + metrics=[ + metric_from_tc_context( + chrf=chrf_details["score"], + bleu_detok=bleu_details["score"], + ) + ] + ) if __name__ == "__main__": diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index d216b15d7..e133d2b10 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -104,7 +104,6 @@ tasks: --marian "$BMT_MARIAN" --gpus "$GPUS" --model_variant quantized - --dataset {dataset} dependencies: dataset: dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale} diff --git a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml index 0f7361a67..08ff1b465 100644 --- a/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml +++ b/taskcluster/kinds/evaluate-teacher-ensemble/kind.yml @@ -109,7 +109,6 @@ tasks: --workspace "$WORKSPACE" --gpus "$GPUS" --model_variant gpu - --dataset {dataset} # Run after the teacher ensemble models are trained. from-deps: diff --git a/taskcluster/kinds/evaluate/kind.yml b/taskcluster/kinds/evaluate/kind.yml index 096bf5f32..1d6f3378a 100644 --- a/taskcluster/kinds/evaluate/kind.yml +++ b/taskcluster/kinds/evaluate/kind.yml @@ -106,7 +106,6 @@ task-defaults: --workspace "$WORKSPACE" --gpus "$GPUS" --model_variant gpu - --dataset {dataset} tasks: backward-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}: diff --git a/tracking/translations_parser/utils.py b/tracking/translations_parser/utils.py index 58581cffb..75d00a18e 100644 --- a/tracking/translations_parser/utils.py +++ b/tracking/translations_parser/utils.py @@ -1,9 +1,12 @@ import logging +import os import re from collections.abc import Sequence from datetime import datetime from typing import NamedTuple, Optional +import taskcluster + logger = logging.getLogger(__name__) # Keywords used to split eval filenames into model and dataset @@ -149,3 +152,27 @@ def build_task_name(task: dict): prefix = task["tags"]["kind"].split("-")[0] label = parse_task_label(task["tags"]["label"]) return prefix, label.model + + +def metric_from_tc_context(chrf: float, bleu: float): + """ + Find the various names needed to build a metric directly from a Taskcluster task + """ + from translations_parser.data import Metric + + task_id = os.environ.get("TASK_ID") + if not task_id: + raise Exception("Evaluation metric can only be build in taskcluster") + + # CI task groups do not expose any configuration, so we must use default values + queue = taskcluster.Queue({"rootUrl": os.environ["TASKCLUSTER_PROXY_URL"]}) + task = queue.task(task_id) + parsed = parse_task_label(task["tags"]["label"]) + + return Metric( + importer=parsed.importer, + dataset=parsed.dataset, + augmentation=parsed.augmentation, + chrf=chrf, + bleu_detok=bleu, + ) From 7c86b899ae60c11dc521456cefda8cf63ba036e3 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Tue, 21 May 2024 08:29:19 +0200 Subject: [PATCH 09/14] Add missing calls to publisher and logging --- pipeline/eval/eval.py | 14 ++++++-------- taskcluster/kinds/evaluate-quantized/kind.yml | 2 +- tracking/translations_parser/publishers.py | 17 ++++++++--------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index bc8e832c4..de43422f9 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -351,14 +351,12 @@ def main(args_list: Optional[list[str]] = None) -> None: artifacts=args.wandb_artifacts, publication=args.wandb_publication, ) - wandb.handle_metrics( - metrics=[ - metric_from_tc_context( - chrf=chrf_details["score"], - bleu_detok=bleu_details["score"], - ) - ] - ) + logger.info("Initializing Weight & Biases client") + wandb.open() + logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})") + metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"]) + wandb.handle_metrics(metrics=[metric]) + wandb.close() if __name__ == "__main__": diff --git a/taskcluster/kinds/evaluate-quantized/kind.yml b/taskcluster/kinds/evaluate-quantized/kind.yml index e133d2b10..ce0c30341 100644 --- a/taskcluster/kinds/evaluate-quantized/kind.yml +++ b/taskcluster/kinds/evaluate-quantized/kind.yml @@ -86,7 +86,7 @@ tasks: - >- export PATH=$PATH:~/.local/bin && export PYTHONPATH=$PYTHONPATH:$VCS_PATH && - pip install --upgrade pip + pip install --upgrade pip && pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt && pip install $VCS_PATH/tracking && zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst && diff --git a/tracking/translations_parser/publishers.py b/tracking/translations_parser/publishers.py index b3f03a338..5112819ea 100644 --- a/tracking/translations_parser/publishers.py +++ b/tracking/translations_parser/publishers.py @@ -92,11 +92,9 @@ def __init__( self.parser: TrainingParser | None = None self.wandb: wandb.sdk.wandb_run.Run | wandb.sdk.lib.disabled.RunDisabled | None = None - def open(self, parser) -> None: - if parser is None or self.parser is not None: - return + def open(self, parser=None) -> None: self.parser = parser - config = parser.config + config = getattr(parser, "config", {}) config.update(self.extra_kwargs.pop("config", {})) try: @@ -188,12 +186,13 @@ def handle_metrics(self, metrics: Sequence[Metric]) -> None: ) def close(self) -> None: - if self.wandb is None or self.parser is None: + if self.wandb is None: return - # Store runtime logs as the main log artifact - # This will be overwritten in case an unhandled exception occurs - with (Path(self.wandb.dir) / "output.log").open("w") as f: - f.write(self.parser.logs_str) + if self.parser is not None: + # Store runtime logs as the main log artifact + # This will be overwritten in case an unhandled exception occurs + with (Path(self.wandb.dir) / "output.log").open("w") as f: + f.write(self.parser.logs_str) # Publish artifacts if self.artifacts: From d3d5ec3d37167f0b82f2a3b6dcea421cdbe56795 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Tue, 21 May 2024 11:15:00 +0200 Subject: [PATCH 10/14] Allow publishing metrics as a table on existing runs (i.e. previous trainings) --- pipeline/eval/eval.py | 3 ++- tracking/translations_parser/publishers.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index de43422f9..a8d4f7897 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -352,7 +352,8 @@ def main(args_list: Optional[list[str]] = None) -> None: publication=args.wandb_publication, ) logger.info("Initializing Weight & Biases client") - wandb.open() + # Allow publishing metrics as a table on existing runs (i.e. previous trainings) + wandb.open(resume=True) logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})") metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"]) wandb.handle_metrics(metrics=[metric]) diff --git a/tracking/translations_parser/publishers.py b/tracking/translations_parser/publishers.py index 5112819ea..20807d9aa 100644 --- a/tracking/translations_parser/publishers.py +++ b/tracking/translations_parser/publishers.py @@ -92,7 +92,7 @@ def __init__( self.parser: TrainingParser | None = None self.wandb: wandb.sdk.wandb_run.Run | wandb.sdk.lib.disabled.RunDisabled | None = None - def open(self, parser=None) -> None: + def open(self, parser=None, resume: bool = False) -> None: self.parser = parser config = getattr(parser, "config", {}) config.update(self.extra_kwargs.pop("config", {})) @@ -119,7 +119,7 @@ def open(self, parser=None) -> None: elif len(existing_runs) == 1: run = existing_runs[0] # Avoid overriding an existing run on a first training, this should not happen - if int(os.environ.get("RUN_ID", 0)) < 1: + if resume is False and int(os.environ.get("RUN_ID", 0)) < 1: logger.warning( f"A W&B run already exists with name '{name}': {run}. No data will be published." ) From 782a8d772e80b10e3368f47f313bd57855392361 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Wed, 22 May 2024 12:50:15 +0200 Subject: [PATCH 11/14] Update regex to parse labels ending with '-1' --- tests/test_tracking_utils.py | 4 ++++ tracking/translations_parser/utils.py | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_tracking_utils.py b/tests/test_tracking_utils.py index 82cc139dc..5761e624b 100644 --- a/tests/test_tracking_utils.py +++ b/tests/test_tracking_utils.py @@ -76,6 +76,10 @@ "evaluate-student-sacrebleu-wmt19-ast-en", ("student", "sacrebleu", "wmt19", None), ), + ( + "evaluate-teacher-flores-devtest-ru-en-1", + ("teacher-1", "flores", "devtest", None), + ), ], ) def test_parse_task_label(task_label, parsed_values): diff --git a/tracking/translations_parser/utils.py b/tracking/translations_parser/utils.py index 75d00a18e..2aa6ab5e2 100644 --- a/tracking/translations_parser/utils.py +++ b/tracking/translations_parser/utils.py @@ -83,6 +83,11 @@ # evaluate-quantized-mtdata_aug-mix_Neulab-tedtalks_eng-lit-lt-en # ^^^^^^^^^^^^^^^^^^^^^^^ r"_?(?P[-\w\d_]*?(-[a-z]{3}-[a-z]{3})?)?" + # + # Match language (project) suffix. + # evaluate-teacher-flores-devtest-ru-en-1 + # ^^^^^ + # r"-?(?P[a-z]{2,3}-[a-z]{2,3})?" # # Match the task chunking, for instance: @@ -90,7 +95,7 @@ # ^ # evaluate-teacher-flores-flores_aug-title_devtest-lt-en-1_2 # ^ - r"-?((?P\d+)(\/|_)\d+)?" + r"(-(?P\d+)([\/|_]\d+)?)?" # r"$" ) From 468f6feb2dd8d0a9803e52a8f3ff808b6bf46330 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Wed, 22 May 2024 13:00:50 +0200 Subject: [PATCH 12/14] Generic support for train/eval different naming --- .../translations_parser/cli/taskcluster_group.py | 13 +++---------- tracking/translations_parser/utils.py | 7 ++++++- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tracking/translations_parser/cli/taskcluster_group.py b/tracking/translations_parser/cli/taskcluster_group.py index 6204b2465..8edfa59d9 100644 --- a/tracking/translations_parser/cli/taskcluster_group.py +++ b/tracking/translations_parser/cli/taskcluster_group.py @@ -8,6 +8,7 @@ import argparse import logging +import re import tempfile from collections import defaultdict from pathlib import Path @@ -20,8 +21,9 @@ from translations_parser.data import Metric from translations_parser.parser import TrainingParser, logger from translations_parser.publishers import WandB -from translations_parser.utils import MULTIPLE_TRAIN_SUFFIX, build_task_name, parse_task_label +from translations_parser.utils import build_task_name, parse_task_label +MULTIPLE_TRAIN_SUFFIX = re.compile(r"(-\d+)/\d+$") KIND_TAG_TARGET = ("train", "finetune") queue = taskcluster.Queue({"rootUrl": "https://firefox-ci-tc.services.mozilla.com"}) @@ -230,15 +232,6 @@ def publish_task_group(group_id: str, override: bool = False) -> None: except ValueError: continue - if eval_label and (re_match := MULTIPLE_TRAIN_SUFFIX.search(eval_label)): - (suffix,) = re_match.groups() - model_name += suffix - - # Training task may be named differently from the evaluation tasks, use training name by default - model_name = model_name.replace("finetuned", "finetune") - if model_name == "backward": - model_name = "backwards" - # Evaluation tasks must be a dependency of the run and match its name if ( training_task["status"]["taskId"] in eval_task["task"]["dependencies"] diff --git a/tracking/translations_parser/utils.py b/tracking/translations_parser/utils.py index 2aa6ab5e2..ebfeab75c 100644 --- a/tracking/translations_parser/utils.py +++ b/tracking/translations_parser/utils.py @@ -99,7 +99,6 @@ # r"$" ) -MULTIPLE_TRAIN_SUFFIX = re.compile(r"(-\d+)/\d+$") class ParsedTaskLabel(NamedTuple): @@ -122,6 +121,12 @@ def parse_task_label(task_label: str) -> ParsedTaskLabel: raise ValueError(task_label) groups = match.groupdict() model = groups["model"] + + # Naming may be inconsistent between train and evaluation tasks + model = model.replace("finetuned", "finetune") + if model == "backward": + model = "backwards" + suffix = groups.get("suffix") or groups.get("task_suffix") if not suffix and model == "teacher": # Keep the index on teacher runs for compatibility with legacy models From 044d93b5939b50114f27c4101dcb6c993c3dfdfd Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Wed, 22 May 2024 15:54:00 +0200 Subject: [PATCH 13/14] Update tests --- tests/test_tracking_cli.py | 4 ++-- tests/test_tracking_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_tracking_cli.py b/tests/test_tracking_cli.py index f9188010a..d8ce40d19 100644 --- a/tests/test_tracking_cli.py +++ b/tests/test_tracking_cli.py @@ -123,9 +123,9 @@ def test_experiments_marian_1_10(wandb_mock, getargs_mock, caplog, samples_dir, ), (logging.INFO, "Found 2 quantized metrics from speed folder"), (logging.INFO, "Found 16 metrics from task logs"), - (logging.INFO, "Creating missing run backward with associated metrics"), + (logging.INFO, "Creating missing run backwards with associated metrics"), (logging.INFO, "Creating missing run quantized with associated metrics"), - (logging.INFO, "Creating missing run student-finetuned with associated metrics"), + (logging.INFO, "Creating missing run student-finetune with associated metrics"), (logging.INFO, "Creating missing run teacher-base-0 with associated metrics"), (logging.INFO, "Creating missing run teacher-base-1 with associated metrics"), (logging.INFO, "Creating missing run teacher-ensemble with associated metrics"), diff --git a/tests/test_tracking_utils.py b/tests/test_tracking_utils.py index 5761e624b..79f2be668 100644 --- a/tests/test_tracking_utils.py +++ b/tests/test_tracking_utils.py @@ -33,7 +33,7 @@ ), ( "eval_student-finetuned_flores_devtest", - ("student-finetuned", "flores", "devtest", None), + ("student-finetune", "flores", "devtest", None), ), ( "eval_teacher-base0_flores_devtest", @@ -65,7 +65,7 @@ ), ( "evaluate-backward-url-gcp_pytest-dataset_a0017e-en-ru", - ("backward", "url", "gcp_pytest-dataset_a0017e", None), + ("backwards", "url", "gcp_pytest-dataset_a0017e", None), ), ( "train-teacher-ast-en-1", From 53a012ce571f8343f90dc4ef45d5f067facd59d6 Mon Sep 17 00:00:00 2001 From: Valentin Rigal Date: Wed, 22 May 2024 17:52:47 +0200 Subject: [PATCH 14/14] Support disabled publication --- pipeline/eval/eval.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pipeline/eval/eval.py b/pipeline/eval/eval.py index a8d4f7897..31c5e43d7 100755 --- a/pipeline/eval/eval.py +++ b/pipeline/eval/eval.py @@ -351,13 +351,14 @@ def main(args_list: Optional[list[str]] = None) -> None: artifacts=args.wandb_artifacts, publication=args.wandb_publication, ) - logger.info("Initializing Weight & Biases client") - # Allow publishing metrics as a table on existing runs (i.e. previous trainings) - wandb.open(resume=True) - logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})") - metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"]) - wandb.handle_metrics(metrics=[metric]) - wandb.close() + if wandb: + logger.info("Initializing Weight & Biases client") + # Allow publishing metrics as a table on existing runs (i.e. previous trainings) + wandb.open(resume=True) + logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})") + metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"]) + wandb.handle_metrics(metrics=[metric]) + wandb.close() if __name__ == "__main__":