Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Publish evaluation metrics #598

Merged
merged 15 commits into from
May 22, 2024
Merged
31 changes: 31 additions & 0 deletions pipeline/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@
from pipeline.common.logging import get_logger

logger = get_logger("eval")
try:
from translations_parser.utils import metric_from_tc_context
from translations_parser.wandb import add_wandb_arguments, get_wandb_publisher

WANDB_AVAILABLE = True
except ImportError as e:
print(f"Failed to import tracking module: {e}")
WANDB_AVAILABLE = False


def run_bash_oneliner(command: str):
Expand Down Expand Up @@ -136,6 +144,11 @@ def main(args_list: Optional[list[str]] = None) -> None:
parser.add_argument(
"--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
)

# Add Weight & Biases CLI args when module is loaded
if WANDB_AVAILABLE:
add_wandb_arguments(parser)
vrigal marked this conversation as resolved.
Show resolved Hide resolved

args = parser.parse_args(args_list)

src = args.src
Expand Down Expand Up @@ -329,6 +342,24 @@ def main(args_list: Optional[list[str]] = None) -> None:
with open(metrics_file, "w") as file:
file.write(f"{bleu_details['score']}\n" f"{chrf_details['score']}\n" f"{comet_score}\n")

if WANDB_AVAILABLE:
wandb = get_wandb_publisher( # noqa
project_name=args.wandb_project,
group_name=args.wandb_group,
run_name=args.wandb_run_name,
taskcluster_secret=args.taskcluster_secret,
artifacts=args.wandb_artifacts,
publication=args.wandb_publication,
)
if wandb:
logger.info("Initializing Weight & Biases client")
# Allow publishing metrics as a table on existing runs (i.e. previous trainings)
wandb.open(resume=True)
logger.info(f"Publishing metrics to Weight & Biases ({wandb.extra_kwargs})")
metric = metric_from_tc_context(chrf=chrf_details["score"], bleu=bleu_details["score"])
wandb.handle_metrics(metrics=[metric])
wandb.close()


if __name__ == "__main__":
main()
21 changes: 19 additions & 2 deletions taskcluster/kinds/evaluate-quantized/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ tasks:
substitution-fields:
- run.command
- fetches
- worker.env
from-parameters:
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
wandb_publication: training_config.wandb-publication

worker-type: b-gpu
worker:
Expand All @@ -60,6 +62,19 @@ tasks:
# This is a separate environment variable so tests can override it.
BMT_MARIAN: $MOZ_FETCHES_DIR

# Weight & Biases trigger
WANDB_PUBLICATION: "{wandb_publication}"

# Weight & Biases publication token is stored in that secret
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
- secrets:get:project/translations/level-1/weights-and-biases

# Don't run unless explicitly scheduled
run-on-tasks-for: []

Expand All @@ -69,9 +84,11 @@ tasks:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
zstd --rm -d $MOZ_FETCHES_DIR/lex.s2t.pruned.zst &&
$VCS_PATH/pipeline/eval/eval.py
--src {src_locale}
Expand Down
22 changes: 20 additions & 2 deletions taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ tasks:
substitution-fields:
- fetches
- run.command
- worker.env
from-parameters:
best_model: training_config.experiment.best-model
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
wandb_publication: training_config.wandb-publication

worker-type: b-gpu
worker:
artifacts:
Expand All @@ -58,6 +61,19 @@ tasks:
# This is a separate environment variable so tests can override it.
MARIAN: $MOZ_FETCHES_DIR

# Weight & Biases trigger
WANDB_PUBLICATION: "{wandb_publication}"

# Weight & Biases publication token is stored in that secret
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
- secrets:get:project/translations/level-1/weights-and-biases

# Don't run unless explicitly scheduled
run-on-tasks-for: []

Expand All @@ -73,9 +89,11 @@ tasks:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
$VCS_PATH/pipeline/eval/eval.py
Expand Down
19 changes: 18 additions & 1 deletion taskcluster/kinds/evaluate/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ task-defaults:
task-context:
substitution-fields:
- run.command
- worker.env
from-parameters:
best_model: training_config.experiment.best-model
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
split_chunks: training_config.experiment.teacher-ensemble
wandb_publication: training_config.wandb-publication
worker-type: b-gpu
worker:
artifacts:
Expand All @@ -57,6 +59,19 @@ task-defaults:
# This is a separate environment variable so tests can override it.
MARIAN: $MOZ_FETCHES_DIR

# Weight & Biases trigger
WANDB_PUBLICATION: "{wandb_publication}"

# Weight & Biases publication token is stored in that secret
TASKCLUSTER_SECRET: project/translations/level-1/weights-and-biases

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
- secrets:get:project/translations/level-1/weights-and-biases

# Don't run unless explicitly scheduled
run-on-tasks-for: []

Expand All @@ -72,9 +87,11 @@ task-defaults:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
export PATH=$PATH:~/.local/bin &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/eval/requirements/eval.txt &&
pip install $VCS_PATH/tracking &&
sed -i -e "s,- .*fetches,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
sed -i -e "s,- .*artifacts,- $MOZ_FETCHES_DIR," $TASK_WORKDIR/fetches/*.yml &&
$VCS_PATH/pipeline/eval/eval.py
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tracking_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,9 @@ def test_experiments_marian_1_10(wandb_mock, getargs_mock, caplog, samples_dir,
),
(logging.INFO, "Found 2 quantized metrics from speed folder"),
(logging.INFO, "Found 16 metrics from task logs"),
(logging.INFO, "Creating missing run backward with associated metrics"),
(logging.INFO, "Creating missing run backwards with associated metrics"),
(logging.INFO, "Creating missing run quantized with associated metrics"),
(logging.INFO, "Creating missing run student-finetuned with associated metrics"),
(logging.INFO, "Creating missing run student-finetune with associated metrics"),
(logging.INFO, "Creating missing run teacher-base-0 with associated metrics"),
(logging.INFO, "Creating missing run teacher-base-1 with associated metrics"),
(logging.INFO, "Creating missing run teacher-ensemble with associated metrics"),
Expand Down
8 changes: 6 additions & 2 deletions tests/test_tracking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
),
(
"eval_student-finetuned_flores_devtest",
("student-finetuned", "flores", "devtest", None),
("student-finetune", "flores", "devtest", None),
),
(
"eval_teacher-base0_flores_devtest",
Expand Down Expand Up @@ -65,7 +65,7 @@
),
(
"evaluate-backward-url-gcp_pytest-dataset_a0017e-en-ru",
("backward", "url", "gcp_pytest-dataset_a0017e", None),
("backwards", "url", "gcp_pytest-dataset_a0017e", None),
),
(
"train-teacher-ast-en-1",
Expand All @@ -76,6 +76,10 @@
"evaluate-student-sacrebleu-wmt19-ast-en",
("student", "sacrebleu", "wmt19", None),
),
(
"evaluate-teacher-flores-devtest-ru-en-1",
("teacher-1", "flores", "devtest", None),
),
],
)
def test_parse_task_label(task_label, parsed_values):
Expand Down
Loading