Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove try catch in eval.py; make model_gauntlet optional in eval.py #434

Merged
merged 5 commits into from
Jul 7, 2023
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 26 additions & 37 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import re
import sys
import time
import traceback
from typing import List

import pandas as pd
Expand Down Expand Up @@ -46,7 +45,6 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
evaluators, logger_keys = build_icl_evaluators(cfg.icl_tasks, tokenizer,
cfg.max_seq_len,
cfg.device_eval_batch_size)

if hasattr(cfg, 'model_gauntlet'):
if isinstance(cfg.model_gauntlet, str):
with open(cfg.model_gauntlet, 'r') as icl_f:
Expand All @@ -61,6 +59,7 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
model_gauntlet_callback = ModelGauntlet(**model_gauntlet)
else:
model_gauntlet = None
model_gauntlet_callback = None

composer_model = load_model(model_cfg.model, tokenizer,
cfg.get('num_retries', 3))
Expand Down Expand Up @@ -119,58 +118,48 @@ def main(cfg):
model_gauntlet_df = None
models_df = None
for model_cfg in cfg.models:
(in_memory_logger, logger_keys, model_gauntlet_callback, model_gauntlet,
model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
model_gauntlet_df)

try:
(in_memory_logger, logger_keys, model_gauntlet_callback,
model_gauntlet,
model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
model_gauntlet_df)

if model_gauntlet_callback is not None:
composite_scores = model_gauntlet_callback.eval_end(
None, in_memory_logger)

benchmark_to_taxonomy = {}
benchmark_to_taxonomy = {}
if model_gauntlet is not None:
for t in model_gauntlet.categories:
for b in t.benchmarks:
benchmark_to_taxonomy[b.name] = t.name

model_results = calculate_markdown_results(logger_keys,
in_memory_logger.data,
benchmark_to_taxonomy,
model_cfg.model_name)
model_results = calculate_markdown_results(logger_keys,
in_memory_logger.data,
benchmark_to_taxonomy,
model_cfg.model_name)

if models_df is None:
models_df = model_results
else:
models_df = pd.concat([models_df, model_results],
ignore_index=True)
if models_df is None:
models_df = model_results
else:
models_df = pd.concat([models_df, model_results], ignore_index=True)

if model_gauntlet_df is not None and model_gauntlet is not None and model_gauntlet_df is not None:
row = {'model_name': model_cfg['model_name']}

row.update({
t.name: composite_scores[f'metrics/model_gauntlet/{t.name}']
for t in model_gauntlet.categories
})
row.update({
'average': composite_scores[f'metrics/model_gauntlet/average']
})
model_gauntlet_df = pd.concat(
[model_gauntlet_df, pd.DataFrame([row])], ignore_index=True)

if model_gauntlet_df is not None:
model_gauntlet_df = pd.concat(
[model_gauntlet_df, pd.DataFrame([row])], ignore_index=True)

print(f'Printing gauntlet results for all models')
print(
model_gauntlet_df.sort_values(
'average', ascending=False).to_markdown(index=False))
print(f'Printing complete results for all models')
print(models_df.to_markdown(index=False))
except Exception as e:
print(f'Printing gauntlet results for all models')
print(
f'Got exception: {str(e)} while evaluating {model_cfg}. Traceback:',
flush=True)
traceback.print_exc() # print the exception to stdout
print('\nContinuing to next model.\n', flush=True)
model_gauntlet_df.sort_values(
'average', ascending=False).to_markdown(index=False))
print(f'Printing complete results for all models')
print(models_df.to_markdown(index=False))


def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
Expand Down Expand Up @@ -210,7 +199,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
subscores = results[num_shot][benchmark][metric]
if len(subscores) == 1:
row = {
'Category': benchmark_to_taxonomy[benchmark],
'Category': benchmark_to_taxonomy.get(benchmark, ""),
'Benchmark': benchmark,
'Subtask': None,
'Accuracy': subscores[0]['val'],
Expand All @@ -221,7 +210,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
else:
row = {
'Category':
benchmark_to_taxonomy[benchmark],
benchmark_to_taxonomy.get(benchmark, ""),
'Benchmark':
benchmark,
'Subtask':
Expand All @@ -236,7 +225,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
for sub in subscores:
row = {
'Category': benchmark_to_taxonomy[benchmark],
'Category': benchmark_to_taxonomy.get(benchmark, ""),
'Benchmark': None,
'Subtask': sub['subcat'],
'Accuracy': sub['val'],
Expand Down