Skip to content

Commit

Permalink
Remove try catch in eval.py; make model_gauntlet optional in eval.py (#…
Browse files Browse the repository at this point in the history
…434)

* fix bugs when no model gauntlet is provided

* fix bugs when no model gauntlet is provided

* fix bugs when no model gauntlet is provided

* fix bugs when no model gauntlet is provided
  • Loading branch information
bmosaicml authored Jul 7, 2023
1 parent ef350d9 commit 62e2fea
Showing 1 changed file with 37 additions and 42 deletions.
79 changes: 37 additions & 42 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import re
import sys
import time
import traceback
from typing import List

import pandas as pd
Expand Down Expand Up @@ -46,7 +45,6 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
evaluators, logger_keys = build_icl_evaluators(cfg.icl_tasks, tokenizer,
cfg.max_seq_len,
cfg.device_eval_batch_size)

if hasattr(cfg, 'model_gauntlet'):
if isinstance(cfg.model_gauntlet, str):
with open(cfg.model_gauntlet, 'r') as icl_f:
Expand All @@ -61,6 +59,7 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
model_gauntlet_callback = ModelGauntlet(**model_gauntlet)
else:
model_gauntlet = None
model_gauntlet_callback = None

composer_model = load_model(model_cfg.model, tokenizer,
cfg.get('num_retries', 3))
Expand Down Expand Up @@ -119,58 +118,48 @@ def main(cfg):
model_gauntlet_df = None
models_df = None
for model_cfg in cfg.models:
(in_memory_logger, logger_keys, model_gauntlet_callback, model_gauntlet,
model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
model_gauntlet_df)

try:
(in_memory_logger, logger_keys, model_gauntlet_callback,
model_gauntlet,
model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
model_gauntlet_df)

if model_gauntlet_callback is not None:
composite_scores = model_gauntlet_callback.eval_end(
None, in_memory_logger)

benchmark_to_taxonomy = {}
benchmark_to_taxonomy = {}
if model_gauntlet is not None:
for t in model_gauntlet.categories:
for b in t.benchmarks:
benchmark_to_taxonomy[b.name] = t.name

model_results = calculate_markdown_results(logger_keys,
in_memory_logger.data,
benchmark_to_taxonomy,
model_cfg.model_name)
model_results = calculate_markdown_results(logger_keys,
in_memory_logger.data,
benchmark_to_taxonomy,
model_cfg.model_name)

if models_df is None:
models_df = model_results
else:
models_df = pd.concat([models_df, model_results],
ignore_index=True)
if models_df is None:
models_df = model_results
else:
models_df = pd.concat([models_df, model_results], ignore_index=True)

if model_gauntlet_df is not None and model_gauntlet is not None and model_gauntlet_df is not None:
row = {'model_name': model_cfg['model_name']}

row.update({
t.name: composite_scores[f'metrics/model_gauntlet/{t.name}']
for t in model_gauntlet.categories
})
row.update({
'average': composite_scores[f'metrics/model_gauntlet/average']
})
model_gauntlet_df = pd.concat(
[model_gauntlet_df, pd.DataFrame([row])], ignore_index=True)

if model_gauntlet_df is not None:
model_gauntlet_df = pd.concat(
[model_gauntlet_df, pd.DataFrame([row])], ignore_index=True)

print(f'Printing gauntlet results for all models')
print(
model_gauntlet_df.sort_values(
'average', ascending=False).to_markdown(index=False))
print(f'Printing complete results for all models')
print(models_df.to_markdown(index=False))
except Exception as e:
print(f'Printing gauntlet results for all models')
print(
f'Got exception: {str(e)} while evaluating {model_cfg}. Traceback:',
flush=True)
traceback.print_exc() # print the exception to stdout
print('\nContinuing to next model.\n', flush=True)
model_gauntlet_df.sort_values(
'average', ascending=False).to_markdown(index=False))
print(f'Printing complete results for all models')
print(models_df.to_markdown(index=False))


def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
Expand Down Expand Up @@ -210,7 +199,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
subscores = results[num_shot][benchmark][metric]
if len(subscores) == 1:
row = {
'Category': benchmark_to_taxonomy[benchmark],
'Category': benchmark_to_taxonomy.get(benchmark, ''),
'Benchmark': benchmark,
'Subtask': None,
'Accuracy': subscores[0]['val'],
Expand All @@ -221,7 +210,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
else:
row = {
'Category':
benchmark_to_taxonomy[benchmark],
benchmark_to_taxonomy.get(benchmark, ''),
'Benchmark':
benchmark,
'Subtask':
Expand All @@ -236,12 +225,18 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
for sub in subscores:
row = {
'Category': benchmark_to_taxonomy[benchmark],
'Benchmark': None,
'Subtask': sub['subcat'],
'Accuracy': sub['val'],
'Number few shot': num_shot,
'Model': model_name
'Category':
benchmark_to_taxonomy.get(benchmark, ''),
'Benchmark':
None,
'Subtask':
sub['subcat'],
'Accuracy':
sub['val'],
'Number few shot':
num_shot,
'Model':
model_name
}
df = pd.concat([df, pd.DataFrame([row])],
ignore_index=True)
Expand Down

0 comments on commit 62e2fea

Please sign in to comment.