Remove try catch in eval.py; make model_gauntlet optional in eval.py (#…

…434) * fix bugs when no model gauntlet is provided * fix bugs when no model gauntlet is provided * fix bugs when no model gauntlet is provided * fix bugs when no model gauntlet is provided
mosaicml · Jul 7, 2023 · 62e2fea · 62e2fea
1 parent ef350d9
commit 62e2fea
Showing 1 changed file with 37 additions and 42 deletions.
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
@@ -5,7 +5,6 @@
 import re
 import sys
 import time
-import traceback
 from typing import List
 
 import pandas as pd
@@ -46,7 +45,6 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
     evaluators, logger_keys = build_icl_evaluators(cfg.icl_tasks, tokenizer,
                                                    cfg.max_seq_len,
                                                    cfg.device_eval_batch_size)
-
     if hasattr(cfg, 'model_gauntlet'):
         if isinstance(cfg.model_gauntlet, str):
             with open(cfg.model_gauntlet, 'r') as icl_f:
@@ -61,6 +59,7 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
         model_gauntlet_callback = ModelGauntlet(**model_gauntlet)
     else:
         model_gauntlet = None
+        model_gauntlet_callback = None
 
     composer_model = load_model(model_cfg.model, tokenizer,
                                 cfg.get('num_retries', 3))
@@ -119,58 +118,48 @@ def main(cfg):
     model_gauntlet_df = None
     models_df = None
     for model_cfg in cfg.models:
+        (in_memory_logger, logger_keys, model_gauntlet_callback, model_gauntlet,
+         model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
+                                             model_gauntlet_df)
 
-        try:
-            (in_memory_logger, logger_keys, model_gauntlet_callback,
-             model_gauntlet,
-             model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
-                                                 model_gauntlet_df)
-
+        if model_gauntlet_callback is not None:
             composite_scores = model_gauntlet_callback.eval_end(
                 None, in_memory_logger)
 
-            benchmark_to_taxonomy = {}
+        benchmark_to_taxonomy = {}
+        if model_gauntlet is not None:
             for t in model_gauntlet.categories:
                 for b in t.benchmarks:
                     benchmark_to_taxonomy[b.name] = t.name
 
-            model_results = calculate_markdown_results(logger_keys,
-                                                       in_memory_logger.data,
-                                                       benchmark_to_taxonomy,
-                                                       model_cfg.model_name)
+        model_results = calculate_markdown_results(logger_keys,
+                                                   in_memory_logger.data,
+                                                   benchmark_to_taxonomy,
+                                                   model_cfg.model_name)
 
-            if models_df is None:
-                models_df = model_results
-            else:
-                models_df = pd.concat([models_df, model_results],
-                                      ignore_index=True)
+        if models_df is None:
+            models_df = model_results
+        else:
+            models_df = pd.concat([models_df, model_results], ignore_index=True)
 
+        if model_gauntlet_df is not None and model_gauntlet is not None and model_gauntlet_df is not None:
             row = {'model_name': model_cfg['model_name']}
-
             row.update({
                 t.name: composite_scores[f'metrics/model_gauntlet/{t.name}']
                 for t in model_gauntlet.categories
             })
             row.update({
                 'average': composite_scores[f'metrics/model_gauntlet/average']
             })
+            model_gauntlet_df = pd.concat(
+                [model_gauntlet_df, pd.DataFrame([row])], ignore_index=True)
 
-            if model_gauntlet_df is not None:
-                model_gauntlet_df = pd.concat(
-                    [model_gauntlet_df, pd.DataFrame([row])], ignore_index=True)
-
-                print(f'Printing gauntlet results for all models')
-                print(
-                    model_gauntlet_df.sort_values(
-                        'average', ascending=False).to_markdown(index=False))
-            print(f'Printing complete results for all models')
-            print(models_df.to_markdown(index=False))
-        except Exception as e:
+            print(f'Printing gauntlet results for all models')
             print(
-                f'Got exception: {str(e)} while evaluating {model_cfg}. Traceback:',
-                flush=True)
-            traceback.print_exc()  # print the exception to stdout
-            print('\nContinuing to next model.\n', flush=True)
+                model_gauntlet_df.sort_values(
+                    'average', ascending=False).to_markdown(index=False))
+        print(f'Printing complete results for all models')
+        print(models_df.to_markdown(index=False))
 
 
 def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
@@ -210,7 +199,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
                 subscores = results[num_shot][benchmark][metric]
                 if len(subscores) == 1:
                     row = {
-                        'Category': benchmark_to_taxonomy[benchmark],
+                        'Category': benchmark_to_taxonomy.get(benchmark, ''),
                         'Benchmark': benchmark,
                         'Subtask': None,
                         'Accuracy': subscores[0]['val'],
@@ -221,7 +210,7 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
                 else:
                     row = {
                         'Category':
-                            benchmark_to_taxonomy[benchmark],
+                            benchmark_to_taxonomy.get(benchmark, ''),
                         'Benchmark':
                             benchmark,
                         'Subtask':
@@ -236,12 +225,18 @@ def calculate_markdown_results(logger_keys, logger_data, benchmark_to_taxonomy,
                     df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
                     for sub in subscores:
                         row = {
-                            'Category': benchmark_to_taxonomy[benchmark],
-                            'Benchmark': None,
-                            'Subtask': sub['subcat'],
-                            'Accuracy': sub['val'],
-                            'Number few shot': num_shot,
-                            'Model': model_name
+                            'Category':
+                                benchmark_to_taxonomy.get(benchmark, ''),
+                            'Benchmark':
+                                None,
+                            'Subtask':
+                                sub['subcat'],
+                            'Accuracy':
+                                sub['val'],
+                            'Number few shot':
+                                num_shot,
+                            'Model':
+                                model_name
                         }
                         df = pd.concat([df, pd.DataFrame([row])],
                                        ignore_index=True)