diff --git a/llmfoundry/callbacks/eval_gauntlet_callback.py b/llmfoundry/callbacks/eval_gauntlet_callback.py index 78ccbb529b..7281a8d1fc 100644 --- a/llmfoundry/callbacks/eval_gauntlet_callback.py +++ b/llmfoundry/callbacks/eval_gauntlet_callback.py @@ -22,6 +22,32 @@ class Weighting(Enum): LOG_SAMPLE_SZ = 3 +def calculate_named_averages(average_names: Dict[str, list], + category_scores: Dict[str, float]): + """Calculates the named averages based off the raw category scores. + + For each named average, take a simple average of all the category scores associated with that named average. + + Args: + average_names (dict[str, list]): Contains a mapping of named averages to which category scores that average should consist of. + category_scores (dict[str, float]): Contains the raw scores corresponding to each category. + """ + average_scores = {} + for avg_name, category_list in average_names.items(): + composite_subset = { + category: score + for category, score in category_scores.items() + if category in category_list + } + if len(composite_subset.values()) > 0: + average_scores[avg_name] = sum(composite_subset.values()) / len( + composite_subset.values()) + else: + average_scores[avg_name] = 0 + + return average_scores + + class EvalGauntlet(Callback): """The EvalGauntlet aggregates ICL eval results. @@ -31,7 +57,7 @@ class EvalGauntlet(Callback): Args: logger_keys (list): These are the exact keys that the individual benchmark metrics will be logged under in the logger after eval - tasks (dict): This contains the list of categories, as well as the subtasks within them, the + categories (dict): This contains the list of categories, as well as the subtasks within them, the random baseline accuracy of each subtask, and the number of fewshot examples used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure. weighting (Weighting): The weighting scheme used to balance different tasks within each category. @@ -43,6 +69,7 @@ class EvalGauntlet(Callback): rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0. benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting. + averages (Optional[dict]): Optional dictionary specifying a mapping from a average names to lists of categories used produce each named average. """ def __init__(self, @@ -51,7 +78,8 @@ def __init__(self, weighting: str = 'EQUAL', subtract_random_baseline: bool = True, rescale_accuracy: bool = True, - benchmark_sizes: Optional[dict] = None): + benchmark_sizes: Optional[dict] = None, + averages: Optional[dict] = None): if isinstance(logger_keys, dict): raise ValueError( 'logger_keys now requires a list type as input, not a dict') @@ -66,13 +94,12 @@ def __init__(self, ) self.categories = categories + self.category_names = [conf.get('name') for conf in self.categories] self.weighting = Weighting[weighting] self.subtract_random_baseline = subtract_random_baseline self.rescale_accuracy = rescale_accuracy self.logger_keys = logger_keys - for category in self.categories: - for benchmark in category['benchmarks']: bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot" @@ -95,7 +122,20 @@ def __init__(self, assert weight is not None benchmark['weighting'] = weight - def compute_averages(self, state: State) -> Dict[str, float]: + self.averages = {} + if averages is not None: + self.averages = averages + else: + # if no averages spec provided, simply average everything + self.averages['default_average'] = self.category_names + + for avg_name in self.averages: + if avg_name in self.category_names: + raise ValueError( + f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.' + ) + + def extract_metrics_from_state(self, state: State) -> Dict[str, float]: results = {} for key in self.logger_keys: @@ -121,23 +161,22 @@ def compute_averages(self, state: State) -> Dict[str, float]: return {k: sum(v) / len(v) for k, v in results.items()} def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]: - new_metrics = self.compute_averages(state) - if len(new_metrics) == 0: + computed_metrics = self.extract_metrics_from_state(state) + if len(computed_metrics) == 0: return {} - composite_scores = {} - + category_scores = {} for category in self.categories: missing_metrics = [] - composite_scores[category['name']] = [] + category_scores[category['name']] = [] for benchmark in category['benchmarks']: key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot" - if key not in new_metrics: + if key not in computed_metrics: log.warning( f'Could not find results for benchmark: {benchmark}.') missing_metrics.append(key) else: - score = new_metrics[key] + score = computed_metrics[key] if self.subtract_random_baseline: score -= benchmark['random_baseline'] @@ -145,7 +184,7 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]: if self.rescale_accuracy and self.subtract_random_baseline: score /= 1.0 - benchmark['random_baseline'] - composite_scores[category['name']].append({ + category_scores[category['name']].append({ 'name': benchmark['name'], 'score': score, 'weighting': benchmark['weighting'] @@ -155,23 +194,22 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]: log.warning( f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}" ) - del composite_scores[category['name']] + del category_scores[category['name']] continue total_weight = sum( - k['weighting'] for k in composite_scores[category['name']]) - composite_scores[category['name']] = sum( + k['weighting'] for k in category_scores[category['name']]) + category_scores[category['name']] = sum( k['score'] * (k['weighting'] / total_weight) - for k in composite_scores[category['name']]) + for k in category_scores[category['name']]) - composite_scores = { + named_averages = calculate_named_averages(self.averages, + category_scores) + category_scores.update(named_averages) + category_scores = { f'icl/metrics/eval_gauntlet/{k}': v - for k, v in composite_scores.items() + for k, v in category_scores.items() } - - composite_scores['icl/metrics/eval_gauntlet/average'] = sum( - composite_scores.values()) / len(composite_scores.values()) if len( - composite_scores.values()) > 0 else 0 if logger is not None: - logger.log_metrics(composite_scores) + logger.log_metrics(category_scores) - return composite_scores + return category_scores diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index f07942ba10..02a5d1f862 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -145,7 +145,8 @@ def evaluate_model( if eval_gauntlet_df is None and eval_gauntlet_callback is not None: eval_gauntlet_df = pd.DataFrame( - columns=['model_name', 'average'] + + columns=['model_name'] + + [avg for avg in eval_gauntlet_callback.averages] + [t.name for t in eval_gauntlet_callback.categories]) load_path = model_cfg.get('load_path', None) @@ -314,23 +315,17 @@ def main(cfg: DictConfig): if eval_gauntlet_df is not None and eval_gauntlet_callback is not None: assert composite_scores is not None row = {'model_name': model_cfg['model_name']} - row.update({ - t.name: - composite_scores.get(f'icl/metrics/eval_gauntlet/{t.name}', - None) - for t in eval_gauntlet_callback.categories - }) - row.update({ - 'average': - composite_scores[f'icl/metrics/eval_gauntlet/average'] - }) + row.update( + {k.split('/')[-1]: v for k, v in composite_scores.items()}) eval_gauntlet_df = pd.concat( [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True) print(f'Printing gauntlet results for all models') + print( eval_gauntlet_df.sort_values( - 'average', ascending=False).to_markdown(index=False)) + list(eval_gauntlet_callback.averages.keys())[0], + ascending=False).to_markdown(index=False)) print(f'Printing complete results for all models') assert models_df is not None print(models_df.to_markdown(index=False)) diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml index 87e01fd44c..1d2fa34139 100644 --- a/scripts/eval/yamls/eval_gauntlet.yaml +++ b/scripts/eval/yamls/eval_gauntlet.yaml @@ -2,6 +2,27 @@ eval_gauntlet: weighting: EQUAL subtract_random_baseline: true rescale_accuracy: true + averages: + core_average: + - world_knowledge + - commonsense_reasoning + - language_understanding + - symbolic_problem_solving + - reading_comprehension + - programming + lm_task_average: + - world_knowledge_lm_task_subscore + - commonsense_reasoning_lm_task_subscore + - language_understanding_lm_task_subscore + - symbolic_problem_solving_lm_task_subscore + - reading_comprehension_lm_task_subscore + lite_average: + - world_knowledge_lite + - commonsense_reasoning_lite + - language_understanding_lite + - symbolic_problem_solving_lite + - reading_comprehension_lite + - programming_lite categories: - name: world_knowledge benchmarks: diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 05169818d9..759af8239a 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -43,5 +43,5 @@ device_eval_batch_size: 4 # forward_prefetch: True # limit_all_gathers: True -icl_tasks: 'eval/yamls/tasks.yaml' +icl_tasks: 'eval/yamls/tasks_light.yaml' eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml' diff --git a/tests/test_eval.py b/tests/test_eval.py index ecd15ab62f..1217487b70 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -62,7 +62,7 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any): assert isinstance(test_cfg, om.DictConfig) main(test_cfg) out, _ = capfd.readouterr() - expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt ' + expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt |' assert expected_results in out - expected_results = '| model_name | average | language_understanding_lite |\n|:-------------|----------:|------------------------------:|\n| tiny_mpt | 0 | 0 |' + expected_results = '| model_name | default_average | language_understanding_lite |\n|:-------------|------------------:|------------------------------:|\n| tiny_mpt | 0 | 0 |' assert expected_results in out diff --git a/tests/test_eval_gauntlet.py b/tests/test_eval_gauntlet.py index 8ccdd75766..3a1e371ab8 100644 --- a/tests/test_eval_gauntlet.py +++ b/tests/test_eval_gauntlet.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, List +from typing import Dict, List, Optional import omegaconf as om import pytest @@ -53,7 +53,10 @@ def log_metrics(self, metrics: Dict[str, float]) -> None: self.inmemorylogger.log_metrics(metrics) -def test_gauntlet_callback(): +@pytest.mark.parametrize('averages', [{ + 'core_average': ['world_knowledge', 'language_understanding'] +}, None]) +def test_gauntlet_callback(averages: Optional[dict]): icl_task_config = om.OmegaConf.create(""" - label: jeopardy_small dataset_uri: eval/local_data/world_knowledge/jeopardy_small.jsonl # ADD YOUR OWN DATASET URI @@ -87,6 +90,9 @@ def test_gauntlet_callback(): """) assert isinstance(eval_gauntlet_config, om.DictConfig) or isinstance( eval_gauntlet_config, str) + + if averages is not None: + eval_gauntlet_config.averages = averages tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b') # test loading functionality @@ -106,4 +112,9 @@ def test_gauntlet_callback(): name = f'icl/metrics/eval_gauntlet/{category}' assert result[name] == pytest.approx(0.25) - assert result['icl/metrics/eval_gauntlet/average'] == pytest.approx(0.25) + if averages is None: + assert result[ + 'icl/metrics/eval_gauntlet/default_average'] == pytest.approx(0.25) + else: + assert result[ + 'icl/metrics/eval_gauntlet/core_average'] == pytest.approx(0.25) diff --git a/tests/test_training.py b/tests/test_training.py index 9d40fc2a78..214909cc28 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -6,7 +6,7 @@ import shutil import sys from argparse import Namespace -from typing import Any +from typing import Any, Optional import pytest from composer.loggers import InMemoryLogger @@ -114,7 +114,11 @@ def set_correct_cwd(): os.chdir('..') -def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path): +@pytest.mark.parametrize('averages', [{ + 'core_average': ['language_understanding_lite'] +}, None]) +def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, + tmp_path: pathlib.Path): """Test training run with a small dataset.""" dataset_name = create_c4_dataset_xsmall(tmp_path) test_cfg = gpt_tiny_cfg(dataset_name, 'cpu') @@ -155,6 +159,9 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path): ]) }) + if averages is not None: + test_cfg.eval_gauntlet['averages'] = averages + test_cfg.icl_seq_len = 128 test_cfg.max_duration = '1ba' test_cfg.eval_interval = '1ba' @@ -167,14 +174,20 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path): inmemorylogger = trainer.logger.destinations[ 0] # pyright: ignore [reportGeneralTypeIssues] assert isinstance(inmemorylogger, InMemoryLogger) - assert 'icl/metrics/eval_gauntlet/average' in inmemorylogger.data.keys() - assert isinstance(inmemorylogger.data['icl/metrics/eval_gauntlet/average'], - list) - assert len(inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1]) > 0 + + category_name = 'default_average' if averages is None else 'core_average' + assert f'icl/metrics/eval_gauntlet/{category_name}' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1], tuple) + inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'], list) + assert len(inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'] + [-1]) > 0 + assert isinstance( + inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][-1], + tuple) - assert inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1][-1] == 0 + assert inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][ + -1][-1] == 0 def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path):