Skip to content

Commit

Permalink
Change gauntlet avging (#640)
Browse files Browse the repository at this point in the history
* commit

* commit

* commit

* commit

* commit

* restore mcli

* eval gauntlet cb

* fix error

* address daniels comments

* parametrize

* parametrize

* precommit

* change

* change

---------

Co-authored-by: Daniel King <[email protected]>
  • Loading branch information
bmosaicml and dakinggg authored Oct 27, 2023
1 parent 7009d4d commit 9027f49
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 51 deletions.
88 changes: 63 additions & 25 deletions llmfoundry/callbacks/eval_gauntlet_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,32 @@ class Weighting(Enum):
LOG_SAMPLE_SZ = 3


def calculate_named_averages(average_names: Dict[str, list],
category_scores: Dict[str, float]):
"""Calculates the named averages based off the raw category scores.
For each named average, take a simple average of all the category scores associated with that named average.
Args:
average_names (dict[str, list]): Contains a mapping of named averages to which category scores that average should consist of.
category_scores (dict[str, float]): Contains the raw scores corresponding to each category.
"""
average_scores = {}
for avg_name, category_list in average_names.items():
composite_subset = {
category: score
for category, score in category_scores.items()
if category in category_list
}
if len(composite_subset.values()) > 0:
average_scores[avg_name] = sum(composite_subset.values()) / len(
composite_subset.values())
else:
average_scores[avg_name] = 0

return average_scores


class EvalGauntlet(Callback):
"""The EvalGauntlet aggregates ICL eval results.
Expand All @@ -31,7 +57,7 @@ class EvalGauntlet(Callback):
Args:
logger_keys (list): These are the exact keys that the individual benchmark metrics will be
logged under in the logger after eval
tasks (dict): This contains the list of categories, as well as the subtasks within them, the
categories (dict): This contains the list of categories, as well as the subtasks within them, the
random baseline accuracy of each subtask, and the number of fewshot examples
used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
weighting (Weighting): The weighting scheme used to balance different tasks within each category.
Expand All @@ -43,6 +69,7 @@ class EvalGauntlet(Callback):
rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark
by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
averages (Optional[dict]): Optional dictionary specifying a mapping from a average names to lists of categories used produce each named average.
"""

def __init__(self,
Expand All @@ -51,7 +78,8 @@ def __init__(self,
weighting: str = 'EQUAL',
subtract_random_baseline: bool = True,
rescale_accuracy: bool = True,
benchmark_sizes: Optional[dict] = None):
benchmark_sizes: Optional[dict] = None,
averages: Optional[dict] = None):
if isinstance(logger_keys, dict):
raise ValueError(
'logger_keys now requires a list type as input, not a dict')
Expand All @@ -66,13 +94,12 @@ def __init__(self,
)

self.categories = categories
self.category_names = [conf.get('name') for conf in self.categories]
self.weighting = Weighting[weighting]
self.subtract_random_baseline = subtract_random_baseline
self.rescale_accuracy = rescale_accuracy
self.logger_keys = logger_keys

for category in self.categories:

for benchmark in category['benchmarks']:
bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"

Expand All @@ -95,7 +122,20 @@ def __init__(self,
assert weight is not None
benchmark['weighting'] = weight

def compute_averages(self, state: State) -> Dict[str, float]:
self.averages = {}
if averages is not None:
self.averages = averages
else:
# if no averages spec provided, simply average everything
self.averages['default_average'] = self.category_names

for avg_name in self.averages:
if avg_name in self.category_names:
raise ValueError(
f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.'
)

def extract_metrics_from_state(self, state: State) -> Dict[str, float]:
results = {}

for key in self.logger_keys:
Expand All @@ -121,31 +161,30 @@ def compute_averages(self, state: State) -> Dict[str, float]:
return {k: sum(v) / len(v) for k, v in results.items()}

def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
new_metrics = self.compute_averages(state)
if len(new_metrics) == 0:
computed_metrics = self.extract_metrics_from_state(state)
if len(computed_metrics) == 0:
return {}
composite_scores = {}

category_scores = {}
for category in self.categories:
missing_metrics = []
composite_scores[category['name']] = []
category_scores[category['name']] = []
for benchmark in category['benchmarks']:
key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"

if key not in new_metrics:
if key not in computed_metrics:
log.warning(
f'Could not find results for benchmark: {benchmark}.')
missing_metrics.append(key)
else:
score = new_metrics[key]
score = computed_metrics[key]

if self.subtract_random_baseline:
score -= benchmark['random_baseline']

if self.rescale_accuracy and self.subtract_random_baseline:
score /= 1.0 - benchmark['random_baseline']

composite_scores[category['name']].append({
category_scores[category['name']].append({
'name': benchmark['name'],
'score': score,
'weighting': benchmark['weighting']
Expand All @@ -155,23 +194,22 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
log.warning(
f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}"
)
del composite_scores[category['name']]
del category_scores[category['name']]
continue
total_weight = sum(
k['weighting'] for k in composite_scores[category['name']])
composite_scores[category['name']] = sum(
k['weighting'] for k in category_scores[category['name']])
category_scores[category['name']] = sum(
k['score'] * (k['weighting'] / total_weight)
for k in composite_scores[category['name']])
for k in category_scores[category['name']])

composite_scores = {
named_averages = calculate_named_averages(self.averages,
category_scores)
category_scores.update(named_averages)
category_scores = {
f'icl/metrics/eval_gauntlet/{k}': v
for k, v in composite_scores.items()
for k, v in category_scores.items()
}

composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
composite_scores.values()) / len(composite_scores.values()) if len(
composite_scores.values()) > 0 else 0
if logger is not None:
logger.log_metrics(composite_scores)
logger.log_metrics(category_scores)

return composite_scores
return category_scores
19 changes: 7 additions & 12 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def evaluate_model(

if eval_gauntlet_df is None and eval_gauntlet_callback is not None:
eval_gauntlet_df = pd.DataFrame(
columns=['model_name', 'average'] +
columns=['model_name'] +
[avg for avg in eval_gauntlet_callback.averages] +
[t.name for t in eval_gauntlet_callback.categories])

load_path = model_cfg.get('load_path', None)
Expand Down Expand Up @@ -314,23 +315,17 @@ def main(cfg: DictConfig):
if eval_gauntlet_df is not None and eval_gauntlet_callback is not None:
assert composite_scores is not None
row = {'model_name': model_cfg['model_name']}
row.update({
t.name:
composite_scores.get(f'icl/metrics/eval_gauntlet/{t.name}',
None)
for t in eval_gauntlet_callback.categories
})
row.update({
'average':
composite_scores[f'icl/metrics/eval_gauntlet/average']
})
row.update(
{k.split('/')[-1]: v for k, v in composite_scores.items()})
eval_gauntlet_df = pd.concat(
[eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True)

print(f'Printing gauntlet results for all models')

print(
eval_gauntlet_df.sort_values(
'average', ascending=False).to_markdown(index=False))
list(eval_gauntlet_callback.averages.keys())[0],
ascending=False).to_markdown(index=False))
print(f'Printing complete results for all models')
assert models_df is not None
print(models_df.to_markdown(index=False))
Expand Down
21 changes: 21 additions & 0 deletions scripts/eval/yamls/eval_gauntlet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,27 @@ eval_gauntlet:
weighting: EQUAL
subtract_random_baseline: true
rescale_accuracy: true
averages:
core_average:
- world_knowledge
- commonsense_reasoning
- language_understanding
- symbolic_problem_solving
- reading_comprehension
- programming
lm_task_average:
- world_knowledge_lm_task_subscore
- commonsense_reasoning_lm_task_subscore
- language_understanding_lm_task_subscore
- symbolic_problem_solving_lm_task_subscore
- reading_comprehension_lm_task_subscore
lite_average:
- world_knowledge_lite
- commonsense_reasoning_lite
- language_understanding_lite
- symbolic_problem_solving_lite
- reading_comprehension_lite
- programming_lite
categories:
- name: world_knowledge
benchmarks:
Expand Down
2 changes: 1 addition & 1 deletion scripts/eval/yamls/hf_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ device_eval_batch_size: 4
# forward_prefetch: True
# limit_all_gathers: True

icl_tasks: 'eval/yamls/tasks.yaml'
icl_tasks: 'eval/yamls/tasks_light.yaml'
eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
4 changes: 2 additions & 2 deletions tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any):
assert isinstance(test_cfg, om.DictConfig)
main(test_cfg)
out, _ = capfd.readouterr()
expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt '
expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt |'
assert expected_results in out
expected_results = '| model_name | average | language_understanding_lite |\n|:-------------|----------:|------------------------------:|\n| tiny_mpt | 0 | 0 |'
expected_results = '| model_name | default_average | language_understanding_lite |\n|:-------------|------------------:|------------------------------:|\n| tiny_mpt | 0 | 0 |'
assert expected_results in out
17 changes: 14 additions & 3 deletions tests/test_eval_gauntlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import os
from typing import Dict, List
from typing import Dict, List, Optional

import omegaconf as om
import pytest
Expand Down Expand Up @@ -53,7 +53,10 @@ def log_metrics(self, metrics: Dict[str, float]) -> None:
self.inmemorylogger.log_metrics(metrics)


def test_gauntlet_callback():
@pytest.mark.parametrize('averages', [{
'core_average': ['world_knowledge', 'language_understanding']
}, None])
def test_gauntlet_callback(averages: Optional[dict]):
icl_task_config = om.OmegaConf.create("""
- label: jeopardy_small
dataset_uri: eval/local_data/world_knowledge/jeopardy_small.jsonl # ADD YOUR OWN DATASET URI
Expand Down Expand Up @@ -87,6 +90,9 @@ def test_gauntlet_callback():
""")
assert isinstance(eval_gauntlet_config, om.DictConfig) or isinstance(
eval_gauntlet_config, str)

if averages is not None:
eval_gauntlet_config.averages = averages
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')

# test loading functionality
Expand All @@ -106,4 +112,9 @@ def test_gauntlet_callback():
name = f'icl/metrics/eval_gauntlet/{category}'
assert result[name] == pytest.approx(0.25)

assert result['icl/metrics/eval_gauntlet/average'] == pytest.approx(0.25)
if averages is None:
assert result[
'icl/metrics/eval_gauntlet/default_average'] == pytest.approx(0.25)
else:
assert result[
'icl/metrics/eval_gauntlet/core_average'] == pytest.approx(0.25)
29 changes: 21 additions & 8 deletions tests/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import shutil
import sys
from argparse import Namespace
from typing import Any
from typing import Any, Optional

import pytest
from composer.loggers import InMemoryLogger
Expand Down Expand Up @@ -114,7 +114,11 @@ def set_correct_cwd():
os.chdir('..')


def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
@pytest.mark.parametrize('averages', [{
'core_average': ['language_understanding_lite']
}, None])
def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
tmp_path: pathlib.Path):
"""Test training run with a small dataset."""
dataset_name = create_c4_dataset_xsmall(tmp_path)
test_cfg = gpt_tiny_cfg(dataset_name, 'cpu')
Expand Down Expand Up @@ -155,6 +159,9 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
])
})

if averages is not None:
test_cfg.eval_gauntlet['averages'] = averages

test_cfg.icl_seq_len = 128
test_cfg.max_duration = '1ba'
test_cfg.eval_interval = '1ba'
Expand All @@ -167,14 +174,20 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
inmemorylogger = trainer.logger.destinations[
0] # pyright: ignore [reportGeneralTypeIssues]
assert isinstance(inmemorylogger, InMemoryLogger)
assert 'icl/metrics/eval_gauntlet/average' in inmemorylogger.data.keys()
assert isinstance(inmemorylogger.data['icl/metrics/eval_gauntlet/average'],
list)
assert len(inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1]) > 0

category_name = 'default_average' if averages is None else 'core_average'
assert f'icl/metrics/eval_gauntlet/{category_name}' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1], tuple)
inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'], list)
assert len(inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}']
[-1]) > 0
assert isinstance(
inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][-1],
tuple)

assert inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1][-1] == 0
assert inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][
-1][-1] == 0


def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path):
Expand Down

0 comments on commit 9027f49

Please sign in to comment.