diff --git a/evals/evaluation/lm_evaluation_harness/accuracy.py b/evals/evaluation/lm_evaluation_harness/accuracy.py index bc00f25b..01181e70 100644 --- a/evals/evaluation/lm_evaluation_harness/accuracy.py +++ b/evals/evaluation/lm_evaluation_harness/accuracy.py @@ -16,7 +16,7 @@ import numpy as np from lm_eval import utils -from lm_eval.logging_utils import WandbLogger +from lm_eval.loggers import WandbLogger from lm_eval.tasks import TaskManager from lm_eval.utils import make_table, simple_parse_args_string diff --git a/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py b/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py index ff7c9c1a..b8b5445d 100644 --- a/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py +++ b/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py @@ -28,7 +28,7 @@ print_writeout, run_task_tests, ) -from lm_eval.logging_utils import add_env_info, get_git_commit_hash +from lm_eval.loggers.utils import add_env_info, get_git_commit_hash from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string @@ -472,7 +472,14 @@ def evaluate( # aggregate results ; run bootstrap CIs for task_output in eval_tasks: task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) - results, samples, configs, versions, num_fewshot = consolidate_results(eval_tasks) + ( + results, + samples, + configs, + versions, + num_fewshot, + higher_is_better, + ) = consolidate_results(eval_tasks) ### Calculate group metrics ### if bool(results): @@ -483,6 +490,23 @@ def evaluate( # or `task_name: []`. # we only want to operate on groups here. continue + + # collect all higher_is_better values for metrics + # in the group's subtasks. + # TODO: clean this up ; unify with the below metric_list loop? + _higher_is_better = {} + for task in task_list: + for m, h in higher_is_better[task].items(): + if m not in _higher_is_better.keys(): + _higher_is_better[m] = h + if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h: + eval_logger.warning( + f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None." + ) + _higher_is_better[m] = None + higher_is_better[group] = _higher_is_better + + # collect all metric keys used by a subtask in the group. metric_list = list( { key @@ -507,10 +531,8 @@ def evaluate( else: results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility - # To use the old (likely incorrect) variance formula, - # comment out the above and uncomment this line: - # results[group][stderr] = \ - # lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) + # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line: + # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) results[group]["samples"] = sum(sizes) @@ -540,6 +562,17 @@ def evaluate( "configs": dict(sorted(configs.items())), "versions": dict(sorted(versions.items())), "n-shot": dict(sorted(num_fewshot.items())), + "higher_is_better": dict(sorted(higher_is_better.items())), + "n-samples": { + task_output.task_name: { + "original": len(task_output.task.eval_docs), + "effective": min( + limit if limit else len(task_output.task.eval_docs), + len(task_output.task.eval_docs), + ), + } + for task_output in eval_tasks + }, } if log_samples: results_dict["samples"] = dict(samples) diff --git a/requirements.txt b/requirements.txt index 483cf00a..02087cf8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e langchain_community langchain_huggingface -lm-eval==0.4.2 +lm-eval==0.4.3 ragas diff --git a/tests/requirements.txt b/tests/requirements.txt index 483cf00a..02087cf8 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e langchain_community langchain_huggingface -lm-eval==0.4.2 +lm-eval==0.4.3 ragas