Skip to content

Commit

Permalink
Add gpqa eval task (#1117)
Browse files Browse the repository at this point in the history
* lm_eval add gpqa task

* add test code
  • Loading branch information
CL-ModelCloud authored Jan 20, 2025
1 parent cf5f1b1 commit 2b3f5eb
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
1 change: 1 addition & 0 deletions gptqmodel/utils/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class LM_EVAL(Enum):
MMLU = "mmlu"
HELLASWAG = "hellaswag"
GSM8K_COT = "gsm8k_cot"
GPQA = "gpqa"

class EVALPLUS(Enum):
HUMAN = "humaneval"
Expand Down
25 changes: 19 additions & 6 deletions tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,33 @@ def setUpClass(self):
(EVAL.LM_EVAL, EVAL.LM_EVAL.ARC_CHALLENGE, 'gptqmodel'),
(EVAL.EVALPLUS, EVAL.EVALPLUS.HUMAN, 'gptqmodel'),
(EVAL.LM_EVAL, EVAL.LM_EVAL.ARC_CHALLENGE, 'vllm'),
(EVAL.EVALPLUS, EVAL.EVALPLUS.HUMAN, 'vllm')
(EVAL.EVALPLUS, EVAL.EVALPLUS.HUMAN, 'vllm'),
(EVAL.LM_EVAL, EVAL.LM_EVAL.GPQA, 'vllm'),
]
)
def test_eval_gptqmodel(self, eval_backend: EVAL, task: Union[EVAL.LM_EVAL, EVAL.EVALPLUS], backend: str):
with tempfile.TemporaryDirectory() as tmp_dir:
output_file = f"{tmp_dir}/result.json"
extra_model_args = ""
if task == EVAL.LM_EVAL.GPQA:
extra_model_args = "gpu_memory_utilization=0.7"

results = GPTQModel.eval(self.MODEL_ID, framework=eval_backend, tasks=[task], batch=32,
output_file=output_file, backend=backend)
output_file=output_file, backend=backend, extra_model_args=extra_model_args)

if eval_backend == EVAL.LM_EVAL:
acc_score = results['results'].get(task.value, {}).get('acc,none')
acc_norm_score = results['results'].get(task.value, {}).get('acc_norm,none')
if task == EVAL.LM_EVAL.GPQA:
gpqa_main_n_shot = results['results'].get('gpqa_main_n_shot', {}).get('acc,none')
gpqa_main_zeroshot = results['results'].get('gpqa_main_zeroshot', {}).get('acc,none')

self.assertGreaterEqual(gpqa_main_n_shot, 0.21, "acc score does not match expected result")
self.assertGreaterEqual(gpqa_main_zeroshot, 0.25, "acc_norm score does not match expected result")
else:
acc_score = results['results'].get(task.value, {}).get('acc,none')
acc_norm_score = results['results'].get(task.value, {}).get('acc_norm,none')

self.assertGreaterEqual(acc_score, 0.28, "acc score does not match expected result")
self.assertGreaterEqual(acc_norm_score, 0.32, "acc_norm score does not match expected result")
self.assertGreaterEqual(acc_score, 0.28, "acc score does not match expected result")
self.assertGreaterEqual(acc_norm_score, 0.32, "acc_norm score does not match expected result")
elif eval_backend == EVAL.EVALPLUS:
result = results.get(task.value)
base_formatted, plus_formatted, _ = float(result.get("base tests")), float(
Expand Down

0 comments on commit 2b3f5eb

Please sign in to comment.