forked from datawhalechina/tiny-universe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval.py
69 lines (61 loc) · 2.34 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import json
import argparse
import numpy as np
from Eval.metrics import (
qa_f1_score,
qa_f1_zh_score,
rouge_score,
classification_score,
rouge_zh_score
)
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='Qwen2')
return parser.parse_args(args)
dataset2metric = {
'multifieldqa_zh': qa_f1_zh_score,
'multi_news': rouge_score,
'trec': classification_score,
'custom_zh': rouge_zh_score
}
# 计算得分
def scorer(dataset, predictions, answers, all_classes):
total_score = 0.
for (prediction, ground_truths) in zip(predictions, answers):
score = 0.
if dataset in ["trec"]:
prediction = prediction.lstrip('\n').split('\n')[0] # 格式抽取
if dataset in ['custom_zh', 'custom_en']:
score = max(score, dataset2metric[dataset](prediction, ground_truths, all_classes=all_classes))
else:
score = max(score, dataset2metric.get(dataset, dataset2metric['custom_zh'])(prediction, ground_truths, all_classes=all_classes))
# for ground_truth in ground_truths:
# score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
total_score += score
return round(100 * total_score / len(predictions), 2)
if __name__ == '__main__':
scores = dict()
args = parse_args()
path = f"Eval/pred/{args.model}/"
all_files = os.listdir(path)
print("Evaluating on:", all_files)
for file in all_files:
if not file.endswith(".jsonl") or file == "result.json":
continue
predictions, answers, lengths = [], [], []
dataset = file.split('.')[0]
with open(f'{path}{file}', 'r', ) as f:
for line in f:
data = json.loads(line) # str转为dict
predictions.append(data["pred"])
answers.append(data["answers"])
all_classes = data["all_classes"]
if "length" in data:
lengths.append(data["length"])
score = scorer(dataset, predictions, answers, all_classes)
scores[dataset] = score
# 保存结果
out_path = f"Eval/pred/{args.model}/result.json"
with open(out_path, "w") as f:
json.dump(scores, f, ensure_ascii=False, indent=4)