forked from davidsbatista/StanfordNER-experiments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvarious-scripts-folds-eval.py
108 lines (84 loc) · 3.45 KB
/
various-scripts-folds-eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import random
import subprocess
import sys
import os
from sklearn_crfsuite.metrics import flat_classification_report
base = "/usr/bin/java -Xmx10g -cp stanford-ner-2017-06-09/stanford-ner.jar " \
"edu.stanford.nlp.ie.crf.CRFClassifier -prop stanford_ner.prop "
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def generate_folds(folds=3):
sentences = []
sentence = []
with open(sys.argv[1]) as f_input:
for line in f_input:
if len(line) == 1 and len(sentence) >= 1:
sentences.append(sentence)
sentence = []
elif len(line) > 1:
parts = line.strip().split('\t')
sentence.append((parts[0], parts[1]))
random.shuffle(sentences, random.random)
print("total", len(sentences))
print("k_folds: ", folds)
per_fold = round(len(sentences) / float(folds))
print("per fold: ", per_fold)
data_folds = list(chunks(sentences, per_fold + 1))
for i in range(len(data_folds)):
fold = data_folds[i]
with open("fold_" + str(i), "wt") as f_out:
for sentence in fold:
for token in sentence:
f_out.write(token[0] + '\t' + token[1] + '\n')
f_out.write('\n')
def run(train_data, test_data, fold):
print("Training on {} and testing on {}".format(train_data, test_data))
stanford_crf = "-trainFile corpus/{} " \
"-testFile corpus/{} " \
"1>tagged_fold_{}.csv " \
"2>results_fold_{}.txt " \
"-serializeTo model_{}.ser.gz".format(train_data, test_data, fold, fold, fold)
full_cmd = base + stanford_crf
print(full_cmd)
p = subprocess.Popen(full_cmd, shell=True)
sts = os.waitpid(p.pid, 0)
def evaluate():
results = ['tagged_fold_0.csv', 'tagged_fold_1.csv', 'tagged_fold_2.csv', 'tagged_fold_3.csv',
'tagged_fold_4.csv']
sentences_original = []
sentences_predicted = []
for i in range(len(results)):
with open(results[i], 'rt') as f_input:
sentence_original = []
sentence_predicted = []
for line in f_input:
line_parts = line.split('\t')
if len(line_parts) == 3:
word, true, predicted = line.split('\t')
sentence_original.append(true)
sentence_predicted.append(predicted.strip())
elif len(line_parts) == 1 and line_parts[0] == '\n':
sentences_original.append(sentence_original)
sentences_predicted.append(sentence_predicted)
sentence_original = []
sentence_predicted = []
labels = ['B-LOC', 'I-LOC',
'B-ORG', 'I-ORG',
'B-PER', 'I-PER',
'B-WRK', 'I-WRK']
results = flat_classification_report(sentences_original, sentences_predicted, digits=3,
labels=labels)
print(results)
def main():
# generate_folds(folds=5)
experiments = [('fold_1_2_3_4', 'fold_0'), ('fold_0_2_3_4', 'fold_1'),
('fold_0_1_3_4', 'fold_2'), ('fold_0_1_2_4', 'fold_3'),
('fold_0_1_2_3', 'fold_4')]
for fold in range(len(experiments)):
data = experiments[fold]
run(data[0], data[1], fold)
# evaluate()
if __name__ == "__main__":
main()