-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_polarities.py
63 lines (58 loc) · 2.31 KB
/
create_polarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
type_prior_polarity_to_polarity = {('strongsubj', 'negative'):"strongneg",
('weaksubj', 'negative'):"weakneg",
('weaksubj', 'neutral'):"neutral",
('strongsubj', 'neutral'):"neutral",
('weaksubj', 'positive'):"weakpos",
('strongsubj', 'positive'):"strongpos"
}
word_to_polarity = {}
with open("./resources/subjclueslen1-HLTEMNLP05.tff", 'r') as f:
i = 0
for line in f:
word_line = line.split()
word = (word_line[2].split("="))[1]
type_ = (word_line[0].split("="))[1]
prior_polarity = (word_line[5].split("="))[1]
# ignore "both"
if prior_polarity == "both":
continue
polarity = type_prior_polarity_to_polarity[(type_, prior_polarity)]
word_to_polarity[word] = polarity
filenames = []# ["acl_dev_eval.json", "acl_dev_tune.json", "acl_mpqa_eval.json", "acl_test.json"]
for filename in filenames:
all_data = []
print(filename)
with open("./data/stanford_label_sent_boundary_with_label/" + filename, "r", encoding="latin1") as af:
for line in af:
annot = json.loads(line)
polarity = []
for token in annot["token"]:
if token in word_to_polarity.keys():
polarity.append(word_to_polarity[token])
else:
polarity.append("<unk>")
annot["polarity"] = polarity
all_data.append(annot)
with open("./data/" + filename, "w", encoding="latin1") as wf:
for line in all_data:
json.dump(line, wf)
wf.write('\n')
'''
print("evaluating validity...")
# testing
with open("./data/new_annot/polarity/" + filename, "r", encoding="latin1") as af:
for line in af:
annot = json.loads(line)
polarities = annot["polarity"]
tokens = annot["token"]
if len(polarities) != len(tokens):
print("length problem: " + str(annot))
for i in range(0, len(polarities)):
if polarities[i] == "<unk>":
if tokens[i] in word_to_polarity.keys():
print("something not labelled")
else:
if word_to_polarity[tokens[i]] != polarities[i]:
print("wrong token")
'''