-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_doc_features.py
122 lines (107 loc) · 4.89 KB
/
generate_doc_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
import random
files = [] #["acl_test.json", "acl_dev_eval.json", "acl_dev_tune.json", "acl_mpqa_eval.json"]
for filename in files:
print(filename)
'''
doc_to_sentences = {}
doc_to_sentences_co_occur = {}
all_data = []
with open("./data/" + filename, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
if annot["docid"] not in doc_to_sentences:
sentences_co_occur = 0
sentences = 0
sentence_has_holder = False
sentence_has_target = False
for i in range(0, len(annot["holder_target"])):
if annot["holder_target"][i] == "holder":
sentence_has_holder = True
if annot["holder_target"][i] == "target":
sentence_has_target = True
if annot["token"][i] == ".":
sentences += 1
if sentence_has_holder and sentence_has_target:
sentences_co_occur += 1
sentence_has_holder = False
sentence_has_target = False
doc_to_sentences[annot["docid"]] = sentences
doc_to_sentences_co_occur[annot["docid"]] = sentences_co_occur
# print(doc_to_sentences[annot["docid"]])
annot["co_occurrences"] = str(doc_to_sentences_co_occur[annot["docid"]])
all_data.append(annot)
print(doc_to_sentences)
print(doc_to_sentences_co_occur)
doc_to_sentences = {}
doc_to_sentences_co_occur = {}
with open("./data/final/" + filename, "w", encoding="latin1") as wf:
for annot in all_data:
json.dump(annot, wf)
wf.write("\n")
# '''
all_data = []
doc_to_entity_to_num_mentions = {}
with open("./data/" + filename, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
all_data.append(annot)
if annot["docid"] not in doc_to_entity_to_num_mentions:
doc_to_entity_to_num_mentions[annot["docid"]] = {}
if annot["holder"] not in doc_to_entity_to_num_mentions[annot["docid"]]:
doc_to_entity_to_num_mentions[annot["docid"]][annot["holder"]] = len(annot["holder_index"])
else:
assert doc_to_entity_to_num_mentions[annot["docid"]][annot["holder"]] == len(annot["holder_index"])
if annot["target"] not in doc_to_entity_to_num_mentions[annot["docid"]]:
doc_to_entity_to_num_mentions[annot["docid"]][annot["target"]] = len(annot["target_index"])
else:
assert doc_to_entity_to_num_mentions[annot["docid"]][annot["target"]] == len(annot["target_index"])
doc_to_num_mentions_to_entity = {}
# Sorting
for doc in doc_to_entity_to_num_mentions:
entity_to_mentions = doc_to_entity_to_num_mentions[doc]
if doc not in doc_to_num_mentions_to_entity:
doc_to_num_mentions_to_entity[doc] = {}
for entity in entity_to_mentions:
if entity_to_mentions[entity] not in doc_to_num_mentions_to_entity[doc]:
doc_to_num_mentions_to_entity[doc][entity_to_mentions[entity]] = []
doc_to_num_mentions_to_entity[doc][entity_to_mentions[entity]].append(entity)
doc_to_rank_to_entity = {}
max_rank = 0
rank_distr = {}
# Ranking
for doc in doc_to_entity_to_num_mentions:
mentions_to_entity = doc_to_num_mentions_to_entity[doc]
if doc not in doc_to_rank_to_entity:
doc_to_rank_to_entity[doc] = {}
rank = 0
for mention in sorted(mentions_to_entity, reverse=True):
if doc == "AFP_ENG_20101230.0350":
print(mentions_to_entity[mention])
rank += 1
doc_to_rank_to_entity[doc][rank] = mentions_to_entity[mention]
if rank not in rank_distr:
rank_distr[rank] = 0
rank_distr[rank] += 1
if rank > max_rank:
max_rank = rank
print(max_rank)
print(rank_distr)
doc_to_entity_to_rank = {}
# Reversing rank -> entity to entity -> rank
for doc in doc_to_rank_to_entity:
rank_to_entity = doc_to_rank_to_entity[doc]
if doc not in doc_to_entity_to_rank:
doc_to_entity_to_rank[doc] = {}
for rank in doc_to_rank_to_entity[doc]:
for entity in doc_to_rank_to_entity[doc][rank]:
doc_to_entity_to_rank[doc][entity] = rank
# Adding to feature to all_data
for annot in all_data:
annot["holder_rank"] = doc_to_entity_to_rank[annot["docid"]][annot["holder"]]
annot["target_rank"] = doc_to_entity_to_rank[annot["docid"]][annot["target"]]
with open("./data/final/" + filename, "w", encoding="latin1") as wf:
for annot in all_data:
json.dump(annot, wf)
wf.write("\n")
# '''