-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
149 lines (119 loc) · 5.7 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import torch
import os
import numpy as np
from config import Config
args = Config()
args.load_config()
def read_entity_from_id():
folder = "{data_folder}/{dataset}/".format(data_folder=args.data_folder, dataset=args.dataset)
entity2id = {}
with open(folder + "entity2id.txt", 'r') as f:
for line in f:
if len(line.strip().split()) > 1:
entity, entity_id = line.strip().split(
)[0].strip(), line.strip().split()[1].strip()
entity2id[entity] = int(entity_id)
return entity2id
def read_relation_from_id():
folder = "{data_folder}/{dataset}/".format(data_folder=args.data_folder, dataset=args.dataset)
relation2id = {}
with open(folder + "relation2id.txt", 'r') as f:
for line in f:
if len(line.strip().split()) > 1:
relation, relation_id = line.strip().split(
)[0].strip(), line.strip().split()[1].strip()
relation2id[relation] = int(relation_id)
return relation2id
def init_embeddings():
folder = "{data_folder}/{dataset}/".format(data_folder=args.data_folder, dataset=args.dataset)
entity_file = folder + 'entity2vec.txt'
relation_file = folder + 'relation2vec.txt'
entity_emb, relation_emb = [], []
with open(entity_file) as f:
for line in f:
entity_emb.append([float(val) for val in line.strip().split()])
with open(relation_file) as f:
for line in f:
relation_emb.append([float(val) for val in line.strip().split()])
return np.array(entity_emb, dtype=np.float32), np.array(relation_emb, dtype=np.float32)
def parse_line(line):
line = line.strip().split()
e1, relation, e2 = line[0].strip(), line[1].strip(), line[2].strip()
return e1, relation, e2
def load_data(filename, entity2id, relation2id, is_unweigted=False, directed=True):
with open(filename) as f:
lines = f.readlines()
# this is list for relation triples
triples_data = []
# for sparse tensor, rows list contains corresponding row of sparse tensor, cols list contains corresponding
# columnn of sparse tensor, data contains the type of relation
# Adjacecny matrix of entities is undirected, as the source and tail entities should know, the relation
# type they are connected with
rows, cols, data = [], [], []
unique_entities = set()
for line in lines:
e1, relation, e2 = parse_line(line)
unique_entities.add(e1)
unique_entities.add(e2)
triples_data.append(
(entity2id[e1], relation2id[relation], entity2id[e2]))
if not directed:
# Connecting source and tail entity
rows.append(entity2id[e1])
cols.append(entity2id[e2])
if is_unweigted:
data.append(1)
else:
data.append(relation2id[relation])
# Connecting tail and source entity
rows.append(entity2id[e2])
cols.append(entity2id[e1])
if is_unweigted:
data.append(1)
else:
data.append(relation2id[relation])
print("number of unique_entities ->", len(unique_entities))
return triples_data, (rows, cols, data), list(unique_entities)
def build_data(path='./data', is_unweigted=False, directed=True):
entity2id = read_entity_from_id()
relation2id = read_relation_from_id()
folder = "{path}/{dataset}/".format(path=path, dataset=args.dataset)
# Adjacency matrix only required for training phase
# Currenlty creating as unweighted, undirected
train_triples, train_adjacency_mat, unique_entities_train = load_data(os.path.join(
folder, 'train.txt'), entity2id, relation2id, is_unweigted, directed)
validation_triples, valid_adjacency_mat, unique_entities_validation = load_data(
os.path.join(folder, 'valid.txt'), entity2id, relation2id, is_unweigted, directed)
test_triples, test_adjacency_mat, unique_entities_test = load_data(os.path.join(
folder, 'test.txt'), entity2id, relation2id, is_unweigted, directed)
id2entity = {v: k for k, v in entity2id.items()}
id2relation = {v: k for k, v in relation2id.items()}
left_entity, right_entity = {}, {}
with open(os.path.join(folder, 'train.txt')) as f:
lines = f.readlines()
for line in lines:
e1, relation, e2 = parse_line(line)
# Count number of occurences for each (e1, relation)
if relation2id[relation] not in left_entity:
left_entity[relation2id[relation]] = {}
if entity2id[e1] not in left_entity[relation2id[relation]]:
left_entity[relation2id[relation]][entity2id[e1]] = 0
left_entity[relation2id[relation]][entity2id[e1]] += 1
# Count number of occurences for each (relation, e2)
if relation2id[relation] not in right_entity:
right_entity[relation2id[relation]] = {}
if entity2id[e2] not in right_entity[relation2id[relation]]:
right_entity[relation2id[relation]][entity2id[e2]] = 0
right_entity[relation2id[relation]][entity2id[e2]] += 1
left_entity_avg = {}
for i in range(len(relation2id)):
left_entity_avg[i] = sum(
left_entity[i].values()) * 1.0 / len(left_entity[i])
right_entity_avg = {}
for i in range(len(relation2id)):
right_entity_avg[i] = sum(
right_entity[i].values()) * 1.0 / len(right_entity[i])
headTailSelector = {}
for i in range(len(relation2id)):
headTailSelector[i] = 1000 * right_entity_avg[i] / (right_entity_avg[i] + left_entity_avg[i])
return (train_triples, train_adjacency_mat), (validation_triples, valid_adjacency_mat), (test_triples, test_adjacency_mat), entity2id, relation2id, headTailSelector, unique_entities_train