-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
executable file
·198 lines (160 loc) · 6.74 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import csv
import torch
from random import shuffle
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import classification_report
import config
def load_model(model_path):
model = KeyedVectors.load_word2vec_format(model_path, binary=True)
return model
def read_data_from_csv(filename, equalize=False, train=True, num_records=-1):
data = []
# only need this stuff for generating class-equalized data for training
if equalize is True:
num_change = int(num_records / 2)
num_same = num_change + config.RNN_SAME_ADDITIONAL_RECORDS
change_records = 0
same_records = 0
with open(filename, 'r', encoding='utf8') as rfile:
reader = csv.reader(rfile)
for index, row in enumerate(reader):
if index == 0:
continue
# if equalize = True, data should be equalized for SAME and CHANGE
if equalize is True:
current_boundary = row[2].strip()
if (current_boundary == '[SAME]' and same_records < num_same) or \
(current_boundary == '[CHANGE]' and change_records < num_change):
data.append({
'sent1': row[0].strip(),
'sent2': row[1].strip(),
'boundary': current_boundary
})
if current_boundary == '[SAME]':
same_records += 1
elif current_boundary == '[CHANGE]':
change_records += 1
if same_records >= num_same and change_records >= num_change:
break
# just add data as is. This will happen when equalize=False
else:
data.append({
'sent1': row[0].strip(),
'sent2': row[1].strip(),
'boundary': row[2].strip()
})
# if generating non-equalized training data, stopping condition should be num_records
if train is True and equalize is False and index > num_records:
break
# if train is False, we don't care about either equalization or a max number of records
shuffle(data)
return data
def print_evaluation_score(y_true, y_pred):
# accuracy, precision, recall, and F-measure.
target_names = ['Same', 'Change']
print(classification_report(y_true, y_pred, target_names=target_names))
def get_word_vector(token, model):
# this is for the padding vector
if token == '0':
return torch.zeros(300)
try:
return torch.Tensor(model[token])
except KeyError:
try:
return torch.Tensor(model[token.lower()])
except KeyError:
return torch.Tensor(config.UNKNOWN_WORD_VECTOR)
def generate_sent_vector(sent, model):
sent_vector = []
for token in sent:
word_vector = get_word_vector(token, model)
sent_vector.append(word_vector)
return torch.stack(sent_vector)
def generate_batch_vectors(sent_batch, model, max_sent_len=None):
sent_vectors = []
sent_batch_tokenized = [word_tokenize(s) for s in sent_batch]
if max_sent_len is None:
max_sent_len = len(max(sent_batch_tokenized, key=lambda x: len(x)))
for sent in sent_batch_tokenized:
# padding the sentences with 0
if len(sent) < max_sent_len:
for i in range(len(sent), max_sent_len):
sent.append('0')
sent_vector = generate_sent_vector(sent, model)
sent_vectors.append(sent_vector)
return torch.stack(sent_vectors)
def get_boundary_mapping(boundary_batch):
mapped_boundaries = [config.BOUNDARY_TO_INT_MAPPING[b]
for b in boundary_batch]
return mapped_boundaries
def combine_sents_bert_style(sent1, sent2):
sent = "[CLS] " + sent1 + " [SEP] " + sent2 + " [SEP]"
return sent
def preprocess_sents_bert_style(sent_data, tokenizer, max_sent_len):
sents = []
labels = []
for elem in sent_data:
sent1 = elem['sent1']
sent2 = elem['sent2']
combined_sent = combine_sents_bert_style(sent1, sent2)
tokenized_sent = tokenizer.tokenize(combined_sent)
# clip tokens to max_sent_len
tokenized_sent = tokenized_sent[:max_sent_len]
sents.append(tokenized_sent)
labels.append(config.BOUNDARY_TO_INT_MAPPING[elem['boundary']])
return sents, labels
def create_segment_masks(preprocessed_train_data, max_sent_len):
segment_masks = []
for sent in preprocessed_train_data:
sent_id = 0
sent_mask = []
for token in sent:
# append sent_id first, then check for SEP token
sent_mask.append(sent_id)
# when SEP token found, switch sent_id to 1, since new sentence is starting now
if token == '[SEP]':
sent_id = 1
while len(sent_mask) < max_sent_len:
sent_mask.append(0)
segment_masks.append(sent_mask)
return segment_masks
def prepare_data_bert(data, tokenizer, max_sent_len):
print("\tTokenizing data...")
preprocessed_data, labels = preprocess_sents_bert_style(
data, tokenizer, max_sent_len)
# max_sent_len = max(len(a) for a in preprocessed_data)
# max sentence length for BERT is 512
print("\tGetting numeric representations, padding and creating segment and attention masks...")
# get numeric representations of tokens
input_ids = [tokenizer.convert_tokens_to_ids(
x) for x in preprocessed_data]
# pad sequences
padded_seqs = pad_sequence([torch.LongTensor(x)
for x in input_ids], batch_first=True)
# create segment masks for separating two sentences
segment_masks = create_segment_masks(
preprocessed_data, padded_seqs.size(1))
# create attention masks
attention_masks = []
for seq in padded_seqs:
seq_mask = [float(x > 0) for x in seq]
attention_masks.append(seq_mask)
print("\tCreating tensors...")
# make everything a tensor
tensor_seqs = torch.LongTensor(padded_seqs)
tensor_labels = torch.LongTensor(labels)
tensor_attention_masks = torch.LongTensor(attention_masks)
tensor_segment_masks = torch.LongTensor(segment_masks)
print("\tCreating dataset...")
# batching
batch_size = config.BERT_BATCH_SIZE
# make an iterator
tensor_data = TensorDataset(
tensor_seqs, tensor_segment_masks, tensor_attention_masks, tensor_labels)
tensor_sampler = RandomSampler(tensor_data)
tensor_dataloader = DataLoader(
tensor_data, sampler=tensor_sampler, batch_size=batch_size)
return tensor_dataloader