-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_vec_fn.py
432 lines (344 loc) · 15 KB
/
word_vec_fn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import os
import copy
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import json
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import zero_one_loss
from libact.base.dataset import Dataset
from libact.labelers import IdealLabeler
from libact.query_strategies import RandomSampling
from libact.query_strategies.multilabel import MultilabelWithAuxiliaryLearner
from libact.query_strategies.multilabel import MMC, BinaryMinimization
from libact.models.multilabel import BinaryRelevance
from libact.models import LogisticRegression, SVM
def remove_stop_words(sentence):
""" Removes stopwords from full_sentences.
Parameters
----------
sentence: list
List of tokenized words.
Returns
-------
filtered_sent : list
New sentence with stopwords removed.
"""
stop_words = stopwords.words('english')
stop_words.extend((',', '.')) # Removing punctuation increases accuracy.
filtered_sent = [w for w in sentence if w.lower() not in stop_words]
return filtered_sent
def load_glove_model(gloveFile, word_count=100000):
"""
Loads a selection of the most common glove vectors into a dictionary.
:param gloveFile: textfile
Contains glove vectors. Each line contains a word string followed by a
'n_features'-dimensional vector to describe the word. Where n_features
is the number of features.
:param word_count: int, default: 100000
Number of words to load from the gloveFile
:return: dictionary
{'word': vector}
word = string of the word we wish to load
vector = 'n_features'-d vectors to describe the word
"""
path = './data/gloveFile_done_%d.npy' % (word_count)
# Saves time by loading existing file, if available.
if os.path.exists(path):
glove = np.load(path).item()
print '100,000 glove vectors loaded from existing file.'
else:
f = open(gloveFile,'r')
glove = {}
count = 0
for line in f:
split_line = line.split()
word = split_line[0]
embedding = [float(val) for val in split_line[1:]]
embedding = np.asanyarray(embedding)
glove[word] = embedding
count += 1
if count >= word_count:
break
# Saves the vectors, so we can load it faster next time.
np.save(path, glove)
print '100,000 glove vectors loaded.'
return glove
def load_data_mal(model, loc='./data/maluuba/data_frames.json'):
"""Loads data directly from JSON. JSON dataset must contain the keys
"labels" and "dialog_list", for example:
"labels": [["inform"], ["suggest"], ["negate", "thankyou"]],
"dialog_list": [
"HI im from Alexandria looking to book a 12 day vacation",
"I only have flights available to Vancouver, Canada at the moment.",
"I will look elsewhere thank you!"]
:param loc: string
Locates the directory in which the data is saved.
:return: X, numpy array shape (n_samples, n_features)
:return: Y, numpy array shape (n_samples, n_classes)
"""
X_list, Y_list = [], []
with open(loc, 'r') as file:
for line in file:
line = json.loads(line)
dialog_list = line.get('dialog_list', 'empty')
labels = line.get('labels', 'empty')
# Since, for now, each dialog is an example i.e. we ignore context
X_list.extend(dialog_list)
# All labels in Y_list correspond to each dialog turn as embedded
# lists. Example:
# [['label_7'],['label_2'], ['label_4','label_1'], ['label_3'], ..]
Y_list.extend(labels)
# Y is a numpy array, shape (n_samples, n_classes)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y_list)
# Place each example (vector representation of a turn in conversation),
# as a row in a matrix. X has shape (n_examples, n_features)
X = np.asarray(sentences2vec(X_list, model))
return X, Y
def load_data_amz(model, loc='./data/amazon/amazon_mixed_dataset.json'):
"""Loads data directly from JSON. Input data should have the form:
...what about stop words???
:param loc: string
Locates the directory in which the data is saved.
:return: X, numpy array shape (n_samples, n_classes)
:return: Y, numpy array shape (n_samples, n_classes)
"""
X_list, Y_list = [], []
with open(loc, 'r') as file:
for line in file:
line = json.loads(line)
review = line.get('text', 'empty')
labels = line.get('labels', 'empty')
# Since, for now, each dialog is an example i.e. we ignore context
X_list.extend([review])
# All labels in Y_list correspond to each dialog turn as embedded
# lists. Example:
# [['label_7'],['label_2'], ['label_4','label_1'], ['label_3'], ..]
Y_list.extend([[labels]])
# Y is a numpy array, shape (n_samples, n_classes)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y_list)
# Place each example (vector representation of a turn in conversation),
# as a row in a matrix. X has shape (n_examples, n_features)
X = np.asarray(sentences2vec(X_list, model))
return X, Y
def sentences2vec(sentences, glove):
""" Uses the glove word vectors to convert whole sentences to single
vectors by averaging the input vectors.
:param: sentences, list
List of all the dialog turns as a list of strings.
Example:
['First conversation first turn.','First conversation second turn.',
..., 'Last conversation, last turn']
:param glove: dictionary
{'word': vector}, where vector has shape, (1, 'n_features')
:return: list
List of sentences, where each sentence is represented by a
single vector.
"""
if type(sentences) != list:
print "sentences2vec requires a list of sentence/s"
return
sentence_vectors = []
for sentence in sentences:
sentence = word_tokenize(sentence)
# Optional: remove stop words
# stop_removed = remove_stop_words(sentence)
# If, when we remove stopwords, turns of the conversation have no
# content, we leave the stopwords in
# if len(stop_removed )!=0: sentence = stop_removed
matched_words = len(sentence)
# All vectors have the same dimensionality. Using an example
# 'word' to set the size of our new sentence vector.
sum_of_words = np.zeros(len(glove[',']))
# We will represent a sentence as an average of the words it contains.
for word in sentence:
sum_of_words += glove.get(word, 0)
if matched_words != 0: # Necessary, to ensure we don't divide by zero.
sentence_vector = sum_of_words/matched_words
sentence_vectors.append(sentence_vector)
else:
# For empty reviews we will use a placeholder
# (will change code so this entry is deleted from dataset)
sentence_vectors.append(glove.get(',', 0))
return sentence_vectors
def full_trn_tst(X, Y, train_size, test_size, num_labelled):
# Preprocessing:
# X = StandardScaler().fit_transform(X)
# if args.pca: X = process_pca(args.pca, X)
# else: pass
# Randomly split the X and y arrays according to test_size.
X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=test_size,
train_size=train_size)
# We are giving the Dataset 'num_labelled' fully labelled examples
Y_trn_mix = Y_trn[:num_labelled].tolist()+[None]*(len(Y_trn)-num_labelled)
# Dataset is a libact object that stores labelled and unlabelled data
# (unlabelled examples are stored as None)
trn_ds = Dataset(X_trn, Y_trn_mix)
tst_ds = Dataset(X_tst, Y_tst.tolist())
fully_labeled_trn_ds = Dataset(X_trn, Y_trn)
return trn_ds, tst_ds, fully_labeled_trn_ds
def split_train_test(test_size, num_labelled):
"""
Generates synthetic data for multilabel problem.
:param test_size:
:param num_labelled:
:return:
"""
# choose a dataset with unbalanced class instances
# calls random sample generators to build artificial datasets
# data[0] are the examples; data[1] are the labels
data = make_multilabel_classification(
n_samples=300, n_classes=10, allow_unlabeled=False)
# Standardize features by removing the mean and scaling to unit variance
X = StandardScaler().fit_transform(data[0])
Y = data[1]
# Split X & Y into random train and test subsets
X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=test_size)
# We are giving the Dataset 'num_labelled' fully labelled examples
Y_trn_mix = Y_trn[:num_labelled].tolist() + [None] * (
len(Y_trn) - num_labelled)
# Dataset is a libact object that stores labelled and unlabelled data
# (unlabelled examples are stored as None)
trn_ds = Dataset(X_trn, Y_trn_mix)
tst_ds = Dataset(X_tst, Y_tst.tolist())
fully_labeled_trn_ds = Dataset(X_trn, Y_trn)
return trn_ds, tst_ds, fully_labeled_trn_ds
def run(trn_ds, tst_ds, lbr, model, qs, quota, fully_labeled_trn_ds):
C_out, C_out_01 = [], []
# Used to produce additional metrics not covered in libact
tst_X, tst_y = zip(*tst_ds.data)
for _ in range(quota):
# Query strategy (MMC, BinMin, RandomSampling,
# MultilabelWithAuxiliaryLearner) returns id of example to query
ask_id = qs.make_query()
# Returns the example corresponding to ask_id
X, _ = trn_ds.data[ask_id]
# Simulated oracle returns the label for the example, x
lb = lbr.label(X)
# Update training library with new label
trn_ds.update(ask_id, lb)
# Train the model (usually Binary Relevance) with the additional label
model.train(trn_ds)
# score returns the mean accuracy on the test dataset. In this case we
# have chosen to recieve the Hamming loss, which is the fraction of
# labels that are incorrectly predicted
C_out = np.append(C_out, model.score(tst_ds, criterion='hamming'))
C_out_01 = np.append(C_out_01, zero_one_loss(np.asarray(tst_y), model.predict(tst_X)))
return C_out, C_out_01
def experiments(fully_labeled_trn_ds, trn_ds, tst_ds, quota):
"""
Runs the active learning experiment for each of the 6 query
strategies/criteria combinations.
:param fully_labeled_trn_ds: libact dataset
Fully labelled training examples
:param trn_ds: libact dataset
Training examples with missing labels
:param tst_ds: libact dataset
Test dataset
:param quota: int
number of queries
:return: results: dict
Key describes the type of experiment, while the value is the
corresponding result from the loss function. The value is a list which
tracks the Hamming loss and f1 score, in positions 0 and 1. The Hamming
loss and f1 score themselves are lists corresponding to the loss after
making different amounts of queries.
"""
# lbr simulates an oracle (method label(x) returns x's label)
lbr = IdealLabeler(fully_labeled_trn_ds)
# This is the model which we train and use for predictions. It divides the
# task into multiple binary tasks, training one classifier for each label.
# Unlike OvA and OvR BinaryRelevance does not use one classifier for each
# possible value for the label
model = BinaryRelevance(LogisticRegression())
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)
trn_ds4 = copy.deepcopy(trn_ds)
trn_ds5 = copy.deepcopy(trn_ds)
trn_ds6 = copy.deepcopy(trn_ds)
results = {}
print "before qs1"
# MMC requires a 'base learner' for its binary relevance
qs = MMC(trn_ds, br_base=LogisticRegression())
E_out_1, E_out_01_1 = run(trn_ds, tst_ds, lbr, model, qs, quota,
fully_labeled_trn_ds)
results['MMC'] = [E_out_1, E_out_01_1]
print "before qs2"
qs2 = RandomSampling(trn_ds2)
E_out_2, E_out_01_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota,
fully_labeled_trn_ds)
results['Random'] = [E_out_2, E_out_01_2]
print "before qs3"
# MLALAL requires two multilabel models:
# 1) major_learner:
# Gives a binary output on each label and is used for final prediction
# 2) auxiliary_learner:
# Estimates the confident on each label, and gives a real value output
# (supports pred_real method)
qs3 = MultilabelWithAuxiliaryLearner(
trn_ds3,
BinaryRelevance(LogisticRegression()),
BinaryRelevance(SVM()),
criterion='hlr')
E_out_3, E_out_01_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota,
fully_labeled_trn_ds)
results['Aux_hlr'] = [E_out_3, E_out_01_3]
print "before qs4"
qs4 = MultilabelWithAuxiliaryLearner(
trn_ds4,
BinaryRelevance(LogisticRegression()),
BinaryRelevance(SVM()),
criterion='shlr')
E_out_4, E_out_01_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota,
fully_labeled_trn_ds)
results['Aux_shlr'] = [E_out_4, E_out_01_4]
print "before qs5"
qs5 = MultilabelWithAuxiliaryLearner(
trn_ds5,
BinaryRelevance(LogisticRegression()),
BinaryRelevance(SVM()),
criterion='mmr')
E_out_5, E_out_01_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota,
fully_labeled_trn_ds)
results['Aux_mmr'] = [E_out_5, E_out_01_5]
print "before qs6"
# BinMin only needs a Continuous Model to eveluate uncertainty
qs6 = BinaryMinimization(trn_ds6, LogisticRegression())
E_out_6, E_out_01_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota,
fully_labeled_trn_ds)
results['BinMin'] = [E_out_6, E_out_01_6]
print "Done qs6"
return results
###############################################################################
##### function used for creating amazon dataset ###############################
#
# import json
# import gzip
# import os
# import glob
#
# def parse(path):
# g = gzip.open(path, 'r')
# for l in g:
# yield eval(l)
#
# with open('./data/amazon/amazon_mixed_dataset.json', 'w') as outfile:
# path = "./data/amazon/*.gz"
# for fname in glob.glob(path):
# i = 0
# for line in parse(fname):
# newline = {}
# # Should be a better way to do this?
# newline['labels'] = category = \
# os.path.basename(fname).split('.')[0][8:]
# newline['text'] = line['reviewText']
# json.dump(newline, outfile)
# outfile.write('\n')
# i += 1
# if i >= 1000: break
###############################################################################
###############################################################################