-
Notifications
You must be signed in to change notification settings - Fork 88
/
main.py
75 lines (64 loc) · 2.45 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#figure out which word of a list doesn't belong!
#use word2vec for vectorization
#compute the mean of the vectors, find the one which is furthest!
import numpy as np
import os
from sklearn.cluster import MiniBatchKMeans as kmeans
def load_word2vec(dir):
#new: since github has a 100M limit, load from a bunch of files in
#a directory
word2vec = {}
for path in os.listdir(dir):
iword2vec = {}
#load the word2vec features.
with open(os.path.join(dir,path), 'r') as fin:
if path == 'vectors0.txt':
next(fin) #skip information on first line
for line in fin:
items = line.replace('\r','').replace('\n','').split(' ')
if len(items) < 10: continue
word = items[0]
vect = np.array([float(i) for i in items[1:] if len(i) > 1])
iword2vec[word] = vect
word2vec.update(iword2vec)
return word2vec
def get_furthest_word(words, word2vect):
vectlist = []
for word in words:
#unknown word?
if word not in word2vect: return word
#normalize.
vectlist.append(word2vect[word]/np.linalg.norm(word2vect[word]))
mean = np.array(vectlist).mean(axis=0)
mean = mean / np.linalg.norm(mean)
#figure out which is furthest
dists = [np.linalg.norm(v - mean) for v in vectlist]
return words[np.argmax(dists)]
def cluster_vects(word2vect):
#use sklearn minibatch kmeans to cluster the vectors.
clusters = kmeans(n_clusters= 25, max_iter=10,batch_size=200,
n_init=1,init_size=2000)
X = np.array([i.T for i in word2vect.itervalues()])
y = [i for i in word2vect.iterkeys()]
print 'fitting kmeans, may take some time'
clusters.fit(X)
print 'done.'
#now we can get a mapping from word->label
#which will let us figure out which other words are in the same cluster
return {word:label for word,label in zip(y,clusters.labels_)}
def words_in_cluster(word, word_to_label):
#sorry, this is O(n), n is pretty large
#it could be O(k), k=cluster size, but that would cost more memory
label = word_to_label[word]
#get the other words with this label
similar_words = [key for key,val in word_to_label.iteritems() if val==label]
return similar_words
def main():
print 'loading knowledge from Wikipedia...should take 10-20 seconds'
word2vec = load_word2vec('vectors')
print 'Type several words separated by spaces. The more words you enter, the better I can guess.'
while (True):
words = raw_input('->').lower().split(' ')
print 'I think',get_furthest_word(words, word2vec),'doesnt belong in this list!\n'
if __name__ == '__main__':
main()