-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo1.py
140 lines (98 loc) · 3.26 KB
/
demo1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#goal: create word vectors from a game of thrones dataset
#and analyze them to see semantic similarity
from __future__ import absolute_import, print_function
#for word encoding
import codecs
#regex
import glob
#concurrency
import multiprocessing
#logging
import logging
#dealing with operating system,
import os
#pretty printing
import pprint
#regular exp
import re
#natural language toolkit
import nltk
#word 2 vec
#import gensim.models.word2vec as w2v
#import gensim
from gensim.models import Word2Vec
#from gensim import models
#dimensionality reduction
import sklearn.manifold
#math
import numpy as np
#plotting
import matplotlib.pyplot as plt
#parse pandas
import pandas as pd
#visulization
import seaborn as sns
#setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
thrones2vec = Word2Vec.load(os.path.join("trained","thrones2vec.w2v"))
#thrones2vec.init_sims();
#compress the word vectors into 2D space and plot them
tsne = sklearn.manifold.TSNE(n_components = 2, random_state = 0)
all_word_vectors_matrix = thrones2vec.wv.syn0
print("train t-SNE, this could take a minute or two")
#train t-SNE, this could take a minute or two
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
#plot the big picture
points = pd.DataFrame(
[
(word, coords[0], coords[1])
for word, coords in [
(word, all_word_vectors_matrix_2d[thrones2vec.wv.vocab[word].index])
for word in thrones2vec.wv.vocab
]
],
columns = ['word', 'x', 'y']
)
print(points.head(10))
#sns.set_context('poster')
#points.plot.scatter('x', 'y', s=10, figsize=(20,12))
#def plot_region(x_bounds, y_bounds):
# slice = points[
# (x_bounds[0] <= points.x) &
# (points.x <= x_bounds[1]) &
# (y_bounds[0] <= points.y) &
# (points.y <= y_bounds[1])
# ]
# ax = slice.plot.scatter('x', 'y', s=35, figsize=(10, 8))
# for i, point in slice.iterrows():
# ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize = 11)
#plot_region(x_bounds = (4.0, 4.2), y_bounds = (-0.5, -0.1))
#plot_region(x_bounds = (0, 1), y_bounds = (4, 4.5))
#explore semantic similarities between book characters
#Word closest to the given word
#print(thrones2vec.most_similar('Stark'))
#print(thrones2vec.most_similar('Aerys'))
#print(thrones2vec.most_similar('direwolf'))
#Linear relationships between word pairs
#def nearest_similarity_cosmul(start1, end1, end2):
# similarities = thrones2vec.wv.most_similar_cosmul(
# positive = [end2, start1],
# negative = [end1]
# )
# start2 = similarities[0][0]
# print('{start1} is related to {end1}, as {start2} is related to {end2}'.format(**locals()))
# return start2
#nearest_similarity_cosmul('Stark', 'Winterfell', 'Riverrun')
#nearest_similarity_cosmul('Jaime', 'sword', 'wine')
#nearest_similarity_cosmul('Arya', 'Nymeria', 'dragons')
def nearest_similarity_cosmul(start1, end1, end2):
similarities = thrones2vec.most_similar_cosmul(
positive = [end2, start1],
negative = [end1]
)
start2 = similarities[0][0]
print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
return start2
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")