-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathmaster.py
135 lines (104 loc) · 3.53 KB
/
master.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import itertools
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
# Drawing the embeddings
import matplotlib.pyplot as plt
# Deep learning:
from keras.models import Input, Model
from keras.layers import Dense
from scipy import sparse
# Custom functions
from utility import text_preprocessing, create_unique_word_dict
# Reading the text from the input folder
texts = pd.read_csv('input/sample.csv')
texts = [x for x in texts['text']]
# Defining the window for context
window = 2
# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []
for text in texts:
# Cleaning the text
text = text_preprocessing(text)
# Appending to the all text list
all_text += text
# Creating a context dictionary
for i, word in enumerate(text):
for w in range(window):
# Getting the context that is ahead by *window* words
if i + 1 + w < len(text):
word_lists.append([word] + [text[(i + 1 + w)]])
# Getting the context that is behind by *window* words
if i - w - 1 >= 0:
word_lists.append([word] + [text[(i - w - 1)]])
unique_word_dict = create_unique_word_dict(all_text)
# Defining the number of features (unique words)
n_words = len(unique_word_dict)
# Getting all the unique words
words = list(unique_word_dict.keys())
# Creating the X and Y matrices using one hot encoding
X = []
Y = []
for i, word_list in tqdm(enumerate(word_lists)):
# Getting the indices
main_word_index = unique_word_dict.get(word_list[0])
context_word_index = unique_word_dict.get(word_list[1])
# Creating the placeholders
X_row = np.zeros(n_words)
Y_row = np.zeros(n_words)
# One hot encoding the main word
X_row[main_word_index] = 1
# One hot encoding the Y matrix words
Y_row[context_word_index] = 1
# Appending to the main matrices
X.append(X_row)
Y.append(Y_row)
# Converting the matrices into a sparse format because the vast majority of the data are 0s
X = sparse.csr_matrix(X)
Y = sparse.csr_matrix(Y)
# Defining the size of the embedding
embed_size = 2
# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
# Optimizing the network weights
model.fit(
x=X,
y=Y,
batch_size=256,
epochs=1000
)
# Obtaining the weights from the neural network.
# These are the so called word embeddings
# The input layer
weights = model.get_weights()[0]
# Creating a dictionary to store the embeddings in. The key is a unique word and
# the value is the numeric vector
embedding_dict = {}
for word in words:
embedding_dict.update({
word: weights[unique_word_dict.get(word)]
})
# Ploting the embeddings
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
coord = embedding_dict.get(word)
plt.scatter(coord[0], coord[1])
plt.annotate(word, (coord[0], coord[1]))
# Saving the embedding vector to a txt file
try:
os.mkdir(f'{os.getcwd()}\\output')
except Exception as e:
print(f'Cannot create output folder: {e}')
with open(f'{os.getcwd()}\\output\\embedding.txt', 'w') as f:
for key, value in embedding_dict.items():
try:
f.write(f'{key}: {value}\n')
except Exception as e:
print(f'Cannot write word {key} to dict: {e}')