-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathutility.py
78 lines (63 loc) · 2.09 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import numpy as np
def create_unique_word_dict(text:list) -> dict:
"""
A method that creates a dictionary where the keys are unique words
and key values are indices
"""
# Getting all the unique words from our text and sorting them alphabetically
words = list(set(text))
words.sort()
# Creating the dictionary for the unique words
unique_word_dict = {}
for i, word in enumerate(words):
unique_word_dict.update({
word: i
})
return unique_word_dict
def text_preprocessing(
text:list,
punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''',
stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will']
)->list:
"""
A method to preproces text
"""
for x in text.lower():
if x in punctuations:
text = text.replace(x, "")
# Removing words that have numbers in them
text = re.sub(r'\w*\d\w*', '', text)
# Removing digits
text = re.sub(r'[0-9]+', '', text)
# Cleaning the whitespaces
text = re.sub(r'\s+', ' ', text).strip()
# Setting every word to lower
text = text.lower()
# Converting all our text to a list
text = text.split(' ')
# Droping empty strings
text = [x for x in text if x!='']
# Droping stop words
text = [x for x in text if x not in stop_words]
return text
# Functions to find the most similar word
def euclidean(vec1:np.array, vec2:np.array) -> float:
"""
A function to calculate the euclidean distance between two vectors
"""
return np.sqrt(np.sum((vec1 - vec2)**2))
def find_similar(word:str, embedding_dict:dict, top_n=10)->list:
"""
A method to find the most similar word based on the learnt embeddings
"""
dist_dict = {}
word_vector = embedding_dict.get(word, [])
if len(word_vector) > 0:
for key, value in embedding_dict.items():
if key!=word:
dist = euclidean(word_vector, value)
dist_dict.update({
key: dist
})
return sorted(dist_dict.items(), key=lambda x: x[1])[0:top_n]