-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
76 lines (63 loc) · 2.24 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import re
import json
import gensim
import numpy as np
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from torchtext.vocab import Vectors
english_stopwords = set(stopwords.words("english"))
non_alphanum_regex = re.compile(r"\W+")
def preprocess(doc, method="nltk", dataset=True):
if method == "spacy":
tokens = " ".join(
[
token.lower_
for token in doc
if token
and not (token.lower_ == "null" or token.is_stop or token.is_punct)
]
)
elif method == "nltk":
# doc = non_alphanum_regex.sub(' ', doc).lower()
tokens = " ".join(
[
token
for token in word_tokenize(doc.lower())
if not (
token == "null"
or token in english_stopwords
or token in string.punctuation
)
]
)
if dataset or tokens != "":
return tokens
def parse_content_line(x, attributes=None, label=True):
if attributes is None:
attributes = ["title_left", "title_right"]
item = json.loads(x)
elements = [item[attr] if item[attr] is not None else "" for attr in attributes]
if label:
ll = int(item["label"])
elements.append(ll)
item = np.array(elements)
return item[np.newaxis, :]
def resave_w2v_model(old_path, new_path):
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(old_path, binary=True)
w2v_model.save_word2vec_format(fname=new_path)
def resave_fasttext_model(old_path, new_path):
fasttext_model = gensim.models.fasttext.load_facebook_model(old_path)
fasttext_model.save(fname=new_path)
def load_embedding(TEXT, embedding_path):
_, file_extension = os.path.splitext(embedding_path)
if file_extension == ".bin":
embedding_name = os.path.basename(embedding_path)
embedding_dir = os.path.dirname(embedding_path)
vectors = Vectors(name=embedding_name, cache=embedding_dir)
elif file_extension == ".txt":
vectors = Vectors(name=embedding_path)
else:
raise NotImplementedError()
TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)