-
Notifications
You must be signed in to change notification settings - Fork 0
/
text2vector.py
54 lines (36 loc) · 1.36 KB
/
text2vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 29 22:47:19 2018
@author: Gonçalo Adolfo, Frederico Costa
"""
import pickle
import re
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
def text2vector(criticas):
'''
Retorna a representação tf-idf da lista de strings recebida.
'''
print("Recebidas " + str(len(criticas)) + " críticas")
# carregar vocabulário
dataset = pickle.load(open("datasets/dataset_max_6539.p", "rb"))
vocab = dataset['vocab']
print("Dimensão do vocabulário: ", len(vocab))
# aplicar limpeza
criticas_limpas = []
stemmingObj = LancasterStemmer()
for critica in criticas:
# retirar br do html
critica = critica.replace('<br />', ' ')
# retirar caracteres que não do alfabeto
critica = re.sub(r'[^a-zA-Z]+', ' ', critica)
# stemming
array_stem = [stemmingObj.stem(palavra) for palavra in critica.split()]
texto = ' '.join(array_stem)
criticas_limpas.append(texto)
# aplicar tf-idf
tf_idf = TfidfVectorizer(min_df=5, token_pattern=r"\b\w\w+\b", vocabulary=vocab)
tf_idf.fit(criticas_limpas)
X = tf_idf.transform(criticas_limpas)
print("Shape do array tf-idf: ", X.shape)
return X