-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
63 lines (55 loc) · 1.98 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys
import codecs
import pickle
from nltk.stem import PorterStemmer
import json
import string
import re
import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
def lemmaFunc(tokens):
lemmaToks = []
lemtr = WordNetLemmatizer()
for val in tokens:
stoken = lemtr.lemmatize(val)
lemmaToks.append(stoken)
return lemmaToks
def fn_preprocessingtoken(text): #Normalize with lowercase, stemming, tokonization , remove combinations of words and numbers, remove punctuations, remove header
porterstemmer = PorterStemmer()
text = text.lower()
text = re.sub("\d","",text);
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
synonyms = []
for token in word_tokens:
for syn in wordnet.synsets(token):
for l in syn.lemmas():
synonyms.append(l.name())
word_tokens.extend(synonyms)
filtered_text = []
for w in word_tokens:
if w not in stop_words:
if w.isalnum():
filtered_text.append(w) #porterstemmer.stem(w))
return lemmaFunc(filtered_text)
def preprocessvocab(text): #Normalize with lowercase, stemming, tokonization , remove combinations of words and numbers, remove punctuations, remove header
porterstemmer = PorterStemmer()
text = text.lower()
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
synonyms = []
for token in word_tokens:
for syn in wordnet.synsets(token):
for l in syn.lemmas():
synonyms.append(l.name())
word_tokens.extend(synonyms)
filtered_text = set()
for w in word_tokens:
if w not in stop_words:
if w.isalnum():
filtered_text.add(w)#porterstemmer.stem(w))
return lemmaFunc(filtered_text)