-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpreprocessing.py
83 lines (68 loc) · 2.55 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from bs4 import BeautifulSoup
import spacy
import unidecode
import contractions as cont
from word2number import w2n
nlp = spacy.load('en_core_web_md')
# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
nlp.vocab[w].is_stop = False
def strip_html_tags(text):
"""remove html tags from text"""
soup = BeautifulSoup(text, "html.parser")
stripped_text = soup.get_text(separator=" ")
return stripped_text
def remove_whitespace(text):
"""remove extra whitespaces from text"""
text = text.strip()
return " ".join(text.split())
def remove_accented_chars(text):
"""remove accented characters from text, e.g. café"""
text = unidecode.unidecode(text)
return text
def expand_contractions(text):
"""expand shortened words, e.g. don't to do not"""
return cont.fix(text, slang=False)
def preprocess_text(text, accented_chars=True, contractions=True, convert_num=False, extra_whitespace=True,
lemmatization=False, lowercase=True, punctuations=False, remove_html=True, remove_num=False,
special_chars=True, stop_words=False):
"""preprocess text with default option set to true for all steps"""
if remove_html:
text = strip_html_tags(text)
if extra_whitespace:
text = remove_whitespace(text)
if accented_chars:
text = remove_accented_chars(text)
if contractions:
text = expand_contractions(text)
if lowercase:
text = text.lower()
doc = nlp(text)
clean_text = []
for token in doc:
flag = True
edit = token.text
# remove stop words
if stop_words and token.is_stop and token.pos_ != 'NUM':
flag = False
# remove punctuations
if punctuations and token.pos_ == 'PUNCT' and flag:
flag = False
# remove special characters
if special_chars and token.pos_ == 'SYM' and flag:
flag = False
# remove numbers
if remove_num and (token.pos_ == 'NUM' or token.text.isnumeric()) and flag:
flag = False
# convert number words to numeric numbers
if convert_num and token.pos_ == 'NUM' and flag:
edit = w2n.word_to_num(token.text)
# convert tokens to base form
elif lemmatization and token.lemma_ != "-PRON-" and flag:
edit = token.lemma_
# append tokens edited and not removed to list
if edit != "" and flag:
clean_text.append(edit)
clean_text = ' '.join(clean_text)
return clean_text