-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
40 lines (37 loc) · 1.36 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import argparse
from sklearn.datasets import fetch_20newsgroups
from gensim.parsing import preprocessing
stemmer = SnowballStemmer("english")
train = fetch_20newsgroups(shuffle = True)
def lemmatize(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))
def preprocess(text):
result = []
for i in gensim.utils.simple_preprocess(text):
if i not in gensim.parsing.preprocessing.STOPWORDS and len(i) > 3:
result.append(lemmatize(i))
return result
if __name__ == "__main__":
pth = sys.argv[1]
finwrds = []
txt2 = ""
with open(pth, "r") as file:
txt2 = file.read()
txt = txt2.split()
for i in txt:
rt = preprocess(i)
if(rt != []):
finwrds.append(preprocess(i))
d = gensim.corpora.Dictionary(finwrds)
bowc = [d.doc2bow(i) for i in finwrds]
lda_model = gensim.models.LdaMulticore(bowc, num_topics = 4, id2word = d, passes = 12,workers = 2)
bowc2 = d.doc2bow(preprocess(txt2))
for index, score in sorted(lda_model[bowc2], key=lambda tup: -1*tup[1]):
print("Score: {} \n Topic: {}".format(score, lda_model.print_topic(index, 5)))