-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdateNodeDict.py
68 lines (50 loc) · 1.87 KB
/
updateNodeDict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys
import codecs
import pickle
from nltk.stem import PorterStemmer
import json
import string
import re
import nltk
from nltk.corpus import wordnet
import preprocessing
import numpy as np
#create 3 dictonary
dict_wiki_desc = {}
dict_stemmed_tokens = {}
dict_symptoms_TFIDF = {}
list_vocab = []
list_fileNames = []
with open("list_vocab.json") as json_data:
list_vocab = json.load(json_data)
with open("list_fileNames.json") as json_data:
list_fileNames = json.load(json_data)
rows, cols = len(list_vocab) , len(list_fileNames)
TFIDFMatrix = [[0 for x in range(cols)] for y in range(rows)]
transposedTFIDF = np.transpose(TFIDFMatrix)
with open("TFIDFMatrix.json") as json_data:
TFIDFMatrix = json.load(json_data)
def create_dict_wiki_desc():
for files in os.walk("wikipedia_data/symptom/"):
for filename in files[2]:
print filename
fileptr = codecs.open("wikipedia_data/symptom/"+filename,encoding= 'utf8', errors='ignore')
content = fileptr.read();
dict_wiki_desc[filename]=content
with open('dict_wiki_desc.json', 'w') as out:
json.dump(dict_wiki_desc, out)
def create_dict_stemmed_tokens():
for files in os.walk("wikipedia_data/symptom/"):
for filename in files[2]:
print filename
fileptr = codecs.open("wikipedia_data/symptom/"+filename,encoding= 'utf8', errors='ignore')
content = fileptr.read();
dict_stemmed_tokens[filename]=preprocessing.fn_preprocessingtoken(content)
with open('dict_stemmed_tokens.json', 'w') as out:
json.dump(dict_stemmed_tokens, out)
#create_dict_stemmed_tokens()
#create_dict_wiki_desc()
create_dict_stemmed_tokens()