-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTFIDF.py
115 lines (84 loc) · 2.95 KB
/
TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#TFIDF
import preprocessing
import codecs
import math
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys
import pickle
from nltk.stem import PorterStemmer
import json
import numpy as np
_list_vocab=[]
list_vocab=[]
list_fileNames=[]
dict_idf = {}
with open("list_vocab.json") as json_data:
list_vocab = json.load(json_data)
with open("list_fileNames.json") as json_data:
list_fileNames = json.load(json_data)
rows, cols = len(list_vocab) , len(list_fileNames)
TFMatrix = [[0 for x in range(cols)] for y in range(rows)]
TFIDFMatrix = [[0 for x in range(cols)] for y in range(rows)]
with open("TFMatrix.json") as json_data:
TFMatrix = json.load(json_data)
#
#for files in os.walk("wikipedia_data/symptom/"):
# for filename in files[2]:
# print filename
# list_fileNames.append(filename)
#with open('list_fileNames.json', 'w') as out:
# json.dump(list_fileNames, out)
#
def calculateTF(term,list_token):
tf = list_token.count(term)
if tf>0:
return 1+ math.log10(tf)
else:
return 0
def fn_createListVocabulary():
for files in os.walk("wikipedia_data/symptom/"):
for filename in files[2]:
print filename
# list_fileNames.append(filename)
fileptr = codecs.open("wikipedia_data/symptom/"+filename,encoding= 'utf8', errors='ignore')
content = fileptr.read();
list_content = preprocessing.fn_preprocessingtoken(content)
_list_vocab.extend(list_content)
for i in _list_vocab:
if i not in list_vocab:
list_vocab.append(i)
with open('list_vocab.json', 'w') as out:
json.dump(list_vocab, out)
def createTFIDFMatrix():
print "in createTFIDFMatrix"
for j in range(0,cols):
print "*********",j
# print "Hello"
fileptr = codecs.open("wikipedia_data/symptom/"+list_fileNames[j],encoding= 'utf8', errors='ignore')
content = fileptr.read();
list_tokens = preprocessing.fn_preprocessingtoken(content)
for i in range(0,rows):
TFMatrix[i][j] = calculateTF(list_vocab[i],list_tokens)
for i in range(0,rows):
df = np.count_nonzero(TFMatrix[i])
print df
idf = math.log10(len(list_fileNames)/df)
for j in range(0,cols):
TFIDFMatrix[i][j] = TFMatrix[i][j] * idf;
with open('TFMatrix.json', 'w') as out:
json.dump(TFMatrix, out)
with open('TFIDFMatrix.json', 'w') as out:
json.dump(TFIDFMatrix, out)
def createIDF():
for i in range(0,rows):
df = np.count_nonzero(TFMatrix[i])
print df
dict_idf[list_vocab[i]]=df
with open('dict_idf.json', 'w') as out:
json.dump(dict_idf, out)
#createIDF()
createTFIDFMatrix()
#createTFIDFMatrix()
#fn_createListVocabulary()