-
Notifications
You must be signed in to change notification settings - Fork 4
/
main.py
99 lines (83 loc) · 2.99 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# @author Jay Kumar
from Document import Document
from Model import Model
import json
import time
import sys
def printExecutionTime(startTime, str=""):
print(str+ " time elapsed: {:.2f}s".format(time.time() - start_time))
return time.time()
def outputFileNameFormatter(resultDir, dataset, outputPrefix, ALPHA, BETA, LAMDA, decay):
output = ""
if decay == True:
output = resultDir + "/" + dataset + outputPrefix + "_ALPHA" + str(ALPHA) + "_BETA" + str(
BETA) + "_LAMDA" + str(LAMDA) + ".txt"
else:
output = resultDir + "/" + dataset + outputPrefix + "_ALPHA" + str(ALPHA) + "_BETA" + str(BETA) + ".txt"
print("ALHA " + str(ALPHA) + " - BETA " + str(BETA))
return output
dataDir = "data/"
resultDir = ""
# dataset = "News"
dataset = "Tweets"
# dataset = "reuters21578"
LAMDA = 0.000006
alphas = [0.002]
betas = [0.0004]
decay = True
applyICF = True
applyCWW = True
start_index = 0
outputPrefix = ""
if applyICF:
outputPrefix = outputPrefix+"_ICF"
if applyCWW:
outputPrefix = outputPrefix + "_CWW"
start_time = time.time()
print("Dataset: ",dataset," , Decay:", decay, " , ICF = ", applyICF, " , CWW = ", applyCWW)
listOfObjects = []
with open(dataDir+dataset) as input: #load all the objects in memory
line = input.readline()
while line:
obj = json.loads(line) # a line is a document represented in JSON
listOfObjects.append(obj)
line = input.readline()
printExecutionTime(start_time)
start_time = time.time()
indexOfAlpha = -1
indexOfBeta = -1
for a in alphas:
indexOfAlpha += 1
for b in betas:
indexOfBeta += 1
if indexOfAlpha!=indexOfBeta:
continue
if a == 0.0 or b == 0.0:
continue
ALPHA = a
BETA = b
output = outputFileNameFormatter(resultDir, dataset, outputPrefix, ALPHA, BETA, LAMDA, decay)
model = Model(ALPHA, BETA, LAMDA, applyDecay=decay, applyICF = applyICF, applyCWW=applyCWW)
iter = 1
for obj in listOfObjects:
document = Document(obj, model.word_wid_map, model.wid_word_map,
model.wid_docId, model.word_counter) # creating a document object which will spilt the text and update wordToIdMap, wordList
if iter%1000 == 0:
start_time=printExecutionTime(start_time,"Documents "+str(iter))
model.processDocument(document)
iter += 1
# Printing Clusters into File
f = open(output, "w")
for d in model.docIdClusId:
st = ""+str(d)+" "+str(model.docIdClusId[d])+" \n"
f.write(st)
for d in model.deletedDocIdClusId:
st = ""+str(d)+" "+str(model.deletedDocIdClusId[d])+" \n"
f.write(st)
f.close()
print(output)
printExecutionTime(start_time)
indexOfBeta = -1
# end of beta loop
#end of alpha loop
printExecutionTime(start_time)