-
Notifications
You must be signed in to change notification settings - Fork 0
/
SaveTopicModel.py
160 lines (129 loc) · 4.44 KB
/
SaveTopicModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 13 11:21:37 2014
Scripts to read the output of the topic model (run using mallet)
into python
@author: dvats
"""
# save the topic model as pickle onjects
import numpy as np
import pandas as pd
import gzip
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import pickle
def GetID(text):
"""
Get document id from the the text of the form
guide_data/id.txt
"""
text = text.split()
ind = text[1]
ind = ind.split('.txt')[0]
ind = int(ind.split('guide_data/')[1]) # get index number
return ind
def ReadDocumentTopics(num_docs, num_topics):
"""
read document in ./mallet_output/per_document_output.gz
and count the number of times a topic appears
"""
# read file
f = gzip.open('./mallet_output/per_document_output.gz', 'rb')
f.readline()
alpha = f.readline()
alpha = alpha.split(":")[1]
alpha = np.array([float(a) for a in alpha.split()])
f.readline()
doc_tops = np.zeros([num_docs, num_topics])
for chunk in iter(lambda: f.readline(), ''):
ind = GetID(chunk) # id number
top_num = int(chunk.split()[-1]) # get topic number
if ind > num_docs-1:
break
doc_tops[ind, top_num] = doc_tops[ind, top_num] + 1.0
return doc_tops
def ReadTopicProps(num_docs, num_topics):
"""
Find the topic proportions for each document (guide_book)
"""
f = open("mallet_output/topic_props.txt")
f.readline()
topic_prop = np.zeros((num_docs, num_topics))
for chunk in iter(lambda: f.readline(), ''):
ind = GetID(chunk)
chunk = chunk.split()[2:]
for i in range(num_topics):
c1 = int(chunk[2*i])
c2 = float(chunk[2*i+1])
topic_prop[ind, c1] = c2
return topic_prop
def ReadWordCounts(num_topics):
"""
Read the number of times a word is mapped to a topic
"""
f = open("mallet_output/word_topic_counts.txt")
num_words = 10000
# create a sparse matrix of size num_words x num_topics
word_count = lil_matrix((num_words, num_topics))
word_list = []
for chunk in iter(lambda: f.readline(), ''):
chunk = chunk.split()
ind = int(chunk[0])
if ind == num_words-1:
break
word_list.append(chunk[1])
chunk = chunk[2:]
for ck in chunk:
# ck has the form xx:yy
ck = ck.split(':')
word_count[ind, int(ck[0])] = int(ck[1])
return word_count, np.array(word_list)
def ScoreDocument(X, word_count, total_words_eachtopic, topic_props, i):
"""
Main algorithm to score documents based on a keyword
See test_web/app/RankingUsingMallet.py for more details
"""
# score = (# words in doc) *
# \sum[ (# times word in T_i) / (# words) * P(T_i) ]
tmp = np.array(word_count[i, :].todense()).flatten()
tmp = tmp / total_words_eachtopic
tmp = np.dot(topic_props, tmp)
xx = np.array(X[:, i].todense()).flatten()
mix = 0.5 # mixing of counting and lda model
tmp = ((1-mix) * tmp + mix * xx)
return tmp
# main code
print "Reading Data"
main_data = pd.read_csv("./data/FilteredTravelData.csv").fillna('')
guide_data = np.array(main_data["all_data"])
title = np.array(main_data["title"])
num_docs = len(main_data)
# read number of topics
num_topics = sum(1 for line in open('./mallet_output/topic_keys.txt'))
# doc_tops: num_docs x num_topics matrix
# topic_props: num_docs x num_topics matrix
# num_words: num_docs x 1 vector
doc_tops = ReadDocumentTopics(num_docs, num_topics)
num_words = np.sum(doc_tops, axis=1)
topic_props = ReadTopicProps(num_docs, num_topics)
# word_distr: num_words x num_topics matrix
word_count, word_list = ReadWordCounts(num_topics)
total_words_eachtopic = np.array(word_count.sum(axis=0))[0]
vectorizer = TfidfVectorizer(vocabulary=word_list)
X = vectorizer.fit_transform(guide_data)
X = normalize(X, axis=1, norm='l1')
# define a dictioary and then save the data
print"Saving data to pickle object"
DictData = {}
DictData["doc_tops"] = doc_tops
DictData["num_words"] = num_words
DictData["topic_props"] = topic_props
DictData["word_count"] = word_count
DictData["word_list"] = word_list
DictData["total_words_eachtopic"] = total_words_eachtopic
DictData["X"] = X
DictData["title"] = title
filehandler = open('test_web/topic_model_data.obj', 'w')
pickle.dump(DictData, filehandler)
filehandler.close()