-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf2map.py
184 lines (162 loc) · 3.98 KB
/
pdf2map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import numpy as np
import random
import logging
from subprocess import call
from gensim.models.doc2vec import LabeledSentence
from gensim import models
from sklearn.decomposition import PCA
# Process PDF to metadata hash
# data = [
# {
# 'pdf': 'A Semantic Implication for HCI',
# 'doc': 'This paper present an implication...',
# 'tag': ['[email protected]', 'CHI2016'],
# 'vec': [0.23, 0.54, 0.23 ... (doc2vec vector)],
# 'xyz': [0.33, 0.63, 0.23] (PCA of vec)
# }, ...]
ROOT_DIR = 'RKMTLAB'
TEXT_DIR = 'text'
MODEL_DIR = 'doc2vec.model'
def train(data):
# require pdf, doc
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = [LabeledSentence(words=d['doc'].split(' '), tags=[d['pdf']]) for d in data]
model = models.Doc2Vec(sentences)
for epoch in range(20):
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
model.alpha -= (0.025 - 0.0001) / 19
model.min_alpha = model.alpha
model.save(MODEL_DIR)
model = models.Doc2Vec.load(MODEL_DIR)
return model
def all_files(r, d=list()):
# Get all file paths under the root (r) recursively.
if '.' in r:
if '.pdf' in r:
d += [r]
return
else:
for f in os.listdir(r):
all_files(r + '/' + f, d)
return d
def read(path):
with open(path) as f:
return f.read().replace('\n', ' ')
def pdf2doc(pdf_path):
pdf_name = pdf_path.split('/')[-1]
text_path = '/'.join([TEXT_DIR, pdf_name + '.txt'])
try:
return read(text_path)
except FileNotFoundError:
call(['pdftotext', pdf_path, text_path])
return read(text_path)
def get_tags(doc):
rst = []
for word in doc.split(' '):
if '@' in word:
rst.append(word)
return rst
def append_doc(data):
# require pdf
for d in data:
pdf = d['pdf']
doc = pdf2doc(pdf)
d['doc'] = doc
return data
def append_tag(data):
# require doc
for d in data:
doc = d['doc']
tags = get_tags(doc)
d['tag'] = tags
return data
def pdf2vec(pdf, model):
# [model.docvecs[tag] for tag in model.docvecs.doctags]
return model.docvecs[pdf]
def valid_model(model, data):
for d in data:
pdf = d['pdf']
tags = model.docvecs.doctags
tags = tags.items()
tags = [tag[0] for tag in tags]
if not pdf in tags:
return False
return True
def get_model(data):
model = models.Doc2Vec.load(MODEL_DIR)
if not valid_model(model, data):
print('Generating new model')
model = train(data)
return model
def append_vec(data):
# require doc, pdf
model = get_model(data)
for d in data:
pdf = d['pdf']
vec = pdf2vec(pdf, model)
d['vec'] = vec
return data
def pca(vecs, n=3):
pca = PCA(n_components=n)
return pca.fit_transform(vecs)
def append_xyz(data):
# require vec
vecs = [d['vec'] for d in data]
vecs = pca(vecs)
for i, d in enumerate(data):
d['xyz'] = vecs[i]
return data
def append_hsl(data):
# require tag
for d in data:
tags = d['tag']
if len(tags) != 0:
tag = tags[0]
d['hsl'] = word2hsl(tag)
else:
d['hsl'] = [0, 0, 0]
return data
def pdfs2drops(data):
data = append_doc(data)
data = append_tag(data)
data = append_vec(data)
data = append_pca(data)
return data
def get_pdfs(dr):
pdfs = all_files(dr)
data = []
for pdf in pdfs:
d = {'pdf':pdf}
data.append(d)
return data
def word2hsl(word):
h = s = l = 0
h = (ord(word[0]) % 26) * (360 / 26)
if len(word) >= 2:
s = (ord(word[1]) % 26) * (100 / 26)
l = 50
return [h, s, l]
def random_choice():
i = random.choice(range(len(data)))
return by_id(i)
def by_id(i):
# for server
i %= len(data)
d = data[i]
rst = {}
rst['text'] = d['pdf']
rst['x'] = d['xyz'][0]
rst['y'] = d['xyz'][1]
rst['z'] = d['xyz'][2]
rst['h'] = d['hsl'][0]
rst['s'] = d['hsl'][1]
rst['l'] = d['hsl'][2]
rst['limit'] = len(data)
return rst
data = get_pdfs(ROOT_DIR)
data = append_doc(data)
data = append_tag(data)
data = append_vec(data)
data = append_xyz(data)
data = append_hsl(data)