forked from vrtx64/fav2pdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fav2pdf.py
executable file
·224 lines (190 loc) · 8.81 KB
/
fav2pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/python2
# -*- coding: utf-8 -*-
import re
import argparse
import os
import sys
from pdf_gen import *
from datetime import date, timedelta, datetime
import requests
import lxml.html
def parseTopic(url):
td = requests.get(url).text
td.index(u'<a href="http://m.habrahabr.ru/" accesskey="2">μHabr</a>')
t_start = td.find('<div class="txt">')
t_stop = td.find('<div class="adv">')
topic_res = td[t_start:t_stop]
author = re.findall('<div class="m">\n\t\t\t\n\t\t\t(.*),', topic_res)[0]
topic_res = re.sub('<div class="m">\n\t\t\t\n\t\t\t(.*),',
'<div class="m"><a href="http://%s.habrahabr.ru">%s</a>,' % (author, author), topic_res)
topic_res = re.sub('\s<br/>\s*\S*<br/>', '<br/>', topic_res)
topic_res = topic_res.replace('align="left"/>', '/>')
topic_res = topic_res.replace('align="center"', 'align="middle"')
topic_res = re.sub('/>\s*<img', '/><br/><img', topic_res)
cmts_start = td.find('<div class="cmts">')
cmts_stop = td.find('<div class="bm">')
cmts_res = td[cmts_start:cmts_stop]
return topic_res, cmts_res
def save(dest_dir=u'.', user='none', from_date=u'', to_date=u'',
all_in_one=False, save_comments=True, create_symlinks=True, only_hubs=[]):
# user = 'icoz'
site = "habrahabr.ru/users/%s" % user
# from_date = u'' # 5 августа 2009
# to_date = u'' # 30 ноября 2010
#[u'Android', u'Mobile Development'] только перечисленные блоги. Должны быть юникодные, не забывать 'u' перед строкой. Звездочки писать не надо.
if only_hubs is None:
only_hubs = []
topic_per_page = 10
month = [
u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября',
u'ноября',
u'декабря']
dr = requests.get('http://' + site + '/favorites/').text
DIR_PDF = dest_dir + '/pdf'
DIR_POSTS = DIR_PDF + '/posts'
DIR_HUBS = DIR_PDF + '/hubs'
if not all_in_one:
if not os.path.exists(DIR_PDF):
os.mkdir(DIR_PDF)
if not os.path.exists(DIR_POSTS):
os.mkdir(DIR_POSTS)
if create_symlinks:
if not os.path.exists(DIR_HUBS):
os.mkdir(DIR_HUBS)
try:
doc = lxml.html.document_fromstring(dr)
profilepage = requests.get('http://' + site).text
profiledoc = lxml.html.document_fromstring(profilepage)
if u'read-only' in profilepage:
count = int(doc.xpath('.//td/a/span/span/text()')[0])
else:
count = int(doc.xpath('.//div/a/span/span/text()')[0])
# Read-only accounts have different page layout, so it needs different
# handling
except:
print 'No favorites found. Most likely its a typo in username'
sys.exit(1)
page = count / topic_per_page + 1
data_finder = doc.xpath(
'.//div[@class="posts shortcuts_items"]/div/div[1]/text()')
if (to_date != ''):
td = to_date.split(' ')
to_date_dt = date(int(td[2]), month.index(td[1]) + 1, int(td[0]))
else:
to_date_dt = date.today()
if (from_date != ''):
fd = from_date.split(' ')
from_date_dt = date(int(fd[2]), month.index(fd[1]) + 1, int(fd[0]))
else:
from_date_dt = date(2000, 1, 1)
content = u'<br /><div align="center"><h2>Избранное пользователя <a href="http://%s.habrahabr.ru">%s</a></h2> (%s - %s) <br /><br /><strong>Содержание</strong></div><br />' % (
user, user, from_date_dt.strftime('%d/%m/%y'), to_date_dt.strftime('%d/%m/%y'))
content_body = u''
topic = ''
topic_res = ''
topicCount = 0
in_date = 0
topic_m = []
for p in range(1, page + 1):
print '\nProcessed page %s of %s:' % (p, page)
dr = requests.get('http://%s/favorites/page%s/' % (site, p)).text
# get posts
doc = lxml.html.fromstring(dr)
elems = doc.xpath('.//div[@class="posts shortcuts_items"]/div')
# get hubs from posts
hubs = [x.xpath('.//div[@class="hubs"]/a/text()') for x in elems]
postLinks = doc.xpath('.//h1[@class="title"]/a[1]')
postDates = doc.xpath(
'.//div[@class="posts shortcuts_items"]/div/div[1]/text()')
for dd in postDates:
in_date += 1
parts = dd.strip(' ').split(' ')
if u'вчера' in dd:
d = date.today() - timedelta(1)
elif u'сегодня' in dd:
d = date.today()
else:
if re.search('20[0-9]{2}', dd):
d = date(int(parts[2]), month.index(
parts[1]) + 1, int(parts[0]))
else:
d = date(datetime.now().year, month.index(
parts[1]) + 1, int(parts[0]))
if from_date_dt <= d <= to_date_dt:
topic_m.append(in_date)
print '----------------------'
for index, a in enumerate(postLinks):
topicCount += 1
# here we will get /post/ID/ part of the link
# check for company/link posts
url = a.get('href')
if 'company' in url:
token = 'post/' + url.split('blog/')[1]
elif 'linker' in url:
token = url.split('linker/')[1].replace('go', 'post')
else:
token = a.get('href').split('ru/')[1]
# get post id for saving
id = token.split('post/')[1]
if id[-1] == "/":
id = id[:-1]
m_link = u'http://m.habrahabr.ru/%s' % token
if len(set(only_hubs) & set(hubs[index])) > 0:
hubFlag = True
else:
hubFlag = False
if (topicCount in topic_m) and (hubFlag or only_hubs == []):
try:
print '%d Topic: %s->%s' % (topicCount, ', '.join(hubs[index]), url)
if all_in_one:
content += u'[%s] <a href="#%d">%s</a><br />' % (
', '.join(hubs[index]), topicCount, a.text)
topic_res, cmts_res = parseTopic(m_link)
# format topic
topic = u'<h2><a name="%d">[%s] </a><br><a href="%s">%s</a></h2><br><br>' % (
topicCount, u', '.join(hubs[index]), url, a.text) + topic_res
# ... and comments
if save_comments:
topic += cmts_res
if all_in_one:
content_body += topic
else:
go(topic, DIR_POSTS + '/' + id + '.pdf')
# create symlinks
if create_symlinks:
for hub in hubs[index]:
if not os.path.exists(DIR_HUBS + '/' + hub):
os.mkdir(DIR_HUBS + '/' + hub)
os.symlink(
'../../posts/' + id + '.pdf', DIR_HUBS + '/' + hub + "/" + id + '.pdf')
except:
print ' Topic: %s->%s is locked!' % (', '.join(hubs[index]), a.text)
print '----------------------'
if all_in_one:
go(content + content_body, dest_dir + '/' + user + '.pdf')
def main():
p = argparse.ArgumentParser(
description=u'Tool for save favorite posts from habrahabr.ru in pdf''s')
p.add_argument('user', type=str, help="habrahabr.ru username")
p.add_argument('-d', '--output-dir', default='.',
type=str, help="Directory for output")
p.add_argument('--from-date', type=str, default=u'', help='From date')
p.add_argument('--to-date', type=str, default=u'', help='To date')
p.add_argument('--all-in-one', action='store_true',
help='Save all posts in one PDF-file')
p.add_argument('--only-hubs', nargs="*",
help='Save only posts from hubs. For multiple: "--only-hubs Hub1 Hub2 --"')
p.add_argument('--no-comments', action='store_true',
help='Dont save comments from posts')
p.add_argument('--no-symlinks', action='store_true',
help='Dont create symlinks to posts')
args = p.parse_args()
if not os.path.exists(args.output_dir):
print('Error! Directory "'+args.output_dir+'" not exists!')
sys.exit(1)
save(
dest_dir=args.output_dir, user=args.user, from_date=args.from_date.decode(sys.getfilesystemencoding()), to_date=args.to_date.decode(sys.getfilesystemencoding()),
only_hubs=args.only_hubs, all_in_one=args.all_in_one, save_comments=not args.no_comments,
create_symlinks=not args.no_symlinks)
if __name__ == '__main__':
main()