-
Notifications
You must be signed in to change notification settings - Fork 0
/
grabbr.py
258 lines (229 loc) · 9.38 KB
/
grabbr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import datetime
import urllib
import time
import requests
import xmltodict
from pyquery import PyQuery
from db import Posts, WpPosts, WpTerms, WpTermTaxonomy, WpTermRelationships, wpdb
from sqlalchemy import and_, or_
class WpGrabbr(object):
def __init__(self, sitemapfile='sitemap.xml', timer=61):
self.sitemapfile = sitemapfile
self.timer = timer
self.cacheurl = "http://webcache.googleusercontent.com/search?q=cache:"
self.loadedurls = 0
self.parsedurls = 0
self.insertedurls = 0
self.failedurls = 0
self.s = requests.Session()
self.s.headers['User-Agent']= 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
self.s.headers['Accept']= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
self.grabbed_additional_urls = set([])
def crawl_missing_urls(self, url, grabbed=None):
print "Crawling internal urls from {}".format(url)
if not grabbed: grabbed = self.check_url(url)
self.grabbed_additional_urls.add(url)
if not grabbed: return
parsd = PyQuery(grabbed['raw_content'])
all_urls = parsd('a')
parenturl = url.split('/')[2]
internal_urls = set(filter(lambda a: a and\
parenturl in a and\
'category' not in a and\
'author' not in a and\
'feed' not in a and\
'tag' not in a and\
'#' not in a and\
'&' not in a and\
'.' not in a.split('/')[-1] and\
'http://'+parenturl+'/' != a and\
a not in self.grabbed_additional_urls,
[a.attrib.get('href') for a in all_urls]))
grabbedurls = [self.check_url(iurl) for iurl in internal_urls]
for gurl in grabbedurls:
if gurl and gurl.get('url'): self.crawl_missing_urls(gurl['url'], gurl)
def grab_url(self, url):
"""Grab url into database"""
response = self.s.get(self.cacheurl+url)
if response.status_code == 200:
raw_content = response.content
Posts.insert({'url':url, 'raw_content':raw_content})
print "Url saved."
self.loadedurls += 1
print "Sleeping for {} seconds".format(self.timer)
time.sleep(self.timer)
elif response.status_code == 503:
print "Seems we're banned"
print "Sleeping for one hour to get forgiven"
print "You may interrupt me and kill, use ctrl+c"
time.sleep(61*60)
else:
print "Could'nt get url"
print response.url
self.failedurls += 1
return Posts.find_one({'url':url})
def parse_grabbed(self, url, item, datestr=u'2014-07-18T11:20:24+00:00'):
"""
Parse grabbed item, extract title content, tags
THIS IS THE METHOD YOU HAVE TO MODIFY FIRST TO GET THIS THING WORKING
"""
raw_data = item['raw_content']
parsd = PyQuery(raw_data)
content_el = parsd('div.entry-content')
if not content_el:
content_el = parsd('.post-content')
content = content_el.html()
title = parsd('h1').html()
tags = []
for raw_tag in parsd('ul.tag-list>li>a'):
tag = {'title':raw_tag.text,
'slug':urllib.pathname2url(
raw_tag.attrib['href'].split('/')[-1].encode('utf8')
)
}
tags.append(tag)
raw_posted_date = parsd('header .entry-meta time.entry-date')
if raw_posted_date:
raw_posted_date_text = raw_posted_date[0].attrib['datetime']
else:
print "Failed to parse date!"
raw_posted_date_text=datestr
print "Setting post date: {}".format(raw_posted_date_text)
posted_date = datetime.datetime.strptime(raw_posted_date_text[:-6],"%Y-%m-%dT%H:%M:%S")
raw_category = None
for potential_category in parsd('a'):
if potential_category.attrib.get('rel'):
if 'tag' in potential_category.attrib.get('rel'):
raw_category = potential_category
break
if raw_category:
category = {'title':raw_category.text,
'slug':urllib.pathname2url(
raw_category.attrib['href'].split('/')[-1].encode('utf8')
)}
else:
category = None
author_raw = parsd('header vcard>a')
author = author_raw[0].text if author_raw else None
Posts.update({'url':url},{'$set':{
'slug':url.split('/')[-1],
'content':content,
'title':title,
'tags':tags,
'posted_date':posted_date,
'category':category,
'author':author,
'parsed':True
}})
self.parsedurls += 1
time.sleep(1)
return Posts.find_one({'url':url})
def insert_into_wp(self, item):
"""Insert into wp database"""
def insert_taxonomy_relation(tag, taxonomy_type):
"""Helper to mange wp's tags and categories"""
wptag = wpdb.execute(
WpTerms.select(
WpTerms.c.slug==tag['slug'])
).fetchone()
if wptag:
wptag_id = wptag[0]
taxonomy = wpdb.execute(WpTermTaxonomy.select(and_(
WpTermTaxonomy.c.term_id == wptag_id,
WpTermTaxonomy.c.taxonomy == taxonomy_type))).fetchone()
taxonomy_id = taxonomy[0] if taxonomy else None
else:
i = WpTerms.insert({'slug':tag.get('slug'),
'name':tag.get('title','None')})
res = wpdb.execute(i)
wptag_id = res.inserted_primary_key[0]
i = WpTermTaxonomy.insert({'term_id':wptag_id,
'taxonomy': taxonomy_type})
res = wpdb.execute(i)
taxonomy_id = res.inserted_primary_key[0]
if wptag_id != None and taxonomy_id != None:
i = WpTermRelationships.insert({'object_id':post_id,
'term_taxonomy_id':taxonomy_id})
wpdb.execute(i)
i = WpPosts.insert({
'post_author':1, #TODO
'post_date':item.get('posted_date'),
'post_date_gmt':item.get('posted_date'),
'post_modified':item.get('posted_date'),
'post_modified_gmt':item.get('posted_date'),
'post_content':item.get('content'),
'post_title':item.get('title'),
'post_name':item.get('slug'),
'post_status':'publish',
'comment_status':'open',
'ping_status':'open'
})
res = wpdb.execute(i)
post_id = res.inserted_primary_key[0]
for tag in item['tags']:
insert_taxonomy_relation(tag, 'post_tag')
if item.get('category'):
insert_taxonomy_relation(item['category'], 'category')
Posts.update({'url':item['url']},{'$set':{
'inserted':True}})
self.insertedurls += 1
def load_sitemap(self, sitemapfile):
"""Parse sitemap"""
file = open(sitemapfile)
data = xmltodict.parse(file)
items = data['urlset']['url']
print "Found {} entries in sitemap".format(len(items))
return items
def check_url(self, url, datestr=None):
"""Check url and if not loaded - grab, parse, and upload to wp.
if load, just return parsed item
"""
print "checking {}".format(url)
item = Posts.find_one({'url':url})
if not item:
print "Url was not grabbed. Grabbing..."
try:
item = self.grab_url(url)
if not item:
return
except Exception as e:
print "Failed grabbing with e:", e
self.failedurls += 1
return
if not item.get('parsed') or not item.get('content'):
print "Parsing item..."
try:
item = self.parse_grabbed(url, item, datestr=datestr)
except Exception as e:
print "Failed parsing with e:", e
self.failedurls += 1
return
print "Parsing done."
wp_post = wpdb.execute(WpPosts.select(
WpPosts.c.post_name == item['slug'])).fetchone()
if not wp_post:
print "Inserting into wp db..."
try:
self.insert_into_wp(item)
except Exception as e:
print "Failed inserting with e:", e
self.failedurls += 1
return
print "Done inserting."
else:
print "Already processed"
return item
def parse_from_sitemap(self):
"""Grab and load all urls from sitemap"""
items = self.load_sitemap(self.sitemapfile)
for item in items:
url = item['loc']
datestr = item['lastmod']
self.check_url(url, datestr)
self.print_finished()
def print_finished(self):
print "Done"
print "successfully grabbed {}".format(self.loadedurls)
print "successfully parsed {}".format(self.parsedurls)
print "successfully inserted {}".format(self.insertedurls)
print "failed {} urls".format(self.failedurls)