-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:brock7/scripts
Conflicts: gather.py
- Loading branch information
Showing
9 changed files
with
273 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__all__ = ['webutils', 'google'] | ||
__all__ = ['webutils', 'google', 'crawler'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import searchbase | ||
import re | ||
import urllib, urllib2 | ||
import webutils | ||
from lxml import etree | ||
|
||
class Googto(searchbase.SearchBase): | ||
|
||
_totalRecordPattern = re.compile(r'找到约 ([0-9,]+) 条结果') | ||
|
||
def _updateTotalRecord(self, html): | ||
m = self._totalRecordPattern.search(html) | ||
if m == None: | ||
# print '* Not found 1' | ||
return | ||
if len(m.groups()) <= 0: | ||
# print '* Not found 2' | ||
return | ||
self._totalRecord = int(m.group(1).replace(',', '')) | ||
print '* Total:', self._totalRecord | ||
|
||
|
||
def _pickupLinks(self, html): | ||
tree = etree.HTML(html) | ||
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div | ||
return tree.xpath(r'//h3/a/@href') | ||
|
||
|
||
def _genUrl(self, what, start): | ||
return 'http://www.googto.com/?q=%s&start=%d' % (what, start) | ||
|
||
if __name__ == '__main__': | ||
opener = urllib2.build_opener() | ||
webutils.setupOpener(opener) | ||
googto = Googto(opener) | ||
|
||
for url in googto.search('site:letv.com', 10): | ||
print url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import searchbase | ||
import re | ||
import urllib, urllib2 | ||
import webutils | ||
from lxml import etree | ||
|
||
pattern = re.compile(r'<div id="resultStats">找到约 ([0-9,]+) 条结果') | ||
pattern2 = re.compile(r'找不到和您的查询 "<em>.*?</em>" 相符的内容或信息。') | ||
|
||
class HxGoogle(searchbase.SearchBase): | ||
|
||
def _updateTotalRecord(self, html): | ||
|
||
m = pattern2.search(html) | ||
if m != None: | ||
self._totalRecord = 0 | ||
#print 'not found' | ||
return | ||
m = pattern.search(html) | ||
if m == None: | ||
return | ||
if len(m.groups()) <= 0: | ||
return | ||
self._totalRecord = int(m.group(1).replace(',', '')) | ||
print 'Total: ', self._totalRecord | ||
|
||
def _pickupLinks(self, html): | ||
tree = etree.HTML(html) | ||
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div | ||
return tree.xpath(r'//h3/a/@href') | ||
|
||
|
||
def _genUrl(self, what, start): | ||
return 'http://g1.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start) | ||
|
||
|
||
hx = None | ||
|
||
def google(opener, what, resultNum = -1, startNum = 0): | ||
global hx | ||
if hx == None: | ||
hx = HxGoogle(opener) | ||
return hx.search(what, resultNum, startNum) | ||
|
||
if __name__ == '__main__': | ||
opener = urllib2.build_opener() | ||
webutils.setupOpener(opener) | ||
# goo = HxGoogle(opener) | ||
|
||
for url in google(opener, 'site:letv.com', 20): | ||
print url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import searchbase | ||
import re | ||
import urllib, urllib2 | ||
import webutils | ||
from lxml import etree | ||
|
||
pattern = re.compile(r'<div id="resultStats">找到约 ([0-9,]+) 条结果') | ||
pattern2 = re.compile(r'找不到和您的查询 "<em>.*?</em>" 相符的内容或信息。') | ||
|
||
class HxGoogle(searchbase.SearchBase): | ||
|
||
def _updateTotalRecord(self, html): | ||
|
||
m = pattern2.search(html) | ||
if m != None: | ||
self._totalRecord = 0 | ||
#print 'not found' | ||
return | ||
m = pattern.search(html) | ||
if m == None: | ||
return | ||
if len(m.groups()) <= 0: | ||
return | ||
self._totalRecord = int(m.group(1).replace(',', '')) | ||
print 'Total: ', self._totalRecord | ||
|
||
def _pickupLinks(self, html): | ||
tree = etree.HTML(html) | ||
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div | ||
return tree.xpath(r'//h3/a/@href') | ||
|
||
|
||
def _genUrl(self, what, start): | ||
return 'http://g2.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start) | ||
|
||
|
||
hx = None | ||
|
||
def google(opener, what, resultNum = -1, startNum = 0): | ||
global hx | ||
if hx == None: | ||
hx = HxGoogle(opener) | ||
return hx.search(what, resultNum, startNum) | ||
|
||
if __name__ == '__main__': | ||
opener = urllib2.build_opener() | ||
webutils.setupOpener(opener) | ||
# goo = HxGoogle(opener) | ||
|
||
for url in google(opener, 'site:letv.com', 20): | ||
print url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import urllib2 | ||
import sys | ||
import os | ||
import webutils | ||
|
||
class SearchBase: | ||
|
||
_opener = None | ||
_totalRecord = sys.maxint | ||
|
||
reqTimeout = 20 | ||
|
||
def __init__(self, opener): | ||
self._opener = opener | ||
|
||
# TODO: get total record number from page | ||
def _updateTotalRecord(self, html): | ||
pass | ||
|
||
# TODO: pick up links from page | ||
def _pickupLinks(self, html): | ||
pass | ||
|
||
def _pageHandler(self, url): | ||
# print 'page handler' | ||
req = urllib2.Request(url) | ||
webutils.setupRequest(req) | ||
req.add_header('Referer', url[:-4]) | ||
|
||
try: | ||
response = self._opener.open(req, timeout = self.reqTimeout) | ||
html = response.read() | ||
# print html | ||
except Exception, e: | ||
print "Exception: url: %s - " % url, e | ||
raise StopIteration() | ||
|
||
if self._totalRecord == sys.maxint: | ||
self._updateTotalRecord(html) | ||
|
||
for url in self._pickupLinks(html): | ||
yield url | ||
|
||
# TODO: return number of results per page. default is 10 | ||
def _getNumPerPage(self): | ||
return 10 | ||
|
||
# TODO: generate a url for searching | ||
def _genUrl(self, what, start): | ||
return '' | ||
|
||
def search(self, what, resultNum = -1, startNum = 0): | ||
|
||
numPerPage = self._getNumPerPage(); | ||
|
||
if resultNum == -1: | ||
pageCount = -1 | ||
else: | ||
pageCount = int((resultNum + numPerPage - 1) / numPerPage) | ||
|
||
startPage = int((startNum + numPerPage - 1) / numPerPage) | ||
|
||
self._totalRecord = sys.maxint | ||
|
||
what = urllib2.quote(what) | ||
|
||
pageNum = 1 | ||
resCnt = 0 | ||
|
||
while True: | ||
if pageCount != -1: | ||
if pageNum > pageCount: | ||
break | ||
|
||
url = self._genUrl(what, (startPage + pageNum - 1) * numPerPage) | ||
# print url | ||
|
||
for result in self._pageHandler(url): | ||
resCnt += 1 | ||
yield result | ||
if resultNum != -1 and resCnt >= resultNum: | ||
raise StopIteration() | ||
if resCnt >= self._totalRecord: | ||
raise StopIteration() | ||
|
||
if self._totalRecord == sys.maxint: | ||
if resultNum == -1: | ||
self._totalRecord = sys.maxint - 1 | ||
else: | ||
self._totalRecord = resultNum | ||
|
||
if resCnt >= self._totalRecord: | ||
raise StopIteration() | ||
#if i < numPerPage: # FIXME: if the result total is 10... :( | ||
# raise StopIteration() | ||
# break | ||
pageNum += 1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters