-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
xiao
committed
Oct 12, 2015
1 parent
01d75d0
commit 207f177
Showing
2 changed files
with
140 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import searchbase | ||
import re | ||
import urllib, urllib2 | ||
import webutils | ||
from lxml import etree | ||
|
||
class Googto(searchbase.SearchBase): | ||
|
||
_totalRecordPattern = re.compile(r'找到约 ([0-9,]+) 条结果') | ||
|
||
def _updateTotalRecord(self, html): | ||
m = self._totalRecordPattern.search(html) | ||
if m == None: | ||
# print '* Not found 1' | ||
return | ||
if len(m.groups()) <= 0: | ||
# print '* Not found 2' | ||
return | ||
self._totalRecord = int(m.group(1).replace(',', '')) | ||
print '* Total:', self._totalRecord | ||
|
||
|
||
def _pickupLinks(self, html): | ||
tree = etree.HTML(html) | ||
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div | ||
return tree.xpath(r'//h3/a/@href') | ||
|
||
|
||
def _genUrl(self, what, start): | ||
return 'http://www.googto.com/?q=%s&start=%d' % (what, start) | ||
|
||
if __name__ == '__main__': | ||
opener = urllib2.build_opener() | ||
webutils.setupOpener(opener) | ||
googto = Googto(opener) | ||
|
||
for url in googto.search('site:letv.com', 10): | ||
print url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import urllib2 | ||
import sys | ||
import os | ||
import webutils | ||
|
||
class SearchBase: | ||
|
||
_opener = None | ||
_totalRecord = sys.maxint | ||
|
||
reqTimeout = 20 | ||
|
||
def __init__(self, opener): | ||
self._opener = opener | ||
|
||
# TODO: get total record number from page | ||
def _updateTotalRecord(self, html): | ||
pass | ||
|
||
# TODO: pick up links from page | ||
def _pickupLinks(self, html): | ||
pass | ||
|
||
def _pageHandler(self, url): | ||
# print 'page handler' | ||
req = urllib2.Request(url) | ||
webutils.setupRequest(req) | ||
req.add_header('Referer', url[:-4]) | ||
|
||
try: | ||
response = self._opener.open(req, timeout = self.reqTimeout) | ||
html = response.read() | ||
# print html | ||
except Exception, e: | ||
print "Exception: url: %s - " % url, e | ||
raise StopIteration() | ||
|
||
if self._totalRecord == sys.maxint: | ||
self._updateTotalRecord(html) | ||
|
||
for url in self._pickupLinks(html): | ||
yield url | ||
|
||
# TODO: return number of results per page. default is 10 | ||
def _getNumPerPage(self): | ||
return 10 | ||
|
||
# TODO: generate a url for searching | ||
def _genUrl(self, what, start): | ||
return '' | ||
|
||
def search(self, what, resultNum = -1, startNum = 0): | ||
|
||
numPerPage = self._getNumPerPage(); | ||
|
||
if resultNum == -1: | ||
pageCount = -1 | ||
else: | ||
pageCount = int((resultNum + numPerPage - 1) / numPerPage) | ||
|
||
startPage = int((startNum + numPerPage - 1) / numPerPage) | ||
|
||
self._totalRecord = sys.maxint | ||
|
||
what = urllib2.quote(what) | ||
|
||
pageNum = 1 | ||
resCnt = 0 | ||
|
||
while True: | ||
if pageCount != -1: | ||
if pageNum > pageCount: | ||
break | ||
|
||
url = self._genUrl(what, (startPage + pageNum) * numPerPage) | ||
|
||
for result in self._pageHandler(url): | ||
resCnt += 1 | ||
yield result | ||
if resultNum != -1 and resCnt >= resultNum: | ||
raise StopIteration() | ||
if resCnt >= totalRecord: | ||
raise StopIteration() | ||
|
||
if self._totalRecord == sys.maxint: | ||
if resultNum == -1: | ||
totalRecord = sys.maxint - 1 | ||
else: | ||
totalRecord = resultNum | ||
|
||
if resCnt >= self._totalRecord: | ||
raise StopIteration() | ||
#if i < numPerPage: # FIXME: if the result total is 10... :( | ||
# raise StopIteration() | ||
# break | ||
pageNum += 1 | ||
|