diff --git a/utils/googto.py b/utils/googto.py new file mode 100755 index 0000000..809c660 --- /dev/null +++ b/utils/googto.py @@ -0,0 +1,41 @@ +# -*- encoding: utf-8 -*- + +import searchbase +import re +import urllib, urllib2 +import webutils +from lxml import etree + +class Googto(searchbase.SearchBase): + + _totalRecordPattern = re.compile(r'找到约 ([0-9,]+) 条结果') + + def _updateTotalRecord(self, html): + m = self._totalRecordPattern.search(html) + if m == None: + # print '* Not found 1' + return + if len(m.groups()) <= 0: + # print '* Not found 2' + return + self._totalRecord = int(m.group(1).replace(',', '')) + print '* Total:', self._totalRecord + + + def _pickupLinks(self, html): + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div + return tree.xpath(r'//h3/a/@href') + + + def _genUrl(self, what, start): + return 'http://www.googto.com/?q=%s&start=%d' % (what, start) + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + googto = Googto(opener) + + for url in googto.search('site:letv.com', 10): + print url + diff --git a/utils/searchbase.py b/utils/searchbase.py new file mode 100644 index 0000000..acca718 --- /dev/null +++ b/utils/searchbase.py @@ -0,0 +1,99 @@ +# -*- encoding: utf-8 -*- + +import urllib2 +import sys +import os +import webutils + +class SearchBase: + + _opener = None + _totalRecord = sys.maxint + + reqTimeout = 20 + + def __init__(self, opener): + self._opener = opener + + # TODO: get total record number from page + def _updateTotalRecord(self, html): + pass + + # TODO: pick up links from page + def _pickupLinks(self, html): + pass + + def _pageHandler(self, url): + # print 'page handler' + req = urllib2.Request(url) + webutils.setupRequest(req) + req.add_header('Referer', url[:-4]) + + try: + response = self._opener.open(req, timeout = self.reqTimeout) + html = response.read() + # print html + except Exception, e: + print "Exception: url: %s - " % url, e + raise StopIteration() + + if self._totalRecord == sys.maxint: + self._updateTotalRecord(html) + + for url in self._pickupLinks(html): + yield url + + # TODO: return number of results per page. default is 10 + def _getNumPerPage(self): + return 10 + + # TODO: generate a url for searching + def _genUrl(self, what, start): + return '' + + def search(self, what, resultNum = -1, startNum = 0): + + numPerPage = self._getNumPerPage(); + + if resultNum == -1: + pageCount = -1 + else: + pageCount = int((resultNum + numPerPage - 1) / numPerPage) + + startPage = int((startNum + numPerPage - 1) / numPerPage) + + self._totalRecord = sys.maxint + + what = urllib2.quote(what) + + pageNum = 1 + resCnt = 0 + + while True: + if pageCount != -1: + if pageNum > pageCount: + break + + url = self._genUrl(what, (startPage + pageNum) * numPerPage) + + for result in self._pageHandler(url): + resCnt += 1 + yield result + if resultNum != -1 and resCnt >= resultNum: + raise StopIteration() + if resCnt >= totalRecord: + raise StopIteration() + + if self._totalRecord == sys.maxint: + if resultNum == -1: + totalRecord = sys.maxint - 1 + else: + totalRecord = resultNum + + if resCnt >= self._totalRecord: + raise StopIteration() + #if i < numPerPage: # FIXME: if the result total is 10... :( + # raise StopIteration() + # break + pageNum += 1 +