From 6a9715f24af660d619fc37176c5b612050a353c4 Mon Sep 17 00:00:00 2001 From: xiao Date: Mon, 12 Oct 2015 20:23:52 +0800 Subject: [PATCH] ZZadd hxgoogle2.py --- utils/google.py | 8 ++++++- utils/hxgoogle.py | 4 ++-- utils/hxgoogle2.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ utils/hxgoogle3.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ utils/searchbase.py | 9 ++++---- 5 files changed, 122 insertions(+), 7 deletions(-) create mode 100755 utils/hxgoogle2.py create mode 100755 utils/hxgoogle3.py diff --git a/utils/google.py b/utils/google.py index 4af1b59..5a0b9e0 100644 --- a/utils/google.py +++ b/utils/google.py @@ -4,6 +4,8 @@ import googlesearch import bingsearch import hxgoogle +import hxgoogle2 +import hxgoogle3 #searchEngine = googlesearch.google #searchEngine = aolsearch.google @@ -24,8 +26,12 @@ google = bingsearch.google elif search_engine == 'hxgoogle': google = hxgoogle.google +elif search_engine == 'hxgoogle2': + google = hxgoogle2.google +elif search_engine == 'hxgoogle3': + google = hxgoogle3.google else: - google = hxgoogle.google + google = hxgoogle2.google searchEngine = google diff --git a/utils/hxgoogle.py b/utils/hxgoogle.py index 872f83c..f50927c 100644 --- a/utils/hxgoogle.py +++ b/utils/hxgoogle.py @@ -11,7 +11,7 @@ import locale import webutils -HXGOOGLE_HOME = 'http://www.hxgoogle.com' +HXGOOGLE_HOME = 'http://g.hxgoogle.com' NUM_PER_PAGE = 10 REQ_TIMEOUT = 20 totalRecord = sys.maxint @@ -82,7 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0): if pageCount != -1: if pageNum > pageCount: break - url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10) + url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum - 1) * 10) for result in _hxPageHandler(opener, url): # i += 1 diff --git a/utils/hxgoogle2.py b/utils/hxgoogle2.py new file mode 100755 index 0000000..adb9f3c --- /dev/null +++ b/utils/hxgoogle2.py @@ -0,0 +1,54 @@ +# -*- encoding: utf-8 -*- + +import searchbase +import re +import urllib, urllib2 +import webutils +from lxml import etree + +pattern = re.compile(r'
找到约 ([0-9,]+) 条结果') +pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。') + +class HxGoogle(searchbase.SearchBase): + + def _updateTotalRecord(self, html): + + m = pattern2.search(html) + if m != None: + self._totalRecord = 0 + #print 'not found' + return + m = pattern.search(html) + if m == None: + return + if len(m.groups()) <= 0: + return + self._totalRecord = int(m.group(1).replace(',', '')) + print 'Total: ', self._totalRecord + + def _pickupLinks(self, html): + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div + return tree.xpath(r'//h3/a/@href') + + + def _genUrl(self, what, start): + return 'http://g1.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start) + + +hx = None + +def google(opener, what, resultNum = -1, startNum = 0): + global hx + if hx == None: + hx = HxGoogle(opener) + return hx.search(what, resultNum, startNum) + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + # goo = HxGoogle(opener) + + for url in google(opener, 'site:letv.com', 20): + print url + diff --git a/utils/hxgoogle3.py b/utils/hxgoogle3.py new file mode 100755 index 0000000..7d0184b --- /dev/null +++ b/utils/hxgoogle3.py @@ -0,0 +1,54 @@ +# -*- encoding: utf-8 -*- + +import searchbase +import re +import urllib, urllib2 +import webutils +from lxml import etree + +pattern = re.compile(r'
找到约 ([0-9,]+) 条结果') +pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。') + +class HxGoogle(searchbase.SearchBase): + + def _updateTotalRecord(self, html): + + m = pattern2.search(html) + if m != None: + self._totalRecord = 0 + #print 'not found' + return + m = pattern.search(html) + if m == None: + return + if len(m.groups()) <= 0: + return + self._totalRecord = int(m.group(1).replace(',', '')) + print 'Total: ', self._totalRecord + + def _pickupLinks(self, html): + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div + return tree.xpath(r'//h3/a/@href') + + + def _genUrl(self, what, start): + return 'http://g2.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start) + + +hx = None + +def google(opener, what, resultNum = -1, startNum = 0): + global hx + if hx == None: + hx = HxGoogle(opener) + return hx.search(what, resultNum, startNum) + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + # goo = HxGoogle(opener) + + for url in google(opener, 'site:letv.com', 20): + print url + diff --git a/utils/searchbase.py b/utils/searchbase.py index acca718..dd3f74b 100644 --- a/utils/searchbase.py +++ b/utils/searchbase.py @@ -74,21 +74,22 @@ def search(self, what, resultNum = -1, startNum = 0): if pageNum > pageCount: break - url = self._genUrl(what, (startPage + pageNum) * numPerPage) + url = self._genUrl(what, (startPage + pageNum - 1) * numPerPage) + # print url for result in self._pageHandler(url): resCnt += 1 yield result if resultNum != -1 and resCnt >= resultNum: raise StopIteration() - if resCnt >= totalRecord: + if resCnt >= self._totalRecord: raise StopIteration() if self._totalRecord == sys.maxint: if resultNum == -1: - totalRecord = sys.maxint - 1 + self._totalRecord = sys.maxint - 1 else: - totalRecord = resultNum + self._totalRecord = resultNum if resCnt >= self._totalRecord: raise StopIteration()