From ef7c7f116b95aca7934e4c4dcd3d6bcefeffae88 Mon Sep 17 00:00:00 2001 From: xiao Date: Wed, 1 Jul 2015 11:53:29 +0800 Subject: [PATCH] add hxgoogle --- utils/google.py | 7 +++++-- utils/hxgoogle.py | 42 +++++++++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/utils/google.py b/utils/google.py index 991c1eb..4af1b59 100644 --- a/utils/google.py +++ b/utils/google.py @@ -3,6 +3,7 @@ import aolsearch import googlesearch import bingsearch +import hxgoogle #searchEngine = googlesearch.google #searchEngine = aolsearch.google @@ -11,7 +12,7 @@ if os.environ.has_key('search_engine'): search_engine = os.environ['search_engine'] else: - search_engine = 'gfsoso' + search_engine = 'hxgoogle' if search_engine == 'gfsoso': google = gfsoso.google @@ -21,8 +22,10 @@ google = aolsearch.google elif search_engine == 'bing': google = bingsearch.google +elif search_engine == 'hxgoogle': + google = hxgoogle.google else: - google = gfsoso.google + google = hxgoogle.google searchEngine = google diff --git a/utils/hxgoogle.py b/utils/hxgoogle.py index b71eb4f..872f83c 100644 --- a/utils/hxgoogle.py +++ b/utils/hxgoogle.py @@ -1,3 +1,5 @@ +# -*- encoding: utf-8 -*- + import urllib, urllib2 import cookielib import re @@ -11,11 +13,12 @@ HXGOOGLE_HOME = 'http://www.hxgoogle.com' NUM_PER_PAGE = 10 +REQ_TIMEOUT = 20 totalRecord = sys.maxint reqDelay = 0.0 -pattern = re.compile(r'约有([0-9,]+)项结果') -pattern2 = re.compile(r'抱歉,没有找到与“.*?”相关的网页') +pattern = re.compile(r'
找到约 ([0-9,]+) 条结果') +pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。') def _updateTotalRecord(html): global totalRecord @@ -28,19 +31,13 @@ def _updateTotalRecord(html): if m == None: return if len(m.groups()) <= 0: - return + return totalRecord = int(m.group(1).replace(',', '')) print 'Total: ', totalRecord - """ - 结果xpath - /html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[1]/div/h3 - - /html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[2]/div/h3/a - - /html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[3]/div/h3/a - """ def _hxPageHandler(opener, url): + + # print 'page handler' req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Referer', url[:-4]) @@ -48,13 +45,22 @@ def _hxPageHandler(opener, url): try: response = opener.open(req, timeout = REQ_TIMEOUT) html = response.read() - #print html + # print html except Exception, e: print "Exception: url: %s - " % url, e raise StopIteration() if totalRecord == sys.maxint: _updateTotalRecord(html) + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[*]/div/h3/a/@href') + nodes = tree.xpath(r'//h3/a/@href') + + + for node in nodes: + url = node + yield url + def _hxSearch(opener, what, resultNum = -1, startNum = 0): if resultNum == -1: @@ -76,9 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0): if pageCount != -1: if pageNum > pageCount: break - - url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % - (what, (startPage + pageNum) * 10) + url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10) for result in _hxPageHandler(opener, url): # i += 1 @@ -104,3 +108,11 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0): if reqDelay > 0: time.sleep(reqDelay) +google = _hxSearch + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + for url in google(opener, 'site:letv.com', 10): + print url +