Skip to content

Commit

Permalink
add hxgoogle
Browse files Browse the repository at this point in the history
  • Loading branch information
xiao committed Jul 1, 2015
1 parent a329171 commit ef7c7f1
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 17 deletions.
7 changes: 5 additions & 2 deletions utils/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import aolsearch
import googlesearch
import bingsearch
import hxgoogle

#searchEngine = googlesearch.google
#searchEngine = aolsearch.google
Expand All @@ -11,7 +12,7 @@
if os.environ.has_key('search_engine'):
search_engine = os.environ['search_engine']
else:
search_engine = 'gfsoso'
search_engine = 'hxgoogle'

if search_engine == 'gfsoso':
google = gfsoso.google
Expand All @@ -21,8 +22,10 @@
google = aolsearch.google
elif search_engine == 'bing':
google = bingsearch.google
elif search_engine == 'hxgoogle':
google = hxgoogle.google
else:
google = gfsoso.google
google = hxgoogle.google

searchEngine = google

42 changes: 27 additions & 15 deletions utils/hxgoogle.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- encoding: utf-8 -*-

import urllib, urllib2
import cookielib
import re
Expand All @@ -11,11 +13,12 @@

HXGOOGLE_HOME = 'http://www.hxgoogle.com'
NUM_PER_PAGE = 10
REQ_TIMEOUT = 20
totalRecord = sys.maxint
reqDelay = 0.0

pattern = re.compile(r'<span>约有([0-9,]+)项结果')
pattern2 = re.compile(r'抱歉,没有找到与“.*?”相关的网页')
pattern = re.compile(r'<div id="resultStats">找到约 ([0-9,]+) 条结果')
pattern2 = re.compile(r'找不到和您的查询 "<em>.*?</em>" 相符的内容或信息。')

def _updateTotalRecord(html):
global totalRecord
Expand All @@ -28,33 +31,36 @@ def _updateTotalRecord(html):
if m == None:
return
if len(m.groups()) <= 0:
return
return
totalRecord = int(m.group(1).replace(',', ''))
print 'Total: ', totalRecord

"""
结果xpath
/html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[1]/div/h3
/html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[2]/div/h3/a
/html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[3]/div/h3/a
"""
def _hxPageHandler(opener, url):

# print 'page handler'
req = urllib2.Request(url)
webutils.setupRequest(req)
req.add_header('Referer', url[:-4])

try:
response = opener.open(req, timeout = REQ_TIMEOUT)
html = response.read()
#print html
# print html
except Exception, e:
print "Exception: url: %s - " % url, e
raise StopIteration()
if totalRecord == sys.maxint:
_updateTotalRecord(html)

tree = etree.HTML(html)
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div/div[*]/div/h3/a/@href')
nodes = tree.xpath(r'//h3/a/@href')


for node in nodes:
url = node
yield url


def _hxSearch(opener, what, resultNum = -1, startNum = 0):
if resultNum == -1:
Expand All @@ -76,9 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0):
if pageCount != -1:
if pageNum > pageCount:
break

url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' %
(what, (startPage + pageNum) * 10)
url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10)

for result in _hxPageHandler(opener, url):
# i += 1
Expand All @@ -104,3 +108,11 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0):
if reqDelay > 0:
time.sleep(reqDelay)

google = _hxSearch

if __name__ == '__main__':
opener = urllib2.build_opener()
webutils.setupOpener(opener)
for url in google(opener, 'site:letv.com', 10):
print url

0 comments on commit ef7c7f1

Please sign in to comment.