diff --git a/utils/google.py b/utils/google.py
index 4af1b59..5a0b9e0 100644
--- a/utils/google.py
+++ b/utils/google.py
@@ -4,6 +4,8 @@
import googlesearch
import bingsearch
import hxgoogle
+import hxgoogle2
+import hxgoogle3
#searchEngine = googlesearch.google
#searchEngine = aolsearch.google
@@ -24,8 +26,12 @@
google = bingsearch.google
elif search_engine == 'hxgoogle':
google = hxgoogle.google
+elif search_engine == 'hxgoogle2':
+ google = hxgoogle2.google
+elif search_engine == 'hxgoogle3':
+ google = hxgoogle3.google
else:
- google = hxgoogle.google
+ google = hxgoogle2.google
searchEngine = google
diff --git a/utils/hxgoogle.py b/utils/hxgoogle.py
index 872f83c..f50927c 100644
--- a/utils/hxgoogle.py
+++ b/utils/hxgoogle.py
@@ -11,7 +11,7 @@
import locale
import webutils
-HXGOOGLE_HOME = 'http://www.hxgoogle.com'
+HXGOOGLE_HOME = 'http://g.hxgoogle.com'
NUM_PER_PAGE = 10
REQ_TIMEOUT = 20
totalRecord = sys.maxint
@@ -82,7 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0):
if pageCount != -1:
if pageNum > pageCount:
break
- url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10)
+ url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum - 1) * 10)
for result in _hxPageHandler(opener, url):
# i += 1
diff --git a/utils/hxgoogle2.py b/utils/hxgoogle2.py
new file mode 100755
index 0000000..adb9f3c
--- /dev/null
+++ b/utils/hxgoogle2.py
@@ -0,0 +1,54 @@
+# -*- encoding: utf-8 -*-
+
+import searchbase
+import re
+import urllib, urllib2
+import webutils
+from lxml import etree
+
+pattern = re.compile(r'
找到约 ([0-9,]+) 条结果')
+pattern2 = re.compile(r'找不到和您的查询 "
.*?" 相符的内容或信息。')
+
+class HxGoogle(searchbase.SearchBase):
+
+ def _updateTotalRecord(self, html):
+
+ m = pattern2.search(html)
+ if m != None:
+ self._totalRecord = 0
+ #print 'not found'
+ return
+ m = pattern.search(html)
+ if m == None:
+ return
+ if len(m.groups()) <= 0:
+ return
+ self._totalRecord = int(m.group(1).replace(',', ''))
+ print 'Total: ', self._totalRecord
+
+ def _pickupLinks(self, html):
+ tree = etree.HTML(html)
+ # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
+ return tree.xpath(r'//h3/a/@href')
+
+
+ def _genUrl(self, what, start):
+ return 'http://g1.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start)
+
+
+hx = None
+
+def google(opener, what, resultNum = -1, startNum = 0):
+ global hx
+ if hx == None:
+ hx = HxGoogle(opener)
+ return hx.search(what, resultNum, startNum)
+
+if __name__ == '__main__':
+ opener = urllib2.build_opener()
+ webutils.setupOpener(opener)
+ # goo = HxGoogle(opener)
+
+ for url in google(opener, 'site:letv.com', 20):
+ print url
+
diff --git a/utils/hxgoogle3.py b/utils/hxgoogle3.py
new file mode 100755
index 0000000..7d0184b
--- /dev/null
+++ b/utils/hxgoogle3.py
@@ -0,0 +1,54 @@
+# -*- encoding: utf-8 -*-
+
+import searchbase
+import re
+import urllib, urllib2
+import webutils
+from lxml import etree
+
+pattern = re.compile(r'
找到约 ([0-9,]+) 条结果')
+pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。')
+
+class HxGoogle(searchbase.SearchBase):
+
+ def _updateTotalRecord(self, html):
+
+ m = pattern2.search(html)
+ if m != None:
+ self._totalRecord = 0
+ #print 'not found'
+ return
+ m = pattern.search(html)
+ if m == None:
+ return
+ if len(m.groups()) <= 0:
+ return
+ self._totalRecord = int(m.group(1).replace(',', ''))
+ print 'Total: ', self._totalRecord
+
+ def _pickupLinks(self, html):
+ tree = etree.HTML(html)
+ # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
+ return tree.xpath(r'//h3/a/@href')
+
+
+ def _genUrl(self, what, start):
+ return 'http://g2.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start)
+
+
+hx = None
+
+def google(opener, what, resultNum = -1, startNum = 0):
+ global hx
+ if hx == None:
+ hx = HxGoogle(opener)
+ return hx.search(what, resultNum, startNum)
+
+if __name__ == '__main__':
+ opener = urllib2.build_opener()
+ webutils.setupOpener(opener)
+ # goo = HxGoogle(opener)
+
+ for url in google(opener, 'site:letv.com', 20):
+ print url
+
diff --git a/utils/searchbase.py b/utils/searchbase.py
index acca718..dd3f74b 100644
--- a/utils/searchbase.py
+++ b/utils/searchbase.py
@@ -74,21 +74,22 @@ def search(self, what, resultNum = -1, startNum = 0):
if pageNum > pageCount:
break
- url = self._genUrl(what, (startPage + pageNum) * numPerPage)
+ url = self._genUrl(what, (startPage + pageNum - 1) * numPerPage)
+ # print url
for result in self._pageHandler(url):
resCnt += 1
yield result
if resultNum != -1 and resCnt >= resultNum:
raise StopIteration()
- if resCnt >= totalRecord:
+ if resCnt >= self._totalRecord:
raise StopIteration()
if self._totalRecord == sys.maxint:
if resultNum == -1:
- totalRecord = sys.maxint - 1
+ self._totalRecord = sys.maxint - 1
else:
- totalRecord = resultNum
+ self._totalRecord = resultNum
if resCnt >= self._totalRecord:
raise StopIteration()