diff --git a/gather.py b/gather.py
index 52cbeec..efe8ca0 100755
--- a/gather.py
+++ b/gather.py
@@ -89,16 +89,19 @@ def queryRDNS(domain):
for ipaddr in hostInfos[2]:
print '[IP Address: ' + ipaddr + ']'
- # TODO: 加入翻页代码
- try:
- response = urllib2.urlopen('http://dns.aizhan.com/%s/' % (ipaddr))
- text = response.read()
- tree = etree.HTML(text)
- nodes = tree.xpath(r"//td[@class='dns-links']/a/@href")
- for node in nodes:
- print node, getTitle(node)
- except Exception, e:
- print e
+ # 翻页
+ for i in range(5): # 最多5页,需要更多到网页上去看
+ try:
+ response = urllib2.urlopen('http://dns.aizhan.com/%s/%d/' % (ipaddr, i))
+ text = response.read()
+ tree = etree.HTML(text)
+ nodes = tree.xpath(r"//td[@class='dns-links']/a/@href")
+ if len(nodes) == 0:
+ break
+ for node in nodes:
+ print node, getTitle(node)
+ except Exception, e:
+ print e
def toStr(l):
diff --git a/utils/__init__.py b/utils/__init__.py
index a9242a2..7f5167f 100755
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -1 +1 @@
-__all__ = ['webutils', 'google']
+__all__ = ['webutils', 'google', 'crawler']
diff --git a/utils/google.py b/utils/google.py
index 4af1b59..5a0b9e0 100644
--- a/utils/google.py
+++ b/utils/google.py
@@ -4,6 +4,8 @@
import googlesearch
import bingsearch
import hxgoogle
+import hxgoogle2
+import hxgoogle3
#searchEngine = googlesearch.google
#searchEngine = aolsearch.google
@@ -24,8 +26,12 @@
google = bingsearch.google
elif search_engine == 'hxgoogle':
google = hxgoogle.google
+elif search_engine == 'hxgoogle2':
+ google = hxgoogle2.google
+elif search_engine == 'hxgoogle3':
+ google = hxgoogle3.google
else:
- google = hxgoogle.google
+ google = hxgoogle2.google
searchEngine = google
diff --git a/utils/googto.py b/utils/googto.py
new file mode 100755
index 0000000..809c660
--- /dev/null
+++ b/utils/googto.py
@@ -0,0 +1,41 @@
+# -*- encoding: utf-8 -*-
+
+import searchbase
+import re
+import urllib, urllib2
+import webutils
+from lxml import etree
+
+class Googto(searchbase.SearchBase):
+
+ _totalRecordPattern = re.compile(r'找到约 ([0-9,]+) 条结果')
+
+ def _updateTotalRecord(self, html):
+ m = self._totalRecordPattern.search(html)
+ if m == None:
+ # print '* Not found 1'
+ return
+ if len(m.groups()) <= 0:
+ # print '* Not found 2'
+ return
+ self._totalRecord = int(m.group(1).replace(',', ''))
+ print '* Total:', self._totalRecord
+
+
+ def _pickupLinks(self, html):
+ tree = etree.HTML(html)
+ # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
+ return tree.xpath(r'//h3/a/@href')
+
+
+ def _genUrl(self, what, start):
+ return 'http://www.googto.com/?q=%s&start=%d' % (what, start)
+
+if __name__ == '__main__':
+ opener = urllib2.build_opener()
+ webutils.setupOpener(opener)
+ googto = Googto(opener)
+
+ for url in googto.search('site:letv.com', 10):
+ print url
+
diff --git a/utils/hxgoogle.py b/utils/hxgoogle.py
index 872f83c..f50927c 100644
--- a/utils/hxgoogle.py
+++ b/utils/hxgoogle.py
@@ -11,7 +11,7 @@
import locale
import webutils
-HXGOOGLE_HOME = 'http://www.hxgoogle.com'
+HXGOOGLE_HOME = 'http://g.hxgoogle.com'
NUM_PER_PAGE = 10
REQ_TIMEOUT = 20
totalRecord = sys.maxint
@@ -82,7 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0):
if pageCount != -1:
if pageNum > pageCount:
break
- url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10)
+ url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum - 1) * 10)
for result in _hxPageHandler(opener, url):
# i += 1
diff --git a/utils/hxgoogle2.py b/utils/hxgoogle2.py
new file mode 100755
index 0000000..adb9f3c
--- /dev/null
+++ b/utils/hxgoogle2.py
@@ -0,0 +1,54 @@
+# -*- encoding: utf-8 -*-
+
+import searchbase
+import re
+import urllib, urllib2
+import webutils
+from lxml import etree
+
+pattern = re.compile(r'
找到约 ([0-9,]+) 条结果')
+pattern2 = re.compile(r'找不到和您的查询 "
.*?" 相符的内容或信息。')
+
+class HxGoogle(searchbase.SearchBase):
+
+ def _updateTotalRecord(self, html):
+
+ m = pattern2.search(html)
+ if m != None:
+ self._totalRecord = 0
+ #print 'not found'
+ return
+ m = pattern.search(html)
+ if m == None:
+ return
+ if len(m.groups()) <= 0:
+ return
+ self._totalRecord = int(m.group(1).replace(',', ''))
+ print 'Total: ', self._totalRecord
+
+ def _pickupLinks(self, html):
+ tree = etree.HTML(html)
+ # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
+ return tree.xpath(r'//h3/a/@href')
+
+
+ def _genUrl(self, what, start):
+ return 'http://g1.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start)
+
+
+hx = None
+
+def google(opener, what, resultNum = -1, startNum = 0):
+ global hx
+ if hx == None:
+ hx = HxGoogle(opener)
+ return hx.search(what, resultNum, startNum)
+
+if __name__ == '__main__':
+ opener = urllib2.build_opener()
+ webutils.setupOpener(opener)
+ # goo = HxGoogle(opener)
+
+ for url in google(opener, 'site:letv.com', 20):
+ print url
+
diff --git a/utils/hxgoogle3.py b/utils/hxgoogle3.py
new file mode 100755
index 0000000..7d0184b
--- /dev/null
+++ b/utils/hxgoogle3.py
@@ -0,0 +1,54 @@
+# -*- encoding: utf-8 -*-
+
+import searchbase
+import re
+import urllib, urllib2
+import webutils
+from lxml import etree
+
+pattern = re.compile(r'
找到约 ([0-9,]+) 条结果')
+pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。')
+
+class HxGoogle(searchbase.SearchBase):
+
+ def _updateTotalRecord(self, html):
+
+ m = pattern2.search(html)
+ if m != None:
+ self._totalRecord = 0
+ #print 'not found'
+ return
+ m = pattern.search(html)
+ if m == None:
+ return
+ if len(m.groups()) <= 0:
+ return
+ self._totalRecord = int(m.group(1).replace(',', ''))
+ print 'Total: ', self._totalRecord
+
+ def _pickupLinks(self, html):
+ tree = etree.HTML(html)
+ # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
+ return tree.xpath(r'//h3/a/@href')
+
+
+ def _genUrl(self, what, start):
+ return 'http://g2.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start)
+
+
+hx = None
+
+def google(opener, what, resultNum = -1, startNum = 0):
+ global hx
+ if hx == None:
+ hx = HxGoogle(opener)
+ return hx.search(what, resultNum, startNum)
+
+if __name__ == '__main__':
+ opener = urllib2.build_opener()
+ webutils.setupOpener(opener)
+ # goo = HxGoogle(opener)
+
+ for url in google(opener, 'site:letv.com', 20):
+ print url
+
diff --git a/utils/searchbase.py b/utils/searchbase.py
new file mode 100644
index 0000000..dd3f74b
--- /dev/null
+++ b/utils/searchbase.py
@@ -0,0 +1,100 @@
+# -*- encoding: utf-8 -*-
+
+import urllib2
+import sys
+import os
+import webutils
+
+class SearchBase:
+
+ _opener = None
+ _totalRecord = sys.maxint
+
+ reqTimeout = 20
+
+ def __init__(self, opener):
+ self._opener = opener
+
+ # TODO: get total record number from page
+ def _updateTotalRecord(self, html):
+ pass
+
+ # TODO: pick up links from page
+ def _pickupLinks(self, html):
+ pass
+
+ def _pageHandler(self, url):
+ # print 'page handler'
+ req = urllib2.Request(url)
+ webutils.setupRequest(req)
+ req.add_header('Referer', url[:-4])
+
+ try:
+ response = self._opener.open(req, timeout = self.reqTimeout)
+ html = response.read()
+ # print html
+ except Exception, e:
+ print "Exception: url: %s - " % url, e
+ raise StopIteration()
+
+ if self._totalRecord == sys.maxint:
+ self._updateTotalRecord(html)
+
+ for url in self._pickupLinks(html):
+ yield url
+
+ # TODO: return number of results per page. default is 10
+ def _getNumPerPage(self):
+ return 10
+
+ # TODO: generate a url for searching
+ def _genUrl(self, what, start):
+ return ''
+
+ def search(self, what, resultNum = -1, startNum = 0):
+
+ numPerPage = self._getNumPerPage();
+
+ if resultNum == -1:
+ pageCount = -1
+ else:
+ pageCount = int((resultNum + numPerPage - 1) / numPerPage)
+
+ startPage = int((startNum + numPerPage - 1) / numPerPage)
+
+ self._totalRecord = sys.maxint
+
+ what = urllib2.quote(what)
+
+ pageNum = 1
+ resCnt = 0
+
+ while True:
+ if pageCount != -1:
+ if pageNum > pageCount:
+ break
+
+ url = self._genUrl(what, (startPage + pageNum - 1) * numPerPage)
+ # print url
+
+ for result in self._pageHandler(url):
+ resCnt += 1
+ yield result
+ if resultNum != -1 and resCnt >= resultNum:
+ raise StopIteration()
+ if resCnt >= self._totalRecord:
+ raise StopIteration()
+
+ if self._totalRecord == sys.maxint:
+ if resultNum == -1:
+ self._totalRecord = sys.maxint - 1
+ else:
+ self._totalRecord = resultNum
+
+ if resCnt >= self._totalRecord:
+ raise StopIteration()
+ #if i < numPerPage: # FIXME: if the result total is 10... :(
+ # raise StopIteration()
+ # break
+ pageNum += 1
+
diff --git a/utils/webutils.py b/utils/webutils.py
index a3163f1..78365c4 100644
--- a/utils/webutils.py
+++ b/utils/webutils.py
@@ -129,7 +129,7 @@ def getPageTitle(opener, url):
return ''
try:
- if url[:7] != 'http://':
+ if url[:7] != 'http://' and url[:8] != 'https://':
url = 'http://' + url
req = urllib2.Request(url)
setupRequest(req)