diff --git a/gather.py b/gather.py index 52cbeec..efe8ca0 100755 --- a/gather.py +++ b/gather.py @@ -89,16 +89,19 @@ def queryRDNS(domain): for ipaddr in hostInfos[2]: print '[IP Address: ' + ipaddr + ']' - # TODO: 加入翻页代码 - try: - response = urllib2.urlopen('http://dns.aizhan.com/%s/' % (ipaddr)) - text = response.read() - tree = etree.HTML(text) - nodes = tree.xpath(r"//td[@class='dns-links']/a/@href") - for node in nodes: - print node, getTitle(node) - except Exception, e: - print e + # 翻页 + for i in range(5): # 最多5页,需要更多到网页上去看 + try: + response = urllib2.urlopen('http://dns.aizhan.com/%s/%d/' % (ipaddr, i)) + text = response.read() + tree = etree.HTML(text) + nodes = tree.xpath(r"//td[@class='dns-links']/a/@href") + if len(nodes) == 0: + break + for node in nodes: + print node, getTitle(node) + except Exception, e: + print e def toStr(l): diff --git a/utils/__init__.py b/utils/__init__.py index a9242a2..7f5167f 100755 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1 +1 @@ -__all__ = ['webutils', 'google'] +__all__ = ['webutils', 'google', 'crawler'] diff --git a/utils/google.py b/utils/google.py index 4af1b59..5a0b9e0 100644 --- a/utils/google.py +++ b/utils/google.py @@ -4,6 +4,8 @@ import googlesearch import bingsearch import hxgoogle +import hxgoogle2 +import hxgoogle3 #searchEngine = googlesearch.google #searchEngine = aolsearch.google @@ -24,8 +26,12 @@ google = bingsearch.google elif search_engine == 'hxgoogle': google = hxgoogle.google +elif search_engine == 'hxgoogle2': + google = hxgoogle2.google +elif search_engine == 'hxgoogle3': + google = hxgoogle3.google else: - google = hxgoogle.google + google = hxgoogle2.google searchEngine = google diff --git a/utils/googto.py b/utils/googto.py new file mode 100755 index 0000000..809c660 --- /dev/null +++ b/utils/googto.py @@ -0,0 +1,41 @@ +# -*- encoding: utf-8 -*- + +import searchbase +import re +import urllib, urllib2 +import webutils +from lxml import etree + +class Googto(searchbase.SearchBase): + + _totalRecordPattern = re.compile(r'找到约 ([0-9,]+) 条结果') + + def _updateTotalRecord(self, html): + m = self._totalRecordPattern.search(html) + if m == None: + # print '* Not found 1' + return + if len(m.groups()) <= 0: + # print '* Not found 2' + return + self._totalRecord = int(m.group(1).replace(',', '')) + print '* Total:', self._totalRecord + + + def _pickupLinks(self, html): + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div + return tree.xpath(r'//h3/a/@href') + + + def _genUrl(self, what, start): + return 'http://www.googto.com/?q=%s&start=%d' % (what, start) + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + googto = Googto(opener) + + for url in googto.search('site:letv.com', 10): + print url + diff --git a/utils/hxgoogle.py b/utils/hxgoogle.py index 872f83c..f50927c 100644 --- a/utils/hxgoogle.py +++ b/utils/hxgoogle.py @@ -11,7 +11,7 @@ import locale import webutils -HXGOOGLE_HOME = 'http://www.hxgoogle.com' +HXGOOGLE_HOME = 'http://g.hxgoogle.com' NUM_PER_PAGE = 10 REQ_TIMEOUT = 20 totalRecord = sys.maxint @@ -82,7 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0): if pageCount != -1: if pageNum > pageCount: break - url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10) + url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum - 1) * 10) for result in _hxPageHandler(opener, url): # i += 1 diff --git a/utils/hxgoogle2.py b/utils/hxgoogle2.py new file mode 100755 index 0000000..adb9f3c --- /dev/null +++ b/utils/hxgoogle2.py @@ -0,0 +1,54 @@ +# -*- encoding: utf-8 -*- + +import searchbase +import re +import urllib, urllib2 +import webutils +from lxml import etree + +pattern = re.compile(r'
找到约 ([0-9,]+) 条结果') +pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。') + +class HxGoogle(searchbase.SearchBase): + + def _updateTotalRecord(self, html): + + m = pattern2.search(html) + if m != None: + self._totalRecord = 0 + #print 'not found' + return + m = pattern.search(html) + if m == None: + return + if len(m.groups()) <= 0: + return + self._totalRecord = int(m.group(1).replace(',', '')) + print 'Total: ', self._totalRecord + + def _pickupLinks(self, html): + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div + return tree.xpath(r'//h3/a/@href') + + + def _genUrl(self, what, start): + return 'http://g1.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start) + + +hx = None + +def google(opener, what, resultNum = -1, startNum = 0): + global hx + if hx == None: + hx = HxGoogle(opener) + return hx.search(what, resultNum, startNum) + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + # goo = HxGoogle(opener) + + for url in google(opener, 'site:letv.com', 20): + print url + diff --git a/utils/hxgoogle3.py b/utils/hxgoogle3.py new file mode 100755 index 0000000..7d0184b --- /dev/null +++ b/utils/hxgoogle3.py @@ -0,0 +1,54 @@ +# -*- encoding: utf-8 -*- + +import searchbase +import re +import urllib, urllib2 +import webutils +from lxml import etree + +pattern = re.compile(r'
找到约 ([0-9,]+) 条结果') +pattern2 = re.compile(r'找不到和您的查询 ".*?" 相符的内容或信息。') + +class HxGoogle(searchbase.SearchBase): + + def _updateTotalRecord(self, html): + + m = pattern2.search(html) + if m != None: + self._totalRecord = 0 + #print 'not found' + return + m = pattern.search(html) + if m == None: + return + if len(m.groups()) <= 0: + return + self._totalRecord = int(m.group(1).replace(',', '')) + print 'Total: ', self._totalRecord + + def _pickupLinks(self, html): + tree = etree.HTML(html) + # nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div + return tree.xpath(r'//h3/a/@href') + + + def _genUrl(self, what, start): + return 'http://g2.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start) + + +hx = None + +def google(opener, what, resultNum = -1, startNum = 0): + global hx + if hx == None: + hx = HxGoogle(opener) + return hx.search(what, resultNum, startNum) + +if __name__ == '__main__': + opener = urllib2.build_opener() + webutils.setupOpener(opener) + # goo = HxGoogle(opener) + + for url in google(opener, 'site:letv.com', 20): + print url + diff --git a/utils/searchbase.py b/utils/searchbase.py new file mode 100644 index 0000000..dd3f74b --- /dev/null +++ b/utils/searchbase.py @@ -0,0 +1,100 @@ +# -*- encoding: utf-8 -*- + +import urllib2 +import sys +import os +import webutils + +class SearchBase: + + _opener = None + _totalRecord = sys.maxint + + reqTimeout = 20 + + def __init__(self, opener): + self._opener = opener + + # TODO: get total record number from page + def _updateTotalRecord(self, html): + pass + + # TODO: pick up links from page + def _pickupLinks(self, html): + pass + + def _pageHandler(self, url): + # print 'page handler' + req = urllib2.Request(url) + webutils.setupRequest(req) + req.add_header('Referer', url[:-4]) + + try: + response = self._opener.open(req, timeout = self.reqTimeout) + html = response.read() + # print html + except Exception, e: + print "Exception: url: %s - " % url, e + raise StopIteration() + + if self._totalRecord == sys.maxint: + self._updateTotalRecord(html) + + for url in self._pickupLinks(html): + yield url + + # TODO: return number of results per page. default is 10 + def _getNumPerPage(self): + return 10 + + # TODO: generate a url for searching + def _genUrl(self, what, start): + return '' + + def search(self, what, resultNum = -1, startNum = 0): + + numPerPage = self._getNumPerPage(); + + if resultNum == -1: + pageCount = -1 + else: + pageCount = int((resultNum + numPerPage - 1) / numPerPage) + + startPage = int((startNum + numPerPage - 1) / numPerPage) + + self._totalRecord = sys.maxint + + what = urllib2.quote(what) + + pageNum = 1 + resCnt = 0 + + while True: + if pageCount != -1: + if pageNum > pageCount: + break + + url = self._genUrl(what, (startPage + pageNum - 1) * numPerPage) + # print url + + for result in self._pageHandler(url): + resCnt += 1 + yield result + if resultNum != -1 and resCnt >= resultNum: + raise StopIteration() + if resCnt >= self._totalRecord: + raise StopIteration() + + if self._totalRecord == sys.maxint: + if resultNum == -1: + self._totalRecord = sys.maxint - 1 + else: + self._totalRecord = resultNum + + if resCnt >= self._totalRecord: + raise StopIteration() + #if i < numPerPage: # FIXME: if the result total is 10... :( + # raise StopIteration() + # break + pageNum += 1 + diff --git a/utils/webutils.py b/utils/webutils.py index a3163f1..78365c4 100644 --- a/utils/webutils.py +++ b/utils/webutils.py @@ -129,7 +129,7 @@ def getPageTitle(opener, url): return '' try: - if url[:7] != 'http://': + if url[:7] != 'http://' and url[:8] != 'https://': url = 'http://' + url req = urllib2.Request(url) setupRequest(req)