Skip to content

Commit

Permalink
Merge branch 'master' of github.com:brock7/scripts
Browse files Browse the repository at this point in the history
Conflicts:
	gather.py
  • Loading branch information
Brock committed Jan 16, 2016
2 parents 30ba665 + 6a9715f commit d3b930c
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 15 deletions.
23 changes: 13 additions & 10 deletions gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,19 @@ def queryRDNS(domain):
for ipaddr in hostInfos[2]:

print '[IP Address: ' + ipaddr + ']'
# TODO: 加入翻页代码
try:
response = urllib2.urlopen('http://dns.aizhan.com/%s/' % (ipaddr))
text = response.read()
tree = etree.HTML(text)
nodes = tree.xpath(r"//td[@class='dns-links']/a/@href")
for node in nodes:
print node, getTitle(node)
except Exception, e:
print e
# 翻页
for i in range(5): # 最多5页,需要更多到网页上去看
try:
response = urllib2.urlopen('http://dns.aizhan.com/%s/%d/' % (ipaddr, i))
text = response.read()
tree = etree.HTML(text)
nodes = tree.xpath(r"//td[@class='dns-links']/a/@href")
if len(nodes) == 0:
break
for node in nodes:
print node, getTitle(node)
except Exception, e:
print e


def toStr(l):
Expand Down
2 changes: 1 addition & 1 deletion utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__all__ = ['webutils', 'google']
__all__ = ['webutils', 'google', 'crawler']
8 changes: 7 additions & 1 deletion utils/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import googlesearch
import bingsearch
import hxgoogle
import hxgoogle2
import hxgoogle3

#searchEngine = googlesearch.google
#searchEngine = aolsearch.google
Expand All @@ -24,8 +26,12 @@
google = bingsearch.google
elif search_engine == 'hxgoogle':
google = hxgoogle.google
elif search_engine == 'hxgoogle2':
google = hxgoogle2.google
elif search_engine == 'hxgoogle3':
google = hxgoogle3.google
else:
google = hxgoogle.google
google = hxgoogle2.google

searchEngine = google

41 changes: 41 additions & 0 deletions utils/googto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- encoding: utf-8 -*-

import searchbase
import re
import urllib, urllib2
import webutils
from lxml import etree

class Googto(searchbase.SearchBase):

_totalRecordPattern = re.compile(r'找到约 ([0-9,]+) 条结果')

def _updateTotalRecord(self, html):
m = self._totalRecordPattern.search(html)
if m == None:
# print '* Not found 1'
return
if len(m.groups()) <= 0:
# print '* Not found 2'
return
self._totalRecord = int(m.group(1).replace(',', ''))
print '* Total:', self._totalRecord


def _pickupLinks(self, html):
tree = etree.HTML(html)
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
return tree.xpath(r'//h3/a/@href')


def _genUrl(self, what, start):
return 'http://www.googto.com/?q=%s&start=%d' % (what, start)

if __name__ == '__main__':
opener = urllib2.build_opener()
webutils.setupOpener(opener)
googto = Googto(opener)

for url in googto.search('site:letv.com', 10):
print url

4 changes: 2 additions & 2 deletions utils/hxgoogle.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import locale
import webutils

HXGOOGLE_HOME = 'http://www.hxgoogle.com'
HXGOOGLE_HOME = 'http://g.hxgoogle.com'
NUM_PER_PAGE = 10
REQ_TIMEOUT = 20
totalRecord = sys.maxint
Expand Down Expand Up @@ -82,7 +82,7 @@ def _hxSearch(opener, what, resultNum = -1, startNum = 0):
if pageCount != -1:
if pageNum > pageCount:
break
url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum) * 10)
url = HXGOOGLE_HOME + '/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, (startPage + pageNum - 1) * 10)

for result in _hxPageHandler(opener, url):
# i += 1
Expand Down
54 changes: 54 additions & 0 deletions utils/hxgoogle2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- encoding: utf-8 -*-

import searchbase
import re
import urllib, urllib2
import webutils
from lxml import etree

pattern = re.compile(r'<div id="resultStats">找到约 ([0-9,]+) 条结果')
pattern2 = re.compile(r'找不到和您的查询 "<em>.*?</em>" 相符的内容或信息。')

class HxGoogle(searchbase.SearchBase):

def _updateTotalRecord(self, html):

m = pattern2.search(html)
if m != None:
self._totalRecord = 0
#print 'not found'
return
m = pattern.search(html)
if m == None:
return
if len(m.groups()) <= 0:
return
self._totalRecord = int(m.group(1).replace(',', ''))
print 'Total: ', self._totalRecord

def _pickupLinks(self, html):
tree = etree.HTML(html)
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
return tree.xpath(r'//h3/a/@href')


def _genUrl(self, what, start):
return 'http://g1.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start)


hx = None

def google(opener, what, resultNum = -1, startNum = 0):
global hx
if hx == None:
hx = HxGoogle(opener)
return hx.search(what, resultNum, startNum)

if __name__ == '__main__':
opener = urllib2.build_opener()
webutils.setupOpener(opener)
# goo = HxGoogle(opener)

for url in google(opener, 'site:letv.com', 20):
print url

54 changes: 54 additions & 0 deletions utils/hxgoogle3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- encoding: utf-8 -*-

import searchbase
import re
import urllib, urllib2
import webutils
from lxml import etree

pattern = re.compile(r'<div id="resultStats">找到约 ([0-9,]+) 条结果')
pattern2 = re.compile(r'找不到和您的查询 "<em>.*?</em>" 相符的内容或信息。')

class HxGoogle(searchbase.SearchBase):

def _updateTotalRecord(self, html):

m = pattern2.search(html)
if m != None:
self._totalRecord = 0
#print 'not found'
return
m = pattern.search(html)
if m == None:
return
if len(m.groups()) <= 0:
return
self._totalRecord = int(m.group(1).replace(',', ''))
print 'Total: ', self._totalRecord

def _pickupLinks(self, html):
tree = etree.HTML(html)
# nodes = tree.xpath(r'/html/body/table[2]/tbody/tr[2]/td[2]/ol/div
return tree.xpath(r'//h3/a/@href')


def _genUrl(self, what, start):
return 'http://g2.hxgoogle.com/search.jsp?q=%s&newwindow=1&safe=off&noj=1&hl=zh-CN&start=%d&sa=N' % (what, start)


hx = None

def google(opener, what, resultNum = -1, startNum = 0):
global hx
if hx == None:
hx = HxGoogle(opener)
return hx.search(what, resultNum, startNum)

if __name__ == '__main__':
opener = urllib2.build_opener()
webutils.setupOpener(opener)
# goo = HxGoogle(opener)

for url in google(opener, 'site:letv.com', 20):
print url

100 changes: 100 additions & 0 deletions utils/searchbase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- encoding: utf-8 -*-

import urllib2
import sys
import os
import webutils

class SearchBase:

_opener = None
_totalRecord = sys.maxint

reqTimeout = 20

def __init__(self, opener):
self._opener = opener

# TODO: get total record number from page
def _updateTotalRecord(self, html):
pass

# TODO: pick up links from page
def _pickupLinks(self, html):
pass

def _pageHandler(self, url):
# print 'page handler'
req = urllib2.Request(url)
webutils.setupRequest(req)
req.add_header('Referer', url[:-4])

try:
response = self._opener.open(req, timeout = self.reqTimeout)
html = response.read()
# print html
except Exception, e:
print "Exception: url: %s - " % url, e
raise StopIteration()

if self._totalRecord == sys.maxint:
self._updateTotalRecord(html)

for url in self._pickupLinks(html):
yield url

# TODO: return number of results per page. default is 10
def _getNumPerPage(self):
return 10

# TODO: generate a url for searching
def _genUrl(self, what, start):
return ''

def search(self, what, resultNum = -1, startNum = 0):

numPerPage = self._getNumPerPage();

if resultNum == -1:
pageCount = -1
else:
pageCount = int((resultNum + numPerPage - 1) / numPerPage)

startPage = int((startNum + numPerPage - 1) / numPerPage)

self._totalRecord = sys.maxint

what = urllib2.quote(what)

pageNum = 1
resCnt = 0

while True:
if pageCount != -1:
if pageNum > pageCount:
break

url = self._genUrl(what, (startPage + pageNum - 1) * numPerPage)
# print url

for result in self._pageHandler(url):
resCnt += 1
yield result
if resultNum != -1 and resCnt >= resultNum:
raise StopIteration()
if resCnt >= self._totalRecord:
raise StopIteration()

if self._totalRecord == sys.maxint:
if resultNum == -1:
self._totalRecord = sys.maxint - 1
else:
self._totalRecord = resultNum

if resCnt >= self._totalRecord:
raise StopIteration()
#if i < numPerPage: # FIXME: if the result total is 10... :(
# raise StopIteration()
# break
pageNum += 1

2 changes: 1 addition & 1 deletion utils/webutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def getPageTitle(opener, url):
return ''

try:
if url[:7] != 'http://':
if url[:7] != 'http://' and url[:8] != 'https://':
url = 'http://' + url
req = urllib2.Request(url)
setupRequest(req)
Expand Down

0 comments on commit d3b930c

Please sign in to comment.