-
Notifications
You must be signed in to change notification settings - Fork 5.2k
/
getFreeProxy.py
146 lines (128 loc) · 4.55 KB
/
getFreeProxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
File Name: GetFreeProxy.py
Description : 抓取免费代理
Author : JHao
date: 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25:
-------------------------------------------------
"""
import re
import requests
try:
from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
except:
import sys # py2
reload(sys)
sys.setdefaultencoding('utf-8')
from Util.utilFunction import robustCrawl, getHtmlTree
from Util.WebRequest import WebRequest
# for debug to disable insecureWarning
requests.packages.urllib3.disable_warnings()
class GetFreeProxy(object):
"""
proxy getter
"""
def __init__(self):
pass
@staticmethod
@robustCrawl # decoration print error if exception happen
def freeProxyFirst(page=10):
"""
抓取无忧代理 http://www.data5u.com/
:param page: 页数
:return:
"""
url_list = ['http://www.data5u.com/',
'http://www.data5u.com/free/',
'http://www.data5u.com/free/gngn/index.shtml',
'http://www.data5u.com/free/gnpt/index.shtml']
for url in url_list:
html_tree = getHtmlTree(url)
ul_list = html_tree.xpath('//ul[@class="l2"]')
for ul in ul_list:
yield ':'.join(ul.xpath('.//li/text()')[0:2])
@staticmethod
@robustCrawl
def freeProxySecond(proxy_number=100):
"""
抓取代理66 http://www.66ip.cn/
:param proxy_number: 代理数量
:return:
"""
url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
proxy_number)
request = WebRequest()
# html = request.get(url).content
# content为未解码,text为解码后的字符串
html = request.get(url).text
for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html):
yield proxy
@staticmethod
@robustCrawl
def freeProxyThird(days=1):
"""
抓取ip181 http://www.ip181.com/
:param days:
:return:
"""
url = 'http://www.ip181.com/'
html_tree = getHtmlTree(url)
tr_list = html_tree.xpath('//tr')[1:]
for tr in tr_list:
yield ':'.join(tr.xpath('./td/text()')[0:2])
@staticmethod
@robustCrawl
def freeProxyFourth():
"""
抓取西刺代理 http://api.xicidaili.com/free2016.txt
:return:
"""
url_list = ['http://www.xicidaili.com/nn', # 高匿
'http://www.xicidaili.com/nt', # 透明
]
for each_url in url_list:
tree = getHtmlTree(each_url)
proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
for proxy in proxy_list:
yield ':'.join(proxy.xpath('./td/text()')[0:2])
@staticmethod
@robustCrawl
def freeProxyFifth():
"""
抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
:return:
"""
url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
for page in range(1, 10):
page_url = url.format(page=page)
tree = getHtmlTree(page_url)
proxy_list = tree.xpath('//td[@class="ip"]')
# 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
# 需要过滤掉<p style="display:none;">的内容
xpath_str = """.//*[not(contains(@style, 'display: none'))
and not(contains(@style, 'display:none'))
and not(contains(@class, 'port'))
]/text()
"""
for each_proxy in proxy_list:
# :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
ip_addr = ''.join(each_proxy.xpath(xpath_str))
port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
yield '{}:{}'.format(ip_addr, port)
if __name__ == '__main__':
gg = GetFreeProxy()
# for e in gg.freeProxyFirst():
# print e
# for e in gg.freeProxySecond():
# print e
# for e in gg.freeProxyThird():
# print e
# for e in gg.freeProxyFourth():
# print(e)
for e in gg.freeProxyFifth():
print(e)