forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgfw_spider.py
78 lines (62 loc) · 2.62 KB
/
gfw_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Proxy spider for the websites blocked by gfw.
"""
import re
import json
from config.settings import SPIDER_GFW_TASK
from ..items import ProxyUrlItem
from .common_spider import CommonSpider
class GFWSpider(CommonSpider):
name = 'gfw'
proxy_mode = 2
task_queue = SPIDER_GFW_TASK
def parse(self, response):
url = response.url
if self.exists(url, 'cn-proxy'):
items = self.parse_common(response, pre_extract='//tbody/tr', infos_pos=0)
elif self.exists(url, 'proxylistplus'):
protocols = None
if self.exists(url, 'SSL'):
protocols = ['https']
items = self.parse_common(response, pre_extract='//tr[contains(@class, "cells")]',
infos_end=-1, protocols=protocols)
elif self.exists(url, 'gatherproxy'):
items = self.parse_gather_proxy(response)
elif self.exists(url, 'xroxy'):
items = self.parse_xroxy(response)
else:
items = self.parse_common(response)
for item in items:
yield item
def parse_gather_proxy(self, response):
items = list()
infos = response.css('script::text').re(r'gp.insertPrx\((.*)\)')
for info in infos:
info = info.lower()
detail = json.loads(info)
ip = detail.get('proxy_ip')
port = detail.get('proxy_port')
protocols = self.procotol_extractor(info)
for protocol in protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items
def parse_xroxy(self, response):
items = list()
ip_extract_pattern = '">(.*)\\n'
infos = response.xpath('//tr').css('.row1') + response.xpath('//tr').css('.row0')
for info in infos:
m = re.search(ip_extract_pattern, info.css('a')[1].extract())
if m:
ip = m.group(1)
port = info.css('a::text')[2].extract()
protocol = info.css('a::text')[3].extract().lower()
if protocol in ['socks4', 'socks5']:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
elif protocol == 'transparent':
continue
else:
items.append(ProxyUrlItem(url=self.construct_proxy_url('http', ip, port)))
is_ssl = info.css('a::text')[4].extract().lower() == 'true'
if is_ssl:
items.append(ProxyUrlItem(url=self.construct_proxy_url('https', ip, port)))
return items