forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathajax_gfw_spider.py
79 lines (66 loc) · 2.75 KB
/
ajax_gfw_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Ajax gfw proxy ip crawler with scrapy-splash
"""
from config.settings import SPIDER_AJAX_GFW_TASK
from ..redis_spiders import RedisAjaxSpider
from ..items import ProxyUrlItem
from .base import BaseSpider
class AjaxGFWSpider(BaseSpider, RedisAjaxSpider):
name = 'ajax_gfw'
proxy_mode = 2
task_queue = SPIDER_AJAX_GFW_TASK
def parse(self, response):
url = response.url
if self.exists(url, 'proxy-list'):
items = self.parse_common(response, pre_extract_method='css', pre_extract='.table ul',
detail_rule='li::text', split_detail=True)
elif self.exists(url, 'cnproxy'):
items = self.parse_cnproxy(response)
elif self.exists(url, 'free-proxy'):
items = self.parse_free_proxy(response)
elif self.exists(url, 'proxylist'):
items = self.parse_proxylist(response)
else:
items = self.parse_common(response)
for item in items:
yield item
def parse_cnproxy(self, response):
items = list()
infos = response.xpath('//tr')[2:]
for info in infos:
info_str = info.extract()
proxy_detail = info.css('td::text').extract()
ip = proxy_detail[0].strip()
port = proxy_detail[1][1:].strip()
cur_protocols = self.procotol_extractor(info_str)
for protocol in cur_protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items
def parse_free_proxy(self, response):
items = list()
infos = response.xpath('//table[@id="proxy_list"]').css('tr')[1:]
for info in infos:
info_str = info.extract()
ip = info.css('abbr::text').extract_first()
port = info.css('.fport::text').extract_first()
if not ip or not port:
continue
cur_protocols = self.procotol_extractor(info_str)
for protocol in cur_protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items
def parse_proxylist(self, response):
items = list()
infos = response.xpath('//tr')[2:]
for info in infos:
info_str = info.extract()
if '透明' in info_str or 'transparent' in info_str.lower():
continue
ip = info.css('td::text')[1].extract()
port = info.css('td a::text')[0].extract()
if not ip or not port:
continue
cur_protocols = self.procotol_extractor(info_str)
for protocol in cur_protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items