-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
138 lines (108 loc) · 5.15 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from multiprocessing import Pool, Manager, freeze_support
import time
from url_filterer import UrlFilterer
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class Crawler:
def __init__(self, start_url, proxies=None, max_workers=4):
self.start_url = start_url
self.start_domain = urlparse(start_url).netloc
# UrlFilterer instance
self.url_filterer = UrlFilterer(
allowed_domains=self.start_domain,
allowed_schemes={"http", "https"},
allowed_filetypes={".html", ".htm", ".php", ".asp", ".aspx", ".jsp", ".cgi", ""},
restricted_urls=[
"web.archive.org", "plugins", ":8080", "moodle", "kalendarz",
"password", "mobile", "query", "sort=", "calendar", "css", "view",
"ajax", "Zaloguj", "reddit.", "search?", "source=", "rozmiar=",
"ssid=", "f_ov", "Facebook=", "cookies", "add", "cart", "comment",
"reply", "en_US", "/login", "/logowanie", "producer_", "register",
"orderby", "tumblr.", "redirect", "linkedin.", "facebook.",
"instagram.", "youtube.", "twitter.", "whatsapp.", "pinterest.",
"login.", "google.", "wykop.", "/en/", "kalendarz-", "filtr,", "kalendarium/",
"month,", "export,", "wydarzenia/dzien/", "dzien/"
]
)
self.headers = {
'User-Agent': 'Speakleash-v0.1',
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Connection": "keep-alive"
}
self.proxies = proxies or ['']
self.max_workers = max_workers
def start(self):
visited_urls = Manager().list()
with Pool(initializer=Crawler.initialize_worker, processes=self.max_workers) as pool:
backlog = Manager().list([self.start_url])
while backlog:
args_list = [(url, visited_urls, self.start_url, self.url_filterer, self.proxies) for url in backlog]
results = pool.imap(Crawler.crawl, args_list)
for url, response, new_urls in results:
if response is not None:
if url is not None:
yield url, response
backlog.extend(new_urls)
try:
backlog.remove(url)
except:
print("ERROR removing"+url)
@staticmethod
def initialize_worker():
global session
session = requests.Session()
headers = {
'User-Agent': 'Speakleash-v0.1',
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Connection": "keep-alive"
}
session.headers.update(headers)
retries = Retry(total=2, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
@staticmethod
def get_random_proxy(proxies):
proxy = random.choice(proxies)
return {"http": proxy, "https": proxy} if proxy else {}
@staticmethod
def crawl(args):
url, visited_urls, start_url, url_filterer, proxies = args
url = Crawler.clean_url(url)
if url in visited_urls or not url_filterer.filter_url(start_url, url):
if not url_filterer.filter_url(start_url, url):
visited_urls.append(url)
return url, None, [] # Return None values when the URL is not valid
visited_urls.append(url)
try:
session.proxies=Crawler.get_random_proxy(proxies)
with session.get(url, timeout=10) as response:
if response.status_code == 429 and 'Retry-After' in response.headers:
retry_after = int(response.headers['Retry-After'])
print(f"Retry after {response.headers['Retry-After']} encountered. WAITING...")
time.sleep(retry_after)
response = session.get(url, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')
found_urls = []
for link in soup.find_all('a'):
href = link.get('href')
absolute_url = Crawler.clean_url(urljoin(url, href, allow_fragments=False))
if url_filterer.filter_url(start_url, absolute_url):
if absolute_url not in visited_urls:
found_urls.append(Crawler.clean_url(absolute_url))
return url, response, found_urls
except Exception as e:
print(f"Exception while crawling URLs occurred: {e} url: {url}")
return url, None, []
@staticmethod
def clean_url(url):
url = url.strip()
if url.endswith('/'):
url = url[:-1]
return url