-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenaire_crawler.py
executable file
·102 lines (73 loc) · 2.37 KB
/
enaire_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# Download airport charts for Spain
import bs4
import os
import subprocess
import time
import urllib.request
BASE_URL = "http://www.enaire.es"
def expand(url):
try:
request = urllib.request.urlopen(BASE_URL + url)
except urllib.error.URLError:
request = urllib.request.urlopen(BASE_URL + url)
soup = bs4.BeautifulSoup(request.read())
cur_item = soup.select("li.listOpened")[-1]
return cur_item.findAll("li")
def get_url(item):
a = item.find("a")
return a.get("href")
def get_title(item):
a = item.find("a")
return a.text.strip()
def get_ext(url):
exts = ["pdf", "png", "jpg", "jpeg"]
for ext in exts:
if ext in url:
return ext
def download(url, dest_dir, dest_file):
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
dest = dest_dir + "/" + dest_file
if os.path.exists(dest):
print("[INFO] Ignoring", dest)
else:
print("[DL]", dest)
subprocess.check_output(["curl", "--retry", "3",
"--silent", "--output", dest, url])
def handle_list(contents, apt):
for item in contents:
url = get_url(item)
title = get_title(item)
if "class" in item.attrs and "listClosed" in item.attrs["class"]:
handle_list(expand(url), apt)
elif "blob" in url:
dest_name = title.replace('/', "_") + '.' + get_ext(url)
download(BASE_URL + url, apt, dest_name)
else:
print("[ERROR] Can't handle", item)
time.sleep(1)
def is_ignored(airport):
ignored_list = []
for item in ignored_list:
if item in airport.text:
return True
def main():
print("[INFO] Start up")
APT_LIST_URL = "/csee/Satellite/navegacion-aerea/es"
APT_LIST_URL += "/Page/1078418725163/?other=1083158950596"
airports = expand(APT_LIST_URL)
print("[INFO] Found", len(airports), "airports.")
for airport in airports:
if is_ignored(airport):
continue
# Prepare airport info
apt_full = get_title(airport).split("- ")
apt_name = apt_full[0].strip().title()
apt_code = apt_full[1].strip()
apt = "{} ({})".format(apt_code, apt_name)
apt = apt.replace("/", "-")
url = get_url(airport)
handle_list(expand(url), apt)
if __name__ == '__main__':
main()