forked from fivethirtyeight/data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwunderground_parser.py
109 lines (91 loc) · 5.57 KB
/
wunderground_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse_station(station):
'''
This function parses the web pages downloaded from wunderground.com
into a flat CSV file for the station you provide it.
Make sure to run the wunderground scraper first so you have the web
pages downloaded.
'''
# Scrape between July 1, 2014 and July 1, 2015
# You can change the dates here if you prefer to parse a different range
current_date = datetime(year=2014, month=7, day=1)
end_date = datetime(year=2015, month=7, day=1)
with open('{}.csv'.format(station), 'w') as out_file:
out_file.write('date,actual_mean_temp,actual_min_temp,actual_max_temp,'
'average_min_temp,average_max_temp,'
'record_min_temp,record_max_temp,'
'record_min_temp_year,record_max_temp_year,'
'actual_precipitation,average_precipitation,'
'record_precipitation\n')
while current_date != end_date:
try_again = False
with open('{}/{}-{}-{}.html'.format(station,
current_date.year,
current_date.month,
current_date.day)) as in_file:
soup = BeautifulSoup(in_file.read(), 'html.parser')
weather_data = soup.find(id='historyTable').find_all('span', class_='wx-value')
weather_data_units = soup.find(id='historyTable').find_all('td')
try:
actual_mean_temp = weather_data[0].text
actual_max_temp = weather_data[2].text
average_max_temp = weather_data[3].text
record_max_temp = weather_data[4].text
actual_min_temp = weather_data[5].text
average_min_temp = weather_data[6].text
record_min_temp = weather_data[7].text
record_max_temp_year = weather_data_units[
9].text.split('(')[-1].strip(')')
record_min_temp_year = weather_data_units[
13].text.split('(')[-1].strip(')')
actual_precipitation = weather_data[9].text
if actual_precipitation == 'T':
actual_precipitation = '0.0'
average_precipitation = weather_data[10].text
record_precipitation = weather_data[11].text
# Verify that the parsed data is valid
if (record_max_temp_year == '-1' or record_min_temp_year == '-1' or
int(record_max_temp) < max(int(actual_max_temp), int(average_max_temp)) or
int(record_min_temp) > min(int(actual_min_temp), int(average_min_temp)) or
float(actual_precipitation) > float(record_precipitation) or
float(average_precipitation) > float(record_precipitation)):
raise Exception
out_file.write('{}-{}-{},'.format(current_date.year, current_date.month, current_date.day))
out_file.write(','.join([actual_mean_temp, actual_min_temp, actual_max_temp,
average_min_temp, average_max_temp,
record_min_temp, record_max_temp,
record_min_temp_year, record_max_temp_year,
actual_precipitation, average_precipitation,
record_precipitation]))
out_file.write('\n')
current_date += timedelta(days=1)
except:
# If the web page is formatted improperly, signal that the page may need
# to be downloaded again.
try_again = True
# If the web page needs to be downloaded again, re-download it from
# wunderground.com
# If the parser gets stuck on a certain date, you may need to investigate
# the page to find out what is going on. Sometimes data is missing, in
# which case the parser will get stuck. You can manually put in the data
# yourself in that case, or just tell the parser to skip this day.
if try_again:
print('Error with date {}'.format(current_date))
lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'
formatted_lookup_URL = lookup_URL.format(station,
current_date.year,
current_date.month,
current_date.day)
html = urlopen(formatted_lookup_URL).read().decode('utf-8')
out_file_name = '{}/{}-{}-{}.html'.format(station,
current_date.year,
current_date.month,
current_date.day)
with open(out_file_name, 'w') as out_file:
out_file.write(html)
# Parse the stations used in this article
for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX',
'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']:
parse_station(station)