-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclean_data.py
45 lines (34 loc) · 1.17 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import csv
import tempfile
DATA_PATH = os.path.join(
os.path.dirname(__file__),
'../data/birmingham_schools.csv'
)
def clean_data(path):
with open(path) as inp, \
tempfile.NamedTemporaryFile('w', delete=False) as output:
# Lowercase all headers
headers = [
# Rename _geom to geom, as the Data Store fails
# on loading headers starting with underscore
header.lstrip('_')
for header in inp.readline().strip().lower().split(',')
]
reader = csv.DictReader(inp, fieldnames=headers)
writer = csv.DictWriter(output, fieldnames=headers)
writer.writeheader()
for row in reader:
row = fix_urls(row)
writer.writerow(row)
os.replace(output.name, path)
def fix_urls(row):
fixed_url_mapping = {
'www.kingsland.bham.sch.uk/': 'http://www.kingsland.bham.sch.uk/',
'http://www.watrmill.bham.sch.uk/': 'http://www.watermill.bham.sch.uk/',
}
if row['web_site'] in fixed_url_mapping:
row['web_site'] = fixed_url_mapping[row['web_site']]
return row
if __name__ == '__main__':
clean_data(DATA_PATH)