-
Notifications
You must be signed in to change notification settings - Fork 175
/
Copy pathgenerate_city_id_files.py
241 lines (198 loc) · 8.72 KB
/
generate_city_id_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python
import requests, sys, os, codecs, json, gzip, bz2, collections, csv, sqlite3, re
city_list_url = 'http://bulk.openweathermap.org/sample/city.list.json.gz'
city_list_gz = "city.list.json.gz"
"""
This script is used to retrieve the city IDs list from the OWM web 2.5 API
and then to divide the list into smaller chunks: each chunk is ordered by
city ID and written to a separate file
Source files are under: http://bulk.openweathermap.org/sample/
"""
def download_the_files():
print('Downloading file '+city_list_url+' ...')
with open(city_list_gz, 'wb') as h:
response = requests.get(city_list_url, stream=True)
for block in response.iter_content(1024):
h.write(block)
print(' ... done')
def read_all_cities_into_dict():
print('Reading city data from files ...')
all_cities = {}
# All cities
with gzip.open(city_list_gz, "rb", "utf-8") as i:
cities = json.loads(i.read())
for city_dict in cities:
# eg. {"id":707860,"name":"Hurzuf","state": "","country":"UA","coord":{"lon":34.283333,"lat":44.549999}}
if city_dict['id'] in all_cities:
print('Warning: city ID %d was already processed! Data chunk is: %s' % (city_dict['id'], city_dict))
continue
else:
country = city_dict['country']
if country == 'US': # if it's a US city, then take the "state" field as country
if city_dict['state']:
country = city_dict['state']
print(city_dict, country)
all_cities[city_dict['id']] = dict(name=city_dict['name'],
country=country,
lon=city_dict['coord']['lon'],
lat=city_dict['coord']['lat'])
print('... done')
return all_cities
URL_REGEX = re.compile("""(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])""")
def read_all_cities_into_lists():
print('Reading city data from files ...')
all_cities = []
with gzip.open(city_list_gz, "rb", "utf-8") as i:
cities = json.loads(i.read())
for city_dict in cities:
# check for URLs in city details (see https://github.com/csparpa/pyowm/pull/389)
for item in ('name', 'country', 'state'):
item_no_url = URL_REGEX.sub('', city_dict[item])
if city_dict[item] != item_no_url:
# item contains URL so prompt user for a correction
print(
f'URL detected in entry [ID {city_dict["id"]!r}]'
+ f'\n\tKey: {item!r}'
+ f'\n\tValue: {city_dict[item]!r}'
+ f'\n\tSuggested correction: {item_no_url!r}'
)
prompt = input("Use suggested correction? (Yes, No, Edit) ").lower()
if prompt.startswith('y'):
city_dict[item] = item_no_url
elif prompt.startswith('e'):
city_dict[item] = input('Enter a correction: ')
if city_dict['state'] != '':
state = city_dict['state']
else:
state = None
t = [city_dict['id'], city_dict['name'], city_dict['country'], state, city_dict['coord']['lat'], city_dict['coord']['lon']]
all_cities.append(t)
print('... done')
return all_cities
def order_dict_by_city_id(all_cities):
print('Ordering city dict by city ID ...')
all_cities_ordered = collections.OrderedDict(sorted(all_cities.items()))
print('... done')
return all_cities_ordered
def city_to_string(city_id, city_dict):
return ','.join([city_dict['name'], str(city_id), str(city_dict['lat']), str(city_dict['lon']),
city_dict['country']])
def split_keyset(cities_dict):
print('Splitting keyset of %d city names into 4 subsets based on the initial letter:' % (len(cities_dict),))
print('-> from "a" = ASCII 97 to "f" = ASCII 102')
print('-> from "g" = ASCII 103 to "l" = ASCII 108')
print('-> from "m" = ASCII 109 to "r" = ASCII 114')
print('-> from "s" = ASCII 115 to "z" = ASCII 122')
ss = [list(), list(), list(), list()]
for city_id in cities_dict:
name = cities_dict[city_id]['name'].lower()
if not name:
continue
c = ord(name[0])
if c < 97: # not a letter
pass
elif c in range(97, 103): # from a to f
ss[0].append(city_to_string(city_id, cities_dict[city_id]))
elif c in range(103, 109): # from g to l
ss[1].append(city_to_string(city_id, cities_dict[city_id]))
elif c in range(109, 115): # from m to r
ss[2].append(city_to_string(city_id, cities_dict[city_id]))
elif c in range (115, 123): # from s to z
ss[3].append(city_to_string(city_id, cities_dict[city_id]))
continue
print('... done')
return ss
def write_subsets_to_files(ssets, outdir):
print('Writing subsets to files ...')
with codecs.open("%s%s097-102.txt" % (outdir, os.sep),
"w", "utf-8") as f:
for city_string in sorted(ssets[0]):
f.write(city_string+"\n")
with codecs.open("%s%s103-108.txt" % (outdir, os.sep),
"w", "utf-8") as f:
for city_string in sorted(ssets[1]):
f.write(city_string+"\n")
with codecs.open("%s%s109-114.txt" % (outdir, os.sep),
"w", "utf-8") as f:
for city_string in sorted(ssets[2]):
f.write(city_string+"\n")
with codecs.open("%s%s115-122.txt" % (outdir, os.sep),
"w", "utf-8") as f:
for city_string in sorted(ssets[3]):
f.write(city_string+"\n")
print('... done')
def bz2_csv_compress(plaintext_csv, target_bz2):
print('Compressing Bz2: %s -> %s ...' % (plaintext_csv, target_bz2))
with open(plaintext_csv, 'r') as source:
source_rows = csv.reader(source)
with bz2.open(target_bz2, "wt") as file:
writer = csv.writer(file)
for row in source_rows:
writer.writerow(row)
print( '... done')
def bz2_all(outdir):
bz2_csv_compress('%s%s097-102.txt' % (outdir, os.sep),
'%s%s097-102.txt.bz2' % (outdir, os.sep))
bz2_csv_compress('%s%s103-108.txt' % (outdir, os.sep),
'%s%s103-108.txt.bz2' % (outdir, os.sep))
bz2_csv_compress('%s%s109-114.txt' % (outdir, os.sep),
'%s%s109-114.txt.bz2' % (outdir, os.sep))
bz2_csv_compress('%s%s115-122.txt' % (outdir, os.sep),
'%s%s115-122.txt.bz2' % (outdir, os.sep))
def generate_city_id_gz_files(target_path='.'):
target_folder = os.path.abspath(target_path)
print('Will save output files to folder: %s' % (target_folder,))
print('Job started')
download_the_files()
cities = read_all_cities_into_dict()
ordered_cities = order_dict_by_city_id(cities)
ssets = split_keyset(ordered_cities)
write_subsets_to_files(ssets, target_folder)
bz2_all(target_folder)
print('Job finished')
# SQLite
def create_db_sqlite(db_path):
with open(db_path, 'w') as _:
pass
sql_schema_statement = '''
CREATE TABLE IF NOT EXISTS city (
id integer NOT NULL PRIMARY KEY,
city_id integer NOT NULL,
name text NOT NULL,
country text NOT NULL,
state text,
lat real NOT NULL,
lon real NOT NULL
);'''
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute(sql_schema_statement)
conn.commit()
conn.close()
print('Created SQLite empty database')
def populate_db_sqlite(db_path, cities_list):
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.executemany('INSERT INTO city (city_id, name, country, state, lat, lon) VALUES (?, ?, ?, ?, ?, ?)', cities_list)
conn.commit()
conn.close()
print('Populated SQLite database')
def generate_sqlite_db(target_path='.'):
DB_NAME = 'cities.db'
target_folder = os.path.abspath(target_path)
db_path = target_folder + os.path.sep + DB_NAME
print('Will save output SQLite DB to folder: %s' % (target_folder,))
print('Job started')
download_the_files()
cities = read_all_cities_into_lists()
create_db_sqlite(db_path)
populate_db_sqlite(db_path, cities)
print('Job finished')
print("******** DON'T FORGET TO MANUALLY BZ2 COMPRESS THE DB !!! ******** ")
if __name__ == '__main__':
if len(sys.argv) == 2:
target_path = sys.argv[1]
else:
target_path = '.'
#generate_city_id_gz_files(target_path)
generate_sqlite_db()