-
Notifications
You must be signed in to change notification settings - Fork 0
/
create-prod-dataset.py
216 lines (184 loc) · 6.73 KB
/
create-prod-dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# Read the V-Dem-CY-Core-v14 dataset, get the relevant data, and save as json suitable for Nivo
import pandas as pd
import json
import numpy as np
print("\nGenerating democracy data json for production...\n")
df = pd.read_csv("V-Dem-CY-Core-v14.csv")
df = df[
[
"country_name",
"country_text_id",
"year",
"v2x_polyarchy",
"v2x_libdem",
"v2x_partipdem",
"v2x_delibdem",
"v2x_egaldem",
]
]
df = df[
(df.groupby("country_name")["year"].transform(max) == 2023) & (df["year"] >= 1900)
]
# Load the GeoJSON data to compare country names
with open("ne_110m_admin_0_countries.json", "r") as file:
geojson_data = json.load(file)
geojson_countries = {
feature["properties"]["NAME"] for feature in geojson_data["features"]
}
dataset_countries = set(df["country_name"].unique())
unmatched_countries = dataset_countries - geojson_countries
print("Countries in dataset not found in GeoJSON:", unmatched_countries, "\n")
# Find country names that are present in the GeoJSON but not in the dataset
missing_countries = geojson_countries - dataset_countries
print("Countries in GeoJSON not found in dataset:", missing_countries, "\n")
# Rename specific countries in the dataset to match the GeoJSON naming conventions
# TODO What to do with Palestine?
rename_map = {
"Bosnia and Herzegovina": "Bosnia and Herz.",
"Burma/Myanmar": "Myanmar",
"Democratic Republic of the Congo": "Dem. Rep. Congo",
"Republic of the Congo": "Congo",
"Central African Republic": "Central African Rep.",
"Dominican Republic": "Dominican Rep.",
"Equatorial Guinea": "Eq. Guinea",
"Eswatini": "Swaziland",
"The Gambia": "Gambia",
"Ivory Coast": "Côte d'Ivoire",
"North Macedonia": "Macedonia",
"South Sudan": "S. Sudan",
"Solomon Islands": "Solomon Is.",
"Türkiye": "Turkey",
}
df["country_name"] = df["country_name"].replace(rename_map)
# Re-check for unmatched countries after renaming
dataset_countries_updated = set(df["country_name"].unique())
unmatched_countries_updated = dataset_countries_updated - geojson_countries
print(
"Countries in dataset not found in GeoJSON after renaming:",
unmatched_countries_updated,
"\n",
)
# Generate a list of unmatched countries in the geojson countries after renaming
geojson_countries_updated = geojson_countries - dataset_countries_updated
print(
"Countries in GeoJSON not found in dataset after renaming:",
geojson_countries_updated,
"\n",
)
# Rename specific text ids in the dataset to match the GeoJSON ADM0_A3 codes
rename_text_id_map = {"SML": "SOL", "SSD": "SDS", "XKX": "KOS"}
df["country_text_id"] = df["country_text_id"].replace(rename_text_id_map)
indices = [
"v2x_polyarchy",
"v2x_libdem",
"v2x_partipdem",
"v2x_delibdem",
"v2x_egaldem",
]
index_labels = {
"v2x_polyarchy": "Electoral",
"v2x_libdem": "Liberal",
"v2x_partipdem": "Participatory",
"v2x_delibdem": "Deliberative",
"v2x_egaldem": "Egalitarian",
}
new_rows = []
grouped = df.groupby("country_name")
# Fill in gaps in data with None values for missing years
for country, group in grouped:
all_years = range(group["year"].min(), 2023)
existing_years = set(group["year"])
missing_years = [year for year in all_years if year not in existing_years]
for missing_year in missing_years:
row_data = {
"country_name": country,
"country_text_id": group["country_text_id"].iloc[0],
"year": missing_year,
}
for idx in indices:
row_data[idx] = -1
new_rows.append(row_data)
# Append new rows to the original DataFrame and sort
new_df = pd.concat([df, pd.DataFrame(new_rows)])
new_df = new_df.sort_values(by="year")
df = new_df
result = []
grouped = df.groupby("country_name")
for country, group in grouped:
for index in indices:
series_data = {
"id": f"{country}_{index}",
"ISO": group["country_text_id"].iloc[0],
"label": f"{country} - {index_labels[index]}",
"data": [
{"x": row["year"], "y": None if row[index] == -1 else row[index]}
for _, row in group.iterrows()
if not np.isnan(row[index])
],
}
result.append(series_data)
# Calculate world averages for each index and each year, rounded to 3 decimal places
world_averages = df.groupby("year")[indices].mean().reset_index()
for index in indices:
series_data = {
"id": f"World average_{index}",
"ISO": "WORLD",
"label": f"World average - {index_labels[index]}",
"data": [
{"x": row["year"], "y": round(row[index], 3)}
for _, row in world_averages.iterrows()
if not np.isnan(row[index])
],
}
result.append(series_data)
# Load the GeoJSON data to check ISO codes
with open("app/src/geojson_features.json", "r") as file:
geojson_data = json.load(file)
# Extract the ADM0_A3 codes and country names from GeoJSON
geojson_iso_to_country = {
feature["properties"]["ADM0_A3"]: feature["properties"]["NAME"]
for feature in geojson_data["features"]
}
# Check for mismatches between dataset ISO codes and GeoJSON ADM0_A3 codes
iso_mismatches = []
for series in result:
iso_code = series["ISO"]
country_name = series["label"].split(" - ")[0]
if iso_code not in geojson_iso_to_country:
iso_mismatches.append((iso_code, country_name, "ISO code not found in GeoJSON"))
elif geojson_iso_to_country[iso_code] != country_name:
iso_mismatches.append(
(
iso_code,
country_name,
f"Mismatch: GeoJSON country is {geojson_iso_to_country[iso_code]}",
)
)
# Print mismatches
iso_mismatches = list(set(iso_mismatches))
if iso_mismatches:
print("There are mismatches in ISO codes:")
for mismatch in iso_mismatches:
print(mismatch)
else:
print("All ISO codes correctly map to the country names in the GeoJSON.")
with open("app/src/prod-dataset.json", "w") as f:
json.dump(result, f)
# Create another dataset with country averages over all five indexes
result = []
grouped = df.groupby("country_name")
for country, group in grouped:
series_data = {
"id": f"{country}_overall",
"ISO": group["country_text_id"].iloc[0],
"label": f"{country} - Overall",
"data": [],
}
for year, year_group in group.groupby("year"):
average_index = year_group[indices].mean(axis=1).mean()
if not np.isnan(average_index):
series_data["data"].append({"x": year, "y": round(average_index, 3)})
result.append(series_data)
with open("app/src/prod-dataset-country-averages.json", "w") as f:
json.dump(result, f)
print("\nGeneration finished.\n")