-
Notifications
You must be signed in to change notification settings - Fork 1
/
PEMS_data_preprocess_script.py
96 lines (75 loc) · 3.18 KB
/
PEMS_data_preprocess_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
import torch
import sys
sys.path.append("../")
from GraphTrafficLib.utils.visual_utils import PEMS_folium_plot
raw_data_folder = "../datafolder/rawdata/pems"
proc_data_folder = "../datafolder/procdata/pems_data"
# The processed data can be found in the numpy zip files
pems_train = np.load(f"{raw_data_folder}/train.npz")
pems_val = np.load(f"{raw_data_folder}/val.npz")
pems_test = np.load(f"{raw_data_folder}/test.npz")
x_train = pems_train["x"]
y_train = pems_train["y"]
x_val = pems_val["x"]
y_val = pems_val["y"]
x_test = pems_test["x"]
y_test = pems_test["y"]
# We can collect the data back to match my format
train_data = np.concatenate([x_train, y_train], axis=1)
val_data = np.concatenate([x_val, y_val], axis=1)
test_data = np.concatenate([x_test, y_test], axis=1)
np.save(f"{proc_data_folder}/train_data.npy", train_data)
np.save(f"{proc_data_folder}/val_data.npy", val_data)
np.save(f"{proc_data_folder}/test_data.npy", test_data)
# The distances can be found in another csv
distances_df = pd.read_csv(
f"{raw_data_folder}/distances_bay_2017.csv", header=None, names=["from", "to", "dist"]
)
# Remove self loops
distances_df = distances_df[distances_df.dist != 0]
# I load their index to sensor id lookup and their adjacancy matrix
with open(f"{raw_data_folder}/adj_mx_bay.pkl", "rb") as f:
sensor_ids, sensor_id_to_ind, adj_mx = pickle.load(f, encoding="latin1")
adj_mx_no_loop = adj_mx - np.eye(325)
# we also make the reverse lookup
sensor_ind_to_id = {v: k for k, v in sensor_id_to_ind.items()}
# The locations of the sensors can be found in this csv
location_df = pd.read_csv(
f"{raw_data_folder}/graph_sensor_locations_bay.csv",
header=None,
names=["id", "lat", "lon"],
)
location_df = location_df.set_index("id")
gdf = gpd.GeoDataFrame(
location_df, geometry=gpd.points_from_xy(location_df.lon, location_df.lat), crs="EPSG:4326"
)
# Based on the distances I make a sparse spatial adjacancy matrix
spatial_adj_matrix = np.zeros_like(adj_mx_no_loop)
problem_list = []
for sender in distances_df["from"].unique():
sender_ind = sensor_id_to_ind[str(sender)]
receivers = distances_df.loc[(distances_df["from"] == sender)].sort_values("dist")["to"]
# Try closest one
for receiver in receivers:
receiver_ind = sensor_id_to_ind[str(receiver)]
if spatial_adj_matrix[receiver_ind, sender_ind] != 1:
spatial_adj_matrix[sender_ind, receiver_ind] = 1
break
else: # if already have connection other direction go to second closest
problem_list.append(sender)
spatial_adj_matrix += spatial_adj_matrix.T
spatial_adj_matrix[spatial_adj_matrix != 0] = 1
spatial_adj_tensor = torch.Tensor(spatial_adj_matrix)
# Create empty adj
empty_adj = np.zeros_like(spatial_adj_matrix)
# Create full adj
full_adj = np.ones_like(spatial_adj_matrix) - np.eye(len(spatial_adj_matrix))
# We save the adj matrices
np.save(f"{proc_data_folder}/approx_local_adj.npy", adj_mx_no_loop)
np.save(f"{proc_data_folder}/sparse_local_adj.npy", spatial_adj_matrix)
np.save(f"{proc_data_folder}/pems_full_adj.npy", full_adj)
np.save(f"{proc_data_folder}/pems_empty_adj.npy", empty_adj)