-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstep_01_05_data_generation.py
338 lines (228 loc) · 21.3 KB
/
step_01_05_data_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: matthew
"""
import numpy as np
import numpy.ma as ma
import pickle
import sys
import glob
import os
from pathlib import Path
import shutil
import pdb
#%% MEG debug import
sys.path.append("/home/matthew/university_work/crucial_1000gb/python_stuff/python_scripts")
from small_plot_functions import matrix_show, quick_linegraph
import matplotlib.pyplot as plt
#%% 0: Things to set
dependency_paths = {#'syinterferopy' : '/home/matthew/university_work/crucial_1000gb/15_my_software_releases/SyInterferoPy-3.0.1/', # Available from Github: https://github.com/matthew-gaddes/SyInterferoPy
'syinterferopy' : '/home/matthew/university_work/crucial_1000gb/01_blind_signal_separation_python/SyInterferoPy',
'srtm_dem_tools' : '/home/matthew/university_work/crucial_1000gb/15_my_software_releases/SRTM-DEM-tools-2.1.1/', # Available from Github: https://github.com/matthew-gaddes/SRTM-DEM-tools
'volcnet' : '/home/matthew/university_work/crucial_1000gb/13_volcnet', # Available from Github: https://github.com/matthew-gaddes/VolcNet
'deep_learning_tools' : '/home/matthew/university_work/crucial_1000gb/20_deep_learning_tools'} # Available from Github: https://github.com/matthew-gaddes/Deep-Learning-Tools
project_outdir = Path('./')
#step 01 (creating DEMS):
SRTM_dem_settings = {'SRTM1_or3' : 'SRTM3', # 1 arc second (SRTM1) is not yet supported.
'water_mask_resolution' : 'f', # 'c', 'i', 'h' or 'f' (lowest to highest resolution, highest can be very slow)
'SRTM3_tiles_folder' : './SRTM3/', # folder for DEM tiles.
'download' : True, # If tile is not available locally, try to download it
'void_fill' : True, # some tiles contain voids which can be filled (slow)
'side_length' : 40e3} # the side length in metres of the DEM. To allow for different crops of this, it should be somewhat bigger than 224 (the number of pixels) x 90 (pixel size) ~ 20e3
n_volcanoes_used = 512 # there are 512 volcanoes in the Smithsonian file, but to speed up making the DEMS not all of these need to be used.
#step 02 (making 20000 synthetic interferograms):
ifg_settings = {'n_per_file' : 500} # number of ifgs per data file.
synthetic_ifgs_n_files = 40 # numer of files of synthetic data
synthetic_ifgs_settings = {'defo_sources' : ['dyke', 'sill', 'no_def'], # deformation patterns that will be included in the dataset.
'n_ifgs' : ifg_settings['n_per_file'], # the number of synthetic interferograms to generate PER FILE
'n_pix' : 224, # number of 3 arc second pixels (~90m) in x and y direction
'outputs' : ['u'], # channel outputs. u = unwrapped.
'intermediate_figure' : False, # if True, a figure showing the steps taken during creation of each ifg is displayed.
'cov_coh_scale' : 5000, # The length scale of the incoherent areas, in meters. A smaller value creates smaller patches, and a larger one creates larger pathces.
'coh_threshold' : 0.7, # if 1, there are no areas of incoherence, if 0 all of ifg is incoherent.
'min_deformation' : 0.05, # deformation pattern must have a signals of at least this many metres.
'max_deformation' : 0.25, # deformation pattern must have a signal no bigger than this many metres.
'snr_threshold' : 2.0, # signal to noise ratio (deformation vs turbulent and topo APS) to ensure that deformation is visible. A lower value creates more subtle deformation signals.
'turb_aps_mean' : 0.02, # turbulent APS will have, on average, a maximum strenghto this in metres (e.g 0.02 = 2cm)
'turb_aps_length' : 5000} # turbulent APS will be correlated on this length scale, in metres.
#step 03 (create real interferograms from VolcNet data):
volcnet_def_min = 0.05 # in m, real ifg must have a signal bigger than this to be classed as deformation.
file_batch_size = 27 # number of files to open and merge at one time. Depends on amount of RAM available.
n_files_test = 2 # to be used for testing
#setp 04: Augment real data
augmentation_factor = 2 # factor to augment by. E.g. if set to 10 and there are 30 data, there will be 300 augmented data.
# step 05 (merge synthetic and real, and rescale to desired range)
cnn_settings = {'input_ranges' : [{'min':-1, 'max':1}, # used by VGG16, ResNet, Inception
# {'min':0, 'max':1}, #
{'min':0, 'max':255}]} # used by Efficeintnet
print(f"\n\nSetting a random seed\n\n")
np.random.seed(0) # 0 used in the example
#%% Import dependencies (paths set above)
from vudlnet21.file_handling import open_smithsonian_csv_file, merge_and_rescale_data, shuffle_data_pkls
from vudlnet21.plotting import plot_data_class_loc_caller, open_datafile_and_plot
from vudlnet21.augmentation import augment_data
sys.path.append(dependency_paths['srtm_dem_tools'])
from dem_tools_lib import SRTM_dem_make_batch # From SRTM dem tools
sys.path.append(dependency_paths['syinterferopy'])
import syinterferopy
from syinterferopy.random_generation import create_random_synthetic_ifgs # From SyInterferoPy
sys.path.append(dependency_paths['deep_learning_tools'])
import deep_learning_tools
#from deep_learning_tools.file_handling import
sys.path.append(dependency_paths['volcnet'])
import volcnet
#from volcnet.plotting import volcnet_ts_visualiser
from volcnet.plotting import plot_volcnet_files_labels
from volcnet.aux import ll_2_pixel
from volcnet.labelling import label_volcnet_files
from volcnet.creating_ifgs import create_volcnet_ifgs
# #%% 1: Create or load DEMs for the volcanoes to be used for synthetic data.
# print("\nStep 01: Creating or loadings DEMs")
# volcanoes = open_smithsonian_csv_file(project_outdir / 'step_01_dem_data/smithsonian_name_lat_lon.csv', side_length=SRTM_dem_settings['side_length']) # conver the csv file into a list with a dict for each volcanoe
# print(f"{len(volcanoes)} volcanoes have been loaded from the .csv file from the Smithsonian, " # update terminal
# f"and the first {n_volcanoes_used} are being used. ")
# volcanoes = volcanoes[:n_volcanoes_used] # crop number of volcanoes as required.
# try:
# print('Trying to open a .pkl of the DEMs... ', end = '')
# with open(project_outdir / 'step_01_dem_data/volcano_dems.pkl', 'rb') as f:
# volcano_dems = pickle.load(f)
# f.close()
# print('Done. ')
# except:
# print('Failed. Generating them from scratch, which can be slow. ')
# del SRTM_dem_settings['side_length'] # this key is no longer needed, so delete.
# volcano_dems = SRTM_dem_make_batch(volcanoes, **SRTM_dem_settings) # make the DEMS
# with open(project_outdir / 'step_01_dem_data/volcano_dems.pkl', 'wb') as f:
# pickle.dump(volcano_dems, f)
# f.close()
# print('Saved the dems as a .pkl for future use. ')
# #%% possibly use to edit synyhteic ifgs.
# # outdir = Path("/home/matthew/university_work/crucial_1000gb/02_neural_network_python/03_VUDLNet_21_github_repo/02_full_size_example/step_02_synthetic_data_2")
# # data_files = glob.glob(str(Path(project_outdir / "step_02_synthetic_data/*.pkl"))) #
# # for data_file in data_files:
# # with open(data_file, 'rb') as f: # open the file
# # X = pickle.load(f) # and extract data (X) and labels (Y)
# # Y_class = pickle.load(f)
# # Y_loc = pickle.load(f)
# # with open(outdir / data_file.split('/')[-1], 'wb') as f: # open the file
# # pickle.dump(X[:,:,:,np.newaxis], f)
# # pickle.dump(Y_class, f)
# # pickle.dump(Y_loc, f)
# # sys.exit()
# #%% 2: Create or load the synthetic interferograms.
# print("\nStep 02: Creating or loading synthetic interferograms")
# n_synth_data = ifg_settings['n_per_file'] * synthetic_ifgs_n_files
# print('Determining if files containing the synthetic deformation patterns exist... ', end = '')
# data_files = glob.glob(str(Path(project_outdir / "step_02_synthetic_data/*.pkl"))) #
# if len(data_files) == synthetic_ifgs_n_files:
# print(f"The correct number of files were found ({synthetic_ifgs_n_files}) so no new ones will be generated. "
# f"However, this doesn't guarantee that the files were made using the settings in synthetic_ifgs_settings. Check synth_data_settings.txt to be sure. ")
# else:
# print(f"{len(data_files)} files were found, but {synthetic_ifgs_n_files} were requested. "
# f"The directory containing these (./step_02_synthetic_data/)will be deleted, and the correct number of files generated. ")
# #pdb.set_trace()
# answer = input("Do you wish to continue ('y' or 'n')?")
# if answer == 'n':
# sys.exit()
# elif answer == 'y':
# try:
# shutil.rmtree(str(project_outdir / "step_02_synthetic_data"))
# except:
# pass
# os.mkdir(project_outdir / "step_02_synthetic_data")
# for file_n in range(synthetic_ifgs_n_files):
# print(f"Generating file {file_n} of {synthetic_ifgs_n_files} files. ")
# X_all, Y_class_categorical, Y_loc, Y_source_kwargs = create_random_synthetic_ifgs(volcano_dems, **synthetic_ifgs_settings)
# Y_class = np.zeros((Y_class_categorical.shape[0], len(synthetic_ifgs_settings['defo_sources'])), dtype='float32') # initiate to store one hot encodings.
# for data_n, Y_class_one in enumerate(Y_class_categorical): # loop through categorical labels
# Y_class[data_n, int(Y_class_one[0])] = 1 # and convert to one hot
# with open(project_outdir / 'step_02_synthetic_data' / f'data_file_{file_n:05d}.pkl', 'wb') as f:
# pickle.dump(X_all[synthetic_ifgs_settings['outputs'][0]], f) # usual output style is many channel formats in a dict, but we are only interesetd in the one we generated.
# pickle.dump(Y_class, f)
# pickle.dump(Y_loc, f)
# f.close()
# del X_all, Y_class, Y_loc
# with open(project_outdir / "step_02_synthetic_data" / "synth_data_settings.txt", 'w') as f: # output the settings as a text file so that we know how data were generated in the future.
# print(f"Number of data per file : {ifg_settings['n_per_file']}" ,file = f)
# print(f"Number of files: {synthetic_ifgs_n_files}" ,file = f)
# for key in synthetic_ifgs_settings:
# print(f"{key} : {synthetic_ifgs_settings[key]}", file = f)
# else:
# print(f"Answer ({answer}) was not understood as either 'y' or 'n' so exiting to err on the side of caution")
# sys.exit()
# open_datafile_and_plot(project_outdir / "step_02_synthetic_data" / "data_file_00000.pkl", n_data = 15, window_title ='01 Sample of synthetic data') # open and plot the data in 1 file
# #%% Step 03: Create real data (interferograms) from the VolcNet data
# print(f"\nStep 03: Creating labels for all possible VolcNet data. ")
# volcnet_files = sorted(glob.glob(dependency_paths['volcnet'] + '/*.pkl')) # get the paths to the volcnet file.
# # from operator import itemgetter
# # print(f"only using some of the VolcNet files! ")
# # volcnet_files = itemgetter(1,2,13)(volcnet_files)
# labels_dyke, labels_sill, labels_atmo = label_volcnet_files(volcnet_files, def_min = volcnet_def_min) # label all the VolcNet data - slow!
# plot_volcnet_files_labels(volcnet_files, labels_dyke, labels_sill, labels_atmo) # figure with two subplots showing how many of each label etc.
# n_min_real = np.min([labels_dyke.shape[0], labels_sill.shape[0], labels_atmo.shape[0]]) # get the minimum number of data in a class
# np.random.shuffle(labels_dyke) # shuffle along first axis
# np.random.shuffle(labels_sill)
# np.random.shuffle(labels_atmo)
# labels_dyke_crop = labels_dyke[:n_min_real, ] # crop number of data so all the same size (the minimum)
# labels_sill_crop = labels_sill[:n_min_real, ]
# labels_atmo_crop = labels_atmo[:n_min_real, ]
# plot_volcnet_files_labels(volcnet_files, labels_dyke_crop, labels_sill_crop, labels_atmo_crop) # same plot as before but for subset of data.
# labels_all = np.vstack((labels_dyke_crop, labels_sill_crop, labels_atmo_crop)) # combine to one array for the data that will be used (same number of data in each class)
# args_sorted = np.argsort(labels_all[:,0]) # get the args to sort the first column (which is the volcnet file number)
# labels_all = labels_all[args_sorted,] # sort so that data are in order by VolcNet file.
# del labels_dyke, labels_sill, labels_atmo, n_min_real, labels_dyke_crop, labels_sill_crop, labels_atmo_crop, args_sorted
# create_volcnet_ifgs(labels_all, volcnet_files, project_outdir / "step_03_labelled_volcnet_data" , n_data_per_file = ifg_settings['n_per_file'], # create the desired ifgs.
# ny = synthetic_ifgs_settings['n_pix'], nx = synthetic_ifgs_settings['n_pix'], volcnet_def_min = volcnet_def_min)
# #open_datafile_and_plot(project_outdir / "step_03_labelled_volcnet_data" / "data_file_unshuffled_00015.pkl", n_data = 15, window_title = '03 Sample of augmented real data')
# unshuffled_files = sorted(glob.glob(str(project_outdir / "step_03_labelled_volcnet_data" / '*.pkl'))) # get the paths to the volcnet file.
# shuffle_data_pkls(unshuffled_files, file_batch_size, outdir = project_outdir / "step_03_labelled_volcnet_data" ) # 2nd arg is number of files to be opened at once. Big number needs lots of RAM
# open_datafile_and_plot(project_outdir / "step_03_labelled_volcnet_data" / "data_file_shuffled_00000.pkl",
# n_data = 45, window_title = '03 Sample of VolcNet data')
# # Separate some files for testing.
# shuffled_files = sorted(glob.glob(str(project_outdir / "step_03_labelled_volcnet_data" / '*.pkl'))) # get the paths to the shuffled volcnet file.
# test_files = shuffled_files[-n_files_test:]
# for test_file in test_files:
# parts = list(Path(test_file).parts)
# outfile = project_outdir / "step_03_labelled_volcnet_data_testing" / parts[-1]
# shutil.move(test_file, outfile)
# open_datafile_and_plot(project_outdir / "step_03_labelled_volcnet_data_testing" / "data_file_shuffled_00026.pkl", n_data = 180, window_title = '03 Sample of VolcNet testing data')
# #%% Step 04: Augment the real (VolcNet) data.
# labelled_volcnet_files = sorted(glob.glob(str(project_outdir / "step_03_labelled_volcnet_data" / f"*.pkl"))) # get the paths to each file of real data
# file_n = 0
# for labelled_volcnet_file in labelled_volcnet_files:
# print(f' Opening and augmenting {labelled_volcnet_file} ... ', end = '')
# with open(labelled_volcnet_file, 'rb') as f: # open the real data file
# X = pickle.load(f)
# Y_class = pickle.load(f)
# Y_loc = pickle.load(f)
# X_aug, Y_class_aug, Y_loc_aug = augment_data(X, Y_class, Y_loc, n_data = X.shape[0] * augmentation_factor, rotate = False, translate=False)
# for i in range(augmentation_factor):
# with open(project_outdir / "step_04_augmented_labelled_volcnet_data" / f"data_file_shuffled_{file_n:05d}.pkl", 'wb') as f: # save the output as a pickle
# pickle.dump(X_aug[i * ifg_settings['n_per_file']: (i+1) * ifg_settings['n_per_file'], ], f)
# pickle.dump(Y_class_aug[i * ifg_settings['n_per_file']: (i+1) * ifg_settings['n_per_file'], ], f)
# pickle.dump(Y_loc_aug[i * ifg_settings['n_per_file']: (i+1) * ifg_settings['n_per_file'], ], f)
# file_n += 1
# print("done. ")
# open_datafile_and_plot(project_outdir / "step_04_augmented_labelled_volcnet_data" / "data_file_shuffled_00000.pkl", n_data = 45, window_title = '04 Sample of augmented VolcNet data')
#%% 5: Merge real and synthetic data, and rescale to desired range
print("\nStep 04: Mergring the real and synthetic interferograms and rescaling to CNNs input range.")
synthetic_data_files = sorted(glob.glob(str(project_outdir / "step_02_synthetic_data" / f"*.pkl"))) # get the paths to each file of synthetic data
real_data_files = sorted(glob.glob(str(project_outdir / "step_04_augmented_labelled_volcnet_data" / f"*.pkl"))) # get the paths to each file of real data
if len(synthetic_data_files) > len(real_data_files): # see if one is bigger than other
synthetic_data_files = synthetic_data_files[:len(real_data_files)] # if so crop longest to make the same
else:
real_data_files = real_data_files[:len(synthetic_data_files)] # or if other is longer.
for input_range in cnn_settings['input_ranges']:
print(f"Saving the data with the range rescaled to {input_range}")
outdir = project_outdir / f"step_05_merged_rescaled_data_range_{input_range['min']}_{input_range['max']}"
merge_and_rescale_data(synthetic_data_files, real_data_files, outdir,
input_range, triplicate_channel = True) # merge the real and synthetic data, and rescale it into the correct range for use with the CNN
open_datafile_and_plot(outdir / "data_file_00000.npz", n_data = 15, window_title = f"05 Sample of merged and rescaled data (range {input_range['min']} {input_range['max']})")
#%% 5a: Rescale only the synthetic data
from vudlnet21.file_handling import rescale_data
synthetic_data_files = sorted(glob.glob(str(project_outdir / "step_02_synthetic_data" / f"*.pkl"))) # get the paths to each file of synthetic data
for input_range in cnn_settings['input_ranges']:
outdir = project_outdir / f"step_05a_merged_rescaled_data_synth_only_range_{input_range['min']}_{input_range['max']}"
rescale_data(synthetic_data_files, outdir, input_range, triplicate_channel = True)
open_datafile_and_plot(outdir / "data_file_00000.npz", n_data = 15, window_title = f"05 Sample of merged and rescaled syntehtic data (range {input_range['min']} {input_range['max']})")