-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathLearn2.py
632 lines (504 loc) · 37.3 KB
/
Learn2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
# George Miloshevich 2021
# Perform training
# Importation des librairies
import os as os
import sys
sys.path.insert(1, '../ERA')
from ERA_Fields import* # general routines
from TF_Fields import* # tensorflow routines
import time
import shutil
import gc
import psutil
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from operator import mul
from functools import reduce
def custom_CNN(model_input_dim): # This CNN I took from https://www.tensorflow.org/tutorials/images/cnn
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), input_shape=model_input_dim))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(SpatialDropout2D(0.2))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3)))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(SpatialDropout2D(0.2))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3)))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(SpatialDropout2D(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(2))
return model
def probability_model(inputs,input_model): # This function is used to apply softmax to the output of the neural network
x = input_model(inputs)
outputs = layers.Softmax()(x)
return keras.Model(inputs, outputs)
def CNN_layers(input_1):# This CNN is inspired by https://www.tensorflow.org/tutorials/images/cnn
x = layers.Conv2D(32, (3, 3))(input_1)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = SpatialDropout2D(0.2)(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3))(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = SpatialDropout2D(0.2)(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3))(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = SpatialDropout2D(0.2)(x)
return layers.Flatten()(x)
def bottom_layers(input_1):
x = layers.Dense(64, activation='relu')(input_1)
x = layers.Dropout(0.2)(x)
return layers.Dense(2)(x)
from tensorflow.keras.regularizers import l2
import math
def create_regularized_model(factor, rate, inputshape=(1,)):
model = tf.keras.models.Sequential([
#tf.keras.layers.Flatten(input_shape=(8, 8)), # if the model has a tensor input
tf.keras.layers.Input(shape=inputshape), # if the model has a flat input
tf.keras.layers.Dense(2, kernel_regularizer=l2(factor))
])
return model
def PrepareData(creation = []): # if we do not specify creation it automacially creates new folder. If we specify the creation, it should correspond to the folder we are running the file from
sampling='' #'3hrs' # This chooses whether we want say daily sampling or 3 hour one. Notice that the corresponding NetCDF files are kept in different places
percent = 5 # 1
timesperday = 8 # 3 hour long periods in case we choose this sampling
if sampling == '3hrs':
T = 14*timesperday
else:
T = 14
tau = 0 #-5 # lag
usepipelines = False # if True => Dataset.from_tensor_slices will be used. This is a more advanced method but it takes more RAM and there is a possiblity for memory leaks when repeating training for cross-validation
fullmetrics = True # If True MCC and confusion matrix will be evaluated during training. This makes training slower!
Model = 'Plasim'
area = 'France'
lon_start = 0
lon_end = 128
lat_start = 0 # latitudes start from 90 degrees North Pole
lat_end = 22
#myscratch='/scratch/gmiloshe/PLASIM/' # where we acess .py files and save output
mylocal='/local/gmiloshe/PLASIM/' #'/local/gmiloshe/PLASIM/' # where we keep large datasets that need to be loaded
myscratch=TryLocalSource(mylocal) # Check if the data is not there and can be found in some other source
#myscratch=mylocal
new_mixing = False # if set to True the undersampling will also follow the suit
num_years = 8000 # Select the number of years from the simulation for the analysis
# If an integer >= 1 is chosen we simply undersample by this rate
# If a float between 0 and 1 is chosen we select each state with the probability given by this float
undersampling_factor = 10 # 1 #15 #10 #5 #1 #0.25 #0.15 #0.25
oversampling_factor = 1 # oversampling_factor = 1 means that oversampling will not be performed
thefield = 't2m' # Important: this is the field that is used to determine the extrema (important for undersampling) and therefore the label space
BATCH_SIZE = 1024 # choose this for training so that the chance of encountering positive batches is nonnegligeable
NUM_EPOCHS = 100 #1000 #20 #200 #50 # number of epochs the training involves
saveweightseveryblaepoch = 1 # If set to 0 the model will not save weights as it is being trained, otherwise this number will tell us how many epochs it weights until saving
if saveweightseveryblaepoch > 0:
ckpt = 'ckpt'
else:
cktp = ''
#checkpoint_name = myscratch+'training/stack_CNN_equalmixed_'+ckpt+'_'+thefield+'France_'+'_with_mrsoArea_'+sampling+'_u'+str(undersampling_factor)+'o'+str(oversampling_factor)+'_LONG'+str(num_years)+'yrs_'+'_per_'+str(percent)+'_tau_'+str(tau)
#checkpoint_name = myscratch+'training/stack_CNNdeepwide2maxpool_equalmix_'+ckpt+'_'+thefield+'France_'+'_with_t2m_zg500_mrsoFrance_'+sampling+'_22by128_u'+str(undersampling_factor)+'o'+str(oversampling_factor)+'_LONG'+str(num_years)+'yrs_'+'_per_'+str(percent)+'_tau_'+str(tau)
#checkpoint_name = myscratch+'training/stack_CNN_equalmix_'+ckpt+'_'+thefield+'France_'+'_with_zg500_t2mmrsoFrance_'+sampling+'_22by128_u'+str(undersampling_factor)+'o'+str(oversampling_factor)+'_LONG'+str(num_years)+'yrs_'+'_per_'+str(percent)+'_finetune_tau_'+str(tau)
#checkpoint_name = myscratch+'training/test'
checkpoint_name_root = myscratch+'training/stack_CNN_equalmixed_'+ckpt+'_'+thefield+'T'+str(T)+'France_'+'_with_zg500_t2mmrsoFrance_'+sampling+'lr5e-5_u'+str(undersampling_factor)+'o'+str(oversampling_factor)+'_'+str(num_years)+'yrs_'+'_per_'+str(percent)+'_finetune_tau_'
checkpoint_name = checkpoint_name_root+str(tau)
checkpoint_name_previous = checkpoint_name_root+str(tau+1)
print("creation = ", creation)
if creation == []: # If we are not running from the same directory
if os.path.exists(checkpoint_name): # Create the directory
print('folder '+checkpoint_name+' exists. Should I overwrite?')
if input(" write Y to overwrite, else the execution will stop: ") != "Y":
sys.exit("User has aborted the program")
else:
print('folder '+checkpoint_name+' created')
os.mkdir(checkpoint_name)
sys.stdout = Logger(checkpoint_name+'/logger.log') # Keep a copy of print outputs there
shutil.copy(__file__, checkpoint_name) # Copy this file to the directory of the training
dest = shutil.copy(__file__, checkpoint_name+'/Funs.py')
shutil.copy(myscratch+'../ERA/ERA_Fields.py', checkpoint_name)
shutil.copy(myscratch+'../ERA/TF_Fields.py', checkpoint_name)
shutil.copy(myscratch+'History.py', checkpoint_name)
shutil.copy(myscratch+'Recalc_History.py', checkpoint_name)
shutil.copy(myscratch+'Recalc_Tau_Metrics.py', checkpoint_name)
shutil.copy(myscratch+'Metrics.py', checkpoint_name)
#lat_from = [4,4] # 18x42
#lat_to = [22,22]
#lon_from = [101,0]
#lon_to = [128,15]
lat_from = [0,0] # 22x128
lat_to = [22,22]
lon_from = [64, 0]
lon_to = [128, 64]
print([percent, T, Model, area, undersampling_factor, lat_from, lat_to, lon_from, lon_to, thefield])
if sampling == '3hrs':
Months1 = [0, 0, 0, 0, 0, 0, timesperday*30, timesperday*30, timesperday*30, timesperday*30, timesperday*30, 0, 0, 0]
else: # if sampling == 'daily'
Months1 = [0, 0, 0, 0, 0, 0, 30, 30, 30, 30, 30, 0, 0, 0]
Tot_Mon1 = list(itertools.accumulate(Months1))
time_start = Tot_Mon1[6]
time_end = Tot_Mon1[9] #+(Tot_Mon1[10]-Tot_Mon1[9])//2 # uncomment this if we are to use full summer (including the portion with september due to T days window)
if sampling == '3hrs':
prefix = ''
file_prefix = '../Climate/'
else:
prefix = 'ANO_LONG_'
file_prefix = ''
t2m = Plasim_Field('tas',prefix+'tas','Temperature', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling) # if we want to use surface tempeature
zg500 = Plasim_Field('zg',prefix+'zg500','500 mbar Geopotential', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
#zg300 = Plasim_Field('zg',prefix+'zg300','300 mbar Geopotential', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
mrso = Plasim_Field('mrso',prefix+'mrso','soil moisture', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
#ua300 = Plasim_Field('ua',prefix+'ua300','eastward wind', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
#va300 = Plasim_Field('va',prefix+'va300','northward wind', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
#hfls = Plasim_Field('hfls',prefix+'hfls','surface latent heat flux', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
#hfss = Plasim_Field('hfss',prefix+'hfss','surface sensible heat flux', Model, lat_start, lat_end, lon_start, lon_end,'single',sampling)
t2m.years=8000
zg500.years=8000
mrso.years=8000
#ts.load_field(mylocal+file_prefix+'Data_Plasim/') # load the data
t2m.load_field(mylocal+file_prefix+'Data_Plasim_LONG/') # load the data
zg500.load_field(mylocal+file_prefix+'Data_Plasim_LONG/')
#zg300.load_field(mylocal+file_prefix+'Data_Plasim/')
mrso.load_field(mylocal+file_prefix+'Data_Plasim_LONG/')
#ua300.load_field(mylocal+file_prefix+'Data_Plasim/')
#va300.load_field(mylocal+file_prefix+'Data_Plasim/')
#hfls.load_field(mylocal+file_prefix+'Data_Plasim/')
#hfss.load_field(mylocal+file_prefix+'Data_Plasim/')
LON = t2m.LON
LAT = t2m.LAT
print(t2m.var.dtype,t2m.var.dtype,t2m.var.dtype)
mask, cell_area, lsm = ExtractAreaWithMask(mylocal,Model,area) # extract land sea mask and multiply it by cell area
print(mask)
#ts.abs_area_int, ts.ano_area_int = ts.Set_area_integral(area,mask)
t2m.abs_area_int, t2m.ano_area_int = t2m.Set_area_integral(area,mask,'PostprocLONG')
zg500.abs_area_int, zg500.ano_area_int = zg500.Set_area_integral(area,mask,'PostprocLONG')
#zg300.abs_area_int, zg300.ano_area_int = zg300.Set_area_integral(area,mask)
mrso.abs_area_int, mrso.ano_area_int = mrso.Set_area_integral(area,mask,'PostprocLONG')
#ua300.abs_area_int, ua300.ano_area_int = ua300.Set_area_integral(area,mask)
#va300.abs_area_int, va300.ano_area_int = va300.Set_area_integral(area,mask)
#hfls.abs_area_int, hfls.ano_area_int = hfls.Set_area_integral(area,mask)
#hfss.abs_area_int, hfss.ano_area_int = hfss.Set_area_integral(area,mask)
# ===Below we filter out just the area of France for mrso====
filter_mask = np.zeros((t2m.var.shape[2],t2m.var.shape[3])) # a mask which sets to zero all values
filter_lat_from = [13, 13] # defining the domain of 1's
filter_lat_to = [17, 17]
filter_lon_from = [-1, 0]
filter_lon_to = [128, 3]
for myiter in range(len(filter_lat_from)): # seting values to 1 in the desired domain
filter_mask[filter_lat_from[myiter]:filter_lat_to[myiter],filter_lon_from[myiter]:filter_lon_to[myiter]] = 1
mrso.var = mrso.var*filter_mask # applying the filter to set to zero all values outside the domain
if creation == []: # If we are not running from the same directory
filename_mixing = t2m.PreMixing(new_mixing, 'PostprocLONG',num_years) # perform mixing (mix batches and years but not days of the same year!) # NEW MIXING MEANS ALSO NEW UNDERSAMPLING!
shutil.copy(filename_mixing, checkpoint_name) # move the permutation file that was used to mix
zg500.PreMixing(False, 'PostprocLONG',num_years) # IT IS IMPORTANT THAT ALL SUBSEQUENT FIELDS BE MIXED (SHUFFLED) THE SAME WAY, otherwise no synchronization!
#zg300.PreMixing(False,num_years)
mrso.PreMixing(False, 'PostprocLONG',num_years)
else:
filename_mixing = t2m.PreMixing(new_mixing,creation,num_years) # load from the folder that we are calling this file from # NEW MIXING MEANS ALSO NEW UNDERSAMPLING!
zg500.PreMixing(False,creation,num_years) # IT IS IMPORTANT THAT ALL SUBSEQUENT FIELDS BE MIXED (SHUFFLED) THE SAME WAY, otherwise no synchronization!
#zg300.PreMixing(False,creation,num_years)
mrso.PreMixing(False,creation,num_years)
print("t2m.var.shape = ", t2m.var.shape)
print("time_end = ", time_end, " ,time_start = ", time_start, " ,T = ", T)
A, A_reshape, threshold, list_extremes, convseq = t2m.ComputeTimeAverage(time_start,time_end,T,tau, percent)
print("threshold = ",threshold)
if creation == []: # If we are not running from the same directory
filename_mixing = t2m.EqualMixing(A, threshold, new_mixing, 'PostprocLONG',num_years)
shutil.copy(filename_mixing, checkpoint_name) # move the permutation file that was used to mix
zg500.EqualMixing(A, threshold, False, 'PostprocLONG',num_years) #IT IS IMPORTANT THAT ALL SUBSEQUENT FIELDS BE MIXED (SHUFFLED) THE SAME WAY, otherwise no synchronization!
#zg300.EqualMixing(A, threshold, False, containing_folder='Postproc',num_years)
mrso.EqualMixing(A, threshold, False, 'PostprocLONG',num_years)
else:
filename_mixing = t2m.EqualMixing(A, threshold, new_mixing,creation,num_years)
zg500.EqualMixing(A, threshold, False,creation,num_years)
#zg300.EqualMixing(A, threshold, False,creation,num_years)
mrso.EqualMixing(A, threshold, False,creation,num_years)
# Now we have to recompute the extremes:
A, A_reshape, threshold, list_extremes, convseq = t2m.ComputeTimeAverage(time_start,time_end,T,tau, percent)
# ===== Applying filter to the temperature field: ====
t2m.var = t2m.var*filter_mask # applying the filter to set to zero all values outside the domain
print("threshold = ",threshold)
print(A.dtype)
# Below we reshape into time by flattened array
t2m.abs_area_int_reshape = t2m.ReshapeInto1Dseries(area, mask, Tot_Mon1[6], Tot_Mon1[9], T, tau)
mrso.abs_area_int_reshape = mrso.ReshapeInto1Dseries(area, mask, Tot_Mon1[6], Tot_Mon1[9], T, tau)
print("mrso.abs_area_int_reshape.shape = ", mrso.abs_area_int_reshape.shape)
print("t2m.var.shape = ", t2m.var.shape)
#X = mrso.abs_area_int_reshape[:,np.newaxis]
# Below we reshape into time by flattened array
#Xs = [t2m.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
# If only 2m temperature is used the following command is to be used: (1D representation)
#
#X = np.concatenate(Xs, axis=1)
# If we want to include soil moisture integrated over the area the following command is to be used:
# (1D representation)
#X = np.c_[mrso.abs_area_int_reshape[:,np.newaxis],np.concatenate(Xs, axis=1)]
#
# Below we reshape into a timex 2D array (space)
#Xs = [t2m.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
#X = np.concatenate(Xs, axis=2)
#============== Use this if only one field needs to be used:===========
##Xs = [zg500.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##Xs = [mrso.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##Xs = [t2m.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X = np.concatenate(Xs, axis=2)
##X = X[:,:,:,np.newaxis]
# =================Use this if many fields need to be used:============
Xs = [t2m.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
X = np.concatenate(Xs, axis=2)
## Without Coarse Graining:
Xs = [zg500.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
X= np.concatenate([X[:,:,:,np.newaxis], np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
Xs = [mrso.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
## Use below for fused networks
#X = [X, mrso.abs_area_int_reshape[:,np.newaxis]]#, t2m.abs_area_int_reshape[:,np.newaxis]]
##Xs = [hfls.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
##Xs = [hfss.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
##Xs = [zg300.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
##Xs = [ua300.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
##Xs = [va300.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
# ========= Use Previous times ===========
##Xs = [t2m.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau-4,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
##Xs = [zg500.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau-4,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
##X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
##Xs = [mrso.ReshapeInto2Dseries(time_start, time_end,lat_from1,lat_to1,lon_from1,lon_to1,T,tau-4,dim=2) for lat_from1,lat_to1, lon_from1, lon_to1 in zip(lat_from,lat_to,lon_from,lon_to)] # here we extract the portion of the globe
#X= np.concatenate([X, np.concatenate(Xs, axis=2)[:,:,:,np.newaxis]], axis=3)
# ========== Coarse Graining: ============
#X1 = t2m.DownScale(time_start,time_end,lat_start,lat_end,lon_start,lon_end,T,tau, (18,42))
#X2 = zg500.DownScale(time_start,time_end,lat_start,lat_end,lon_start,lon_end,T,tau, (18,42))
#X3 = mrso.DownScale(time_start,time_end,lat_start,lat_end,lon_start,lon_end,T,tau, (18,42))
#X = np.concatenate([X1[:,:,:,np.newaxis], X2[:,:,:,np.newaxis], X3[:,:,:,np.newaxis]], axis=3)
# if we want to use FFT space (2D representation)
##X = t2m.ComputeFFT(time_start, time_end,T,tau,'real')
##X = t2m.ComputeFFThalf(time_start, time_end,T,tau,'real')
##X = zg500.ComputeFFThalf(time_start, time_end,T,tau,'real')
##X = zg500.ComputeFFT(time_start, time_end,T,tau,'real')
#X = zg500.ComputeFFTnoPad(time_start, time_end,T,tau,'real',(18,42))
del t2m.var
gc.collect() # Garbage collector which removes some extra references to the object
usepipelines = A_reshape, threshold, checkpoint_name_previous, tau
undersampling_factor = [undersampling_factor, oversampling_factor]
return X, list_extremes, thefield, sampling, percent, usepipelines, undersampling_factor, new_mixing, saveweightseveryblaepoch, NUM_EPOCHS, BATCH_SIZE, checkpoint_name, fullmetrics
if __name__ == '__main__':
print("====== running Learn.py ====== ")
print("tf.__version__ = ",tf.__version__)
if int(tf.__version__[0]) < 2:
print("tf.test.is_gpu_available() = ", tf.test.is_gpu_available())
else:
print("tf.config.list_physical_devices('GPU') = ", tf.config.list_physical_devices('GPU'))
start = time.time()
X, list_extremes, thefield, sampling, percent, usepipelines, undersampling_factor, new_mixing, saveweightseveryblaepoch, NUM_EPOCHS, BATCH_SIZE, checkpoint_name, fullmetrics = PrepareData()
oversampling_factor = undersampling_factor[1]
undersampling_factor = undersampling_factor[0]
print("full dimension of the data is X[0].shape = ", X[0].shape) # do the previous statement in steps so that first we get a list (I extract the necessary sizes)
end = time.time()
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory used:', psutil.virtual_memory()[3])
print("Reading time = ",end - start)
start = time.time()
mylabels = np.array(list_extremes)
checkpoint_name_previous = usepipelines[2]
tau = usepipelines[3]
if tau < 0: # else tau = 0 is assumed to be the first training we perform
# Here we insert analysis of the previous tau with the assessment of the ideal checkpoint
history = np.load(checkpoint_name_previous+'/batch_'+str(0)+'_history.npy', allow_pickle=True).item()
if ('val_CustomLoss' in history.keys()):
print( "'val_CustomLoss' in history.keys()")
historyCustom = []
for i in range(10): # preemptively compute the optimal score
historyCustom.append(np.load(checkpoint_name_previous+'/batch_'+str(i)+'_history.npy', allow_pickle=True).item()['val_CustomLoss'])
historyCustom = np.mean(np.array(historyCustom),0)
opt_checkpoint = np.argmin(historyCustom) # We will use optimal checkpoint in this case!
else:
print( "'val_CustomLoss' not in history.keys()")
sys.exit("Aborting the program!")
my_MCC = np.zeros(10,)
my_entropy = np.zeros(10,)
my_skill = np.zeros(10,)
my_BS = np.zeros(10,)
my_WBS = np.zeros(10,)
my_freq = np.zeros(10,)
my_memory = []
for i in range(10):
print("===============================")
print("cross validation i = ", str(i))
if isinstance(X, list):
print("X is a list")
test_indices, train_indices, train_true_labels_indices, train_false_labels_indices, filename_permutation = TrainTestSplitIndices(i,X[0], mylabels, 1, sampling, new_mixing, thefield, percent) # 1 implies undersampling_rate=1 indicating that we supress the manual undersampling
else:
print("X is not a list")
test_indices, train_indices, train_true_labels_indices, train_false_labels_indices, filename_permutation = TrainTestSplitIndices(i,X, mylabels, 1, sampling, new_mixing, thefield, percent) # 1 implies undersampling_rate=1 indicating that we supress the manual undersampling
print("# events in the train sample after TrainTestSp;litIndices: ", len(train_indices))
print("original proportion of positive events in the train sample: ", np.sum(mylabels[train_indices])/len(train_indices))
oversampling_strategy = oversampling_factor/(100/percent-1)
if oversampling_factor > 1:
print("oversampling_strategy = ", oversampling_strategy )
over = RandomOverSampler(random_state=42, sampling_strategy=oversampling_strategy) # first oversample the minority class to have 15 percent the number of examples of the majority class
#over = SMOTEENN(random_state=42, sampling_strategy=oversampling_strategy) # first oversample the minority class to have 15 percent the number of examples of the majority class
undersampling_strategy = undersampling_factor*oversampling_strategy
print("undersampling_strategy = ", undersampling_strategy )
under = RandomUnderSampler(random_state=42, sampling_strategy=undersampling_strategy)
if oversampling_factor > 1:
steps = [('o', over), ('u', under)]
else:
steps = [('u', under)]
pipeline = Pipeline(steps=steps) # The Pipeline can then be applied to a dataset, performing each transformation in turn and returning a final dataset with the accumulation of the transform applied to it, in this case oversampling followed by undersampling.
# To make use of the in-built pipelines we need to transform the X into the required dimensions
if isinstance(X, list): # if list we are dealing with fused (combined) approach (at least partially)
XTrain_indicesShape = [Xloop[train_indices].shape for Xloop in X] # each element of X has a specific shape
XTrain_indicesLength = [reduce(mul, XTrain_indicesShapeIter[1:]) for XTrain_indicesShapeIter in XTrain_indicesShape] # each shape is converted to an integer
XTrain_indicesLengthAccumulate = list(itertools.accumulate(XTrain_indicesLength))
XTrain_indicesLengthAccumulate.insert(0,0)
print("Original dimension of the train set is X[train_indices].shape = ", XTrain_indicesShape)
print("Original dimension of the train set is XTrain_indicesLength = ", XTrain_indicesLength)
print("Original dimension of the train set is XTrain_indicesLengthAccumulate = ", XTrain_indicesLengthAccumulate)
X_train, Y_train = pipeline.fit_resample(np.concatenate([Xloop[train_indices].reshape(Train_indicesShapeloop[0], XTrain_indicesLengthloop) for Xloop, Train_indicesShapeloop, XTrain_indicesLengthloop in zip(X, XTrain_indicesShape, XTrain_indicesLength)], axis=1), mylabels[train_indices])
print("post-pipeline X_train.shape = ", X_train.shape)
# Next step is to unpack the X into its intended dimension
for myiter in range(len(XTrain_indicesShape)): # number of samples has changed
temp = list(XTrain_indicesShape[myiter])
temp[0] = X_train.shape[0]
XTrain_indicesShape[myiter] = tuple(temp)#.insert(0,X_train.shape[0]))
print("XTrain_indicesShape = ", XTrain_indicesShape)
X_train = [X_train[:,XTrain_indicesLengthAccumulate1: XTrain_indicesLengthAccumulate2].reshape(XTrain_indicesShapeloop) for XTrain_indicesLengthAccumulate1, XTrain_indicesLengthAccumulate2, XTrain_indicesShapeloop in zip(XTrain_indicesLengthAccumulate[:-1],XTrain_indicesLengthAccumulate[1:],XTrain_indicesShape)]
else: # If X is not a list we expect it to be an array useful for fully stacked approach
XTrain_indicesShape = X[train_indices].shape
print("Original dimension of the train set is X[train_indices].shape = ", XTrain_indicesShape)
#X_train, Y_train = pipeline.fit_resample(X[train_indices].reshape(XTrain_indicesShape[0],XTrain_indicesShape[1]*XTrain_indicesShape[2]*XTrain_indicesShape[3]), mylabels[train_indices])
#X_train = X_train.reshape(X_train.shape[0],XTrain_indicesShape[1],XTrain_indicesShape[2],XTrain_indicesShape[3])
XTrain_indicesShape = list(XTrain_indicesShape)
print("XTrain_indicesShape = ", XTrain_indicesShape)
X_train, Y_train = pipeline.fit_resample(X[train_indices].reshape(tuple([XTrain_indicesShape[0],np.prod(XTrain_indicesShape[1:])])), mylabels[train_indices])
XTrain_indicesShape[0] = X_train.shape[0]
print("XTrain_indicesShape = ", XTrain_indicesShape)
X_train = X_train.reshape(tuple(XTrain_indicesShape))
Y_test = mylabels[test_indices]
neg = train_false_labels_indices.shape[0]
pos = train_true_labels_indices.shape[0]
if isinstance(X, list): # If list we need to renormalize it for each instance of the list
for Xdim in range(len(X)):
print("dimension of the train set is X[", Xdim, "][train_indices].shape = ", X[Xdim][train_indices].shape)
print("# of the true labels = ", np.sum(mylabels[train_indices]))
print("effective sampling rate for rare events is ", np.sum(mylabels[train_indices])/X[0][train_indices].shape[0])
X_mean = [np.mean(Xloop[train_indices], 0) for Xloop in X]
for X_meanloop in X_mean:
print("X_meanloop.shape = ", X_meanloop.shape)
X_std = [np.std(Xloop[train_indices],0) for Xloop in X]
for X_stdloop in X_std:
print("X_stdloop.shape = ", X_stdloop.shape)
X_stdloop[X_stdloop==0] = 1 # If there is no variance we shouldn't divide by zero
X_test = [] # Normalization applied here
for iteration in range(len(X_std)):
X_test.append( (X[iteration][test_indices]-X_mean[iteration])/X_std[iteration] )
X_train[iteration] = (X_train[iteration]-X_mean[iteration])/X_std[iteration]
print("====Dimensions of the data before entering the neural net===")
print("unpacked normalized X_train.shape = ", [X_trainloop.shape for X_trainloop in X_train])
print("X_test.shape = ", [X_testloop.shape for X_testloop in X_test])
else: # If not a list we expect an array and thus much more straightforward normalization
print("====Dimensions of the data before entering the neural net===")
print("dimension of the train set is X[train_indices].shape = ", X_train.shape)
X_mean = np.mean(X_train,0)
X_std = np.std(X_train,0)
X_std[X_std==0] = 1 # If there is no variance we shouldn't divide by zero
X_test = (X[test_indices]-X_mean)/X_std
Y_test = mylabels[test_indices]
X_train = (X_train-X_mean)/X_std
print("Y_train.shape = ", Y_train.shape)
print("Y_test.shape = ", Y_test.shape)
print("Train set: # of true labels = ", np.sum(Y_train), " ,# of false labels = ", Y_train.shape[0] - np.sum(Y_train))
print("Train set: effective sampling rate for rare events is ", np.sum(Y_train)/Y_train.shape[0])
np.save(checkpoint_name+'/batch_'+str(i)+'_X_mean', X_mean)
np.save(checkpoint_name+'/batch_'+str(i)+'_X_std', X_std)
if tau < 0:
print("opt_checkpoint: ", opt_checkpoint, " ,loading model: ", checkpoint_name_previous)
model = (tf.keras.models.load_model(checkpoint_name_previous+'/batch_'+str(i), compile=False)) # if we just want to train
nb_zeros_c = 4-len(str(opt_checkpoint))
cp_checkpoint_name = '/cp-'+nb_zeros_c*'0'+str(opt_checkpoint)+'.ckpt'
print("loading weights from ",checkpoint_name_previous+'/batch_'+str(i)+cp_checkpoint_name)
model.load_weights(checkpoint_name_previous+'/batch_'+str(i)+cp_checkpoint_name)
else:
if isinstance(X, list):
inputs = []
for myindex in range(len(X)): # preparing inputs for the probability softmax
model_input_dim = X[myindex].shape[1:]
inputs.append(layers.Input(shape=model_input_dim))
print("inputs = ", inputs)
x1 = CNN_layers(inputs[0])
x2 = layers.Dense(8, activation='relu')(inputs[1])
x = layers.Concatenate()([x1, x2])
outputs = bottom_layers(x)
model = keras.Model(inputs, outputs)
else:
model_input_dim = X.shape[1:] #(X.shape[1],X.shape[2])
model = custom_CNN(model_input_dim)
"""
model_input_dim = X.shape[1:] #(X.shape[1],X.shape[2])
model = create_regularized_model(1e-9, 1)
print("model_input_dim = ",model_input_dim)
"""
tf_sampling = tf.cast([0.5*np.log(undersampling_factor), -0.5*np.log(undersampling_factor)], tf.float32)
model.summary()
if fullmetrics:
#METRICS=['accuracy',MCCMetric(2),ConfusionMatrixMetric(2)] # the last two make the code run longer but give precise discrete prediction benchmarks
METRICS=['accuracy',MCCMetric(2),ConfusionMatrixMetric(2),CustomLoss(tf_sampling)]#tf.keras.metrics.SparseCategoricalCrossentropy(from_logits=True)]#CustomLoss()] # the last two make the code run longer but give precise discrete prediction benchmarks
else:
METRICS=['loss']
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate = 5e-5),
#optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), #If the predicted labels are not converted to a probability distribution by the last layer of the model (using sigmoid or softmax activation functions), we need to inform these three Cross-Entropy functions by setting their from_logits = True.
#One advantage of using sparse categorical cross-entropy is it saves storage in memory as well as time in computation because it simply uses a single integer for a class, rather than a whole one-hot vector. This works despite the fact that the neural network has an one-hot vector output
metrics=METRICS # the last two make the code run longer but give precise discrete prediction benchmarks
)
# Create a callback that saves the model's weights every saveweightseveryblaepoch epochs
checkpoint_path = checkpoint_name+'/batch_'+str(i)+"/cp-{epoch:04d}.ckpt"
if saveweightseveryblaepoch > 0:
print("cp_callback save option on")
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
verbose=1)
else:
cp_callback=None
model.save_weights(checkpoint_path.format(epoch=0))
if undersampling_factor < 1: # without pipelines batchsize and oversampling must be done within model.fit environment
class_weight = {0: (undersampling_factor), 1: (1-undersampling_factor)}
else:
class_weight = None
my_history=model.fit(X_train, Y_train, batch_size=BATCH_SIZE, validation_data=(X_test,Y_test), shuffle=True, callbacks=[cp_callback], epochs=NUM_EPOCHS,verbose=2, class_weight=class_weight)
model.save(checkpoint_name+'/batch_'+str(i))
np.save(checkpoint_name+'/batch_'+str(i)+'_history.npy',my_history.history)
if isinstance(X, list):
my_probability_model = probability_model(inputs,model)
else:
my_probability_model=(tf.keras.Sequential([ # softmax output to make a prediction
model,
tf.keras.layers.Softmax()
]))
print("======================================")
my_memory.append(psutil.virtual_memory())
print('RAM memory:', my_memory[i][3])
del X_test, Y_test
tf.keras.backend.clear_session()
gc.collect() # Garbage collector which removes some extra references to the object
# Getting % usage of virtual_memory ( 3rd field)
print(f" TOTAL MCC = {np.mean(my_MCC):.3f} +- {np.std(my_MCC):.3f} , entropy = {np.mean(my_entropy):.3f} +- {np.std(my_entropy):.3f} , skill = {np.mean(my_skill):.3f} +- {np.std(my_skill):.3f}, Brier = {np.mean(my_BS):.3f} +- {np.std(my_BS):.3f} , Weighted Brier = {np.mean(my_WBS):.3f} +- {np.std(my_WBS):.3f} , frequency = {np.mean(my_freq):.3f} +- {np.std(my_freq):.3f}")
np.save(checkpoint_name+'/RAM_stats.npy', my_memory)
end = time.time()
print("files saved in ", checkpoint_name)
print("Learning time = ",end - start)