-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from spaceml-org/virtual_eve
[Ready] VirtualEVE finetuning model & ablation study capability
- Loading branch information
Showing
80 changed files
with
15,445 additions
and
168 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
wandb | ||
output | ||
outputs | ||
*.tar | ||
|
||
# aux directories | ||
.vscode | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# finetune_32.2M_mae_autocalibration.yaml | ||
|
||
# general | ||
log_level: 'DEBUG' | ||
experiment: | ||
name: "ablation-autocalibration" # generate random name in wandb when set to null | ||
project: "sdofm" | ||
task: "ablation" # options: train, evaluate (not implemented) | ||
model: "autocalibration" | ||
resuming: false | ||
checkpoint: null # this is the wandb run_id of the checkpoint to load | ||
backbone: | ||
checkpoint: null #"sdofm/runs/771lx6o3:best" | ||
model: null | ||
seed: 0 | ||
disable_cuda: false | ||
wandb: | ||
enable: true | ||
entity: "fdlx" | ||
group: "sdofm-phase1" | ||
job_type: "ablation" | ||
tags: [] | ||
notes: "" | ||
output_directory: "wandb_output" | ||
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches) | ||
gcp_storage: # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better) | ||
enabled: true | ||
bucket: "sdofm-checkpoints" | ||
fold: null | ||
evaluate: false # skip training and only evaluate (requires checkpoint to be set) | ||
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available() | ||
precision: '32' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu | ||
log_n_batches: 1000 # log every n training batches | ||
save_results: true # save full results to file and wandb | ||
accelerator: "auto" # options are "auto", "gpu", "tpu", "ipu", or "cpu" | ||
profiler: null # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs | ||
distributed: | ||
enabled: true # set to true to use more than one device | ||
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators. | ||
strategy: "auto" | ||
|
||
# dataset configuration | ||
data: | ||
min_date: '2011-10-01 00:00:00.00' # minimum is '2010-09-09 00:00:11.08' | ||
max_date: '2011-12-31 23:59:59.99' # maximum is '2023-05-26 06:36:08.072' | ||
month_splits: # non selected months will form training set | ||
# train: [1,2,3,4,5,6,7,8,9,10] | ||
val: [11] | ||
test: [12] | ||
holdout: [] | ||
num_workers: 8 # set appropriately for your machine | ||
prefetch_factor: 3 # TODO: not implemented, 2 is default | ||
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE | ||
drop_frame_dim: True # Requires num_frames=1, for backwards compatibility | ||
sdoml: | ||
base_directory: "/mnt/sdoml" | ||
sub_directory: | ||
hmi: "HMI.zarr" | ||
aia: "AIA.zarr" | ||
eve: "EVE_legacy.zarr" | ||
cache: "cache" | ||
components: null # null for select all magnetic components ["Bx", "By", "Bz"] | ||
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"] | ||
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"] | ||
frequency: '12min' # smallest is 12min | ||
mask_with_hmi_threshold: null # None/null for no mask, float for threshold | ||
|
||
# model configurations | ||
model: | ||
# PRETRAINERS | ||
mae: | ||
img_size: 512 | ||
patch_size: 16 | ||
num_frames: 5 | ||
tubelet_size: 1 | ||
in_chans: 9 | ||
embed_dim: 128 | ||
depth: 24 | ||
num_heads: 16 | ||
decoder_embed_dim: 512 | ||
decoder_depth: 8 | ||
decoder_num_heads: 16 | ||
mlp_ratio: 4.0 | ||
norm_layer: 'LayerNorm' | ||
norm_pix_loss: False | ||
samae: | ||
# uses all parameters as in mae plus these | ||
masking_type: "random" # 'random' or 'solar_aware' | ||
active_region_mu_degs: 15.73 | ||
active_region_std_degs: 6.14 | ||
active_region_scale: 1.0 | ||
active_region_abs_lon_max_degs: 60 | ||
active_region_abs_lat_max_degs: 60 | ||
nvae: | ||
use_se: true | ||
res_dist: true | ||
num_x_bits: 8 | ||
num_latent_scales: 3 # 5 | ||
num_groups_per_scale: 1 # 16 | ||
num_latent_per_group: 1 # 10 | ||
ada_groups: true | ||
min_groups_per_scale: 1 | ||
num_channels_enc: 30 | ||
num_channels_dec: 30 | ||
num_preprocess_blocks: 2 # 1 | ||
num_preprocess_cells: 2 | ||
num_cell_per_cond_enc: 2 | ||
num_postprocess_blocks: 2 # 1 | ||
num_postprocess_cells: 2 | ||
num_cell_per_cond_dec: 2 | ||
num_mixture_dec: 1 | ||
num_nf: 2 | ||
kl_anneal_portion: 0.3 | ||
kl_const_portion: 0.0001 | ||
kl_const_coeff: 0.0001 | ||
# learning_rate: 1e-2 | ||
# weight_decay: 3e-4 | ||
weight_decay_norm_anneal: true | ||
weight_decay_norm_init: 1. | ||
weight_decay_norm: 1e-2 | ||
|
||
# FINE-TUNERS | ||
autocalibration: | ||
num_neck_filters: 32 | ||
output_dim: 9 # num channels? Not sure why this is implemented for autocorrelation, should be a scalar | ||
loss: "mse" # options: "mse", "heteroscedastic" | ||
freeze_encoder: true | ||
|
||
# ML optimization arguments: | ||
opt: | ||
loss: "mse" # options: "mae", "mse", "mape" | ||
scheduler: "constant" #other options: "cosine", "plateau", "exp" | ||
scheduler_warmup: 0 | ||
batch_size: 2 | ||
learning_rate: 0.0001 | ||
weight_decay: 3e-4 # 0.0 | ||
optimiser: "adam" | ||
epochs: 100 | ||
patience: 2 | ||
|
||
# hydra configuration | ||
hydra: | ||
mode: RUN | ||
# run: | ||
# dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S} | ||
# sweep: | ||
# dir: ${hydra.run.dir} | ||
# subdir: ${hydra.job.num} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
# finetune_32.2M_mae_virtualeve.yaml | ||
|
||
# general | ||
log_level: 'DEBUG' | ||
experiment: | ||
name: "virtualeve-ablation" # generate random name in wandb when set to null | ||
project: "sdofm" | ||
task: "ablation" # pretrain/finetune/ablation | ||
model: "virtualeve" | ||
resuming: false | ||
checkpoint: null # this is the wandb run_id of the checkpoint to load | ||
backbone: | ||
checkpoint: null | ||
model: null | ||
seed: 0 | ||
disable_cuda: false | ||
wandb: | ||
enable: true | ||
entity: "fdlx" | ||
group: "sdofm-phase1" | ||
job_type: "ablation" | ||
tags: [] | ||
notes: "" | ||
output_directory: "wandb_output" | ||
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches) | ||
gcp_storage: # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better) | ||
enabled: true | ||
bucket: "sdofm-checkpoints" | ||
fold: null # [Not implemented] | ||
evaluate: false # skip training and only evaluate (requires checkpoint to be set) | ||
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available() | ||
precision: 'bf16-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu | ||
log_n_batches: 1000 # [Not implemented] log every n training batches | ||
save_results: true # [Not implemented] save full results to file and wandb | ||
accelerator: "gpu" # options are "auto", "gpu", "tpu", "ipu", or "cpu" | ||
profiler: null # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs | ||
distributed: | ||
enabled: true # set to true to use more than one device | ||
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators. | ||
strategy: "auto" | ||
|
||
# dataset configuration | ||
data: | ||
# min_date: '2010-09-09 00:00:11.08' # minimum is '2010-09-09 00:00:11.08' | ||
# max_date: '2014-05-25 00:00:00.00' # maximum is '2023-05-26 06:36:08.072' | ||
min_date: '2011-10-01 00:00:00.00' # minimum is '2010-09-09 00:00:11.08' | ||
max_date: '2011-12-31 23:59:59.99' # maximum is '2023-05-26 06:36:08.072' | ||
month_splits: # non selected months will form training set | ||
# train: [1,2,3,4,5,6,7,8,9,10] | ||
val: [11] | ||
test: [12] | ||
holdout: [] | ||
num_workers: 32 # set appropriately for your machine | ||
prefetch_factor: 3 # [Not implemented] 2 is default | ||
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE | ||
drop_frame_dim: True # Requires num_frames=1, for backwards compatibility | ||
sdoml: | ||
base_directory: "/mnt/sdoml" | ||
sub_directory: | ||
hmi: "HMI.zarr" | ||
aia: "AIA.zarr" | ||
eve: "EVE_legacy.zarr" | ||
cache: "cache" | ||
components: null # null for select all magnetic components ["Bx", "By", "Bz"] | ||
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"] | ||
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"] | ||
frequency: '12min' # smallest is 12min | ||
mask_with_hmi_threshold: null # None/null for no mask, float for threshold | ||
|
||
# model configurations | ||
model: | ||
# PRETRAINERS | ||
mae: | ||
img_size: 512 | ||
patch_size: 16 | ||
num_frames: 5 | ||
tubelet_size: 1 | ||
in_chans: 9 | ||
embed_dim: 128 | ||
depth: 24 | ||
num_heads: 16 | ||
decoder_embed_dim: 512 | ||
decoder_depth: 8 | ||
decoder_num_heads: 16 | ||
mlp_ratio: 4.0 | ||
norm_layer: 'LayerNorm' | ||
norm_pix_loss: False | ||
samae: | ||
# uses all parameters as in mae plus these | ||
masking_type: "random" # 'random' or 'solar_aware' | ||
active_region_mu_degs: 15.73 | ||
active_region_std_degs: 6.14 | ||
active_region_scale: 1.0 | ||
active_region_abs_lon_max_degs: 60 | ||
active_region_abs_lat_max_degs: 60 | ||
nvae: | ||
use_se: true | ||
res_dist: true | ||
num_x_bits: 8 | ||
num_latent_scales: 3 # 5 | ||
num_groups_per_scale: 1 # 16 | ||
num_latent_per_group: 1 # 10 | ||
ada_groups: true | ||
min_groups_per_scale: 1 | ||
num_channels_enc: 30 | ||
num_channels_dec: 30 | ||
num_preprocess_blocks: 2 # 1 | ||
num_preprocess_cells: 2 | ||
num_cell_per_cond_enc: 2 | ||
num_postprocess_blocks: 2 # 1 | ||
num_postprocess_cells: 2 | ||
num_cell_per_cond_dec: 2 | ||
num_mixture_dec: 1 | ||
num_nf: 2 | ||
kl_anneal_portion: 0.3 | ||
kl_const_portion: 0.0001 | ||
kl_const_coeff: 0.0001 | ||
# learning_rate: 1e-2 | ||
# weight_decay: 3e-4 | ||
weight_decay_norm_anneal: true | ||
weight_decay_norm_init: 1. | ||
weight_decay_norm: 1e-2 | ||
|
||
# FINE-TUNERS | ||
autocalibration: | ||
num_neck_filters: 32 | ||
output_dim: 1 # not sure why this is implemented for autocorrelation, should be a scalar | ||
loss: "mse" # options: "mse", "heteroscedastic" | ||
freeze_encoder: true | ||
virtualeve: | ||
cnn_model: "efficientnet_b3" | ||
lr_linear: 0.01 | ||
lr_cnn: 0.0001 | ||
cnn_dp: 0.75 | ||
epochs_linear: 20 # total will still be opt.epochs, cnn will train opt.epochs - (this value) | ||
|
||
# ML optimization arguments: | ||
opt: | ||
loss: "mse" # options: "mae", "mse", "mape" | ||
scheduler: "constant" #other options: "cosine", "plateau", "exp" | ||
scheduler_warmup: 0 | ||
batch_size: 16 | ||
learning_rate: 0.0001 | ||
weight_decay: 3e-4 # 0.0 | ||
optimiser: "adam" | ||
epochs: 50 | ||
patience: 2 | ||
|
||
# hydra configuration | ||
hydra: | ||
mode: RUN | ||
# run: | ||
# dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S} | ||
# sweep: | ||
# dir: ${hydra.run.dir} | ||
# subdir: ${hydra.job.num} |
Oops, something went wrong.