Skip to content

Commit

Permalink
Merge pull request #14 from spaceml-org/virtual_eve
Browse files Browse the repository at this point in the history
[Ready] VirtualEVE finetuning model & ablation study capability
  • Loading branch information
dead-water authored Jul 19, 2024
2 parents 64266ff + 42e46a8 commit d5bd9f0
Show file tree
Hide file tree
Showing 80 changed files with 15,445 additions and 168 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
wandb
output
outputs
*.tar

# aux directories
.vscode
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ SDO-FM is envisioned as a ‘multi-modal’ foundation model, integrating instru
├── notebooks # visualisation/testing ipynb
├── scripts # entrypoint and highest level executors
├── sdofm # python package
│ ├── ablation # models without backbone integration
│ ├── benchmarks # metrics for comparison
│ ├── datasets # dataloaders/modules
│ ├── finetuning # modules for finetuning
│ ├── models # model components
└── └── pretraining # modules for pretraining
│ ├── pretraining # modules for pretraining
└── └── visualisation # various graphing utilities
```

## Datasets
Expand All @@ -43,6 +46,7 @@ SDO-FM is envisioned as a ‘multi-modal’ foundation model, integrating instru
| Name | Paper |
|--- |--- |
| Multichannel autocalibration for the Atmospheric Imaging Assembly using machine learning | Dos Santos, Luiz FG, et al. "Multichannel autocalibration for the Atmospheric Imaging Assembly using machine learning." Astronomy & Astrophysics 648 (2021): A53 ([link](https://www.aanda.org/articles/aa/full_html/2021/04/aa40051-20/aa40051-20.html)) |
| Virtual EVE: a Deep Learning Model for Solar Irradiance Prediction | Indaco, Manuel, et al. "Virtual EVE: a Deep Learning Model for Solar Irradiance Prediction." (2023) ([link](https://ml4physicalsciences.github.io/2023/files/NeurIPS_ML4PS_2023_236.pdf)) |

## Setup
### Installation
Expand Down
148 changes: 148 additions & 0 deletions experiments/ablation_autocalibration.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# finetune_32.2M_mae_autocalibration.yaml

# general
log_level: 'DEBUG'
experiment:
name: "ablation-autocalibration" # generate random name in wandb when set to null
project: "sdofm"
task: "ablation" # options: train, evaluate (not implemented)
model: "autocalibration"
resuming: false
checkpoint: null # this is the wandb run_id of the checkpoint to load
backbone:
checkpoint: null #"sdofm/runs/771lx6o3:best"
model: null
seed: 0
disable_cuda: false
wandb:
enable: true
entity: "fdlx"
group: "sdofm-phase1"
job_type: "ablation"
tags: []
notes: ""
output_directory: "wandb_output"
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
gcp_storage: # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better)
enabled: true
bucket: "sdofm-checkpoints"
fold: null
evaluate: false # skip training and only evaluate (requires checkpoint to be set)
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
precision: '32' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
log_n_batches: 1000 # log every n training batches
save_results: true # save full results to file and wandb
accelerator: "auto" # options are "auto", "gpu", "tpu", "ipu", or "cpu"
profiler: null # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs
distributed:
enabled: true # set to true to use more than one device
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
strategy: "auto"

# dataset configuration
data:
min_date: '2011-10-01 00:00:00.00' # minimum is '2010-09-09 00:00:11.08'
max_date: '2011-12-31 23:59:59.99' # maximum is '2023-05-26 06:36:08.072'
month_splits: # non selected months will form training set
# train: [1,2,3,4,5,6,7,8,9,10]
val: [11]
test: [12]
holdout: []
num_workers: 8 # set appropriately for your machine
prefetch_factor: 3 # TODO: not implemented, 2 is default
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
drop_frame_dim: True # Requires num_frames=1, for backwards compatibility
sdoml:
base_directory: "/mnt/sdoml"
sub_directory:
hmi: "HMI.zarr"
aia: "AIA.zarr"
eve: "EVE_legacy.zarr"
cache: "cache"
components: null # null for select all magnetic components ["Bx", "By", "Bz"]
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
frequency: '12min' # smallest is 12min
mask_with_hmi_threshold: null # None/null for no mask, float for threshold

# model configurations
model:
# PRETRAINERS
mae:
img_size: 512
patch_size: 16
num_frames: 5
tubelet_size: 1
in_chans: 9
embed_dim: 128
depth: 24
num_heads: 16
decoder_embed_dim: 512
decoder_depth: 8
decoder_num_heads: 16
mlp_ratio: 4.0
norm_layer: 'LayerNorm'
norm_pix_loss: False
samae:
# uses all parameters as in mae plus these
masking_type: "random" # 'random' or 'solar_aware'
active_region_mu_degs: 15.73
active_region_std_degs: 6.14
active_region_scale: 1.0
active_region_abs_lon_max_degs: 60
active_region_abs_lat_max_degs: 60
nvae:
use_se: true
res_dist: true
num_x_bits: 8
num_latent_scales: 3 # 5
num_groups_per_scale: 1 # 16
num_latent_per_group: 1 # 10
ada_groups: true
min_groups_per_scale: 1
num_channels_enc: 30
num_channels_dec: 30
num_preprocess_blocks: 2 # 1
num_preprocess_cells: 2
num_cell_per_cond_enc: 2
num_postprocess_blocks: 2 # 1
num_postprocess_cells: 2
num_cell_per_cond_dec: 2
num_mixture_dec: 1
num_nf: 2
kl_anneal_portion: 0.3
kl_const_portion: 0.0001
kl_const_coeff: 0.0001
# learning_rate: 1e-2
# weight_decay: 3e-4
weight_decay_norm_anneal: true
weight_decay_norm_init: 1.
weight_decay_norm: 1e-2

# FINE-TUNERS
autocalibration:
num_neck_filters: 32
output_dim: 9 # num channels? Not sure why this is implemented for autocorrelation, should be a scalar
loss: "mse" # options: "mse", "heteroscedastic"
freeze_encoder: true

# ML optimization arguments:
opt:
loss: "mse" # options: "mae", "mse", "mape"
scheduler: "constant" #other options: "cosine", "plateau", "exp"
scheduler_warmup: 0
batch_size: 2
learning_rate: 0.0001
weight_decay: 3e-4 # 0.0
optimiser: "adam"
epochs: 100
patience: 2

# hydra configuration
hydra:
mode: RUN
# run:
# dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S}
# sweep:
# dir: ${hydra.run.dir}
# subdir: ${hydra.job.num}
156 changes: 156 additions & 0 deletions experiments/ablation_virtualeve.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# finetune_32.2M_mae_virtualeve.yaml

# general
log_level: 'DEBUG'
experiment:
name: "virtualeve-ablation" # generate random name in wandb when set to null
project: "sdofm"
task: "ablation" # pretrain/finetune/ablation
model: "virtualeve"
resuming: false
checkpoint: null # this is the wandb run_id of the checkpoint to load
backbone:
checkpoint: null
model: null
seed: 0
disable_cuda: false
wandb:
enable: true
entity: "fdlx"
group: "sdofm-phase1"
job_type: "ablation"
tags: []
notes: ""
output_directory: "wandb_output"
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
gcp_storage: # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better)
enabled: true
bucket: "sdofm-checkpoints"
fold: null # [Not implemented]
evaluate: false # skip training and only evaluate (requires checkpoint to be set)
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
precision: 'bf16-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
log_n_batches: 1000 # [Not implemented] log every n training batches
save_results: true # [Not implemented] save full results to file and wandb
accelerator: "gpu" # options are "auto", "gpu", "tpu", "ipu", or "cpu"
profiler: null # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs
distributed:
enabled: true # set to true to use more than one device
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
strategy: "auto"

# dataset configuration
data:
# min_date: '2010-09-09 00:00:11.08' # minimum is '2010-09-09 00:00:11.08'
# max_date: '2014-05-25 00:00:00.00' # maximum is '2023-05-26 06:36:08.072'
min_date: '2011-10-01 00:00:00.00' # minimum is '2010-09-09 00:00:11.08'
max_date: '2011-12-31 23:59:59.99' # maximum is '2023-05-26 06:36:08.072'
month_splits: # non selected months will form training set
# train: [1,2,3,4,5,6,7,8,9,10]
val: [11]
test: [12]
holdout: []
num_workers: 32 # set appropriately for your machine
prefetch_factor: 3 # [Not implemented] 2 is default
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
drop_frame_dim: True # Requires num_frames=1, for backwards compatibility
sdoml:
base_directory: "/mnt/sdoml"
sub_directory:
hmi: "HMI.zarr"
aia: "AIA.zarr"
eve: "EVE_legacy.zarr"
cache: "cache"
components: null # null for select all magnetic components ["Bx", "By", "Bz"]
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
frequency: '12min' # smallest is 12min
mask_with_hmi_threshold: null # None/null for no mask, float for threshold

# model configurations
model:
# PRETRAINERS
mae:
img_size: 512
patch_size: 16
num_frames: 5
tubelet_size: 1
in_chans: 9
embed_dim: 128
depth: 24
num_heads: 16
decoder_embed_dim: 512
decoder_depth: 8
decoder_num_heads: 16
mlp_ratio: 4.0
norm_layer: 'LayerNorm'
norm_pix_loss: False
samae:
# uses all parameters as in mae plus these
masking_type: "random" # 'random' or 'solar_aware'
active_region_mu_degs: 15.73
active_region_std_degs: 6.14
active_region_scale: 1.0
active_region_abs_lon_max_degs: 60
active_region_abs_lat_max_degs: 60
nvae:
use_se: true
res_dist: true
num_x_bits: 8
num_latent_scales: 3 # 5
num_groups_per_scale: 1 # 16
num_latent_per_group: 1 # 10
ada_groups: true
min_groups_per_scale: 1
num_channels_enc: 30
num_channels_dec: 30
num_preprocess_blocks: 2 # 1
num_preprocess_cells: 2
num_cell_per_cond_enc: 2
num_postprocess_blocks: 2 # 1
num_postprocess_cells: 2
num_cell_per_cond_dec: 2
num_mixture_dec: 1
num_nf: 2
kl_anneal_portion: 0.3
kl_const_portion: 0.0001
kl_const_coeff: 0.0001
# learning_rate: 1e-2
# weight_decay: 3e-4
weight_decay_norm_anneal: true
weight_decay_norm_init: 1.
weight_decay_norm: 1e-2

# FINE-TUNERS
autocalibration:
num_neck_filters: 32
output_dim: 1 # not sure why this is implemented for autocorrelation, should be a scalar
loss: "mse" # options: "mse", "heteroscedastic"
freeze_encoder: true
virtualeve:
cnn_model: "efficientnet_b3"
lr_linear: 0.01
lr_cnn: 0.0001
cnn_dp: 0.75
epochs_linear: 20 # total will still be opt.epochs, cnn will train opt.epochs - (this value)

# ML optimization arguments:
opt:
loss: "mse" # options: "mae", "mse", "mape"
scheduler: "constant" #other options: "cosine", "plateau", "exp"
scheduler_warmup: 0
batch_size: 16
learning_rate: 0.0001
weight_decay: 3e-4 # 0.0
optimiser: "adam"
epochs: 50
patience: 2

# hydra configuration
hydra:
mode: RUN
# run:
# dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S}
# sweep:
# dir: ${hydra.run.dir}
# subdir: ${hydra.job.num}
Loading

0 comments on commit d5bd9f0

Please sign in to comment.