Merge pull request #14 from spaceml-org/virtual_eve

[Ready] VirtualEVE finetuning model & ablation study capability
spaceml-org · Jul 19, 2024 · d5bd9f0 · d5bd9f0
2 parents 64266ff + 42e46a8
commit d5bd9f0
Show file tree

Hide file tree

Showing 80 changed files with 15,445 additions and 168 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 wandb
 output
 outputs
+*.tar
 
 # aux directories
 .vscode

diff --git a/README.md b/README.md
@@ -19,10 +19,13 @@ SDO-FM is envisioned as a ‘multi-modal’ foundation model, integrating instru
 ├── notebooks           # visualisation/testing ipynb
 ├── scripts             # entrypoint and highest level executors
 ├── sdofm               # python package
+│   ├── ablation        # models without backbone integration
+│   ├── benchmarks      # metrics for comparison
 │   ├── datasets        # dataloaders/modules
 │   ├── finetuning      # modules for finetuning
 │   ├── models          # model components 
-└── └── pretraining     # modules for pretraining
+│   ├── pretraining     # modules for pretraining
+└── └── visualisation   # various graphing utilities
 ```
 
 ## Datasets
@@ -43,6 +46,7 @@ SDO-FM is envisioned as a ‘multi-modal’ foundation model, integrating instru
 | Name 	| Paper	|
 |---	|---	|
 | Multichannel autocalibration for the Atmospheric Imaging Assembly using machine learning 	| Dos Santos, Luiz FG, et al. "Multichannel autocalibration for the Atmospheric Imaging Assembly using machine learning." Astronomy & Astrophysics 648 (2021): A53 ([link](https://www.aanda.org/articles/aa/full_html/2021/04/aa40051-20/aa40051-20.html)) 	|
+| Virtual EVE: a Deep Learning Model for Solar Irradiance Prediction | Indaco, Manuel, et al. "Virtual EVE: a Deep Learning Model for Solar Irradiance Prediction." (2023) ([link](https://ml4physicalsciences.github.io/2023/files/NeurIPS_ML4PS_2023_236.pdf)) 	|
 
 ## Setup
 ### Installation

diff --git a/experiments/ablation_autocalibration.yaml b/experiments/ablation_autocalibration.yaml
@@ -0,0 +1,148 @@
+# finetune_32.2M_mae_autocalibration.yaml
+
+# general
+log_level: 'DEBUG'
+experiment:
+  name: "ablation-autocalibration"                    # generate random name in wandb when set to null
+  project: "sdofm"
+  task: "ablation"              # options: train, evaluate (not implemented)
+  model: "autocalibration"
+  resuming: false
+  checkpoint: null              # this is the wandb run_id of the checkpoint to load
+  backbone:
+    checkpoint: null #"sdofm/runs/771lx6o3:best"
+    model: null
+  seed: 0
+  disable_cuda: false
+  wandb:
+    enable: true
+    entity: "fdlx"
+    group: "sdofm-phase1"
+    job_type: "ablation"
+    tags: []
+    notes: ""
+    output_directory: "wandb_output"
+    log_model: "all"             # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
+  gcp_storage:                  # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better)
+    enabled: true
+    bucket: "sdofm-checkpoints"
+  fold: null
+  evaluate: false               # skip training and only evaluate (requires checkpoint to be set)
+  device: null                  # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
+  precision: '32'        # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
+  log_n_batches: 1000           # log every n training batches
+  save_results: true            # save full results to file and wandb
+  accelerator: "auto"           # options are "auto", "gpu", "tpu", "ipu", or "cpu"
+  profiler: null                # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs
+  distributed:
+    enabled: true               # set to true to use more than one device
+    world_size: "auto"          # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
+    strategy: "auto"
+
+# dataset configuration
+data:
+  min_date: '2011-10-01 00:00:00.00'    # minimum is '2010-09-09 00:00:11.08'
+  max_date: '2011-12-31 23:59:59.99'    # maximum is '2023-05-26 06:36:08.072'
+  month_splits:                 # non selected months will form training set 
+    # train: [1,2,3,4,5,6,7,8,9,10]
+    val: [11]
+    test: [12]
+    holdout: []
+  num_workers: 8               # set appropriately for your machine
+  prefetch_factor: 3           # TODO: not implemented, 2 is default
+  num_frames: 1                # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
+  drop_frame_dim: True         # Requires num_frames=1, for backwards compatibility
+  sdoml:
+    base_directory: "/mnt/sdoml"
+    sub_directory:
+      hmi: "HMI.zarr"
+      aia: "AIA.zarr"
+      eve: "EVE_legacy.zarr"
+      cache: "cache"
+    components: null            # null for select all magnetic components ["Bx", "By", "Bz"]
+    wavelengths: null           # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
+    ions: null                  # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
+    frequency: '12min'          # smallest is 12min
+    mask_with_hmi_threshold: null # None/null for no mask, float for threshold
+
+# model configurations
+model:
+  # PRETRAINERS
+  mae:
+    img_size: 512
+    patch_size: 16
+    num_frames: 5
+    tubelet_size: 1
+    in_chans: 9
+    embed_dim: 128
+    depth: 24
+    num_heads: 16
+    decoder_embed_dim: 512
+    decoder_depth: 8
+    decoder_num_heads: 16
+    mlp_ratio: 4.0
+    norm_layer: 'LayerNorm'
+    norm_pix_loss: False
+  samae:
+    # uses all parameters as in mae plus these
+    masking_type: "random"  # 'random' or 'solar_aware'
+    active_region_mu_degs: 15.73
+    active_region_std_degs: 6.14
+    active_region_scale: 1.0
+    active_region_abs_lon_max_degs: 60
+    active_region_abs_lat_max_degs: 60
+  nvae:
+    use_se: true
+    res_dist: true
+    num_x_bits: 8
+    num_latent_scales: 3  # 5
+    num_groups_per_scale: 1  # 16
+    num_latent_per_group: 1  # 10
+    ada_groups: true
+    min_groups_per_scale: 1
+    num_channels_enc: 30
+    num_channels_dec: 30
+    num_preprocess_blocks: 2   # 1
+    num_preprocess_cells: 2
+    num_cell_per_cond_enc: 2
+    num_postprocess_blocks: 2  # 1
+    num_postprocess_cells: 2
+    num_cell_per_cond_dec: 2
+    num_mixture_dec: 1
+    num_nf: 2
+    kl_anneal_portion: 0.3
+    kl_const_portion: 0.0001
+    kl_const_coeff: 0.0001
+    # learning_rate: 1e-2
+    # weight_decay: 3e-4
+    weight_decay_norm_anneal: true
+    weight_decay_norm_init: 1.
+    weight_decay_norm: 1e-2        
+
+  # FINE-TUNERS
+  autocalibration:
+    num_neck_filters: 32
+    output_dim: 9 # num channels? Not sure why this is implemented for autocorrelation, should be a scalar
+    loss: "mse" # options: "mse", "heteroscedastic"
+    freeze_encoder: true
+
+  # ML optimization arguments:
+  opt:
+    loss: "mse" # options: "mae", "mse", "mape"
+    scheduler: "constant" #other options: "cosine", "plateau", "exp"
+    scheduler_warmup: 0
+    batch_size: 2
+    learning_rate: 0.0001
+    weight_decay: 3e-4 # 0.0
+    optimiser: "adam"
+    epochs: 100
+    patience: 2
+
+# hydra configuration
+hydra:
+  mode: RUN
+  # run:
+  #   dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S}
+  # sweep:
+  #   dir: ${hydra.run.dir}
+  #   subdir: ${hydra.job.num}
diff --git a/experiments/ablation_virtualeve.yaml b/experiments/ablation_virtualeve.yaml
@@ -0,0 +1,156 @@
+# finetune_32.2M_mae_virtualeve.yaml
+
+# general
+log_level: 'DEBUG'
+experiment:
+  name: "virtualeve-ablation"   # generate random name in wandb when set to null
+  project: "sdofm"
+  task: "ablation"              # pretrain/finetune/ablation
+  model: "virtualeve"
+  resuming: false
+  checkpoint: null              # this is the wandb run_id of the checkpoint to load
+  backbone:
+    checkpoint: null
+    model: null
+  seed: 0
+  disable_cuda: false
+  wandb:
+    enable: true
+    entity: "fdlx"
+    group: "sdofm-phase1"
+    job_type: "ablation"
+    tags: []
+    notes: ""
+    output_directory: "wandb_output"
+    log_model: "all"             # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
+  gcp_storage:                  # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better)
+    enabled: true
+    bucket: "sdofm-checkpoints"
+  fold: null                    # [Not implemented]
+  evaluate: false               # skip training and only evaluate (requires checkpoint to be set)
+  device: null                  # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
+  precision: 'bf16-true'        # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
+  log_n_batches: 1000           # [Not implemented] log every n training batches
+  save_results: true            # [Not implemented] save full results to file and wandb
+  accelerator: "gpu"           # options are "auto", "gpu", "tpu", "ipu", or "cpu"
+  profiler: null                # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs
+  distributed:
+    enabled: true               # set to true to use more than one device
+    world_size: "auto"               # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
+    strategy: "auto"
+
+# dataset configuration
+data:
+  # min_date: '2010-09-09 00:00:11.08'    # minimum is '2010-09-09 00:00:11.08'
+  # max_date: '2014-05-25 00:00:00.00'    # maximum is '2023-05-26 06:36:08.072'
+  min_date: '2011-10-01 00:00:00.00'    # minimum is '2010-09-09 00:00:11.08'
+  max_date: '2011-12-31 23:59:59.99'    # maximum is '2023-05-26 06:36:08.072'
+  month_splits:                 # non selected months will form training set 
+    # train: [1,2,3,4,5,6,7,8,9,10]
+    val: [11]
+    test: [12]
+    holdout: []
+  num_workers: 32                 # set appropriately for your machine
+  prefetch_factor: 3             # [Not implemented] 2 is default
+  num_frames: 1                  # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
+  drop_frame_dim: True           # Requires num_frames=1, for backwards compatibility
+  sdoml:
+    base_directory: "/mnt/sdoml"
+    sub_directory:
+      hmi: "HMI.zarr"
+      aia: "AIA.zarr"
+      eve: "EVE_legacy.zarr"
+      cache: "cache"
+    components: null             # null for select all magnetic components ["Bx", "By", "Bz"]
+    wavelengths: null            # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
+    ions: null                   # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
+    frequency: '12min'           # smallest is 12min
+    mask_with_hmi_threshold: null   # None/null for no mask, float for threshold
+
+# model configurations
+model:
+  # PRETRAINERS
+  mae:
+    img_size: 512
+    patch_size: 16
+    num_frames: 5
+    tubelet_size: 1
+    in_chans: 9
+    embed_dim: 128
+    depth: 24
+    num_heads: 16
+    decoder_embed_dim: 512
+    decoder_depth: 8
+    decoder_num_heads: 16
+    mlp_ratio: 4.0
+    norm_layer: 'LayerNorm'
+    norm_pix_loss: False
+  samae:
+    # uses all parameters as in mae plus these
+    masking_type: "random"  # 'random' or 'solar_aware'
+    active_region_mu_degs: 15.73
+    active_region_std_degs: 6.14
+    active_region_scale: 1.0
+    active_region_abs_lon_max_degs: 60
+    active_region_abs_lat_max_degs: 60
+  nvae:
+    use_se: true
+    res_dist: true
+    num_x_bits: 8
+    num_latent_scales: 3  # 5
+    num_groups_per_scale: 1  # 16
+    num_latent_per_group: 1  # 10
+    ada_groups: true
+    min_groups_per_scale: 1
+    num_channels_enc: 30
+    num_channels_dec: 30
+    num_preprocess_blocks: 2   # 1
+    num_preprocess_cells: 2
+    num_cell_per_cond_enc: 2
+    num_postprocess_blocks: 2  # 1
+    num_postprocess_cells: 2
+    num_cell_per_cond_dec: 2
+    num_mixture_dec: 1
+    num_nf: 2
+    kl_anneal_portion: 0.3
+    kl_const_portion: 0.0001
+    kl_const_coeff: 0.0001
+    # learning_rate: 1e-2
+    # weight_decay: 3e-4
+    weight_decay_norm_anneal: true
+    weight_decay_norm_init: 1.
+    weight_decay_norm: 1e-2        
+
+  # FINE-TUNERS
+  autocalibration:
+    num_neck_filters: 32
+    output_dim: 1 # not sure why this is implemented for autocorrelation, should be a scalar
+    loss: "mse" # options: "mse", "heteroscedastic"
+    freeze_encoder: true
+  virtualeve:
+    cnn_model: "efficientnet_b3"
+    lr_linear: 0.01
+    lr_cnn: 0.0001
+    cnn_dp: 0.75
+    epochs_linear: 20 # total will still be opt.epochs, cnn will train opt.epochs - (this value) 
+
+  # ML optimization arguments:
+  opt:
+    loss: "mse" # options: "mae", "mse", "mape"
+    scheduler: "constant" #other options: "cosine", "plateau", "exp"
+    scheduler_warmup: 0
+    batch_size: 16
+    learning_rate: 0.0001
+    weight_decay: 3e-4 # 0.0
+    optimiser: "adam"
+    epochs: 50
+    patience: 2
+
+# hydra configuration
+hydra:
+  mode: RUN
+  # run:
+  #   dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S}
+  # sweep:
+  #   dir: ${hydra.run.dir}
+  #   subdir: ${hydra.job.num}
-Original file line number
+Diff line change
@@ -1,6 +1,7 @@
     wandb
     output
     outputs
+    *.tar
     # aux directories
     .vscode
@@ Expand Down @@