Add r and alpha for attn_lora

zqiao11 · Dec 10, 2024 · 1bc73f2 · 1bc73f2
1 parent 07824a8
commit 1bc73f2
Show file tree

Hide file tree

Showing 26 changed files with 179 additions and 1,468 deletions.
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/default.yaml b/cli/conf/lsf-setup/multi_scale/finetune/default.yaml
@@ -33,13 +33,10 @@ trainer:
       mode: min
       save_top_k: 1  # Qz: Sometimes the 1st validation gets anomalous results. Discard that ckpt, and use the 2nd one.
       every_n_epochs: 1
-    - _target_: lightning.pytorch.callbacks.ModelCheckpoint
-      dirpath: ${hydra:runtime.output_dir}/checkpoints
-      save_weights_only: true
     - _target_: lightning.pytorch.callbacks.EarlyStopping # uni2ts.callbacks.earlystop.WarmupEarlyStopping
       monitor: val/PackedNLLLoss
       min_delta: 0.0
-      patience: 3  # Set to a small value as now each epoch has many batches.
+      patience: 3
       mode: min
       strict: false
       verbose: true

diff --git a/cli/conf/lsf-setup/multi_scale/finetune/model/moirai_1.0_R_base.yaml b/cli/conf/lsf-setup/multi_scale/finetune/model/moirai_1.0_R_base.yaml
@@ -41,6 +41,8 @@ prediction_length: null
 finetune_pattern: full
 num_new_scales: 3
 ds_factor: 2
+r: 16
+alpha: 16
 
 use_lora: True
 lora_kwargs:

diff --git a/cli/conf/lsf-setup/multi_scale/finetune/model/moirai_1.0_R_small.yaml b/cli/conf/lsf-setup/multi_scale/finetune/model/moirai_1.0_R_small.yaml
@@ -41,6 +41,8 @@ prediction_length: null
 finetune_pattern: full
 num_new_scales: 3
 ds_factor: 2
+r: 16
+alpha: 16
 
 use_lora: False
 lora_kwargs:

diff --git a/cli/conf/lsf-setup/multi_scale/finetune/model/moirai_1.1_R_small.yaml b/cli/conf/lsf-setup/multi_scale/finetune/model/moirai_1.1_R_small.yaml
@@ -41,6 +41,8 @@ prediction_length: null
 finetune_pattern: full
 num_new_scales: 3
 ds_factor: 2
+r: 16
+alpha: 16
 
 use_lora: False
 lora_kwargs:

diff --git a/cli/conf/lsf-setup/multi_scale/finetune_two_stage/default.yaml b/cli/conf/lsf-setup/multi_scale/finetune_two_stage/default.yaml
@@ -69,13 +69,10 @@ trainer:
       mode: min
       save_top_k: 1  # Qz: Sometimes the 1st validation gets anomalous results. Discard that ckpt, and use the 2nd one.
       every_n_epochs: 1
-    - _target_: lightning.pytorch.callbacks.ModelCheckpoint
-      dirpath: ${hydra:runtime.output_dir}/checkpoints
-      save_weights_only: true
-    - _target_: lightning.pytorch.callbacks.EarlyStopping # uni2ts.callbacks.earlystop.WarmupEarlyStopping
+    - _target_: lightning.pytorch.callbacks.EarlyStopping  # uni2ts.callbacks.earlystop.WarmupEarlyStopping
       monitor: val/PackedNLLLoss
       min_delta: 0.0
-      patience: 3  # Set to a small value as now each epoch has many batches.
+      patience: 3
       mode: min
       strict: false
       verbose: true

diff --git a/cli/conf/lsf-setup/multi_scale/finetune_two_stage/model/moirai_1.0_R_base.yaml b/cli/conf/lsf-setup/multi_scale/finetune_two_stage/model/moirai_1.0_R_base.yaml
@@ -41,6 +41,8 @@ prediction_length: null
 finetune_pattern: full
 num_new_scales: 3
 ds_factor: 2
+r: 16
+alpha: 16
 
 use_lora: True
 lora_kwargs:

diff --git a/cli/conf/lsf-setup/multi_scale/finetune_two_stage/model/moirai_1.0_R_small.yaml b/cli/conf/lsf-setup/multi_scale/finetune_two_stage/model/moirai_1.0_R_small.yaml
@@ -41,6 +41,8 @@ prediction_length: null
 finetune_pattern: full
 num_new_scales: 3
 ds_factor: 2
+r: 16
+alpha: 16
 
 use_lora: False
 lora_kwargs:

diff --git a/cli/conf/lsf-setup/multi_scale/finetune_two_stage/model/moirai_1.1_R_small.yaml b/cli/conf/lsf-setup/multi_scale/finetune_two_stage/model/moirai_1.1_R_small.yaml
@@ -41,6 +41,8 @@ prediction_length: null
 finetune_pattern: full
 num_new_scales: 3
 ds_factor: 2
+r: 16
+alpha: 16
 
 use_lora: False
 lora_kwargs:

diff --git a/cli/train_two_stage.py b/cli/train_two_stage.py
@@ -27,6 +27,21 @@
 from uni2ts.common import hydra_util  # noqa: hydra resolvers
 from uni2ts.data.loader import DataLoader
 
+import os
+import glob
+
+
+def get_best_checkpoint_path(checkpoint_dir: str):
+    # list all .ckpt files
+    ckpt_files = glob.glob(os.path.join(checkpoint_dir, "*.ckpt"))
+
+    if len(ckpt_files) == 1:
+        return ckpt_files[0]  # Return the path of the only .ckpt file
+    elif len(ckpt_files) == 0:
+        raise FileNotFoundError(f"No .ckpt file found in {checkpoint_dir}")
+    else:
+        raise ValueError(f"Multiple .ckpt files found in {checkpoint_dir}. Expected only one.")
+
 
 class DataModule(L.LightningDataModule):
     def __init__(
@@ -139,13 +154,12 @@ def main(cfg: DictConfig):
 
     # ToDo: 写training_warmup的config
     trainer_warmup: L.Trainer = instantiate(cfg.trainer_warmup)
-
+    trainer_warmup.callbacks[-1].CHECKPOINT_EQUALS_CHAR = "_"
 
     trainer: L.Trainer = instantiate(cfg.trainer)
 
     # '=' in ckpt name make it cannot be directly loaded with hydra. Change it to '_'.
     trainer.callbacks[-1].CHECKPOINT_EQUALS_CHAR = "_"
-    trainer.callbacks[-2].CHECKPOINT_EQUALS_CHAR = "_"
 
     train_dataset: Dataset = instantiate(cfg.data).load_dataset(
         model.train_transform_map
@@ -190,7 +204,10 @@ def main(cfg: DictConfig):
         ckpt_path=cfg.ckpt_path,
     )
 
-    print("Finished warmup stage. Now finetuning the whole model...")
+    # Load the saved ckpt of the best model in stage 1
+    print("Finished warmup stage. Now loading the saved model and finetuning the whole model...")
+    checkpoint = torch.load(get_best_checkpoint_path(trainer_warmup.callbacks[-1].dirpath))
+    model.load_state_dict(checkpoint["state_dict"])
     model.current_stage = 2
 
     trainer.fit(

diff --git a/project/lsf-setup/multi_scale/eval/small/ettm1.sh b/project/lsf-setup/multi_scale/eval/small/ettm1.sh
@@ -1,18 +1,18 @@
 #!/bin/bash
 
 export HYDRA_FULL_ERROR=1
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=2
 
 mode=S
 cp=conf/lsf-setup/multi_scale/eval
 exp_name=lsf
 cl=4000
 model=moirai_lightning_ckpt
 
-cpp1='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm1/S/cl4000_pl96/checkpoints/epoch_5-step_2502.ckpt'
-cpp2='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm1/S/cl4000_pl192/checkpoints/epoch_1-step_832.ckpt'
-cpp3='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm1/S/cl4000_pl336/checkpoints/epoch_0-step_414.ckpt'
-cpp4='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm1/S/cl4000_pl720/checkpoints/epoch_0-step_408.ckpt'
+cpp1='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm1/S/cl4000_pl96/checkpoints_warmup/epoch_1-step_834-v2.ckpt'
+cpp2='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm1/S/cl4000_pl192/checkpoints_warmup/epoch_3-step_1664.ckpt'
+cpp3='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm1/S/cl4000_pl336/checkpoints_warmup/epoch_2-step_1242.ckpt'
+cpp4='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm1/S/cl4000_pl720/checkpoints_warmup/epoch_2-step_1224.ckpt'
 
 index=1
 for pl in 96 192 336 720; do

diff --git a/project/lsf-setup/multi_scale/eval/small/ettm2.sh b/project/lsf-setup/multi_scale/eval/small/ettm2.sh
@@ -9,10 +9,10 @@ exp_name=lsf
 cl=3000
 model=moirai_lightning_ckpt
 
-cpp1='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm2/S/cl3000_pl96/checkpoints/epoch_16-step_7327.ckpt'
-cpp2='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm2/S/cl3000_pl192/checkpoints/epoch_3-step_1716.ckpt'
-cpp3='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm2/S/cl3000_pl336/checkpoints/epoch_1-step_854.ckpt'
-cpp4='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/ettm2/S/cl3000_pl720/checkpoints/epoch_0-step_422.ckpt'
+cpp1='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm2/S/cl3000_pl96/checkpoints_warmup/epoch_1-step_862-v1.ckpt'
+cpp2='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm2/S/cl3000_pl192/checkpoints_warmup/epoch_0-step_429-v1.ckpt'
+cpp3='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm2/S/cl3000_pl336/checkpoints_warmup/epoch_0-step_427.ckpt'
+cpp4='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/ettm2/S/cl3000_pl720/checkpoints_warmup/epoch_26-step_11394.ckpt'
 
 index=1
 for pl in 96 192 336 720; do

diff --git a/project/lsf-setup/multi_scale/eval/small/weather.sh b/project/lsf-setup/multi_scale/eval/small/weather.sh
@@ -9,10 +9,10 @@ exp_name=lsf
 cl=2000
 model=moirai_lightning_ckpt
 
-cpp1='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/weather/S/cl2000_pl96/checkpoints/epoch_14-step_21420.ckpt'
-cpp2='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/weather/S/cl2000_pl192/checkpoints/epoch_9-step_14240.ckpt'
-cpp3='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/weather/S/cl2000_pl336/checkpoints/epoch_5-step_8508.ckpt'
-cpp4='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/1tid_2inproj_all_scale_lora_freezeqkv/full/weather/S/cl2000_pl720/checkpoints/epoch_1-step_2804.ckpt'
+cpp1='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/weather/S/cl2000_pl96/checkpoints_warmup/epoch_2-step_4284.ckpt'
+cpp2='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/weather/S/cl2000_pl192/checkpoints_warmup/epoch_1-step_2848.ckpt'
+cpp3='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/weather/S/cl2000_pl336/checkpoints_warmup/epoch_1-step_2836.ckpt'
+cpp4='./outputs/lsf-setup/multi_scale/finetune_two_stage/moirai_1.0_R_small/direct_1full_2head/full/weather/S/cl2000_pl720/checkpoints_warmup/epoch_1-step_2804.ckpt'
 
 index=1
 for pl in 96 192 336 720; do

diff --git a/project/lsf-setup/multi_scale/finetune_two_stage/small/ettm1.sh b/project/lsf-setup/multi_scale/finetune_two_stage/small/ettm1.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
-export HYDRA_FULL_ERROR=1; export CUDA_VISIBLE_DEVICES=1;
+export HYDRA_FULL_ERROR=1; export CUDA_VISIBLE_DEVICES=3;
 
 model=moirai_1.0_R_small
 cp=conf/lsf-setup/multi_scale/finetune_two_stage
-exp_name=1tid_2inproj_all_scale_lora_freezeqkv
+exp_name=direct_1full_2head
 data=ettm1
 cl=4000
 ps=128
@@ -32,4 +32,6 @@ for pl in 96 192 336 720; do
   val_data.context_length=$cl \
   val_data.prediction_length=$pl \
   val_data.mode=${mode}
+#  trainer_warmup.callbacks."1".monitor=val/PackedMSELoss \
+#  trainer_warmup.callbacks."2".monitor=val/PackedMSELoss
 done
diff --git a/project/lsf-setup/multi_scale/finetune_two_stage/small/ettm2.sh b/project/lsf-setup/multi_scale/finetune_two_stage/small/ettm2.sh
@@ -4,7 +4,7 @@ export HYDRA_FULL_ERROR=1; export CUDA_VISIBLE_DEVICES=3;
 
 model=moirai_1.0_R_small
 cp=conf/lsf-setup/multi_scale/finetune_two_stage
-exp_name=1tid_2inproj_all_scale_lora_freezeqkv
+exp_name=direct_1full_2head
 data=ettm2
 cl=3000
 ps=64
@@ -32,4 +32,6 @@ for pl in 96 192 336 720; do
   val_data.context_length=$cl \
   val_data.prediction_length=$pl \
   val_data.mode=${mode}
+#  trainer_warmup.callbacks."1".monitor=val/PackedMSELoss \
+#  trainer_warmup.callbacks."2".monitor=val/PackedMSELoss
 done
diff --git a/project/lsf-setup/multi_scale/finetune_two_stage/small/weather.sh b/project/lsf-setup/multi_scale/finetune_two_stage/small/weather.sh
@@ -4,7 +4,7 @@ export HYDRA_FULL_ERROR=1; export CUDA_VISIBLE_DEVICES=0;
 
 model=moirai_1.0_R_small
 cp=conf/lsf-setup/multi_scale/finetune_two_stage
-exp_name=1tid_2inproj_all_scale_lora_freezeqkv
+exp_name=direct_1full_2head
 data=weather
 cl=2000
 ps=128
@@ -33,7 +33,7 @@ for pl in 96 192 336 720; do
   val_data.prediction_length=$pl \
   val_data.mode=${mode} \
   trainer.callbacks."1".monitor=val/PackedMSELoss \
-  trainer.callbacks."3".monitor=val/PackedMSELoss \
+  trainer.callbacks."2".monitor=val/PackedMSELoss \
   trainer_warmup.callbacks."1".monitor=val/PackedMSELoss \
   trainer_warmup.callbacks."2".monitor=val/PackedMSELoss
 done
diff --git a/src/uni2ts/model/lsf_moirai/finetune.py b/src/uni2ts/model/lsf_moirai/finetune.py
@@ -414,7 +414,7 @@ def configure_optimizers(self) -> dict:
 
         # validate that we considered every parameter
         param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
-        self.trainable_params = param_dict
+        self.updated_params = param_dict
 
         inter_params = decay & no_decay
         union_params = decay | no_decay
@@ -736,7 +736,7 @@ def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
         filtered_state = {
             name: tensor
             for name, tensor in state.items()
-            if name in self.trainable_params
+            if name in self.updated_params
         }
         return filtered_state
 

diff --git a/src/uni2ts/model/lsf_moirai_point/finetune.py b/src/uni2ts/model/lsf_moirai_point/finetune.py
@@ -343,7 +343,7 @@ def configure_optimizers(self) -> dict:
 
         # validate that we considered every parameter
         param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
-        self.trainable_params = param_dict
+        self.updated_params = param_dict
 
         inter_params = decay & no_decay
         union_params = decay | no_decay
@@ -665,6 +665,6 @@ def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
         filtered_state = {
             name: tensor
             for name, tensor in state.items()
-            if name in self.trainable_params
+            if name in self.updated_params
         }
         return filtered_state
diff --git a/src/uni2ts/model/moirai/finetune.py b/src/uni2ts/model/moirai/finetune.py
@@ -372,7 +372,7 @@ def configure_optimizers(self) -> dict:
 
         # validate that we considered every parameter
         param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
-        self.trainable_params = param_dict
+        self.updated_params = param_dict
 
         inter_params = decay & no_decay
         union_params = decay | no_decay
@@ -702,7 +702,7 @@ def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
         filtered_state = {
             name: tensor
             for name, tensor in state.items()
-            if name in self.trainable_params
+            if name in self.updated_params
         }
         return filtered_state