diff --git a/cli/conf/lsf-setup/lsf/finetune/data/electricity.yaml b/cli/conf/lsf-setup/lsf/finetune/data/electricity.yaml
index 70ca032..73e7350 100644
--- a/cli/conf/lsf-setup/lsf/finetune/data/electricity.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/data/electricity.yaml
@@ -4,3 +4,4 @@ train_length: 18412
 prediction_length: ???
 context_length: ???
 patch_size: ???
+mode: ???
diff --git a/cli/conf/lsf-setup/lsf/finetune/data/etth1.yaml b/cli/conf/lsf-setup/lsf/finetune/data/etth1.yaml
index f7235c4..bd54733 100644
--- a/cli/conf/lsf-setup/lsf/finetune/data/etth1.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/data/etth1.yaml
@@ -3,4 +3,5 @@ dataset: ETTh1
 train_length: 8640
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/data/etth2.yaml b/cli/conf/lsf-setup/lsf/finetune/data/etth2.yaml
index 1dd47f7..6e3eede 100644
--- a/cli/conf/lsf-setup/lsf/finetune/data/etth2.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/data/etth2.yaml
@@ -3,4 +3,5 @@ dataset: ETTh2
 train_length: 8640
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/data/ettm1.yaml b/cli/conf/lsf-setup/lsf/finetune/data/ettm1.yaml
index dbde79e..2f84768 100644
--- a/cli/conf/lsf-setup/lsf/finetune/data/ettm1.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/data/ettm1.yaml
@@ -3,4 +3,5 @@ dataset: ETTm1
 train_length: 34560
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/data/ettm2.yaml b/cli/conf/lsf-setup/lsf/finetune/data/ettm2.yaml
index 5c402f1..1d8e32b 100644
--- a/cli/conf/lsf-setup/lsf/finetune/data/ettm2.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/data/ettm2.yaml
@@ -4,3 +4,4 @@ train_length: 34560
 prediction_length: ???
 context_length: ???
 patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/data/weather.yaml b/cli/conf/lsf-setup/lsf/finetune/data/weather.yaml
index a6fa5fd..86b5bcc 100644
--- a/cli/conf/lsf-setup/lsf/finetune/data/weather.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/data/weather.yaml
@@ -4,3 +4,4 @@ train_length: 36887
 prediction_length: ???
 context_length: ???
 patch_size: ???
+mode: ???
diff --git a/cli/conf/lsf-setup/lsf/finetune/default.yaml b/cli/conf/lsf-setup/lsf/finetune/default.yaml
index 67ac76a..1ea168e 100644
--- a/cli/conf/lsf-setup/lsf/finetune/default.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/default.yaml
@@ -1,6 +1,6 @@
 hydra:
   run:
-    dir: outputs/lsf-setup/lsf/finetune/${hydra:runtime.choices.model}/${exp_name}/${model.finetune_pattern}/${hydra:runtime.choices.data}/${run_name}
+    dir: outputs/lsf-setup/lsf/finetune/${hydra:runtime.choices.model}/${exp_name}/${model.finetune_pattern}/${hydra:runtime.choices.data}/${data.mode}/${run_name}
 defaults:
   - model: ???
   - data: ???
diff --git a/cli/conf/lsf-setup/lsf/finetune/val_data/electricity.yaml b/cli/conf/lsf-setup/lsf/finetune/val_data/electricity.yaml
index 74981e0..a20c574 100644
--- a/cli/conf/lsf-setup/lsf/finetune/val_data/electricity.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/val_data/electricity.yaml
@@ -4,4 +4,5 @@ offset: 18412 # Same as _lsf_dataset.py
 eval_length: 2630 # Same as _lsf_dataset.py, test_length=5260
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/val_data/etth1.yaml b/cli/conf/lsf-setup/lsf/finetune/val_data/etth1.yaml
index a409cde..2a379ab 100644
--- a/cli/conf/lsf-setup/lsf/finetune/val_data/etth1.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/val_data/etth1.yaml
@@ -4,4 +4,5 @@ offset: 8640
 eval_length: 2880
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/val_data/etth2.yaml b/cli/conf/lsf-setup/lsf/finetune/val_data/etth2.yaml
index 31ca968..90e8296 100644
--- a/cli/conf/lsf-setup/lsf/finetune/val_data/etth2.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/val_data/etth2.yaml
@@ -4,4 +4,5 @@ offset: 8640  # Same as _lsf_dataset.py
 eval_length: 2880  # Same as _lsf_dataset.py
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/val_data/ettm1.yaml b/cli/conf/lsf-setup/lsf/finetune/val_data/ettm1.yaml
index 3f0244c..3cdf94b 100644
--- a/cli/conf/lsf-setup/lsf/finetune/val_data/ettm1.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/val_data/ettm1.yaml
@@ -4,4 +4,5 @@ offset: 34560  # Same as _lsf_dataset.py
 eval_length: 11520  # Same as _lsf_dataset.py
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/val_data/ettm2.yaml b/cli/conf/lsf-setup/lsf/finetune/val_data/ettm2.yaml
index 0939493..74ae64c 100644
--- a/cli/conf/lsf-setup/lsf/finetune/val_data/ettm2.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/val_data/ettm2.yaml
@@ -4,4 +4,5 @@ offset: 34560  # Same as _lsf_dataset.py
 eval_length: 11520  # Same as _lsf_dataset.py
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/lsf/finetune/val_data/weather.yaml b/cli/conf/lsf-setup/lsf/finetune/val_data/weather.yaml
index c2a23de..1d4e331 100644
--- a/cli/conf/lsf-setup/lsf/finetune/val_data/weather.yaml
+++ b/cli/conf/lsf-setup/lsf/finetune/val_data/weather.yaml
@@ -4,4 +4,5 @@ offset: 36887 # Same as _lsf_dataset.py
 eval_length: 5269 # Same as _lsf_dataset.py; test_length=10539
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.0_R_small.yaml b/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.0_R_small.yaml
index 338656c..b3bb984 100644
--- a/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.0_R_small.yaml
+++ b/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.0_R_small.yaml
@@ -5,4 +5,4 @@ module:
 num_samples: 100
 patch_size: ???
 context_length: ???
-num_new_scales: 2
\ No newline at end of file
+num_new_scales: 3
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.1_R_small.yaml b/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.1_R_small.yaml
index 845a22b..df35ec7 100644
--- a/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.1_R_small.yaml
+++ b/cli/conf/lsf-setup/multi_scale/eval/model/moirai_1.1_R_small.yaml
@@ -5,4 +5,4 @@ module:
 num_samples: 100
 patch_size: ???
 context_length: ???
-num_new_scales: 2
\ No newline at end of file
+num_new_scales: 3
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/data/electricity.yaml b/cli/conf/lsf-setup/multi_scale/finetune/data/electricity.yaml
index 70ca032..73e7350 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/data/electricity.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/data/electricity.yaml
@@ -4,3 +4,4 @@ train_length: 18412
 prediction_length: ???
 context_length: ???
 patch_size: ???
+mode: ???
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/data/etth1.yaml b/cli/conf/lsf-setup/multi_scale/finetune/data/etth1.yaml
index f7235c4..bd54733 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/data/etth1.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/data/etth1.yaml
@@ -3,4 +3,5 @@ dataset: ETTh1
 train_length: 8640
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/data/etth2.yaml b/cli/conf/lsf-setup/multi_scale/finetune/data/etth2.yaml
index 1dd47f7..6e3eede 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/data/etth2.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/data/etth2.yaml
@@ -3,4 +3,5 @@ dataset: ETTh2
 train_length: 8640
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/data/ettm1.yaml b/cli/conf/lsf-setup/multi_scale/finetune/data/ettm1.yaml
index dbde79e..2f84768 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/data/ettm1.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/data/ettm1.yaml
@@ -3,4 +3,5 @@ dataset: ETTm1
 train_length: 34560
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/data/ettm2.yaml b/cli/conf/lsf-setup/multi_scale/finetune/data/ettm2.yaml
index 5c402f1..dee2561 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/data/ettm2.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/data/ettm2.yaml
@@ -4,3 +4,4 @@ train_length: 34560
 prediction_length: ???
 context_length: ???
 patch_size: ???
+mode: ???
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/data/weather.yaml b/cli/conf/lsf-setup/multi_scale/finetune/data/weather.yaml
index a6fa5fd..86b5bcc 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/data/weather.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/data/weather.yaml
@@ -4,3 +4,4 @@ train_length: 36887
 prediction_length: ???
 context_length: ???
 patch_size: ???
+mode: ???
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/default.yaml b/cli/conf/lsf-setup/multi_scale/finetune/default.yaml
index 7239871..d3135e2 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/default.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/default.yaml
@@ -1,6 +1,6 @@
 hydra:
   run:
-    dir: outputs/lsf-setup/multi_scale/finetune/${hydra:runtime.choices.model}/${exp_name}/${model.finetune_pattern}/${hydra:runtime.choices.data}/${run_name}
+    dir: outputs/lsf-setup/multi_scale/finetune/${hydra:runtime.choices.model}/${exp_name}/${model.finetune_pattern}/${hydra:runtime.choices.data}/${data.mode}/${run_name}
 defaults:
   - model: ???
   - data: ???
@@ -40,7 +40,11 @@ trainer:
       mode: min
       strict: false
       verbose: true
-#      warmup_steps: 1
+      #      warmup_steps: 1
+    - _target_: lightning.pytorch.callbacks.ModelCheckpoint
+      dirpath: ${hydra:runtime.output_dir}/checkpoints
+      save_last: true
+      save_weights_only: true
   max_epochs: 1000
   enable_progress_bar: true
   accumulate_grad_batches: 1
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/val_data/electricity.yaml b/cli/conf/lsf-setup/multi_scale/finetune/val_data/electricity.yaml
index 74981e0..a20c574 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/val_data/electricity.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/val_data/electricity.yaml
@@ -4,4 +4,5 @@ offset: 18412 # Same as _lsf_dataset.py
 eval_length: 2630 # Same as _lsf_dataset.py, test_length=5260
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth1.yaml b/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth1.yaml
index a409cde..2a379ab 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth1.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth1.yaml
@@ -4,4 +4,5 @@ offset: 8640
 eval_length: 2880
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth2.yaml b/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth2.yaml
index 31ca968..90e8296 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth2.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/val_data/etth2.yaml
@@ -4,4 +4,5 @@ offset: 8640  # Same as _lsf_dataset.py
 eval_length: 2880  # Same as _lsf_dataset.py
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm1.yaml b/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm1.yaml
index 3f0244c..3cdf94b 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm1.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm1.yaml
@@ -4,4 +4,5 @@ offset: 34560  # Same as _lsf_dataset.py
 eval_length: 11520  # Same as _lsf_dataset.py
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm2.yaml b/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm2.yaml
index 0939493..74ae64c 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm2.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/val_data/ettm2.yaml
@@ -4,4 +4,5 @@ offset: 34560  # Same as _lsf_dataset.py
 eval_length: 11520  # Same as _lsf_dataset.py
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/cli/conf/lsf-setup/multi_scale/finetune/val_data/weather.yaml b/cli/conf/lsf-setup/multi_scale/finetune/val_data/weather.yaml
index c2a23de..1d4e331 100644
--- a/cli/conf/lsf-setup/multi_scale/finetune/val_data/weather.yaml
+++ b/cli/conf/lsf-setup/multi_scale/finetune/val_data/weather.yaml
@@ -4,4 +4,5 @@ offset: 36887 # Same as _lsf_dataset.py
 eval_length: 5269 # Same as _lsf_dataset.py; test_length=10539
 prediction_length: ???
 context_length: ???
-patch_size: ???
\ No newline at end of file
+patch_size: ???
+mode: ???
\ No newline at end of file
diff --git a/project/lsf-setup/build_lsf_ft_datasets.sh b/project/lsf-setup/build_lsf_ft_datasets.sh
index f35b537..d88b9c2 100644
--- a/project/lsf-setup/build_lsf_ft_datasets.sh
+++ b/project/lsf-setup/build_lsf_ft_datasets.sh
@@ -3,7 +3,7 @@ set -a
 source .env
 set +a
 
-ds_type="wide"  # "wide_multivariate"
+ds_type="wide_multivariate"  # "wide_multivariate"
 path_prefix=$LSF_PATH
 
 for data in ETTh1 ETTh2; do
diff --git a/project/lsf-setup/lsf/finetune/small/electricity.sh b/project/lsf-setup/lsf/finetune/small/electricity.sh
index 9ac2970..d57c1b6 100644
--- a/project/lsf-setup/lsf/finetune/small/electricity.sh
+++ b/project/lsf-setup/lsf/finetune/small/electricity.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=electricity
 cl=5000
 ps=64
+mode=S
 ft_pattern=full
 
 
@@ -25,9 +26,11 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
   val_data.prediction_length=$pl \
+  val_data.mode=${mode} \
   model.lr=5e-6
 done
\ No newline at end of file
diff --git a/project/lsf-setup/lsf/finetune/small/etth1.sh b/project/lsf-setup/lsf/finetune/small/etth1.sh
index 7798c69..626c843 100644
--- a/project/lsf-setup/lsf/finetune/small/etth1.sh
+++ b/project/lsf-setup/lsf/finetune/small/etth1.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=etth1
 cl=5000
 ps=64
+mode=M
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/lsf/finetune/small/etth2.sh b/project/lsf-setup/lsf/finetune/small/etth2.sh
index 0927993..780ba83 100644
--- a/project/lsf-setup/lsf/finetune/small/etth2.sh
+++ b/project/lsf-setup/lsf/finetune/small/etth2.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=etth2
 cl=3000
 ps=64
+mode=M
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/lsf/finetune/small/ettm1.sh b/project/lsf-setup/lsf/finetune/small/ettm1.sh
index 515daa7..061a5e1 100644
--- a/project/lsf-setup/lsf/finetune/small/ettm1.sh
+++ b/project/lsf-setup/lsf/finetune/small/ettm1.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=ettm1
 cl=4000
 ps=128
+mode=S
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/lsf/finetune/small/ettm2.sh b/project/lsf-setup/lsf/finetune/small/ettm2.sh
index 2a9cce4..2dc2415 100644
--- a/project/lsf-setup/lsf/finetune/small/ettm2.sh
+++ b/project/lsf-setup/lsf/finetune/small/ettm2.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=ettm2
 cl=3000
 ps=64
+mode=S
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/lsf/finetune/small/weather.sh b/project/lsf-setup/lsf/finetune/small/weather.sh
index 98939b0..9db8ddc 100644
--- a/project/lsf-setup/lsf/finetune/small/weather.sh
+++ b/project/lsf-setup/lsf/finetune/small/weather.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=weather
 cl=2000
 ps=128
+mode=M
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/multi_scale/finetune/small/electricity.sh b/project/lsf-setup/multi_scale/finetune/small/electricity.sh
index 2b1af2b..ac8d4fc 100644
--- a/project/lsf-setup/multi_scale/finetune/small/electricity.sh
+++ b/project/lsf-setup/multi_scale/finetune/small/electricity.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=electricity
 cl=5000
 ps=64
+mode=S
 ft_pattern=full
 
 
@@ -25,9 +26,11 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
   val_data.prediction_length=$pl \
+  val_data.mode=${mode} \
   model.lr=5e-6
 done
\ No newline at end of file
diff --git a/project/lsf-setup/multi_scale/finetune/small/etth1.sh b/project/lsf-setup/multi_scale/finetune/small/etth1.sh
index 3e724fa..cedd949 100644
--- a/project/lsf-setup/multi_scale/finetune/small/etth1.sh
+++ b/project/lsf-setup/multi_scale/finetune/small/etth1.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=etth1
 cl=5000
 ps=64
+mode=M
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/multi_scale/finetune/small/etth2.sh b/project/lsf-setup/multi_scale/finetune/small/etth2.sh
index 0bfad04..c2baada 100644
--- a/project/lsf-setup/multi_scale/finetune/small/etth2.sh
+++ b/project/lsf-setup/multi_scale/finetune/small/etth2.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=etth2
 cl=3000
 ps=64
+mode=M
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/multi_scale/finetune/small/ettm1.sh b/project/lsf-setup/multi_scale/finetune/small/ettm1.sh
index 99a191a..30204fc 100644
--- a/project/lsf-setup/multi_scale/finetune/small/ettm1.sh
+++ b/project/lsf-setup/multi_scale/finetune/small/ettm1.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=ettm1
 cl=4000
 ps=128
+mode=S
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/multi_scale/finetune/small/ettm2.sh b/project/lsf-setup/multi_scale/finetune/small/ettm2.sh
index 57b2713..66ff9cb 100644
--- a/project/lsf-setup/multi_scale/finetune/small/ettm2.sh
+++ b/project/lsf-setup/multi_scale/finetune/small/ettm2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-export HYDRA_FULL_ERROR=1; export CUDA_VISIBLE_DEVICES=0;
+export HYDRA_FULL_ERROR=1; export CUDA_VISIBLE_DEVICES=3;
 
 model=moirai_1.0_R_small
 cp=conf/lsf-setup/multi_scale/finetune
@@ -8,6 +8,7 @@ exp_name=lsf
 data=ettm2
 cl=3000
 ps=64
+mode=S
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/project/lsf-setup/multi_scale/finetune/small/weather.sh b/project/lsf-setup/multi_scale/finetune/small/weather.sh
index fc8ae55..95e2cc7 100644
--- a/project/lsf-setup/multi_scale/finetune/small/weather.sh
+++ b/project/lsf-setup/multi_scale/finetune/small/weather.sh
@@ -8,6 +8,7 @@ exp_name=lsf
 data=weather
 cl=2000
 ps=128
+mode=M
 ft_pattern=full
 
 
@@ -25,8 +26,10 @@ for pl in 96 192 336 720; do
   data.patch_size=${ps} \
   data.context_length=$cl \
   data.prediction_length=$pl \
+  data.mode=${mode} \
   val_data=${data} \
   val_data.patch_size=${ps} \
   val_data.context_length=$cl \
-  val_data.prediction_length=$pl
+  val_data.prediction_length=$pl \
+  val_data.mode=${mode}
 done
\ No newline at end of file
diff --git a/src/uni2ts/data/builder/simple.py b/src/uni2ts/data/builder/simple.py
index 832f880..5a89a32 100644
--- a/src/uni2ts/data/builder/simple.py
+++ b/src/uni2ts/data/builder/simple.py
@@ -274,6 +274,7 @@ class SimpleFinetuneDatasetBuilder(DatasetBuilder):
     prediction_length: Optional[int]
     context_length: Optional[int]
     patch_size: Optional[int]
+    mode: Optional[str] = 'S'
     storage_path: Path = env.CUSTOM_DATA_PATH
     mean = None
     std = None
@@ -331,16 +332,22 @@ def build_dataset(
             example_gen_func, features=features
         )
         hf_dataset.info.dataset_name = self.dataset
-        hf_dataset.save_to_disk(self.storage_path / 'lsf' / self.dataset)
+        hf_dataset.save_to_disk(self.storage_path / 'lsf' / f'{dataset_type}' / self.dataset)
 
     def load_dataset(
         self, transform_map: dict[str, Callable[..., Transformation]]
     ) -> Dataset:
+
+        if self.mode == 'S':
+            dataset_type = 'wide'
+        elif self.mode == 'M':
+            dataset_type = 'wide_multivariate'
+
         return FinetuneDataset(
             self.windows,
             HuggingFaceDatasetIndexer(
                 datasets.load_from_disk(
-                    str(self.storage_path / 'lsf' / self.dataset),
+                    str(self.storage_path / 'lsf' / f'{dataset_type}' / self.dataset),
                 )
             ),
             transform=transform_map[self.dataset](
@@ -367,6 +374,7 @@ class SimpleEvalDatasetBuilder(DatasetBuilder):
     prediction_length: Optional[int]
     context_length: Optional[int]
     patch_size: Optional[int]
+    mode: Optional[str] = 'S'
     storage_path: Path = env.CUSTOM_DATA_PATH
 
     def __post_init__(self):
@@ -402,16 +410,21 @@ def build_dataset(
             example_gen_func, features=features
         )
         hf_dataset.info.dataset_name = self.dataset
-        hf_dataset.save_to_disk(self.storage_path / 'lsf' / self.dataset)
+        hf_dataset.save_to_disk(self.storage_path / 'lsf' / f'{dataset_type}' / self.dataset)
 
     def load_dataset(
         self, transform_map: dict[str, Callable[..., Transformation]]
     ) -> Dataset:
+        if self.mode == 'S':
+            dataset_type = 'wide'
+        elif self.mode == 'M':
+            dataset_type = 'wide_multivariate'
+
         return EvalDataset(
             self.windows,
             HuggingFaceDatasetIndexer(
                 datasets.load_from_disk(
-                    str(self.storage_path / 'lsf' / self.dataset),
+                    str(self.storage_path / 'lsf' / f'{dataset_type}' / self.dataset),
                 )
             ),
             transform=transform_map[self.dataset](
@@ -430,6 +443,7 @@ def generate_finetune_builder(
     prediction_length: int,
     context_length: int,
     patch_size: int,
+    mode: str,
     storage_path: Path = env.CUSTOM_DATA_PATH,
 ) -> SimpleFinetuneDatasetBuilder:
     """
@@ -442,6 +456,7 @@ def generate_finetune_builder(
         prediction_length=prediction_length,
         context_length=context_length,
         patch_size=patch_size,
+        mode=mode,
         storage_path=storage_path,
     )
 
@@ -453,6 +468,7 @@ def generate_eval_builder(
     prediction_length: int,
     context_length: int,
     patch_size: int,
+    mode: str,
     storage_path: Path = env.CUSTOM_DATA_PATH,
 ) -> SimpleEvalDatasetBuilder:
     """
@@ -505,6 +521,7 @@ def generate_eval_builder(
         prediction_length=prediction_length,
         context_length=context_length,
         patch_size=patch_size,
+        mode=mode,
         storage_path=storage_path,
     )
 
diff --git a/src/uni2ts/model/multi_scale_moirai/finetune.py b/src/uni2ts/model/multi_scale_moirai/finetune.py
index 72662d3..24b23eb 100644
--- a/src/uni2ts/model/multi_scale_moirai/finetune.py
+++ b/src/uni2ts/model/multi_scale_moirai/finetune.py
@@ -132,15 +132,15 @@ def __init__(
         self.token_idx_per_scale = self._get_token_idx_per_scale()
 
     def post_init(self):
-        # for layer in self.module.encoder.layers:
-        #     # Check if the layer has an attribute named `self_attn` and if it is an instance of GroupedQueryAttention
-        #     if hasattr(layer, 'self_attn') and isinstance(layer.self_attn, GroupedQueryAttention):
-        #         # Call post_init() method of the GroupedQueryAttention object
-        #         layer.self_attn.init_multi_scale_modules(self.context_length, self.patch_size, self.num_new_scales, self.ds_factor)
+        for layer in self.module.encoder.layers:
+            # Check if the layer has an attribute named `self_attn` and if it is an instance of GroupedQueryAttention
+            if hasattr(layer, 'self_attn') and isinstance(layer.self_attn, GroupedQueryAttention):
+                # Call post_init() method of the GroupedQueryAttention object
+                layer.self_attn.init_multi_scale_modules(self.context_length, self.patch_size, self.num_new_scales, self.ds_factor)
 
-        for module in self.module.encoder.modules():
-            if isinstance(module, MultiScaleRotaryProjection):
-                module.post_init(self.token_idx_per_scale)
+        # for module in self.module.encoder.modules():
+        #     if isinstance(module, MultiScaleRotaryProjection):
+        #         module.post_init(self.token_idx_per_scale)
 
             # ToDo: Call post_init() method to replace BinaryAttentionBias to CrossVariateAttentionBias
             #   from_pretrained的Pipeline是什么？先init,再load? 然后load不了的参数自动忽略？如果是这样就不用加post_init
@@ -324,6 +324,12 @@ def configure_optimizers(self) -> dict:
             if "film" in pn:
                 p.requires_grad = True
 
+            if "adapt_weight" in pn:
+                p.requires_grad = True
+
+            if "adapt_bias" in pn:
+                p.requires_grad = True
+
             if "var_attn_bias.emb" in pn:
                 p.requires_grad = True
 
@@ -417,6 +423,8 @@ def configure_optimizers(self) -> dict:
                     decay.add(fpn)
                 elif pn.endswith("weight") and isinstance(m, blacklist_params):
                     no_decay.add(fpn)
+                elif "adapt_weight" in pn or "adapt_bias" in pn:
+                    decay.add(fpn)
 
         # validate that we considered every parameter
         param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
diff --git a/src/uni2ts/module/multi_scale/attention.py b/src/uni2ts/module/multi_scale/attention.py
index 7a4cd33..abb4009 100644
--- a/src/uni2ts/module/multi_scale/attention.py
+++ b/src/uni2ts/module/multi_scale/attention.py
@@ -100,6 +100,41 @@ def __init__(
         self.dim = dim
         self.num_new_scales = None
 
+
+    def init_multi_scale_modules(self, context_length, patch_size, num_new_scales, ds_factor):
+        self.num_new_scales = num_new_scales
+
+        base_len = math.ceil(context_length / patch_size)  # num context patches in base scale
+        scale_len = math.ceil(base_len / ds_factor)
+
+        # Initialize parameter lists
+        self.query_adapt_weight = nn.ParameterList()
+        self.key_adapt_weight = nn.ParameterList()
+        self.value_adapt_weight = nn.ParameterList()
+        self.query_adapt_bias = nn.ParameterList()
+        self.key_adapt_bias = nn.ParameterList()
+        self.value_adapt_bias = nn.ParameterList()
+
+        for _ in range(num_new_scales):
+            # Append the new parameters for the current scale
+            self.query_adapt_weight.append(
+                nn.Parameter(torch.ones((scale_len, self.dim), dtype=torch.float), requires_grad=True))
+            self.key_adapt_weight.append(
+                nn.Parameter(torch.ones((scale_len, self.dim), dtype=torch.float), requires_grad=True))
+            self.value_adapt_weight.append(
+                nn.Parameter(torch.ones((scale_len, self.dim), dtype=torch.float), requires_grad=True))
+
+            self.query_adapt_bias.append(
+                nn.Parameter(torch.zeros((scale_len, self.dim), dtype=torch.float), requires_grad=True))
+            self.key_adapt_bias.append(
+                nn.Parameter(torch.zeros((scale_len, self.dim), dtype=torch.float), requires_grad=True))
+            self.value_adapt_bias.append(
+                nn.Parameter(torch.zeros((scale_len, self.dim), dtype=torch.float), requires_grad=True))
+
+            # Update scale length for the next iteration
+            scale_len = math.ceil(scale_len / ds_factor)
+
+
     # def init_multi_scale_modules(self, context_length, patch_size, num_new_scales, ds_factor):
     #
     #     self.num_new_scales = num_new_scales
@@ -115,28 +150,24 @@ def __init__(
     #         nn.Linear(in_features=nh, out_features=self.dim) for _ in range(num_new_scales)
     #     ])
 
-    def init_multi_scale_modules(self, context_length, patch_size, num_new_scales, ds_factor):
-
-        self.num_new_scales = num_new_scales
-
-        base_len = math.ceil(context_length / patch_size)  # num context patches in base scale
-
-        scale_len = math.ceil(base_len / ds_factor)
-        self.query_film_generator = nn.ModuleList([
-            nn.Linear(in_features=self.dim, out_features=2 * scale_len)
-        ])
-        self.key_film_generator = nn.ModuleList([
-            nn.Linear(in_features=self.dim, out_features=2 * scale_len)
-        ])
-
-        for _ in range(1, num_new_scales):
-            scale_len = math.ceil(scale_len / ds_factor)
-            self.query_film_generator.append(
-                nn.Linear(in_features=self.dim, out_features=2 * scale_len)
-            )
-            self.key_film_generator.append(
-                nn.Linear(in_features=self.dim, out_features=2 * scale_len)
-            )
+    # def init_multi_scale_modules(self, context_length, patch_size, num_new_scales, ds_factor):
+    # 
+    #     self.num_new_scales = num_new_scales
+    # 
+    #     base_len = math.ceil(context_length / patch_size)  # num context patches in base scale
+    #     scale_len = math.ceil(base_len / ds_factor)
+    # 
+    #     self.query_film_generator = nn.ModuleList()
+    #     self.key_film_generator = nn.ModuleList()
+    # 
+    #     for _ in range(num_new_scales):
+    #         self.query_film_generator.append(
+    #             nn.Linear(in_features=self.dim, out_features=2 * scale_len)
+    #         )
+    #         self.key_film_generator.append(
+    #             nn.Linear(in_features=self.dim, out_features=2 * scale_len)
+    #         )
+    #         scale_len = math.ceil(scale_len / ds_factor)
 
     def _get_var_id(
         self,
@@ -301,28 +332,56 @@ def forward(
         query_time_id: Optional[Int[torch.Tensor, "*batch q_len"]] = None,
         kv_time_id: Optional[Int[torch.Tensor, "*batch kv_len"]] = None,
     ) -> Float[torch.Tensor, "*batch q_len dim"]:
-        query = self.q_proj(query)
-        key = self.k_proj(key)
-        value = self.v_proj(value)
+        # query = self.q_proj(query)
+        # key = self.k_proj(key)
+        # value = self.v_proj(value)
+
+        init_query = self.q_proj(query)
+        init_key = self.k_proj(key)
+        init_value = self.v_proj(value)
+
+        query = init_query.clone()
+        key = init_key.clone()
+        value = init_value.clone()
+
+        # ToDo: Plan B: Directly apply different Film on query / key to different scales. W.o revising RoPE
+        if self.num_new_scales is not None:
+            index_by_variate = self.get_token_index_by_variate(query_var_id)
+
+            for scale in range(self.num_new_scales):
+                assert torch.equal(query_var_id, kv_var_id), "query_var_id is different from kv_var_id"
+                index = index_by_variate[scale + 1]
+                query_scale = init_query[..., index, :]  # (bs, num_patch_new_scale, dim)
+                query[..., index, :] = self.query_adapt_weight[scale] * query_scale + self.query_adapt_bias[scale]
+                
+                key_scale = init_key[..., index, :]  # (bs, num_patch_new_scale, dim)
+                key[..., index, :] = self.key_adapt_weight[scale] * key_scale + self.key_adapt_bias[scale]
+                
+                value_scale = init_value[..., index, :]  # (bs, num_patch_new_scale, dim)
+                value[..., index, :] = self.value_adapt_weight[scale] * value_scale + self.value_adapt_bias[scale]
 
-        # # ToDo: Plan B: Directly apply different Film on query / key to different scales. W.o revising RoPE
+
+
+        # # Apply a different transformation for each dimension. All tokens share the same transformation.
         # if self.num_new_scales is not None:
         #     index_by_variate = self.get_token_index_by_variate(query_var_id)
-        #
+        # 
         #     for scale in range(self.num_new_scales):
         #         assert torch.equal(query_var_id, kv_var_id), "query_var_id is different from kv_var_id"
         #         index = index_by_variate[scale + 1]
-        #
+        # 
         #         query_scale = query[..., index, :]  # (bs, num_patch_new_scale, dim)
         #         query_scale_reprs = self.film_controller(torch.mean(query_scale, dim=1))
-        #         query_weight = self.query_film_generator[scale](query_scale_reprs)
-        #         query[..., index, :] = query_weight.unsqueeze(-2) * query_scale
-        #
+        #         query_adapt_weight = self.query_film_generator[scale](query_scale_reprs)  # (bs, dim)
+        #         query[..., index, :] = query_adapt_weight.unsqueeze(-2) * query_scale
+        # 
         #         key_scale = key[..., index, :]
         #         key_scale_reprs = self.film_controller(torch.mean(key_scale, dim=1))
-        #         key_weight = self.key_film_generator[scale](key_scale_reprs)
-        #         key[..., index, :] = key_weight.unsqueeze(-2) * key_scale
+        #         key_adapt_weight = self.key_film_generator[scale](key_scale_reprs)
+        #         key[..., index, :] = key_adapt_weight.unsqueeze(-2) * key_scale
+
 
+        # # Apply a different transformation for each token. All dimensions of a token share the same transformation.
         # if self.num_new_scales is not None:
         #     index_by_variate = self.get_token_index_by_variate(query_var_id)
         #
@@ -331,15 +390,17 @@ def forward(
         #         index = index_by_variate[scale+1]
         #         query_scale = query[..., index, :]  # (bs, num_patch_new_scale, dim)
         #         query_film_out = self.query_film_generator[scale](torch.mean(query_scale, dim=1))  # ToDo: 换成faltten试试？
-        #         query_weight, query_bias = query_film_out[:, :int(query_film_out.size(-1) / 2)], query_film_out[:, int(query_film_out.size(-1) / 2):]
-        #         query[..., index, :] = query_weight.unsqueeze(-1) * query_scale + query_bias.unsqueeze(-1)
+        #         query_adapt_weight, query_adapt_bias = query_film_out[:, :int(query_film_out.size(-1) / 2)], query_film_out[:, int(query_film_out.size(-1) / 2):]
+        #         query[..., index, :] = query_adapt_weight.unsqueeze(-1) * query_scale + query_adapt_bias.unsqueeze(-1)
         #
         #         key_scale = key[..., index, :]
         #         key_film_out = self.key_film_generator[scale](torch.mean(key_scale, dim=1))
-        #         key_weight, key_bias = key_film_out[:, :int(key_film_out.size(-1) / 2)], key_film_out[:,
+        #         key_adapt_weight, key_adapt_bias = key_film_out[:, :int(key_film_out.size(-1) / 2)], key_film_out[:,
         #                                                                                          int(key_film_out.size(
         #                                                                                              -1) / 2):]
-        #         key[..., index, :] = key_weight.unsqueeze(-1) * key_scale + key_bias.unsqueeze(-1)
+        #         key[..., index, :] = key_adapt_weight.unsqueeze(-1) * key_scale + key_adapt_bias.unsqueeze(-1)
+
+
 
         query = self.q_norm(
             rearrange(