From c957ea3831924652495fb79ccfb1a46b655e7fcf Mon Sep 17 00:00:00 2001
From: Qiyuan Gong <qiyuan.gong@intel.com>
Date: Tue, 14 May 2024 13:43:59 +0800
Subject: [PATCH] Add axolotl main support and axolotl Llama-3-8B QLoRA example
  (#10984)

* Support axolotl main (796a085).
* Add axolotl Llama-3-8B QLoRA example.
* Change `sequence_len` to 256 for alpaca, and revert `lora_r` value.
* Add example to quick_start.
---
 .../doc/LLM/Quickstart/axolotl_quickstart.md  | 96 ++++++++++++++++++-
 .../GPU/LLM-Finetuning/axolotl/README.md      | 67 ++++++++++++-
 .../LLM-Finetuning/axolotl/llama3-qlora.yml   | 72 ++++++++++++++
 .../GPU/LLM-Finetuning/axolotl/lora.yml       |  4 +-
 .../GPU/LLM-Finetuning/axolotl/qlora.yml      |  4 +-
 5 files changed, 233 insertions(+), 10 deletions(-)
 create mode 100644 python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml

diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/axolotl_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/axolotl_quickstart.md
index afbdc7c9234..8c3a28e18dd 100644
--- a/docs/readthedocs/source/doc/LLM/Quickstart/axolotl_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/axolotl_quickstart.md
@@ -134,7 +134,7 @@ Modify LoRA parameters, such as `lora_r` and `lora_alpha`, etc.
 adapter: lora
 lora_model_dir:
 
-lora_r: 16
+lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
@@ -178,7 +178,7 @@ Modify QLoRA parameters, such as `lora_r` and `lora_alpha`, etc.
 adapter: qlora
 lora_model_dir:
 
-lora_r: 16
+lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
@@ -198,6 +198,98 @@ In Axolotl v0.4.0, you can use `train.py` instead of `-m axolotl.cli.train` or `
 accelerate launch train.py qlora.yml
 ```
 
+### 3. Finetune Llama-3-8B (Experimental)
+
+Warning: this section will install axolotl main ([796a085](https://github.com/OpenAccess-AI-Collective/axolotl/tree/796a085b2f688f4a5efe249d95f53ff6833bf009)) for new features, e.g., Llama-3-8B.
+
+#### 3.1 Install Axolotl main in conda
+
+Axolotl main has lots of new dependencies. Please setup a new conda env for this version.
+
+```cmd
+conda create -n llm python=3.11
+conda activate llm
+# install axolotl main
+git clone https://github.com/OpenAccess-AI-Collective/axolotl
+cd axolotl && git checkout 796a085
+pip install -e .
+# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
+pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+# install transformers etc
+pip install accelerate==0.23.0
+# to avoid https://github.com/OpenAccess-AI-Collective/axolotl/issues/1544
+pip install datasets==2.15.0
+pip install transformers==4.37.0
+```
+
+Config accelerate and oneAPIs, according to [Set Environment Variables](#22-set-environment-variables).
+
+#### 3.2 Alpaca QLoRA
+
+Based on [axolotl Llama-3 QLoRA example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/examples/llama-3/qlora.yml).
+
+Prepare `llama3-qlora.yml` for QLoRA finetune. You can download a template from github.
+
+```cmd
+wget https://raw.githubusercontent.com/intel-analytics/ipex-llm/main/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
+```
+
+**If you are using the offline model and dataset in local env**, please modify the model path and dataset path in `llama3-qlora.yml`. Otherwise, keep them unchanged.
+
+```yaml
+# Please change to local path if model is offline, e.g., /path/to/model/Meta-Llama-3-8B
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+  # Please change to local path if dataset is offline, e.g., /path/to/dataset/alpaca_2k_test
+  - path: aaditya/alpaca_subset_1
+    type: alpaca
+```
+
+Modify QLoRA parameters, such as `lora_r` and `lora_alpha`, etc.
+
+```yaml
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 256
+sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+```
+
+```cmd
+accelerate launch finetune.py llama3-qlora.yml
+```
+
+You can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
+
+```cmd
+accelerate launch train.py llama3-qlora.yml
+```
+
+Expected output
+
+```cmd
+{'loss': 0.237, 'learning_rate': 1.2254711850265387e-06, 'epoch': 3.77}
+{'loss': 0.6068, 'learning_rate': 1.1692453482951115e-06, 'epoch': 3.77}
+{'loss': 0.2926, 'learning_rate': 1.1143322458989303e-06, 'epoch': 3.78}
+{'loss': 0.2475, 'learning_rate': 1.0607326072295087e-06, 'epoch': 3.78}
+{'loss': 0.1531, 'learning_rate': 1.008447144232094e-06, 'epoch': 3.79}
+{'loss': 0.1799, 'learning_rate': 9.57476551396197e-07, 'epoch': 3.79}
+{'loss': 0.2724, 'learning_rate': 9.078215057463868e-07, 'epoch': 3.79}
+{'loss': 0.2534, 'learning_rate': 8.594826668332445e-07, 'epoch': 3.8}
+{'loss': 0.3388, 'learning_rate': 8.124606767246579e-07, 'epoch': 3.8}
+{'loss': 0.3867, 'learning_rate': 7.667561599972505e-07, 'epoch': 3.81}
+{'loss': 0.2108, 'learning_rate': 7.223697237281668e-07, 'epoch': 3.81}
+{'loss': 0.0792, 'learning_rate': 6.793019574868775e-07, 'epoch': 3.82}
+```
+
 ## Troubleshooting
 
 #### TypeError: PosixPath
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
index 773202e6974..7a019e7f01b 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
@@ -69,13 +69,13 @@ This example shows how to run [Alpaca LoRA training](https://github.com/tloen/al
 
 Based on [axolotl Llama-2 LoRA example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/lora.yml).
 
-```
+```bash
 accelerate launch finetune.py lora.yml
 ```
 
 In v0.4.0, you can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
 
-```
+```bash
 accelerate launch train.py lora.yml
 ```
 
@@ -85,13 +85,13 @@ Based on [axolotl Llama-2 QLoRA example](https://github.com/OpenAccess-AI-Collec
 
 Modify parameters in `qlora.yml` based on your requirements. Then, launch finetuning with the following command.
 
-```
+```bash
 accelerate launch finetune.py qlora.yml
 ```
 
 In v0.4.0, you can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
 
-```
+```bash
 accelerate launch train.py qlora.yml
 ```
 
@@ -113,3 +113,62 @@ Output in console
 {'loss': 0.9651, 'learning_rate': 0.00019189578116202307, 'epoch': 0.54}
 {'loss': 0.9067, 'learning_rate': 0.00019107766703887764, 'epoch': 0.56}
 ```
+
+### 4. Finetune Llama-3-8B (Experimental)
+
+Warning: this section will install axolotl main ([796a085](https://github.com/OpenAccess-AI-Collective/axolotl/tree/796a085b2f688f4a5efe249d95f53ff6833bf009)) for new features, e.g., Llama-3-8B.
+
+#### 4.1 Install Axolotl main in conda
+
+Axolotl main has lots of new dependencies. Please setup a new conda env for this version.
+
+```bash
+conda create -n llm python=3.11
+conda activate llm
+# install axolotl main
+git clone https://github.com/OpenAccess-AI-Collective/axolotl
+cd axolotl && git checkout 796a085
+pip install -e .
+# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
+pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+# install transformers etc
+pip install accelerate==0.23.0
+# to avoid https://github.com/OpenAccess-AI-Collective/axolotl/issues/1544
+pip install datasets==2.15.0
+pip install transformers==4.37.0
+```
+
+Config accelerate and oneAPIs, according to [Configures OneAPI environment variables and accelerate](#2-configures-oneapi-environment-variables-and-accelerate).
+
+#### 4.2 Alpaca QLoRA
+
+Based on [axolotl Llama-3 QLoRA example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/examples/llama-3/qlora.yml).
+
+Modify parameters in `llama3-qlora.yml` based on your requirements. Then, launch finetuning with the following command.
+
+```bash
+accelerate launch finetune.py llama3-qlora.yml
+```
+
+You can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
+
+```bash
+accelerate launch train.py llama3-qlora.yml
+```
+
+Expected output
+
+```bash
+{'loss': 0.237, 'learning_rate': 1.2254711850265387e-06, 'epoch': 3.77}
+{'loss': 0.6068, 'learning_rate': 1.1692453482951115e-06, 'epoch': 3.77}
+{'loss': 0.2926, 'learning_rate': 1.1143322458989303e-06, 'epoch': 3.78}
+{'loss': 0.2475, 'learning_rate': 1.0607326072295087e-06, 'epoch': 3.78}
+{'loss': 0.1531, 'learning_rate': 1.008447144232094e-06, 'epoch': 3.79}
+{'loss': 0.1799, 'learning_rate': 9.57476551396197e-07, 'epoch': 3.79}
+{'loss': 0.2724, 'learning_rate': 9.078215057463868e-07, 'epoch': 3.79}
+{'loss': 0.2534, 'learning_rate': 8.594826668332445e-07, 'epoch': 3.8}
+{'loss': 0.3388, 'learning_rate': 8.124606767246579e-07, 'epoch': 3.8}
+{'loss': 0.3867, 'learning_rate': 7.667561599972505e-07, 'epoch': 3.81}
+{'loss': 0.2108, 'learning_rate': 7.223697237281668e-07, 'epoch': 3.81}
+{'loss': 0.0792, 'learning_rate': 6.793019574868775e-07, 'epoch': 3.82}
+```
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
new file mode 100644
index 00000000000..401f4c10445
--- /dev/null
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/llama3-qlora.yml
@@ -0,0 +1,72 @@
+# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/examples/llama-3/qlora.yml
+base_model: meta-llama/Meta-Llama-3-8B
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: aaditya/alpaca_subset_1
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 256
+sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+# paged_adamw_32bit is not supported
+# due to bitsandbytes issue https://github.com/TimDettmers/bitsandbytes/issues/1180
+# optimizer: paged_adamw_32bit
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+# flash_attention is not supported
+flash_attention: false
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|end_of_text|>"
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
index 20cd8f73568..b77612c7476 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/lora.yml
@@ -14,13 +14,13 @@ dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./lora-out
 
-sequence_len: 4096
+sequence_len: 256
 sample_packing: true
 pad_to_sequence_len: true
 
 adapter: lora
 lora_model_dir:
-lora_r: 16
+lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
index b76eeae2d5d..b18efd4ed28 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
@@ -18,11 +18,11 @@ output_dir: ./qlora-out
 adapter: qlora
 lora_model_dir:
 
-sequence_len: 4096
+sequence_len: 256
 sample_packing: true
 pad_to_sequence_len: true
 
-lora_r: 16
+lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: