From 00af8b0d9b43eaa3f8c812e3e8bd53ef38cbb49d Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Fri, 27 Sep 2024 08:57:19 -0500 Subject: [PATCH] Llama 3.2 weights (#1750) --- README.md | 13 +- config_hub/finetune/README.md | 12 +- config_hub/finetune/llama-3.2-1B/full.yaml | 112 ++++++++++++++++ config_hub/finetune/llama-3.2-1B/lora.yaml | 136 +++++++++++++++++++ config_hub/finetune/llama-3.2-1B/qlora.yaml | 138 ++++++++++++++++++++ config_hub/finetune/llama-3.2-3B/full.yaml | 112 ++++++++++++++++ config_hub/finetune/llama-3.2-3B/lora.yaml | 136 +++++++++++++++++++ config_hub/finetune/llama-3.2-3B/qlora.yaml | 138 ++++++++++++++++++++ litgpt/config.py | 38 ++++++ tests/test_model.py | 2 + tutorials/download_model_weights.md | 7 +- 11 files changed, 835 insertions(+), 9 deletions(-) create mode 100644 config_hub/finetune/llama-3.2-1B/full.yaml create mode 100644 config_hub/finetune/llama-3.2-1B/lora.yaml create mode 100644 config_hub/finetune/llama-3.2-1B/qlora.yaml create mode 100644 config_hub/finetune/llama-3.2-3B/full.yaml create mode 100644 config_hub/finetune/llama-3.2-3B/lora.yaml create mode 100644 config_hub/finetune/llama-3.2-3B/qlora.yaml diff --git a/README.md b/README.md index 5623f43c8c..f2e8616164 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Every model is written from scratch to maximize performance and remove layers of | Model | Model size | Author | Reference | |----|----|----|----| -| Llama 3 & 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | @@ -125,6 +125,7 @@ Every model is written from scratch to maximize performance and remove layers of | Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | | Llama 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 3.2 | 1B, 3B | Meta AI | [Meta AI 2024](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/) | | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/) | | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) | @@ -173,11 +174,11 @@ After installing LitGPT, select the model and workflow to run (finetune, pretrai ```bash # ligpt [action] [model] -litgpt serve meta-llama/Meta-Llama-3.1-8B-Instruct -litgpt finetune meta-llama/Meta-Llama-3.1-8B-Instruct -litgpt pretrain meta-llama/Meta-Llama-3.1-8B-Instruct -litgpt chat meta-llama/Meta-Llama-3.1-8B-Instruct -litgpt evaluate meta-llama/Meta-Llama-3.1-8B-Instruct +litgpt serve meta-llama/Llama-3.2-3B-Instruct +litgpt finetune meta-llama/Llama-3.2-3B-Instruct +litgpt pretrain meta-llama/Llama-3.2-3B-Instruct +litgpt chat meta-llama/Llama-3.2-3B-Instruct +litgpt evaluate meta-llama/Llama-3.2-3B-Instruct ```   diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md index 38f87c0197..e12fa79031 100644 --- a/config_hub/finetune/README.md +++ b/config_hub/finetune/README.md @@ -23,7 +23,7 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset | gemma-7b/qlora.yaml | gemma-7b | 2 | 512 | 1 | 1xA10G | 43.58 min | $1.3 | 17.18 GB | 0.973 | 2.646 | 62.45% | | | | | | | | | | | | | | | gemma2-2b/lora.yaml | gemma-2b | 2 | 512 | 2 | 1xA10G | 11.96 min | $0.4 | 14.31 GB | 0.951 | 2.589 | 23.84% | -| gemma2b/qlora.yaml | gemma-2b | 2 | 512 | 2 | 1xA10G | 16.06 min | $0.5 | 13.52 GB | 0.983 | 2.673 | 24.12% | +| gemma2b/qlora.yaml | gemma-2b | 2 | 512 | 2 | 1xA10G | 16.06 min | $0.5 | 13.52 GB | 0.983 | 2.673 | 24.12% | | | | | | | | | | | | | | | gemma2-9b/lora.yaml | gemma-2-9b | 2 | 512 | 1 | 1xA10G | OOM | OOM | OOM | OOM | OOM | | | gemma2-9b/lora.yaml | gemma-2-9b | 2 | 512 | 1 | 4xA10G | OOM | OOM | OOM | OOM | OOM | | @@ -38,11 +38,19 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset | llama-3-8b/lora.yaml | llama-3-8b | 2 | 512 | 1 | 1xA10G | 14.79 min | $0.4 | 19.73 GB | 0.888 | 2.431 | 62.4% | | llama-3-8b/lora.yaml | llama-3-8b | 2 | 512 | 1 | 4xA10G | 14.88 min | $1.2 | 19.73 GB | 0.889 | 2.432 | 62.5% | | llama-3-8b/qlora.yaml | llama-3-8b | 2 | 512 | 2 | 1xA10G | 22.24 min | $0.7 | 17.41 GB | 0.939 | 2.558 | 62.2% | -| | | | | | | | | | | | | +| | | | | | | | | | | | | | llama-3.1-8b/full.yaml | llama-3.1-8b | 1 | 512 | 4 | 1xA10G | OOM | OOM | OOM | OOM | OOM | OOM | | llama-3.1-8b/lora.yaml | llama-3.1-8b | 2 | 512 | 1 | 1xA10G | 13.36 min | $1.1 | 19.73 GB | 0.878 | 2.406 | xx.xx | | llama-3.1-8b/qlora.yaml | llama-3.1-8b | 2 | 512 | 2 | 1xA10G | 21.81 min | $0.7 | 17.41 GB | 0.928 | 2.529 | xx.xx | | | | | | | | | | | | | | +| llama-3.2-1b/full.yaml | llama-3.2-1b | 1 | 512 | 4 | 1xA10G | 2.01 min | $0.1 | 8.70 GB | 1.442 | 4.229 | 38.21% | +| llama-3.2-1b/lora.yaml | llama-3.2-1b | 2 | 512 | 1 | 1xA10G | 4.17 min | $0.4 | 4.49 GB | 1.114 | 3.046 | 36.87% | +| llama-3.2-1b/qlora.yaml | llama-3.2-1b | 2 | 512 | 2 | 1xA10G | 6.20 min | $0.6 | 5.53 GB | 1.201 | 3.322 | 36.49% | +| | | | | | | | | | | | | +| llama-3.2-3b/full.yaml | llama-3.2-3b | 1 | 512 | 4 | 1xA10G | 4.71 min | $0.4 | 16.51 GB | 1.255 | 3.509 | 54.69% | +| llama-3.2-3b/lora.yaml | llama-3.2-3b | 2 | 512 | 1 | 1xA10G | 8.31 min | $0.8 | 9.67 GB | 0.973 | 2.647 | 54.77% | +| llama-3.2-3b/qlora.yaml | llama-3.2-3b | 2 | 512 | 2 | 1xA10G | 14.89 min | $1.4 | 10.30 GB | 1.031 | 2.804 | 55.08% | +| | | | | | | | | | | | | | mistral-7b-v0.2/lora.yaml | mistral-7b-v0.2 | 4 | 512 | 2 | 1xA10G | 31.00 min | $0.9 | 20.66 GB | 0.801 | 2.228 | 55.7% | | mistral-7b-v0.2/lora.yaml | mistral-7b-v0.2 | 4 | 512 | 2 | 4xA10G | 31.00 min | $2.5 | 20.66 GB | 0.802 | 2.229 | 55.5% | | mistral-7b-v0.2/qlora.yaml | mistral-7b-v0.2 | 4 | 512 | 2 | 1xA10G | 44.75 min | $1.3 | 14.29 GB | 0.813 | 2.255 | 56.5% | diff --git a/config_hub/finetune/llama-3.2-1B/full.yaml b/config_hub/finetune/llama-3.2-1B/full.yaml new file mode 100644 index 0000000000..4db804e26f --- /dev/null +++ b/config_hub/finetune/llama-3.2-1B/full.yaml @@ -0,0 +1,112 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B + +# Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) +out_dir: out/finetune/full-llama-3.2-1B + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# How many devices/GPUs to use (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume +# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing +# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. +# (type: Union[bool, Literal["auto"], Path], default: False) +# resume: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) + global_batch_size: 64 + + # Number of samples per data-parallel rank (type: int, default: 1) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 25 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 600) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.2-1B/lora.yaml b/config_hub/finetune/llama-3.2-1B/lora.yaml new file mode 100644 index 0000000000..4d7f04af5b --- /dev/null +++ b/config_hub/finetune/llama-3.2-1B/lora.yaml @@ -0,0 +1,136 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/lora-llama-3.2-1B + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 1 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.2-1B/qlora.yaml b/config_hub/finetune/llama-3.2-1B/qlora.yaml new file mode 100644 index 0000000000..ebaac201f3 --- /dev/null +++ b/config_hub/finetune/llama-3.2-1B/qlora.yaml @@ -0,0 +1,138 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-llama3.2-1b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: bnb.nf4 + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.05 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + download_dir: data/alpaca2k + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 2 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.2-3B/full.yaml b/config_hub/finetune/llama-3.2-3B/full.yaml new file mode 100644 index 0000000000..6bbf99a836 --- /dev/null +++ b/config_hub/finetune/llama-3.2-3B/full.yaml @@ -0,0 +1,112 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B + +# Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) +out_dir: out/finetune/full-llama-3.2-3B + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# How many devices/GPUs to use (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume +# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing +# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. +# (type: Union[bool, Literal["auto"], Path], default: False) +# resume: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) + global_batch_size: 64 + + # Number of samples per data-parallel rank (type: int, default: 1) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 25 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 600) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.2-3B/lora.yaml b/config_hub/finetune/llama-3.2-3B/lora.yaml new file mode 100644 index 0000000000..328149cb4e --- /dev/null +++ b/config_hub/finetune/llama-3.2-3B/lora.yaml @@ -0,0 +1,136 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/lora-llama-3.2-3B + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 1 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3.2-3B/qlora.yaml b/config_hub/finetune/llama-3.2-3B/qlora.yaml new file mode 100644 index 0000000000..8ad674a58e --- /dev/null +++ b/config_hub/finetune/llama-3.2-3B/qlora.yaml @@ -0,0 +1,138 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-llama3.2-3b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: bnb.nf4 + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# How many nodes to use. (type: int, default: 1) +num_nodes: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.05 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + download_dir: data/alpaca2k + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 2 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/litgpt/config.py b/litgpt/config.py index 5052ab117b..e047252d74 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -951,6 +951,44 @@ def norm_class(self) -> Type: intermediate_size=53248, rope_base=500000, ), + # https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json + dict( + name="Llama-3.2-1B{}", + hf_config=dict(org="meta-llama", name="Llama-3.2-1B{}"), + block_size=8192, + vocab_size=128000, + padded_vocab_size=128256, + n_layer=16, + n_embd=2048, + n_head=32, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=8192, + rope_base=500000, + ), + # https://huggingface.co/meta-llama/Llama-3.2-3B/blob/main/config.json + dict( + name="Llama-3.2-3B{}", + hf_config=dict(org="meta-llama", name="Llama-3.2-3B{}"), + block_size=8192, + vocab_size=128000, + padded_vocab_size=128256, + n_layer=28, + n_embd=3072, + n_head=24, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=8192, + rope_base=500000, + ), ] for c in llama_3: for kind in ("", "-Instruct"): diff --git a/tests/test_model.py b/tests/test_model.py index 245f08dcf8..c62d9c5cdd 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -218,6 +218,8 @@ def test_against_original_open_llama_3b(device, dtype): {"name": "Llama-3.1-405B", "n_query_groups": 4}, {"name": "Llama-3.1-8B"}, {"name": "Llama-3.1-8B-Instruct"}, + {"name": "Llama-3.2-1B"}, + {"name": "Llama-3.2-3B"}, ], ) @pytest.mark.parametrize( diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 074aa808be..174ebf2700 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -18,7 +18,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | | Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | -| Llama 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | +| Llama 3.2 | 1B, 3B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/MODEL_CARD.md) | | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/) | | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) @@ -120,6 +121,10 @@ meta-llama/Llama-2-70b-chat-hf meta-llama/Llama-2-70b-hf meta-llama/Llama-2-7b-chat-hf meta-llama/Llama-2-7b-hf +meta-llama/Llama-3.2-1B +meta-llama/Llama-3.2-1B-Instruct +meta-llama/Llama-3.2-3B +meta-llama/Llama-3.2-3B-Instruct meta-llama/Meta-Llama-3-70B meta-llama/Meta-Llama-3-70B-Instruct meta-llama/Meta-Llama-3-8B