From 641af95f87dd507f4f0432452764bb04a43a1029 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Thu, 26 Sep 2024 19:13:37 +0000
Subject: [PATCH 1/4] Llama 3.2 weights

---
 README.md                                   |  13 +-
 config_hub/finetune/README.md               |  12 +-
 config_hub/finetune/llama-3.2-1B/full.yaml  | 112 ++++++++++++++++
 config_hub/finetune/llama-3.2-1B/lora.yaml  | 136 +++++++++++++++++++
 config_hub/finetune/llama-3.2-1B/qlora.yaml | 138 ++++++++++++++++++++
 config_hub/finetune/llama-3.2-3B/full.yaml  | 112 ++++++++++++++++
 config_hub/finetune/llama-3.2-3B/lora.yaml  | 136 +++++++++++++++++++
 config_hub/finetune/llama-3.2-3B/qlora.yaml | 138 ++++++++++++++++++++
 litgpt/config.py                            |  40 ++++++
 9 files changed, 829 insertions(+), 8 deletions(-)
 create mode 100644 config_hub/finetune/llama-3.2-1B/full.yaml
 create mode 100644 config_hub/finetune/llama-3.2-1B/lora.yaml
 create mode 100644 config_hub/finetune/llama-3.2-1B/qlora.yaml
 create mode 100644 config_hub/finetune/llama-3.2-3B/full.yaml
 create mode 100644 config_hub/finetune/llama-3.2-3B/lora.yaml
 create mode 100644 config_hub/finetune/llama-3.2-3B/qlora.yaml

diff --git a/README.md b/README.md
index 5623f43c8c..f2e8616164 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ Every model is written from scratch to maximize performance and remove layers of
 
 | Model | Model size | Author | Reference |
 |----|----|----|----|
-| Llama 3 & 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                           |
+| Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                           |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                       |
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                         |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                      |
@@ -125,6 +125,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Gemma 2 | 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf)                                  |
 | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                           |
 | Llama 3.1 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                                 |
+| Llama 3.2 | 1B, 3B | Meta AI | [Meta AI 2024](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/)                                           |
 | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                                       |
 | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/)                                                                                  |
 | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama)                                                                             |
@@ -173,11 +174,11 @@ After installing LitGPT, select the model and workflow to run (finetune, pretrai
 
 ```bash
 # ligpt [action] [model]
-litgpt  serve     meta-llama/Meta-Llama-3.1-8B-Instruct
-litgpt  finetune  meta-llama/Meta-Llama-3.1-8B-Instruct
-litgpt  pretrain  meta-llama/Meta-Llama-3.1-8B-Instruct
-litgpt  chat      meta-llama/Meta-Llama-3.1-8B-Instruct
-litgpt  evaluate  meta-llama/Meta-Llama-3.1-8B-Instruct
+litgpt  serve     meta-llama/Llama-3.2-3B-Instruct
+litgpt  finetune  meta-llama/Llama-3.2-3B-Instruct
+litgpt  pretrain  meta-llama/Llama-3.2-3B-Instruct
+litgpt  chat      meta-llama/Llama-3.2-3B-Instruct
+litgpt  evaluate  meta-llama/Llama-3.2-3B-Instruct
 ```
 
 &nbsp;
diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md
index 38f87c0197..b5fbd5660e 100644
--- a/config_hub/finetune/README.md
+++ b/config_hub/finetune/README.md
@@ -23,7 +23,7 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset
 | gemma-7b/qlora.yaml               | gemma-7b               | 2      | 512            | 1                | 1xA10G  | 43.58 min        | $1.3 | 17.18 GB    | 0.973           | 2.646                 | 62.45%          |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
 | gemma2-2b/lora.yaml               | gemma-2b               | 2      | 512            | 2                | 1xA10G  | 11.96 min        | $0.4 | 14.31 GB    | 0.951           | 2.589                 | 23.84%          |
-| gemma2b/qlora.yaml                | gemma-2b               | 2      | 512            | 2                | 1xA10G  | 16.06 min        | $0.5 | 13.52 GB    | 0.983           | 2.673                 | 24.12%           |
+| gemma2b/qlora.yaml                | gemma-2b               | 2      | 512            | 2                | 1xA10G  | 16.06 min        | $0.5 | 13.52 GB    | 0.983           | 2.673                 | 24.12%          |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
 | gemma2-9b/lora.yaml               | gemma-2-9b             | 2      | 512            | 1                | 1xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |                 |
 | gemma2-9b/lora.yaml               | gemma-2-9b             | 2      | 512            | 1                | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |                 |
@@ -38,11 +38,19 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset
 | llama-3-8b/lora.yaml              | llama-3-8b             | 2      | 512            | 1                | 1xA10G  | 14.79 min        | $0.4 | 19.73 GB    | 0.888           | 2.431                 | 62.4%           |
 | llama-3-8b/lora.yaml              | llama-3-8b             | 2      | 512            | 1                | 4xA10G  | 14.88 min        | $1.2 | 19.73 GB    | 0.889           | 2.432                 | 62.5%           |
 | llama-3-8b/qlora.yaml             | llama-3-8b             | 2      | 512            | 2                | 1xA10G  | 22.24 min        | $0.7 | 17.41 GB    | 0.939           | 2.558                 | 62.2%           |
-|                                   |                        |        |                |                  |         |                  |       |            |                 |                       |                 |
+|                                   |                        |        |                |                  |         |                  |      |            |                 |                        |                 |
 | llama-3.1-8b/full.yaml            | llama-3.1-8b           | 1      | 512            | 4                | 1xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   | OOM             |
 | llama-3.1-8b/lora.yaml            | llama-3.1-8b           | 2      | 512            | 1                | 1xA10G  | 13.36 min        | $1.1 | 19.73 GB    | 0.878           | 2.406                 | xx.xx           |
 | llama-3.1-8b/qlora.yaml           | llama-3.1-8b           | 2      | 512            | 2                | 1xA10G  | 21.81 min        | $0.7 | 17.41 GB    | 0.928           | 2.529                 | xx.xx           |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| llama-3.2-1b/full.yaml            | llama-3.2-1b           | 1      | 512            | 4                | 1xA10G  |  2.01 min        | $0.1 |  8.70 GB    | 1.442           | 4.229                 | 38.21%          |
+| llama-3.2-1b/lora.yaml            | llama-3.2-1b           | 2      | 512            | 1                | 1xA10G  | xx.xx min        | $0.1 | xx.xx GB    | x.xxx           | x.xxx                 | xx.xx           |
+| llama-3.2-1b/qlora.yaml           | llama-3.2-1b           | 2      | 512            | 2                | 1xA10G  | xx.xx min        | $0.1 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| llama-3.2-3b/full.yaml            | llama-3.2-3b           | 1      | 512            | 4                | 1xA10G  | xx.xx min        | $0.2 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
+| llama-3.2-3b/lora.yaml            | llama-3.2-3b           | 2      | 512            | 1                | 1xA10G  | xx.xx min        | $0.2 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
+| llama-3.2-3b/qlora.yaml           | llama-3.2-3b           | 2      | 512            | 2                | 1xA10G  | xx.xx min        | $0.2 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
 | mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | 4      | 512            | 2                | 1xA10G  | 31.00 min        | $0.9 | 20.66 GB    | 0.801           | 2.228                 | 55.7%           |
 | mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | 4      | 512            | 2                | 4xA10G  | 31.00 min        | $2.5 | 20.66 GB    | 0.802           | 2.229                 | 55.5%           |
 | mistral-7b-v0.2/qlora.yaml        | mistral-7b-v0.2        | 4      | 512            | 2                | 1xA10G  | 44.75 min        | $1.3 | 14.29 GB    | 0.813           | 2.255                 | 56.5%           |
diff --git a/config_hub/finetune/llama-3.2-1B/full.yaml b/config_hub/finetune/llama-3.2-1B/full.yaml
new file mode 100644
index 0000000000..4db804e26f
--- /dev/null
+++ b/config_hub/finetune/llama-3.2-1B/full.yaml
@@ -0,0 +1,112 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
+out_dir: out/finetune/full-llama-3.2-1B
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# How many devices/GPUs to use (type: Union[int, str], default: 1)
+devices: 1
+
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+
+# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
+# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
+# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
+# (type: Union[bool, Literal["auto"], Path], default: False)
+# resume: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
+  global_batch_size: 64
+
+  # Number of samples per data-parallel rank (type: int, default: 1)
+  micro_batch_size: 4
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 25
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 1
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 600)
+  interval: 25
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3.2-1B/lora.yaml b/config_hub/finetune/llama-3.2-1B/lora.yaml
new file mode 100644
index 0000000000..4d7f04af5b
--- /dev/null
+++ b/config_hub/finetune/llama-3.2-1B/lora.yaml
@@ -0,0 +1,136 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-llama-3.2-1B
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 32
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: false
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: false
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 1
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 2
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3.2-1B/qlora.yaml b/config_hub/finetune/llama-3.2-1B/qlora.yaml
new file mode 100644
index 0000000000..ebaac201f3
--- /dev/null
+++ b/config_hub/finetune/llama-3.2-1B/qlora.yaml
@@ -0,0 +1,138 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/qlora-llama3.2-1b
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize: bnb.nf4
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 32
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: false
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: false
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    val_split_fraction: 0.05
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+    download_dir: data/alpaca2k
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 2
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3.2-3B/full.yaml b/config_hub/finetune/llama-3.2-3B/full.yaml
new file mode 100644
index 0000000000..6bbf99a836
--- /dev/null
+++ b/config_hub/finetune/llama-3.2-3B/full.yaml
@@ -0,0 +1,112 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
+out_dir: out/finetune/full-llama-3.2-3B
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# How many devices/GPUs to use (type: Union[int, str], default: 1)
+devices: 1
+
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+
+# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
+# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
+# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
+# (type: Union[bool, Literal["auto"], Path], default: False)
+# resume: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
+  global_batch_size: 64
+
+  # Number of samples per data-parallel rank (type: int, default: 1)
+  micro_batch_size: 4
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 25
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 1
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 600)
+  interval: 25
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3.2-3B/lora.yaml b/config_hub/finetune/llama-3.2-3B/lora.yaml
new file mode 100644
index 0000000000..328149cb4e
--- /dev/null
+++ b/config_hub/finetune/llama-3.2-3B/lora.yaml
@@ -0,0 +1,136 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-llama-3.2-3B
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 32
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: false
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: false
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 1
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 2
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3.2-3B/qlora.yaml b/config_hub/finetune/llama-3.2-3B/qlora.yaml
new file mode 100644
index 0000000000..8ad674a58e
--- /dev/null
+++ b/config_hub/finetune/llama-3.2-3B/qlora.yaml
@@ -0,0 +1,138 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/qlora-llama3.2-3b
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize: bnb.nf4
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 32
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: false
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: false
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    val_split_fraction: 0.05
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+    download_dir: data/alpaca2k
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 2
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/litgpt/config.py b/litgpt/config.py
index 5052ab117b..78d76fc9e0 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -951,6 +951,46 @@ def norm_class(self) -> Type:
         intermediate_size=53248,
         rope_base=500000,
     ),
+    # https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+    dict(
+        name="Llama-3.2-1B{}",
+        hf_config=dict(org="meta-llama", name="Llama-3.2-1B{}"),
+        block_size=8192,
+        vocab_size=128000,
+        padded_vocab_size=128256,
+        n_layer=16,
+        n_embd=2048,
+        n_head=32,
+        head_size=64,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=8192,
+        rope_base=500000,
+    ),
+    # https://huggingface.co/meta-llama/Llama-3.2-3B/blob/main/config.json
+    dict(
+        name="Llama-3.2-3B{}",
+        hf_config=dict(org="meta-llama", name="Llama-3.2-3B{}"),
+        block_size=8192,
+        vocab_size=128000,
+        padded_vocab_size=128256,
+        n_layer=28,
+        n_embd=3072,
+        n_head=24,
+        head_size=128,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=8192,
+        rope_base=500000,
+    ),
 ]
 for c in llama_3:
     for kind in ("", "-Instruct"):

From 32b5f95f6e5a9ba2876dba03e2f241b5b21ab35a Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Thu, 26 Sep 2024 22:19:45 +0000
Subject: [PATCH 2/4] update

---
 config_hub/finetune/README.md       | 10 +++++-----
 tests/test_model.py                 |  4 ++++
 tutorials/download_model_weights.md |  7 ++++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md
index b5fbd5660e..e12fa79031 100644
--- a/config_hub/finetune/README.md
+++ b/config_hub/finetune/README.md
@@ -44,12 +44,12 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset
 | llama-3.1-8b/qlora.yaml           | llama-3.1-8b           | 2      | 512            | 2                | 1xA10G  | 21.81 min        | $0.7 | 17.41 GB    | 0.928           | 2.529                 | xx.xx           |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
 | llama-3.2-1b/full.yaml            | llama-3.2-1b           | 1      | 512            | 4                | 1xA10G  |  2.01 min        | $0.1 |  8.70 GB    | 1.442           | 4.229                 | 38.21%          |
-| llama-3.2-1b/lora.yaml            | llama-3.2-1b           | 2      | 512            | 1                | 1xA10G  | xx.xx min        | $0.1 | xx.xx GB    | x.xxx           | x.xxx                 | xx.xx           |
-| llama-3.2-1b/qlora.yaml           | llama-3.2-1b           | 2      | 512            | 2                | 1xA10G  | xx.xx min        | $0.1 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
+| llama-3.2-1b/lora.yaml            | llama-3.2-1b           | 2      | 512            | 1                | 1xA10G  |  4.17 min        | $0.4 |  4.49 GB    | 1.114           | 3.046                 | 36.87%          |
+| llama-3.2-1b/qlora.yaml           | llama-3.2-1b           | 2      | 512            | 2                | 1xA10G  |  6.20 min        | $0.6 |  5.53 GB    | 1.201           | 3.322                 | 36.49%          |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
-| llama-3.2-3b/full.yaml            | llama-3.2-3b           | 1      | 512            | 4                | 1xA10G  | xx.xx min        | $0.2 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
-| llama-3.2-3b/lora.yaml            | llama-3.2-3b           | 2      | 512            | 1                | 1xA10G  | xx.xx min        | $0.2 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
-| llama-3.2-3b/qlora.yaml           | llama-3.2-3b           | 2      | 512            | 2                | 1xA10G  | xx.xx min        | $0.2 | xx.xx GB    | x.xxx           | x.xxx                 | x.xxx           |
+| llama-3.2-3b/full.yaml            | llama-3.2-3b           | 1      | 512            | 4                | 1xA10G  |  4.71 min        | $0.4 | 16.51 GB    | 1.255           | 3.509                 | 54.69%          |
+| llama-3.2-3b/lora.yaml            | llama-3.2-3b           | 2      | 512            | 1                | 1xA10G  |  8.31 min        | $0.8 |  9.67 GB    | 0.973           | 2.647                 | 54.77%          |
+| llama-3.2-3b/qlora.yaml           | llama-3.2-3b           | 2      | 512            | 2                | 1xA10G  | 14.89 min        | $1.4 | 10.30 GB    | 1.031           | 2.804                 | 55.08%          |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
 | mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | 4      | 512            | 2                | 1xA10G  | 31.00 min        | $0.9 | 20.66 GB    | 0.801           | 2.228                 | 55.7%           |
 | mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | 4      | 512            | 2                | 4xA10G  | 31.00 min        | $2.5 | 20.66 GB    | 0.802           | 2.229                 | 55.5%           |
diff --git a/tests/test_model.py b/tests/test_model.py
index 245f08dcf8..834c91137d 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -218,6 +218,10 @@ def test_against_original_open_llama_3b(device, dtype):
         {"name": "Llama-3.1-405B", "n_query_groups": 4},
         {"name": "Llama-3.1-8B"},
         {"name": "Llama-3.1-8B-Instruct"},
+        {"name": "Llama-3.2-1B"},
+        {"name": "Llama-3.2-1B-Instruct"},
+        {"name": "Llama-3.2-3B"},
+        {"name": "Llama-3.2-3B-Instruct"},
     ],
 )
 @pytest.mark.parametrize(
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 074aa808be..174ebf2700 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -18,7 +18,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf)                              |
 | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                           |
 | Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                                   |
-| Llama 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                                   |
+| Llama 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                           |
+| Llama 3.2 | 1B, 3B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/MODEL_CARD.md)                                    |
 | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                                       |
 | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/)                                                                        |
 | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama)
@@ -120,6 +121,10 @@ meta-llama/Llama-2-70b-chat-hf
 meta-llama/Llama-2-70b-hf
 meta-llama/Llama-2-7b-chat-hf
 meta-llama/Llama-2-7b-hf
+meta-llama/Llama-3.2-1B
+meta-llama/Llama-3.2-1B-Instruct
+meta-llama/Llama-3.2-3B
+meta-llama/Llama-3.2-3B-Instruct
 meta-llama/Meta-Llama-3-70B
 meta-llama/Meta-Llama-3-70B-Instruct
 meta-llama/Meta-Llama-3-8B

From 21fdf0894e1ed6675524bf9a2b7055277392b4ff Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Fri, 27 Sep 2024 13:39:47 +0000
Subject: [PATCH 3/4] fix tests

---
 litgpt/config.py    | 4 ++--
 tests/test_model.py | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index 78d76fc9e0..658eff105f 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -961,7 +961,7 @@ def norm_class(self) -> Type:
         n_layer=16,
         n_embd=2048,
         n_head=32,
-        head_size=64,
+        # head_size=64,
         n_query_groups=8,
         rotary_percentage=1.0,
         parallel_residual=False,
@@ -981,7 +981,7 @@ def norm_class(self) -> Type:
         n_layer=28,
         n_embd=3072,
         n_head=24,
-        head_size=128,
+        # head_size=128,
         n_query_groups=8,
         rotary_percentage=1.0,
         parallel_residual=False,
diff --git a/tests/test_model.py b/tests/test_model.py
index 834c91137d..c62d9c5cdd 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -219,9 +219,7 @@ def test_against_original_open_llama_3b(device, dtype):
         {"name": "Llama-3.1-8B"},
         {"name": "Llama-3.1-8B-Instruct"},
         {"name": "Llama-3.2-1B"},
-        {"name": "Llama-3.2-1B-Instruct"},
         {"name": "Llama-3.2-3B"},
-        {"name": "Llama-3.2-3B-Instruct"},
     ],
 )
 @pytest.mark.parametrize(

From b1a76f1a9a32c293b8fc857db8c331bfd39b04cb Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Fri, 27 Sep 2024 13:42:51 +0000
Subject: [PATCH 4/4] clean up config

---
 litgpt/config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index 658eff105f..e047252d74 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -961,7 +961,6 @@ def norm_class(self) -> Type:
         n_layer=16,
         n_embd=2048,
         n_head=32,
-        # head_size=64,
         n_query_groups=8,
         rotary_percentage=1.0,
         parallel_residual=False,
@@ -981,7 +980,6 @@ def norm_class(self) -> Type:
         n_layer=28,
         n_embd=3072,
         n_head=24,
-        # head_size=128,
         n_query_groups=8,
         rotary_percentage=1.0,
         parallel_residual=False,