From 300db7efb40415ac0c94f1e17898defeed8666c7 Mon Sep 17 00:00:00 2001
From: XU Ke <ker2.xu@outlook.com>
Date: Sat, 20 Apr 2024 22:19:07 +0800
Subject: [PATCH] Fix a typo of s^2 attn.

---
 docs/training_params.md  | 10 +++++-----
 supervised_finetuning.py | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/training_params.md b/docs/training_params.md
index 118102d..88c42fb 100644
--- a/docs/training_params.md
+++ b/docs/training_params.md
@@ -3,7 +3,7 @@
 
 - 第一阶段：PT(Continue PreTraining)增量预训练 `run_pt.sh`
 - 第二阶段：SFT(Supervised Fine-tuning)有监督微调 `run_sft.sh`
-- 第三阶段 
+- 第三阶段
   - RLHF(Reinforcement Learning from Human Feedback)分为两步：
     - RM(Reward Model)奖励模型建模 `run_rm.sh`
     - RL(Reinforcement Learning)基于人类反馈的强化学习 `run_ppo.sh`
@@ -23,7 +23,7 @@
 9. PT和SFT支持qlora训练，如果使用的是 RTX4090、A100 或 H100 GPU，支持nf4，使用`--qlora True --load_in_4bit True`参数启用qlora训练，开启qlora训练，会减少显存占用，训练加速，同时建议设置`--torch_dtype bfloat16 --optim paged_adamw_32bit`保证训练精度
 10. 扩词表后的增量预训练，PT阶段加上`--modules_to_save embed_tokens,lm_head`参数，后续SFT等阶段不用加
 11. 新增了RoPE插值来扩展GPT模型的上下文长度，通过[位置插值方法](https://arxiv.org/abs/2306.15595)，在增量数据上进行训练，使模型获得长文本处理能力，使用 `--rope_scaling linear` 参数训练模型，使用`--rope_scaling dynamic` 参数预测模型
-12. 针对LLaMA模型支持了[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)，如果您使用的是 RTX4090、A100 或 H100 GPU，SFT中请使用 `--flash_attn` 参数以启用 FlashAttention-2
+12. 针对LLaMA模型支持了[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)，如果您使用的是 RTX3090、RTX4090、A100 或 H100 GPU，SFT中请使用 `--flash_attn` 参数以启用 FlashAttention-2
 13. 新增了[LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**，使模型获得长文本处理能力，SFT中使用 `--shift_attn` 参数以启用该功能
 14. 支持了[NEFTune](https://github.com/neelsjain/NEFTune)给embedding加噪SFT训练方法，[NEFTune paper](https://arxiv.org/abs/2310.05914), SFT中使用 `--neft_alpha` 参数启用 NEFTune，例如 `--neft_alpha 5`
 15. 支持微调Mixtral混合专家MoE模型 **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)**，SFT中如果用lora微调模型，可以开启4bit量化和QLoRA`--load_in_4bit True --qlora True`以节省显存，建议设置`--target_modules q_proj,k_proj,v_proj,o_proj`，这样可以避免对MoE专家网络的MLP层量化，因为它们很稀疏且量化后会导致性能效果下降。
@@ -33,8 +33,8 @@
 
 默认使用LoRA训练，每个stage的LoRA模型权重都需要合并到base model中，使用以下命令合并，下一个stage的`model_name_or_path`指定为合并后的模型文件夹。
 
-LoRA layers were using at all stages to reduce memory requirements. 
-At each stage the peft adapter layers were merged with the base model, using: 
+LoRA layers were using at all stages to reduce memory requirements.
+At each stage the peft adapter layers were merged with the base model, using:
 ```shell
 python merge_peft_adapter.py \
   --base_model base_model_dir \
@@ -98,7 +98,7 @@ node_rank=$1
 echo ${node_rank}
 master_addr="10.111.112.223"
 
-torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_port 14545 --node_rank ${node_rank} run_supervised_finetuning.py ... 
+torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_port 14545 --node_rank ${node_rank} run_supervised_finetuning.py ...
 ```
 
 
diff --git a/supervised_finetuning.py b/supervised_finetuning.py
index 3d98cf2..efbe7ba 100644
--- a/supervised_finetuning.py
+++ b/supervised_finetuning.py
@@ -148,7 +148,7 @@ class ModelArguments:
     )
     shift_attn: Optional[bool] = field(
         default=False,
-        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."}
+        metadata={"help": "Enable shifted sparse attention (S^2-Attn) proposed by LongLoRA."}
     )
     neft_alpha: Optional[float] = field(
         default=0,
@@ -1251,16 +1251,16 @@ def filter_empty_labels(example):
                 logger.warning("FlashAttention-2 is not installed.")
         elif model_args.shift_attn and getattr(config, "model_type", None) == "llama":
             logger.warning("Using `--flash_attn` for faster training in large context length, enable if your GPU"
-                           " is RTX4090, A100 or H100.")
+                           " is RTX3090, RTX4090, A100 or H100.")
 
-        # Set shift short attention (S^2-Attn)
+        # Set shifted sparse attention (S^2-Attn)
         if model_args.shift_attn:
             if getattr(config, "model_type", None) == "llama":
                 setattr(config, "group_size_ratio", 0.25)
                 apply_llama_patch()
-                logger.info("Using shift short attention with group_size_ratio=1/4.")
+                logger.info("Using shifted sparse attention with group_size_ratio=1/4.")
             else:
-                logger.warning("Current model does not support shift short attention.")
+                logger.warning("Current model does not support shifted sparse attention.")
 
         load_in_4bit = model_args.load_in_4bit
         load_in_8bit = model_args.load_in_8bit
@@ -1388,7 +1388,7 @@ def fp32_forward_post_hook(module: torch.nn.Module, args: Tuple[torch.Tensor], o
         tokenizer=tokenizer,
         model=model,
         label_pad_token_id=IGNORE_INDEX,
-        pad_to_multiple_of=4 if tokenizer.padding_side == "right" else None,  # for shift short attention
+        pad_to_multiple_of=4 if tokenizer.padding_side == "right" else None,  # for shifted sparse attention
     )
     # Initialize our Trainer
     trainer = SavePeftModelTrainer(