From 369158caf22b96affcc32b8e1c9d4d466f0bf03a Mon Sep 17 00:00:00 2001
From: ZhuangXialie <62231346+ZhuangXialie@users.noreply.github.com>
Date: Wed, 24 Apr 2024 22:31:07 +0800
Subject: [PATCH 1/2] add max_length and max_prompt_length

If it is not set, it will only be used as the default value and cannot be changed
---
 orpo_training.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/orpo_training.py b/orpo_training.py
index 4a7055c..49dcbd3 100644
--- a/orpo_training.py
+++ b/orpo_training.py
@@ -45,6 +45,11 @@ class ScriptArguments:
     The name of the Casual LM model we wish to fine with DPO
     """
     # Model arguments
+
+    max_length: Optional[int] = field(default=512,
+                                      metadata={"help": "Maximum total input sequence length after tokenization."})
+    max_prompt_length: Optional[int] = field(default=128, metadata={"help": "Maximum length of prompt sequences."})
+
     model_type: str = field(
         default=None,
         metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())}
@@ -415,6 +420,8 @@ def main():
         model.config.use_cache = True
 
     training_args = ORPOConfig(
+        max_length=args.max_length,
+        max_prompt_length=args.max_prompt_length,
         per_device_train_batch_size=args.per_device_train_batch_size,
         per_device_eval_batch_size=args.per_device_eval_batch_size,
         max_steps=args.max_steps,

From 9aa010f429c5cd42a863749a946f6bbb03ba0514 Mon Sep 17 00:00:00 2001
From: ZhuangXialie <62231346+ZhuangXialie@users.noreply.github.com>
Date: Sun, 28 Apr 2024 11:00:52 +0800
Subject: [PATCH 2/2] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index be9e47e..7caf9ff 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,7 @@ CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 inference_multigpu_demo.py
 - 80万条中文ChatGPT多轮对话数据集：[BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
 - 116万条中文ChatGPT多轮对话数据集：[fnlp/moss-002-sft-data](https://huggingface.co/datasets/fnlp/moss-002-sft-data)
 - 3.8万条中文ShareGPT多轮对话数据集：[FreedomIntelligence/ShareGPT-CN](https://huggingface.co/datasets/FreedomIntelligence/ShareGPT-CN)
+- 中文微调数据集汇总:[zhuangxialie/Llama3-Chinese-Dataset](https://modelscope.cn/datasets/zhuangxialie/Llama3-Chinese-Dataset/dataPeview) [本项目支持格式]
 
 #### Reward Model datasets
 - 原版的oasst1数据集：[OpenAssistant/oasst1](https://huggingface.co/datasets/OpenAssistant/oasst1)