From 23f12353e32630d513386d1c9d4269edbac02656 Mon Sep 17 00:00:00 2001 From: xigui wang Date: Thu, 25 Jan 2024 18:17:29 -0800 Subject: [PATCH 1/3] Add yarn scaleling parameter --- neural_speed/models/model_utils/model_files.h | 6 ++++++ neural_speed/models/model_utils/model_types.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h index 0f99fe898..c17204a94 100644 --- a/neural_speed/models/model_utils/model_files.h +++ b/neural_speed/models/model_utils/model_files.h @@ -1099,6 +1099,12 @@ struct model_file_loader { file.read_raw(&hparams.rms_norm_eps, sizeof(float)); file.read_raw(&hparams.freq_base, sizeof(float)); file.read_raw(&hparams.freq_scale, sizeof(float)); + + file.read_raw(&hparams.rope_scaling_factor, sizeof(float)); + hparams.original_max_position_embeddings = file.read_u32(); + hparams.use_yarn = bool(file.read_u32()); + // file.read_raw(&hparams.rope_scaling_factor, sizeof(float)); + } void read_ne_vocab() { diff --git a/neural_speed/models/model_utils/model_types.h b/neural_speed/models/model_utils/model_types.h index e19f0d999..d438dac33 100644 --- a/neural_speed/models/model_utils/model_types.h +++ b/neural_speed/models/model_utils/model_types.h @@ -139,6 +139,9 @@ struct model_hparams { // ChatGLM-1 int32_t inner_hidden_size = 0; + float rope_scaling_factor = 0.0f; + int32_t original_max_position_embeddings = 0; + int32_t use_yarn = 0; bool operator!=(const model_hparams& other) const { return static_cast(memcmp(this, &other, sizeof(model_hparams))); From a6b1c1e29795d7bcfd81be49cbab5d85fe98dd51 Mon Sep 17 00:00:00 2001 From: xigui wang Date: Thu, 25 Jan 2024 18:17:29 -0800 Subject: [PATCH 2/3] Add yarn scaleling parameter in convert script --- neural_speed/convert/convert_baichuan.py | 4 ++++ neural_speed/convert/convert_bloom.py | 4 ++++ neural_speed/convert/convert_chatglm.py | 4 ++++ neural_speed/convert/convert_falcon.py | 4 ++++ neural_speed/convert/convert_gptj.py | 4 ++++ neural_speed/convert/convert_gptneox.py | 4 ++++ neural_speed/convert/convert_llama.py | 4 ++++ neural_speed/convert/convert_mistral.py | 4 ++++ neural_speed/convert/convert_mpt.py | 5 +++++ neural_speed/convert/convert_opt.py | 4 ++++ neural_speed/convert/convert_qwen.py | 4 ++++ neural_speed/convert/convert_starcoder.py | 4 ++++ neural_speed/models/model_utils/model_files.h | 7 ++++--- 13 files changed, 53 insertions(+), 3 deletions(-) diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py index 44fc1a82a..d7fdca3af 100644 --- a/neural_speed/convert/convert_baichuan.py +++ b/neural_speed/convert/convert_baichuan.py @@ -160,6 +160,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py index 7e2a3f805..bb7e8dd43 100644 --- a/neural_speed/convert/convert_bloom.py +++ b/neural_speed/convert/convert_bloom.py @@ -103,6 +103,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py index 5c89b6b43..2e988139d 100644 --- a/neural_speed/convert/convert_chatglm.py +++ b/neural_speed/convert/convert_chatglm.py @@ -363,6 +363,10 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py index c4a92222b..2155acc92 100644 --- a/neural_speed/convert/convert_falcon.py +++ b/neural_speed/convert/convert_falcon.py @@ -110,6 +110,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index 111941ee5..b0b1b66b3 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -102,6 +102,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index 8c50c006b..a24d3201d 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -116,6 +116,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 101dd1980..b527c9547 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1093,6 +1093,10 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write(struct.pack("f", params.rope_theta)) self.fout.write(struct.pack("f", params.rope_scale)) + self.fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + self.fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + self.fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp self.fout.write(struct.pack("i", params.bos_token_id)) diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py index 3c76e3d04..778c7dd32 100644 --- a/neural_speed/convert/convert_mistral.py +++ b/neural_speed/convert/convert_mistral.py @@ -1065,6 +1065,10 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write(struct.pack("f", params.rope_theta)) self.fout.write(struct.pack("f", params.rope_scale)) + self.fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + self.fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + self.fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + self.fout.write( struct.pack("i", 1) ) diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py index 3a917cc40..fcad21bda 100644 --- a/neural_speed/convert/convert_mpt.py +++ b/neural_speed/convert/convert_mpt.py @@ -98,6 +98,11 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index ab26bc538..cc1416250 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -109,6 +109,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index 6c089dc45..900d8cfe8 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -116,6 +116,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py index 759de85c4..11dec1f70 100644 --- a/neural_speed/convert/convert_starcoder.py +++ b/neural_speed/convert/convert_starcoder.py @@ -114,6 +114,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h index c17204a94..22abfc097 100644 --- a/neural_speed/models/model_utils/model_files.h +++ b/neural_speed/models/model_utils/model_files.h @@ -1102,9 +1102,7 @@ struct model_file_loader { file.read_raw(&hparams.rope_scaling_factor, sizeof(float)); hparams.original_max_position_embeddings = file.read_u32(); - hparams.use_yarn = bool(file.read_u32()); - // file.read_raw(&hparams.rope_scaling_factor, sizeof(float)); - + hparams.use_yarn = file.read_u32(); } void read_ne_vocab() { @@ -1225,6 +1223,9 @@ struct model_file_saver { file.write_raw(&hparams.rms_norm_eps, sizeof(float)); file.write_raw(&hparams.freq_base, sizeof(float)); file.write_raw(&hparams.freq_scale, sizeof(float)); + file.write_raw(&hparams.rope_scaling_factor, sizeof(float)); + file.write_u32(hparams.original_max_position_embeddings); + file.write_u32(hparams.use_yarn); } void write_vocab() { if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) { From e3b0c7631649d68f1c7d591d784f66def69b0587 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 06:14:35 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_speed/convert/convert_falcon.py | 2 +- neural_speed/convert/convert_gptj.py | 2 +- neural_speed/convert/convert_gptneox.py | 2 +- neural_speed/convert/convert_opt.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py index 2155acc92..d1de3ce59 100644 --- a/neural_speed/convert/convert_falcon.py +++ b/neural_speed/convert/convert_falcon.py @@ -110,7 +110,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index b0b1b66b3..5b2b72aa3 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -102,7 +102,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index a24d3201d..6fc57d6c2 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index cc1416250..0068311e9 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -109,7 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0))