intel · airMeng · Feb 22, 2024 · Jan 26, 2024 · Jan 26, 2024 · Feb 20, 2024
diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
@@ -160,6 +160,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
@@ -103,6 +103,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
@@ -363,6 +363,10 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
@@ -111,6 +111,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
@@ -103,6 +103,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
@@ -117,6 +117,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
@@ -1093,6 +1093,10 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("f", params.rope_theta))
         self.fout.write(struct.pack("f", params.rope_scale))
 
+        self.fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+        self.fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+        self.fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
         # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
         # but bos_token_id = 1 in llama.cpp
         self.fout.write(struct.pack("i", params.bos_token_id))

diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
@@ -1065,6 +1065,10 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("f", params.rope_theta))
         self.fout.write(struct.pack("f", params.rope_scale))
 
+        self.fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+        self.fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+        self.fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
         self.fout.write(
             struct.pack("i", 1)
         )

diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py
@@ -98,6 +98,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
+
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
@@ -110,6 +110,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
@@ -116,6 +116,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
     fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py
@@ -114,6 +114,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
 
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h
@@ -1099,6 +1099,10 @@ struct model_file_loader {
     file.read_raw(&hparams.rms_norm_eps, sizeof(float));
     file.read_raw(&hparams.freq_base, sizeof(float));
     file.read_raw(&hparams.freq_scale, sizeof(float));
+
+    file.read_raw(&hparams.rope_scaling_factor, sizeof(float));
+    hparams.original_max_position_embeddings = file.read_u32();
+    hparams.use_yarn = file.read_u32();
   }
 
   void read_ne_vocab() {
@@ -1219,6 +1223,9 @@ struct model_file_saver {
     file.write_raw(&hparams.rms_norm_eps, sizeof(float));
     file.write_raw(&hparams.freq_base, sizeof(float));
     file.write_raw(&hparams.freq_scale, sizeof(float));
+    file.write_raw(&hparams.rope_scaling_factor, sizeof(float));
+    file.write_u32(hparams.original_max_position_embeddings);
+    file.write_u32(hparams.use_yarn);
   }
   void write_vocab() {
     if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) {

diff --git a/neural_speed/models/model_utils/model_types.h b/neural_speed/models/model_utils/model_types.h
@@ -139,6 +139,9 @@ struct model_hparams {
 
   // ChatGLM-1
   int32_t inner_hidden_size = 0;
+  float rope_scaling_factor = 0.0f;
+  int32_t original_max_position_embeddings = 0;
+  int32_t use_yarn = 0;
 
   bool operator!=(const model_hparams& other) const {
     return static_cast<bool>(memcmp(this, &other, sizeof(model_hparams)));