Enable Mixtral8x7B (#138)

intel · Mar 4, 2024 · 9bcb612 · 9bcb612
1 parent 96fadd9
commit 9bcb612
Show file tree

Hide file tree

Showing 38 changed files with 2,534 additions and 66 deletions.
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -171,14 +171,15 @@ Neural Speed supports the following models:
     <td>4.33.1</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a></td>
-    <td>✅</td>
-    <td>✅</td>
-    <td>✅</td>
-    <td>✅</td>
+    <td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
+     <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" target="_blank" rel="noopener noreferrer">Mixtral-8x7B</a></td>
     <td>✅</td>
+    <td> </td>
+    <td> </td>
     <td>✅</td>
-    <td>4.34.0 or newer</td>
+    <td> </td>
+    <td> </td>
+    <td>4.36.0 or newer</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B</a>,

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -72,6 +72,8 @@ def __import_package(self, model_type):
             import neural_speed.phi_cpp as cpp_model
         elif model_type == "whisper":
             import neural_speed.whisper_cpp as cpp_model
+        elif model_type == "mixtral":
+            import neural_speed.mixtral_cpp as cpp_model
         else:
             raise TypeError("Unsupported model type {}!".format(model_type))
         self.module = cpp_model

diff --git a/neural_speed/application/CMakeLists.txt b/neural_speed/application/CMakeLists.txt
@@ -67,6 +67,7 @@ compile_quant(quant_chatglm   quant_model.cpp chatglm   chatglm)
 compile_quant(quant_chatglm2  quant_model.cpp chatglm2  chatglm2)
 compile_quant(quant_baichuan  quant_model.cpp baichuan  baichuan)
 compile_quant(quant_mistral   quant_model.cpp mistral   llama)
+compile_quant(quant_mixtral   quant_model.cpp mixtral   llama)
 compile_quant(quant_qwen   quant_model.cpp qwen   qwen)
 compile_quant(quant_phi   quant_model.cpp phi   phi)
 compile_quant(quant_whisper   quant_whisper.cpp whisper   whisper)
@@ -93,6 +94,7 @@ set(mymap_mistral 14)
 set(mymap_qwen 15)
 set(mymap_phi 16)
 set(mymap_whisper 17)
+set(mymap_mixtral 18)
 
 
 
@@ -129,6 +131,7 @@ compile_run(run_baichuan  main_run.cpp   main_pybind.cpp baichuan  baichuan)
 compile_run(run_mistral   main_run.cpp   main_pybind.cpp mistral   llama)
 compile_run(run_qwen      main_run.cpp   main_pybind.cpp qwen      qwen)
 compile_run(run_phi      main_run.cpp   main_pybind.cpp phi      phi)
+compile_run(run_mixtral   main_run.cpp   main_pybind.cpp mixtral   llama)
 
 # speech recognition
 compile_run(run_whisper   audio_run.cpp  whisper_pybind.cpp whisper   whisper)
diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp
@@ -898,6 +898,10 @@ PYBIND11_MODULE(phi_cpp, m)
 
 PYBIND11_MODULE(whisper_cpp, m)
 
+#elif MODEL_NAME_ID == 18
+
+PYBIND11_MODULE(mixtral_cpp, m)
+
 #endif
 {
   m.doc() = "cpp model python binding";

diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
@@ -156,6 +156,8 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", hparams["intermediate_size"]))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor

diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
@@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor

diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
@@ -393,6 +393,8 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["multi_query_group_num"]))
     fout.write(struct.pack("i", hparams["ffn_hidden_size"]))
     fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor

diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
@@ -113,9 +113,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
 
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))

diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
@@ -107,6 +107,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor

diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
@@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor

diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
@@ -113,6 +113,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
@@ -1089,6 +1089,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("i", params.ffn_hidden_size))
         self.fout.write(struct.pack("i", 0))
 
+        self.fout.write(struct.pack("i", 0))   # n_experts
+        self.fout.write(struct.pack("i", 0))   # n_expert_used
         self.fout.write(struct.pack("f", params.rms_norm_eps))
         self.fout.write(struct.pack("f", params.rope_theta))
         self.fout.write(struct.pack("f", params.rope_scale))

diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
@@ -1062,6 +1062,9 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("i", 0))
         self.fout.write(struct.pack("i", params.ffn_hidden_size))
         self.fout.write(struct.pack("i", 0))
+
+        self.fout.write(struct.pack("i", 0))  # n_experts
+        self.fout.write(struct.pack("i", 0))  # n_expert_used
         self.fout.write(struct.pack("f", params.rms_norm_eps))
         self.fout.write(struct.pack("f", params.rope_theta))
         self.fout.write(struct.pack("f", params.rope_scale))