Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Enable Mixtral8x7B (#138)
Browse files Browse the repository at this point in the history
  • Loading branch information
intellinjun authored Mar 4, 2024
1 parent 96fadd9 commit 9bcb612
Show file tree
Hide file tree
Showing 38 changed files with 2,534 additions and 66 deletions.
13 changes: 7 additions & 6 deletions docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,15 @@ Neural Speed supports the following models:
<td>4.33.1</td>
</tr>
<tr>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a></td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
<a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" target="_blank" rel="noopener noreferrer">Mixtral-8x7B</a></td>
<td>✅</td>
<td> </td>
<td> </td>
<td>✅</td>
<td>4.34.0 or newer</td>
<td> </td>
<td> </td>
<td>4.36.0 or newer</td>
</tr>
<tr>
<td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B</a>,
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def __import_package(self, model_type):
import neural_speed.phi_cpp as cpp_model
elif model_type == "whisper":
import neural_speed.whisper_cpp as cpp_model
elif model_type == "mixtral":
import neural_speed.mixtral_cpp as cpp_model
else:
raise TypeError("Unsupported model type {}!".format(model_type))
self.module = cpp_model
Expand Down
3 changes: 3 additions & 0 deletions neural_speed/application/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ compile_quant(quant_chatglm quant_model.cpp chatglm chatglm)
compile_quant(quant_chatglm2 quant_model.cpp chatglm2 chatglm2)
compile_quant(quant_baichuan quant_model.cpp baichuan baichuan)
compile_quant(quant_mistral quant_model.cpp mistral llama)
compile_quant(quant_mixtral quant_model.cpp mixtral llama)
compile_quant(quant_qwen quant_model.cpp qwen qwen)
compile_quant(quant_phi quant_model.cpp phi phi)
compile_quant(quant_whisper quant_whisper.cpp whisper whisper)
Expand All @@ -93,6 +94,7 @@ set(mymap_mistral 14)
set(mymap_qwen 15)
set(mymap_phi 16)
set(mymap_whisper 17)
set(mymap_mixtral 18)



Expand Down Expand Up @@ -129,6 +131,7 @@ compile_run(run_baichuan main_run.cpp main_pybind.cpp baichuan baichuan)
compile_run(run_mistral main_run.cpp main_pybind.cpp mistral llama)
compile_run(run_qwen main_run.cpp main_pybind.cpp qwen qwen)
compile_run(run_phi main_run.cpp main_pybind.cpp phi phi)
compile_run(run_mixtral main_run.cpp main_pybind.cpp mixtral llama)

# speech recognition
compile_run(run_whisper audio_run.cpp whisper_pybind.cpp whisper whisper)
4 changes: 4 additions & 0 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,10 @@ PYBIND11_MODULE(phi_cpp, m)

PYBIND11_MODULE(whisper_cpp, m)

#elif MODEL_NAME_ID == 18

PYBIND11_MODULE(mixtral_cpp, m)

#endif
{
m.doc() = "cpp model python binding";
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", hparams["intermediate_size"]))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["multi_query_group_num"]))
fout.write(struct.pack("i", hparams["ffn_hidden_size"]))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
5 changes: 5 additions & 0 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings
fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0))

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("i", params.ffn_hidden_size))
self.fout.write(struct.pack("i", 0))

self.fout.write(struct.pack("i", 0)) # n_experts
self.fout.write(struct.pack("i", 0)) # n_expert_used
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
3 changes: 3 additions & 0 deletions neural_speed/convert/convert_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,9 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("i", 0))
self.fout.write(struct.pack("i", params.ffn_hidden_size))
self.fout.write(struct.pack("i", 0))

self.fout.write(struct.pack("i", 0)) # n_experts
self.fout.write(struct.pack("i", 0)) # n_expert_used
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
Loading

0 comments on commit 9bcb612

Please sign in to comment.