Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Enable Mixtral8x7B #138

Merged
merged 38 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
8905a97
fix top-k and argsort error
intellinjun Feb 7, 2024
cbe21a9
enable mistral8X7b f32 gguf
intellinjun Feb 21, 2024
692e2d0
enable moe jblas
intellinjun Feb 22, 2024
f285b1f
add write and read n_experts parameter
intellinjun Feb 23, 2024
299a996
Merge branch 'main' into mixtral
intellinjun Feb 23, 2024
ce00092
Update __init__.py
intellinjun Feb 23, 2024
a698dac
enable q40
intellinjun Feb 23, 2024
b2fd598
Merge branch 'mixtral' of https://github.com/intel/neural-speed into …
intellinjun Feb 23, 2024
ffdd11a
fix format error
intellinjun Feb 23, 2024
2abcb14
enable mixtral8x7b from hf to bin
intellinjun Feb 23, 2024
9f56f45
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 23, 2024
4a01f65
fix llama load error
intellinjun Feb 23, 2024
38a368b
Merge branch 'mixtral' of https://github.com/intel/neural-speed into …
intellinjun Feb 23, 2024
28bc0db
fix format error
intellinjun Feb 26, 2024
932ee25
Update llama.cpp
intellinjun Feb 26, 2024
41606f6
fix convert and format error
intellinjun Feb 27, 2024
5454e1f
Merge branch 'mixtral' of https://github.com/intel/neural-speed into …
intellinjun Feb 27, 2024
00c4ff6
Merge branch 'main' into mixtral
intellinjun Feb 27, 2024
3857b5b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 27, 2024
e4260dd
Update convert_quantized_llama.py
intellinjun Feb 27, 2024
a88ac05
add extension test for mixtral
intellinjun Feb 27, 2024
13999fc
Merge branch 'mixtral' of https://github.com/intel/neural-speed into …
intellinjun Feb 27, 2024
1ea128e
update argsort
intellinjun Feb 28, 2024
e27880c
Update argsort.cpp
intellinjun Feb 28, 2024
e6f2fbf
Update ne_layers.c
intellinjun Feb 29, 2024
cad4114
fix format error
intellinjun Feb 29, 2024
f8cd5a2
Merge branch 'mixtral' of https://github.com/intel/neural-speed into …
intellinjun Feb 29, 2024
32a951d
fix format error
intellinjun Feb 29, 2024
db3b00d
Update ne_layers.c
intellinjun Feb 29, 2024
7f9fcf8
add mul_id_ffn_fusion
intellinjun Feb 29, 2024
24156e0
Merge branch 'mixtral' of https://github.com/intel/neural-speed into …
intellinjun Feb 29, 2024
7ef7a2e
fix compile error
intellinjun Feb 29, 2024
9e76221
fix compile error
intellinjun Feb 29, 2024
9d6c93c
amend function name
intellinjun Feb 29, 2024
5e474d0
Update convert_mixtral.py
intellinjun Feb 29, 2024
c32c991
Update CMakeLists.txt
intellinjun Feb 29, 2024
affe69c
fix mixtral_q40 multi thread error
intellinjun Mar 1, 2024
590d65b
add ffn silu support assert
intellinjun Mar 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,15 @@ Neural Speed supports the following models:
<td>4.33.1</td>
</tr>
<tr>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a></td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
<a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" target="_blank" rel="noopener noreferrer">Mixtral-8x7B</a></td>
<td>✅</td>
<td> </td>
<td> </td>
<td>✅</td>
<td>4.34.0 or newer</td>
<td> </td>
<td> </td>
<td>4.36.0 or newer</td>
</tr>
<tr>
<td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B</a>,
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def __import_package(self, model_type):
import neural_speed.phi_cpp as cpp_model
elif model_type == "whisper":
import neural_speed.whisper_cpp as cpp_model
elif model_type == "mixtral":
import neural_speed.mixtral_cpp as cpp_model
else:
raise TypeError("Unsupported model type {}!".format(model_type))
self.module = cpp_model
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/application/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ set(mymap_mistral 14)
set(mymap_qwen 15)
set(mymap_phi 16)
set(mymap_whisper 17)
set(mymap_mixtral 18)



Expand Down Expand Up @@ -129,6 +130,7 @@ compile_run(run_baichuan main_run.cpp main_pybind.cpp baichuan baichuan)
compile_run(run_mistral main_run.cpp main_pybind.cpp mistral llama)
compile_run(run_qwen main_run.cpp main_pybind.cpp qwen qwen)
compile_run(run_phi main_run.cpp main_pybind.cpp phi phi)
compile_run(run_mixtral main_run.cpp main_pybind.cpp mixtral llama)

# speech recognition
compile_run(run_whisper audio_run.cpp whisper_pybind.cpp whisper whisper)
4 changes: 4 additions & 0 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,10 @@ PYBIND11_MODULE(phi_cpp, m)

PYBIND11_MODULE(whisper_cpp, m)

#elif MODEL_NAME_ID == 18

PYBIND11_MODULE(mixtral_cpp, m)

#endif
{
m.doc() = "cpp model python binding";
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", hparams["intermediate_size"]))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,8 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["multi_query_group_num"]))
fout.write(struct.pack("i", hparams["ffn_hidden_size"]))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
5 changes: 5 additions & 0 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings
fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0))

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("i", params.ffn_hidden_size))
self.fout.write(struct.pack("i", 0))

self.fout.write(struct.pack("i", 0)) # n_experts
self.fout.write(struct.pack("i", 0)) # n_expert_used
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
3 changes: 3 additions & 0 deletions neural_speed/convert/convert_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,9 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("i", 0))
self.fout.write(struct.pack("i", params.ffn_hidden_size))
self.fout.write(struct.pack("i", 0))

self.fout.write(struct.pack("i", 0)) # n_experts
self.fout.write(struct.pack("i", 0)) # n_expert_used
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
Loading
Loading