Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
fix qwen load error (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
intellinjun authored Mar 13, 2024
1 parent 8e17b67 commit 2309fbb
Show file tree
Hide file tree
Showing 13 changed files with 29 additions and 22 deletions.
3 changes: 2 additions & 1 deletion neural_speed/convert/convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None
fname_tokenizer,
cache_dir=fname_tokenizer,
local_files_only=True,
trust_remote_code=True
)

# Initialize lists and dictionaries for added tokens
Expand Down Expand Up @@ -402,7 +403,7 @@ def _set_vocab_gpt2(self):
toktypes: list[int] = []

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
print("Loading model: ", dir_model)

Expand Down
5 changes: 3 additions & 2 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
print("Loading model: ", dir_model)
model.eval()
for p in model.parameters():
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = model.config.to_dict()
list_vars = model.state_dict()
Expand Down
5 changes: 3 additions & 2 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
5 changes: 3 additions & 2 deletions neural_speed/convert/convert_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model.eval()
hparams = model.config.to_dict()

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer):
toktypes: list[int] = []

from transformers import AutoTokenizer # type: ignore[attr-defined]
tokenizer = AutoTokenizer.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_quantized_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def bytes_to_unicode():

model_name = "/mnt/disk1/data2/zhenweil/models/bloom/bloom-7b1"
prompt = "Once upon a time, a little girl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)
Expand Down
7 changes: 5 additions & 2 deletions neural_speed/convert/convert_quantized_qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,15 +159,18 @@ def main(args_in: Optional[List[str]] = None) -> None:
"i", hparams["kv_channels"] if "kv_channels" in hparams else int(hparams["hidden_size"] /
hparams["num_attention_heads"])))
f.write(struct.pack("i", ftype))
f.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"]))
f.write(struct.pack("i", hparams["max_position_embeddings"]))
f.write(struct.pack("f", 0.0))
f.write(struct.pack("f", 0.0))
f.write(struct.pack("i", 0))
f.write(struct.pack("i", 0)) # word_embed_proj_dim (for opt)
f.write(struct.pack("i", 0)) # do_layer_norm_before (for opt)

f.write(struct.pack("i", 0))
f.write(struct.pack("i", hparams["intermediate_size"]))
if hparams['model_type']=='qwen2':
fout.write(struct.pack("i", hparams["intermediate_size"]))
else:
fout.write(struct.pack("i", int(hparams["intermediate_size"]/2)))
f.write(struct.pack("i", 0))
f.write(struct.pack("i", 0)) # n_experts
f.write(struct.pack("i", 0)) # n_expert_used
Expand Down
11 changes: 7 additions & 4 deletions neural_speed/convert/convert_qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down Expand Up @@ -111,15 +111,18 @@ def main(args_in: Optional[List[str]] = None) -> None:
hparams["num_attention_heads"])))
fout.write(struct.pack("i", ftype))
fout.write(
struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"]))
struct.pack("i", hparams["max_position_embeddings"]))
fout.write(struct.pack("f", 0.0))
fout.write(struct.pack("f", 0.0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # word_embed_proj_dim (for opt)
fout.write(struct.pack("i", 0)) # do_layer_norm_before (for opt)

fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", hparams["intermediate_size"]))
if hparams['model_type']=='qwen2':
fout.write(struct.pack("i", hparams["intermediate_size"]))
else:
fout.write(struct.pack("i", int(hparams["intermediate_size"]/2)))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
low_cpu_mem_usage=True,
trust_remote_code=True)
print("Model loaded: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()

list_vars = model.state_dict()
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import WhisperForConditionalGeneration
else:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained(dir_model)
model = WhisperForConditionalGeneration.from_pretrained(dir_model, trust_remote_code=True)

#code.interact(local=locals())
path = os.getcwd()
Expand Down
3 changes: 0 additions & 3 deletions neural_speed/models/qwen/qwen_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ void QWEN::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo
model_file_version file_version = ml->file_loaders.at(0)->file_version;
auto& hparams = model.hparams;
n_ff = hparams.ffn_hidden_size;
if (hparams.max_seq_len == 8192) {
n_ff = n_ff / 2;
}
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
Expand Down

0 comments on commit 2309fbb

Please sign in to comment.