diff --git a/neural_speed/convert/convert-hf-to-gguf.py b/neural_speed/convert/convert-hf-to-gguf.py index f3efb6485..b583ee9c4 100755 --- a/neural_speed/convert/convert-hf-to-gguf.py +++ b/neural_speed/convert/convert-hf-to-gguf.py @@ -64,6 +64,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None fname_tokenizer, cache_dir=fname_tokenizer, local_files_only=True, + trust_remote_code=True ) # Initialize lists and dictionaries for added tokens @@ -402,7 +403,7 @@ def _set_vocab_gpt2(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py index b26fee236..94fecf678 100644 --- a/neural_speed/convert/convert_bloom.py +++ b/neural_speed/convert/convert_bloom.py @@ -77,7 +77,7 @@ def main(args_in: Optional[List[str]] = None) -> None: torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = config.to_dict() print("Loading model: ", dir_model) diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py index 28d477297..1360735a8 100644 --- a/neural_speed/convert/convert_dolly.py +++ b/neural_speed/convert/convert_dolly.py @@ -79,8 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoModelForCausalLM, AutoTokenizer - model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) - tokenizer = AutoTokenizer.from_pretrained(dir_model) + model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32, + trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) print("Loading model: ", dir_model) model.eval() for p in model.parameters(): diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index 0670cb3de..e5b588698 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -74,7 +74,7 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) + model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() list_vars = model.state_dict() diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index da3937451..82a190cab 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -81,8 +81,9 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) - tokenizer = AutoTokenizer.from_pretrained(dir_model) + model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32, + trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) model.eval() for p in model.parameters(): p.requires_grad = False diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index 10fb9e4c7..ee6dcad53 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -79,8 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) - tokenizer = AutoTokenizer.from_pretrained(dir_model) + model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32, + trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) model.eval() hparams = model.config.to_dict() diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py index a81069afc..d7b685cd9 100644 --- a/neural_speed/convert/convert_phi.py +++ b/neural_speed/convert/convert_phi.py @@ -86,7 +86,7 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer): toktypes: list[int] = [] from transformers import AutoTokenizer # type: ignore[attr-defined] - tokenizer = AutoTokenizer.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size diff --git a/neural_speed/convert/convert_quantized_bloom.py b/neural_speed/convert/convert_quantized_bloom.py index 21ce87ab6..a323019e8 100644 --- a/neural_speed/convert/convert_quantized_bloom.py +++ b/neural_speed/convert/convert_quantized_bloom.py @@ -121,7 +121,7 @@ def bytes_to_unicode(): model_name = "/mnt/disk1/data2/zhenweil/models/bloom/bloom-7b1" prompt = "Once upon a time, a little girl" -tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) inputs = tokenizer(prompt, return_tensors="pt").input_ids streamer = TextStreamer(tokenizer) diff --git a/neural_speed/convert/convert_quantized_qwen.py b/neural_speed/convert/convert_quantized_qwen.py index beab20ca1..b57238862 100644 --- a/neural_speed/convert/convert_quantized_qwen.py +++ b/neural_speed/convert/convert_quantized_qwen.py @@ -159,7 +159,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "i", hparams["kv_channels"] if "kv_channels" in hparams else int(hparams["hidden_size"] / hparams["num_attention_heads"]))) f.write(struct.pack("i", ftype)) - f.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"])) + f.write(struct.pack("i", hparams["max_position_embeddings"])) f.write(struct.pack("f", 0.0)) f.write(struct.pack("f", 0.0)) f.write(struct.pack("i", 0)) @@ -167,7 +167,10 @@ def main(args_in: Optional[List[str]] = None) -> None: f.write(struct.pack("i", 0)) # do_layer_norm_before (for opt) f.write(struct.pack("i", 0)) - f.write(struct.pack("i", hparams["intermediate_size"])) + if hparams['model_type']=='qwen2': + fout.write(struct.pack("i", hparams["intermediate_size"])) + else: + fout.write(struct.pack("i", int(hparams["intermediate_size"]/2))) f.write(struct.pack("i", 0)) f.write(struct.pack("i", 0)) # n_experts f.write(struct.pack("i", 0)) # n_expert_used diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index 5f694d5ce..603d7c2b9 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -81,8 +81,8 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model) + model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) model.eval() for p in model.parameters(): p.requires_grad = False @@ -111,7 +111,7 @@ def main(args_in: Optional[List[str]] = None) -> None: hparams["num_attention_heads"]))) fout.write(struct.pack("i", ftype)) fout.write( - struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"])) + struct.pack("i", hparams["max_position_embeddings"])) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("i", 0)) @@ -119,7 +119,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) # do_layer_norm_before (for opt) fout.write(struct.pack("i", 0)) - fout.write(struct.pack("i", hparams["intermediate_size"])) + if hparams['model_type']=='qwen2': + fout.write(struct.pack("i", hparams["intermediate_size"])) + else: + fout.write(struct.pack("i", int(hparams["intermediate_size"]/2))) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) # n_experts fout.write(struct.pack("i", 0)) # n_expert_used diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py index 105f58e4f..b53fe3f38 100644 --- a/neural_speed/convert/convert_starcoder.py +++ b/neural_speed/convert/convert_starcoder.py @@ -82,7 +82,7 @@ def main(args_in: Optional[List[str]] = None) -> None: low_cpu_mem_usage=True, trust_remote_code=True) print("Model loaded: ", dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = config.to_dict() list_vars = model.state_dict() diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py index b9718d9e7..3351caa8b 100644 --- a/neural_speed/convert/convert_whisper.py +++ b/neural_speed/convert/convert_whisper.py @@ -113,7 +113,7 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import WhisperForConditionalGeneration else: from transformers import WhisperForConditionalGeneration - model = WhisperForConditionalGeneration.from_pretrained(dir_model) + model = WhisperForConditionalGeneration.from_pretrained(dir_model, trust_remote_code=True) #code.interact(local=locals()) path = os.getcwd() diff --git a/neural_speed/models/qwen/qwen_utils.cpp b/neural_speed/models/qwen/qwen_utils.cpp index 32e222d76..4dd25cec4 100644 --- a/neural_speed/models/qwen/qwen_utils.cpp +++ b/neural_speed/models/qwen/qwen_utils.cpp @@ -53,9 +53,6 @@ void QWEN::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo model_file_version file_version = ml->file_loaders.at(0)->file_version; auto& hparams = model.hparams; n_ff = hparams.ffn_hidden_size; - if (hparams.max_seq_len == 8192) { - n_ff = n_ff / 2; - } fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);