diff --git a/README.md b/README.md index 57121ab41..4a4102186 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,22 @@ streamer = TextStreamer(tokenizer) model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file) outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) ``` +Pytorch format modelscpoe model +```python +import sys +from modelscope import AutoTokenizer +from transformers import TextStreamer +from neural_speed import Model +model_name = "qwen/Qwen1.5-7B-Chat" # modelscope model_id or local model +prompt = "Once upon a time, there existed a little girl," +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +inputs = tokenizer(prompt, return_tensors="pt").input_ids +streamer = TextStreamer(tokenizer) +model = Model() +model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope") +outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) +``` Please refer [this link](./docs/supported_models.md) to check supported models. If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md). diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index d0ab95c21..12229812e 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -18,7 +18,6 @@ import torch from neural_speed.convert import convert_model -from transformers import AutoConfig, AutoTokenizer model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"} max_request_num_default = 8 @@ -87,8 +86,13 @@ def get_model_type(model_config): def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False, weight_dtype="int4", alg="sym", group_size=32, - scale_dtype="fp32", compute_dtype="int8", use_ggml=False): - self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"): + if model_hub == "modelscope": + from modelscope import AutoConfig + self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + else: + from transformers import AutoConfig + self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) model_type = Model.get_model_type(self.config) self.model_type = model_type self.__import_package(model_type) @@ -129,7 +133,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au return if not os.path.exists(fp32_bin): - convert_model(model_name, fp32_bin, "f32") + convert_model(model_name, fp32_bin, "f32", model_hub = model_hub) assert os.path.exists(fp32_bin), "Fail to convert pytorch model" if not use_quant: diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py index 4e2a6796d..18ce11490 100644 --- a/neural_speed/convert/__init__.py +++ b/neural_speed/convert/__init__.py @@ -16,14 +16,18 @@ # limitations under the License. from pathlib import Path -from transformers import AutoConfig import subprocess model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"} -def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_quantized_model=False): - config = AutoConfig.from_pretrained(model, trust_remote_code=True) +def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False): + if model_hub == "modelscope": + from modelscope import AutoConfig + config = AutoConfig.from_pretrained(model, trust_remote_code=True) + else: + from transformers import AutoConfig + config = AutoConfig.from_pretrained(model, trust_remote_code=True) model_type = model_maps.get(config.model_type, config.model_type) if use_quantized_model: @@ -34,6 +38,7 @@ def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_qua cmd.extend(["python", path]) cmd.extend(["--outfile", outfile]) cmd.extend(["--outtype", outtype]) + cmd.extend(["--model_hub", model_hub]) cmd.extend([model]) print("cmd:", cmd) diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py index 9303df2c1..3fddfe29f 100644 --- a/neural_speed/convert/convert_baichuan.py +++ b/neural_speed/convert/convert_baichuan.py @@ -19,7 +19,6 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoModel, AutoConfig, AutoModelForCausalLM, AutoTokenizer from sentencepiece import SentencePieceProcessor # type: ignore @@ -231,6 +230,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -243,10 +244,14 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + print("Loading model: ", dir_model) + model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) hparams = config.to_dict() diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py index 7bd263d52..b26fee236 100644 --- a/neural_speed/convert/convert_bloom.py +++ b/neural_speed/convert/convert_bloom.py @@ -24,7 +24,6 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -54,6 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -66,16 +67,20 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model) + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) - hparams = config.to_dict() - print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model) + hparams = config.to_dict() + print("Loading model: ", dir_model) + print("Model loaded: ", dir_model) fout = open(fname_out, "wb") diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py index 124d19ec3..47db7f569 100644 --- a/neural_speed/convert/convert_chatglm.py +++ b/neural_speed/convert/convert_chatglm.py @@ -19,7 +19,6 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig from sentencepiece import SentencePieceProcessor # type: ignore import gguf @@ -612,6 +611,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") parser.add_argument("--format", type=str, @@ -629,10 +630,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModel, AutoTokenizer + else: + from transformers import AutoConfig, AutoModel, AutoTokenizer + model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) hparams = config.to_dict() diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py index 0d90c29fb..28d477297 100644 --- a/neural_speed/convert/convert_dolly.py +++ b/neural_speed/convert/convert_dolly.py @@ -32,7 +32,6 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoModelForCausalLM, AutoTokenizer # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -62,6 +61,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -74,10 +75,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer + model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) tokenizer = AutoTokenizer.from_pretrained(dir_model) print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) model.eval() for p in model.parameters(): p.requires_grad = False diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py index 3c88850f5..d0ac575a2 100644 --- a/neural_speed/convert/convert_falcon.py +++ b/neural_speed/convert/convert_falcon.py @@ -24,7 +24,6 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -54,6 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -66,21 +67,24 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) - with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f: - hparams = json.load(f) - if hparams["architectures"][0] != "FalconForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit(1) + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True) print("Model loaded: ", dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f: + hparams = json.load(f) + if hparams["architectures"][0] != "FalconForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + sys.exit(1) n_head_kv = hparams.get("num_kv_heads", 1) n_head = hparams["num_attention_heads"] diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index d41b0cf5f..0670cb3de 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -29,7 +29,6 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoModelForCausalLM, AutoTokenizer # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -59,6 +58,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -68,10 +69,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() list_vars = model.state_dict() fout = open(fname_out, "wb") diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index 0dade0563..da3937451 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -62,6 +62,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -74,10 +76,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model) + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) + tokenizer = AutoTokenizer.from_pretrained(dir_model) model.eval() for p in model.parameters(): p.requires_grad = False diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 5470035db..37af73ab4 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -35,7 +35,6 @@ Union) import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig import gguf if TYPE_CHECKING: @@ -1423,6 +1422,8 @@ def main(args_in: Optional[List[str]] = None) -> None: type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") @@ -1432,7 +1433,6 @@ def main(args_in: Optional[List[str]] = None) -> None: choices=["NE", "GGUF"], help="convert to the GGUF or NE format") args = parser.parse_args(args_in) - vocab: Vocab if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1449,8 +1449,13 @@ def main(args_in: Optional[List[str]] = None) -> None: model_plus = load_some_model(args.model) else: print("Loadding the model from HF.") - model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer + model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, + trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True) cache_path = Path(tokenizer.vocab_file).parent args.model = cache_path diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py index a440f529d..1889a9860 100644 --- a/neural_speed/convert/convert_mistral.py +++ b/neural_speed/convert/convert_mistral.py @@ -36,7 +36,6 @@ import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig if TYPE_CHECKING: from typing_extensions import TypeAlias @@ -1298,11 +1297,12 @@ def main(args_in: Optional[List[str]] = None) -> None: type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") args = parser.parse_args(args_in) - vocab: Vocab if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1318,8 +1318,12 @@ def main(args_in: Optional[List[str]] = None) -> None: print("Loadding the model from the local path.") else: print("Loadding the model from HF.") - model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModel, AutoTokenizer + else: + from transformers import AutoConfig, AutoModel, AutoTokenizer + model = AutoModel.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True) cache_path = Path(tokenizer.vocab_file).parent args.model = cache_path diff --git a/neural_speed/convert/convert_mixtral.py b/neural_speed/convert/convert_mixtral.py index 1c8eded16..4166d94be 100644 --- a/neural_speed/convert/convert_mixtral.py +++ b/neural_speed/convert/convert_mixtral.py @@ -36,7 +36,6 @@ import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig if TYPE_CHECKING: from typing_extensions import TypeAlias @@ -1300,6 +1299,8 @@ def main(args_in: Optional[List[str]] = None) -> None: type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") @@ -1320,8 +1321,12 @@ def main(args_in: Optional[List[str]] = None) -> None: print("Loadding the model from the local path.") else: print("Loadding the model from HF.") - model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModel, AutoTokenizer + else: + from transformers import AutoConfig, AutoModel, AutoTokenizer + model = AutoModel.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True) cache_path = Path(tokenizer.vocab_file).parent args.model = cache_path diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py index cd56af41d..929b41818 100644 --- a/neural_speed/convert/convert_mpt.py +++ b/neural_speed/convert/convert_mpt.py @@ -51,6 +51,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -62,9 +64,12 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() list_vars = model.state_dict() diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index 07b7a632a..10fb9e4c7 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -60,6 +60,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -72,10 +74,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model) + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) + tokenizer = AutoTokenizer.from_pretrained(dir_model) model.eval() hparams = model.config.to_dict() diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py index 6e02b0b55..a81069afc 100644 --- a/neural_speed/convert/convert_phi.py +++ b/neural_speed/convert/convert_phi.py @@ -267,6 +267,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default="huggingface", + help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") parser.add_argument("--format", type=str, @@ -284,10 +286,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() if args.format == "GGUF": phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams) diff --git a/neural_speed/convert/convert_quantized_qwen.py b/neural_speed/convert/convert_quantized_qwen.py index fc0b87ed2..d5444b25d 100644 --- a/neural_speed/convert/convert_quantized_qwen.py +++ b/neural_speed/convert/convert_quantized_qwen.py @@ -177,12 +177,12 @@ def main(args_in: Optional[List[str]] = None) -> None: f.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings f.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) - f.write( - struct.pack( - "i", hparams["bos_token_id"] if "bos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>'])) - f.write( - struct.pack( - "i", hparams["eos_token_id"] if "eos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>'])) + if hparams['model_type']=='qwen2': + f.write(struct.pack("i", hparams["bos_token_id"])) + f.write(struct.pack("i", hparams["eos_token_id"])) + else: + f.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) + f.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) f.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) f.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index bcdacbdc2..5f694d5ce 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -32,7 +32,7 @@ import argparse from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import AutoModelForCausalLM, AutoTokenizer + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -62,6 +62,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -74,10 +76,13 @@ def main(args_in: Optional[List[str]] = None) -> None: ftype = 0 if args.outtype == "f16": ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + if args.model_hub == "modelscope": + from modelscope import AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model) model.eval() for p in model.parameters(): p.requires_grad = False @@ -125,12 +130,12 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) - fout.write( - struct.pack( - "i", hparams["bos_token_id"] if "bos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>'])) - fout.write( - struct.pack( - "i", hparams["eos_token_id"] if "eos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>'])) + if hparams['model_type']=='qwen2': + fout.write(struct.pack("i", hparams["bos_token_id"])) + fout.write(struct.pack("i", hparams["eos_token_id"])) + else: + fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) + fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py index f176ef8d9..105f58e4f 100644 --- a/neural_speed/convert/convert_starcoder.py +++ b/neural_speed/convert/convert_starcoder.py @@ -56,6 +56,8 @@ def main(args_in: Optional[List[str]] = None) -> None: default="fp32", help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) @@ -68,17 +70,20 @@ def main(args_in: Optional[List[str]] = None) -> None: use_f16 = False if args.outtype == "f16": use_f16 = True - + if args.model_hub == "modelscope": + from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model) config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) - hparams = config.to_dict() model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, torch_dtype=torch.float16 \ if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True) print("Model loaded: ", dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model) + hparams = config.to_dict() list_vars = model.state_dict() diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py index 157f34175..b9718d9e7 100644 --- a/neural_speed/convert/convert_whisper.py +++ b/neural_speed/convert/convert_whisper.py @@ -42,7 +42,6 @@ from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) -from transformers import WhisperForConditionalGeneration conv_map = { 'self_attn.k_proj': 'attn.key', @@ -98,6 +97,8 @@ def main(args_in: Optional[List[str]] = None) -> None: default="fp32", help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("--model_hub", choices=["huggingface","modelscope"], + default="huggingface", help="hub to load model") parser.add_argument("model", type=Path, help="directory containing model file") args = parser.parse_args(args_in) dir_model = args.model @@ -108,7 +109,10 @@ def main(args_in: Optional[List[str]] = None) -> None: encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8")) encoder_added = json.load((dir_model / "added_tokens.json").open("r", encoding="utf8")) hparams = json.load((dir_model / "config.json").open("r", encoding="utf8")) - + if args.model_hub == "modelscope": + from modelscope import WhisperForConditionalGeneration + else: + from transformers import WhisperForConditionalGeneration model = WhisperForConditionalGeneration.from_pretrained(dir_model) #code.interact(local=locals()) diff --git a/scripts/python_api_example_for_modelscope.py b/scripts/python_api_example_for_modelscope.py new file mode 100644 index 000000000..7dd311afe --- /dev/null +++ b/scripts/python_api_example_for_modelscope.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +from modelscope import AutoTokenizer +from transformers import TextStreamer +from neural_speed import Model + +if len(sys.argv) != 2: + print("Usage: python python_api_example.py model_path") +model_name = sys.argv[1] + +prompt = "Once upon a time, a little girl" +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +inputs = tokenizer(prompt, return_tensors="pt").input_ids +streamer = TextStreamer(tokenizer) + +model = Model() +# If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True. +model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope") +outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)