Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
enable load model from modelscope (#154)
Browse files Browse the repository at this point in the history
  • Loading branch information
intellinjun authored Mar 8, 2024
1 parent 37d01f3 commit ad3d19e
Show file tree
Hide file tree
Showing 21 changed files with 205 additions and 73 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,22 @@ streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
Pytorch format modelscpoe model
```python
import sys
from modelscope import AutoTokenizer
from transformers import TextStreamer
from neural_speed import Model

model_name = "qwen/Qwen1.5-7B-Chat" # modelscope model_id or local model
prompt = "Once upon a time, there existed a little girl,"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)
model = Model()
model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
Please refer [this link](./docs/supported_models.md) to check supported models.

If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md).
Expand Down
12 changes: 8 additions & 4 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import torch
from neural_speed.convert import convert_model
from transformers import AutoConfig, AutoTokenizer

model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
max_request_num_default = 8
Expand Down Expand Up @@ -87,8 +86,13 @@ def get_model_type(model_config):

def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False,
weight_dtype="int4", alg="sym", group_size=32,
scale_dtype="fp32", compute_dtype="int8", use_ggml=False):
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"):
if model_hub == "modelscope":
from modelscope import AutoConfig
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
else:
from transformers import AutoConfig
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model_type = Model.get_model_type(self.config)
self.model_type = model_type
self.__import_package(model_type)
Expand Down Expand Up @@ -129,7 +133,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
return

if not os.path.exists(fp32_bin):
convert_model(model_name, fp32_bin, "f32")
convert_model(model_name, fp32_bin, "f32", model_hub = model_hub)
assert os.path.exists(fp32_bin), "Fail to convert pytorch model"

if not use_quant:
Expand Down
11 changes: 8 additions & 3 deletions neural_speed/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@
# limitations under the License.

from pathlib import Path
from transformers import AutoConfig
import subprocess

model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}


def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_quantized_model=False):
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False):
if model_hub == "modelscope":
from modelscope import AutoConfig
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
else:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
model_type = model_maps.get(config.model_type, config.model_type)

if use_quantized_model:
Expand All @@ -34,6 +38,7 @@ def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_qua
cmd.extend(["python", path])
cmd.extend(["--outfile", outfile])
cmd.extend(["--outtype", outtype])
cmd.extend(["--model_hub", model_hub])
cmd.extend([model])

print("cmd:", cmd)
Expand Down
11 changes: 8 additions & 3 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModel, AutoConfig, AutoModelForCausalLM, AutoTokenizer
from sentencepiece import SentencePieceProcessor # type: ignore


Expand Down Expand Up @@ -231,6 +230,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -243,10 +244,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)

hparams = config.to_dict()

Expand Down
15 changes: 10 additions & 5 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -54,6 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -66,16 +67,20 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

tokenizer = AutoTokenizer.from_pretrained(dir_model)
if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model,
config=config,
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
hparams = config.to_dict()
print("Loading model: ", dir_model)

print("Model loaded: ", dir_model)

fout = open(fname_out, "wb")
Expand Down
10 changes: 7 additions & 3 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
from sentencepiece import SentencePieceProcessor # type: ignore
import gguf

Expand Down Expand Up @@ -612,6 +611,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
parser.add_argument("--format",
type=str,
Expand All @@ -629,10 +630,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModel, AutoTokenizer
else:
from transformers import AutoConfig, AutoModel, AutoTokenizer
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)

hparams = config.to_dict()

Expand Down
10 changes: 7 additions & 3 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModelForCausalLM, AutoTokenizer


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -62,6 +61,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -74,10 +75,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
22 changes: 13 additions & 9 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -54,6 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -66,21 +67,24 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model,
config=config,
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
print("Model loaded: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)

n_head_kv = hparams.get("num_kv_heads", 1)
n_head = hparams["num_attention_heads"]
Expand Down
10 changes: 7 additions & 3 deletions neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModelForCausalLM, AutoTokenizer


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -59,6 +58,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -68,10 +69,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = model.config.to_dict()
list_vars = model.state_dict()
fout = open(fname_out, "wb")
Expand Down
9 changes: 7 additions & 2 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -74,10 +76,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

tokenizer = AutoTokenizer.from_pretrained(dir_model)
if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
13 changes: 9 additions & 4 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
Union)
import numpy as np
from sentencepiece import SentencePieceProcessor # type: ignore
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import gguf

if TYPE_CHECKING:
Expand Down Expand Up @@ -1423,6 +1422,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
type=Path,
help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model",
type=Path,
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
Expand All @@ -1432,7 +1433,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
choices=["NE", "GGUF"],
help="convert to the GGUF or NE format")
args = parser.parse_args(args_in)

vocab: Vocab
if args.dump_single:
model_plus = lazy_load_file(args.model)
Expand All @@ -1449,8 +1449,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
model_plus = load_some_model(args.model)
else:
print("Loadding the model from HF.")
model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
cache_path = Path(tokenizer.vocab_file).parent
args.model = cache_path

Expand Down
Loading

0 comments on commit ad3d19e

Please sign in to comment.