Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

[LLM Runtime] Control printing information using NEURAL_SPEED_VERBOSE #1054

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions intel_extension_for_transformers/llm/runtime/graph/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,16 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, ctx_size

https://github.com/intel/intel-extension-for-transformers/assets/109187816/1698dcda-c9ec-4f44-b159-f4e9d67ab15b

Methods description of model:
| Method | Description |
| -------------- | ----------------------------------------------------------------------- |
| `init` | initialize cpp model using model name |
| `init_from_bin` | initialize cpp model from bin file |
| `generate` | transformer-like generate function, arguments please refer to `argument description of generate function`|
| `__call__` | forward function |
| `quant_model` | quantize model from fp32 bin, arguments please refer to following `WeightOnlyQuantConfig` |
| `print_time` | print time of each evaluation |

Argument description of WeightOnlyQuantConfig ([supported MatMul combinations](#supported-matrix-multiplication-data-types-combinations)):
| Argument | Type | Description |
| -------------- | ---------- | ----------------------------------------------------------------------- |
Expand Down
17 changes: 13 additions & 4 deletions intel_extension_for_transformers/llm/runtime/graph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def get_model_type(model_config):
return model_type

def init(self, model_name, use_quant=True, use_gptq=False, **quant_kwargs):
"""initialize cpp model using model name"""
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
self.model_type = Model.get_model_type(self.config)
Expand Down Expand Up @@ -127,6 +128,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, **quant_kwargs):
os.remove(fp32_bin)

def init_from_bin(self, model_type, model_path, **generate_kwargs):
DDEle marked this conversation as resolved.
Show resolved Hide resolved
"""initialize cpp model from bin file"""
self.__import_package(model_type)
self.model = self.module.Model()
if "threads" not in generate_kwargs:
Expand All @@ -138,11 +140,13 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
self.model.init_model(model_path, **generate_kwargs)

def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
"""quantize model from fp32 bin"""
self.__import_package(model_type)
self.module.Model.quant_model(model_path=model_path, out_path=out_path, **quant_kwargs)

def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False, stopping_criteria=None,
**generate_kwargs):
"""transformer-like generate"""
max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
self.batch_size = input_ids.shape[0]
if self.model is None:
Expand Down Expand Up @@ -190,7 +194,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
elif (max_new_tokens != -1 and out_count >= max_new_tokens):
break
else:
all_done = [(r[-1] in [self.eos_token_id(), self.pad_token_id()]) for r in ret]
all_done = [(r[-1] in [self.__eos_token_id(), self.__pad_token_id()]) for r in ret]
if False not in all_done:
break
if streamer:
Expand All @@ -199,15 +203,15 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
self.generate_round += 1
return ret

def is_token_end(self):
def __is_token_end(self):
return self.model.is_token_end()

def eos_token_id(self):
def __eos_token_id(self):
if self.model_type == 'qwen':
return self.tokenizer.special_tokens['<|endoftext|>']
return self.tokenizer.eos_token_id

def pad_token_id(self):
def __pad_token_id(self):
if self.tokenizer.pad_token_id == None:
if self.batch_size == 1:
return None
Expand All @@ -217,10 +221,15 @@ def pad_token_id(self):
return self.tokenizer.pad_token_id

def __call__(self, input_ids, reinit=False, **kwargs):
"""forward function"""
if self.model is None:
self.init_from_bin(self.model_type, self.bin_file, **kwargs)
self.generate_round = 0
elif reinit:
self.model.reinit()
self.generate_round = 0
return self.model.evaluate(input_ids.tolist())

def print_time(self):
"""print time of each evaluation"""
self.model.print_time()
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ class Model {
generate_count = 0;
}

void print_time() {
model_print_timings(ctx);
}

static size_t np_jblas_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
const std::string& alg, int group_size, const std::string& scale_dtype,
Expand Down Expand Up @@ -688,5 +692,6 @@ PYBIND11_MODULE(qwen_cpp, m)
.def_static("np_jblas_quantize", &Model::np_jblas_quantize, "Quantize tensor to jblas format", py::arg("src_w"),
py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym", py::arg("group_size") = 32,
py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8", py::arg("threads") = 8)
.def("print_time", &Model::print_time)
.def("reinit", &Model::reinit);
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig

model_name = "Intel/neural-chat-7b-v1-1" # or local path to model
model_name = "/mnt/disk1/data2/zhenweil/models/llama/Llama-2-7b-chat-hf" # or local path to model
hshen14 marked this conversation as resolved.
Show resolved Hide resolved
# int4 weight_only quantization
woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
# fp4 weight_only quantization
Expand All @@ -37,9 +37,10 @@

# top_k_top_p sample or greedy_search
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=30)
model.print_time()
# beam search
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
outputs = model.generate(inputs, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True)
ans = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(ans)
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
# outputs = model.generate(inputs, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True)
# ans = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# print(ans)
Loading