intel · zhenwei-intel · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md
@@ -277,6 +277,16 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, ctx_size
 
 https://github.com/intel/intel-extension-for-transformers/assets/109187816/1698dcda-c9ec-4f44-b159-f4e9d67ab15b
 
+Methods description of model:
+| Method         | Description                                                                                |
+| -------------- | -----------------------------------------------------------------------                    |
+| `init`         | initialize cpp model using model name                                                      |
+| `init_from_bin` | initialize cpp model from bin file                                                      |
+| `generate`     | transformer-like generate function, arguments please refer to `argument description of generate function`|
+| `__call__`     | forward function                                                                           |
+| `quant_model`  | quantize model from fp32 bin, arguments please refer to following `WeightOnlyQuantConfig`  | 
+| `print_time`   | print time of each evaluation                                                              |
+
 Argument description of WeightOnlyQuantConfig ([supported MatMul combinations](#supported-matrix-multiplication-data-types-combinations)):
 | Argument          |  Type       | Description                                                                             |
 | --------------    | ----------  | -----------------------------------------------------------------------                 |

diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py
@@ -76,6 +76,7 @@ def get_model_type(model_config):
         return model_type
 
     def init(self, model_name, use_quant=True, use_gptq=False, **quant_kwargs):
+        """initialize cpp model using model name"""
         self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model_type = Model.get_model_type(self.config)
@@ -127,6 +128,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, **quant_kwargs):
         os.remove(fp32_bin)
 
     def init_from_bin(self, model_type, model_path, **generate_kwargs):
+        """initialize cpp model from bin file"""
         self.__import_package(model_type)
         self.model = self.module.Model()
         if "threads" not in generate_kwargs:
@@ -138,11 +140,13 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
         self.model.init_model(model_path, **generate_kwargs)
 
     def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
+        """quantize model from fp32 bin"""
         self.__import_package(model_type)
         self.module.Model.quant_model(model_path=model_path, out_path=out_path, **quant_kwargs)
 
     def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False, stopping_criteria=None,
                  **generate_kwargs):
+        """transformer-like generate"""
         max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
         self.batch_size = input_ids.shape[0]
         if self.model is None:
@@ -190,7 +194,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
             elif (max_new_tokens != -1 and out_count >= max_new_tokens):
                 break
             else:
-                all_done = [(r[-1] in [self.eos_token_id(), self.pad_token_id()]) for r in ret]
+                all_done = [(r[-1] in [self.__eos_token_id(), self.__pad_token_id()]) for r in ret]
                 if False not in all_done:
                     break
         if streamer:
@@ -199,15 +203,15 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
         self.generate_round += 1
         return ret
 
-    def is_token_end(self):
+    def __is_token_end(self):
         return self.model.is_token_end()
 
-    def eos_token_id(self):
+    def __eos_token_id(self):
         if self.model_type == 'qwen':
             return self.tokenizer.special_tokens['<|endoftext|>']
         return self.tokenizer.eos_token_id
 
-    def pad_token_id(self):
+    def __pad_token_id(self):
         if self.tokenizer.pad_token_id == None:
             if self.batch_size == 1:
                 return None
@@ -217,10 +221,15 @@ def pad_token_id(self):
         return self.tokenizer.pad_token_id
 
     def __call__(self, input_ids, reinit=False, **kwargs):
+        """forward function"""
         if self.model is None:
             self.init_from_bin(self.model_type, self.bin_file, **kwargs)
             self.generate_round = 0
         elif reinit:
             self.model.reinit()
             self.generate_round = 0
         return self.model.evaluate(input_ids.tolist())
+
+    def print_time(self):
+        """print time of each evaluation"""
+        self.model.print_time()
diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp
@@ -91,6 +91,10 @@ class Model {
     generate_count = 0;
   }
 
+  void print_time() {
+    model_print_timings(ctx);
+  }
+
   static size_t np_jblas_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
                                py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
                                const std::string& alg, int group_size, const std::string& scale_dtype,
@@ -688,5 +692,6 @@ PYBIND11_MODULE(qwen_cpp, m)
       .def_static("np_jblas_quantize", &Model::np_jblas_quantize, "Quantize tensor to jblas format", py::arg("src_w"),
                   py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym", py::arg("group_size") = 32,
                   py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8", py::arg("threads") = 8)
+      .def("print_time", &Model::print_time)
       .def("reinit", &Model::reinit);
 }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/python_api_example.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/python_api_example.py
@@ -18,7 +18,7 @@
 from transformers import AutoTokenizer, TextStreamer
 from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
 
-model_name = "Intel/neural-chat-7b-v1-1"  # or local path to model
+model_name = "/mnt/disk1/data2/zhenweil/models/llama/Llama-2-7b-chat-hf"  # or local path to model
 # int4 weight_only quantization
 woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
 # fp4 weight_only quantization
@@ -37,9 +37,10 @@
 
 # top_k_top_p sample or greedy_search
 model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
-outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=30)
+model.print_time()
 # beam search
-model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
-outputs = model.generate(inputs, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True)
-ans = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-print(ans)
+# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
+# outputs = model.generate(inputs, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True)
+# ans = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+# print(ans)