From 79c3537b163d12daf296940f413fc5b3694a0828 Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Fri, 29 Mar 2024 15:31:41 +0800
Subject: [PATCH] update README (#165)
---
README.md | 13 +++++--------
docs/advanced_usage.md | 4 ++--
docs/fused_attention.md | 4 ++--
docs/gguf.md | 2 +-
4 files changed, 10 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
index 5671943ee..34b6580bd 100644
--- a/README.md
+++ b/README.md
@@ -79,21 +79,18 @@ streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
-
### PyTorch Model from Modelscope
```python
-import sys
-from modelscope import AutoTokenizer
from transformers import TextStreamer
-from neural_speed import Model
-
-model_name = "qwen/Qwen1.5-7B-Chat" # modelscope model_id or local model
+from modelscope import AutoTokenizer
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
+model_name = "qwen/Qwen-7B" # Modelscope model_id or local model
prompt = "Once upon a time, there existed a little girl,"
+
+model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, model_hub="modelscope")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)
-model = Model()
-model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md
index c82c6b194..278901b42 100644
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -22,7 +22,7 @@ Argument description of run.py ([supported MatMul combinations](#supported-matri
| --repeat_penalty | Penalize repeat sequence of tokens: Float (default: 1.1, 1.0 = disabled) |
| --color | Colorise output to distinguish prompt and user input from generations |
| --keep | Number of tokens to keep from the initial prompt: Int (default: 0, -1 = all) |
-| --shift-roped-k | Use [ring-buffer](./docs/infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) |
+| --shift-roped-k | Use [ring-buffer](./infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) |
| --token | Access token ID for models that require it (e.g: LLaMa2, etc..) |
@@ -119,6 +119,6 @@ Argument description of inference.py:
| --repeat_penalty | Penalize repeat sequence of tokens: Float (default: 1.1, 1.0 = disabled) |
| --color | Colorise output to distinguish prompt and user input from generations |
| --keep | Number of tokens to keep from the initial prompt: Int (default: 0, -1 = all) |
-| --shift-roped-k | Use [ring-buffer](./docs/infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) |
+| --shift-roped-k | Use [ring-buffer](./infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) |
| --glm_tokenizer | The path of the chatglm tokenizer: String (default: THUDM/chatglm-6b) |
| --memory-f32
--memory-f16
--memory-auto | Data type of kv memory (default to auto);
If set to auto, the runtime will try with bestla flash attn managed format (currently requires GCC11+ & AMX) and fall back to fp16 if failed |
diff --git a/docs/fused_attention.md b/docs/fused_attention.md
index 353d05a45..e836b3318 100644
--- a/docs/fused_attention.md
+++ b/docs/fused_attention.md
@@ -3,10 +3,10 @@ Fused Attention
Attention (including MHA, GQA, MQA) is one of the key parts of transformers and also the performance critical in many scenarios. To implement various optimizations, a fused attention layer and corresponding utilities for the customized KV-cache it uses are introduced. As an example, fused attention can reduce the cost of MHA from 8521.276 ms (17.044 ms) to 248.586 ms (7.944 ms) of a 1975-token llama-7b first-token inference[[1]](#1).
-Note that this doc assumes you have the basic knowledge of this cpp graph implementation including `ne_tensor::ne`, `ne_tensor::nb` etc. Model builder can enable fused attention with operators `ne_flash_attn*` defined in [`core/ne_layers.h`](core/ne_layers.h) while the implementation of fused attentions is mostly located in [`core/layers/mha_dense.cpp`](core/layers/mha_dense.cpp).
+Note that this doc assumes you have the basic knowledge of this cpp graph implementation including `ne_tensor::ne`, `ne_tensor::nb` etc. Model builder can enable fused attention with operators `ne_flash_attn*` defined in [`core/ne_layers.h`](../neural_speed/core/ne_layers.h) while the implementation of fused attentions is mostly located in [`core/layers/mha_dense.cpp`](../neural_speed/core/layers/mha_dense.cpp).
## KV-cache initialization
-The memory for kv-cache is allocated in `/models/model_utils/model_utils.cpp`. As the fused attention implementation requires certain instruction extensions and potentially some other limitations, fused attention is enabled only if `bestla_reordered_attn_fp32_support()` aggress, denoting with `memory_type = NE_TYPE_BTLA`. Next, `get_batch_kv_elements_from_gpt_params()` will give the sizes (in terms of bytes if fused attention enabled, or in terms elements if fused attention disabled) of k-cache and v-cache respectively for each batch and each layer. The KV-cache is finally prepared with these 2 sizes by creating `ne_new_tensor` inside `model.kv_self.k/.v` or `model.layer[il].k_cache/.v_cache`.
+The memory for kv-cache is allocated in `neural_speed/models/model_utils/model_utils.cpp`. As the fused attention implementation requires certain instruction extensions and potentially some other limitations, fused attention is enabled only if `bestla_reordered_attn_fp32_support()` aggress, denoting with `memory_type = NE_TYPE_BTLA`. Next, `get_batch_kv_elements_from_gpt_params()` will give the sizes (in terms of bytes if fused attention enabled, or in terms elements if fused attention disabled) of k-cache and v-cache respectively for each batch and each layer. The KV-cache is finally prepared with these 2 sizes by creating `ne_new_tensor` inside `model.kv_self.k/.v` or `model.layer[il].k_cache/.v_cache`.
## KV-cache Append
KV-cache is appended every time a new pair of K and V are generated by evaluating inner product for QKV (ne_mul_qkv). This operation append an additional K/V-tensor on the dimension of sequence length (i.e. resulting `n_past = n_past + N`, where `n_past` is number of previously cached tokens and `N` is the length of current tokens).
diff --git a/docs/gguf.md b/docs/gguf.md
index 67bb8bdc1..068bbf0d1 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -5,7 +5,7 @@ Neural Speed also supports GGUF models generated by [llama.cpp](https://github.c
Validated models: [llama2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [falcon-7b](https://huggingface.co/tiiuae/falcon-7b), [falcon-40b](https://huggingface.co/tiiuae/falcon-40b), [mpt-7b](https://huggingface.co/mosaicml/mpt-7b), [mpt-40b](https://huggingface.co/mosaicml/mpt-40b) and [bloom-7b1](https://huggingface.co/bigscience/bloomz-7b1).
-Please check more validated GGUF models from HuggingFace in [list](./docs/supported_models.md).
+Please check more validated GGUF models from HuggingFace in [list](./supported_models.md).
## Examples