[Version] v1.5.0. (#311)

intel · Apr 12, 2024 · e91e75d · e91e75d
1 parent 136bfd7
commit e91e75d
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # CHANGELOG
 
+# [Version v1.5.0](https://github.com/intel/xFasterTransformer/releases/tag/v1.5.0)
+v1.5.0 - Gemma series models supported.
+
+## Functionality
+- Support Gemma series medels, including Gemma and CodeGemma, and DeepSeek model.
+- Llama Converter support convert quantized huggingface model by params `from_quantized_model='gptq` into xFt format INT8/INT4 model files.
+- Support loading INT4 data weights directly from local files.
+- Optimize memory usage during QWen model conversion, particularly for QWen 72B.
+
+## Dependency
+- Bump `transformers` to `4.38.1` to support Gemma models.
+- Add `protobuf` to support new behavier in `tokenzier`.
+
+## Performance
+- Update xDNN to release `v1.4.5`
+- Add GPU kernel library gpuDNN v0.1 to support Intel Arc GPU series. 
+- Optimize ROPE perfermance by reducing repeated sin and cos embedding table data.
+- Accelerate KVCache copy by increasing parallelism in self attention.
+- Accelerate addreduce operation in long sequence case by transposing KVCache and tuned comm.
+
+## BUG fix
+- Fix a incorrect computing which should be in float, but was in integer.
+- Fix timeline is disordered.
+- Fix runtime issue of Qwen when seq_length is bigger than 32768.
+
 # [Version v1.4.0](https://github.com/intel/xFasterTransformer/releases/tag/v1.4.0)
 v1.4.0 - Fully BF16 support in Llama for better performance and serving framework support.
 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.4.0
+1.5.0
diff --git a/ci/test_case b/ci/test_case
@@ -100,87 +100,164 @@ bash run_benchmark.sh -m llama-2-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m llama-2-7b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m llama-2-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m llama-2-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # llama-2-7b with long prompt:
 bash run_benchmark.sh -m llama-2-7b -d fp16 -i 1 -w 0 -in 2016 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d bf16 -i 1 -w 0 -in 2016 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d int8 -i 1 -w 0 -in 2016 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d w8a8 -i 1 -w 0 -in 2016 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-7b -d int4 -i 1 -w 0 -in 2016 -out 32 -s 1
-bash run_benchmark.sh -m llama-2-7b -d nf4 -i 1 -w 0 -in 2016 -out 32 -s 1
+bash run_benchmark.sh -m llama-2-7b -d nf4  -i 1 -w 0 -in 2016 -out 32 -s 1
+
+bash run_benchmark.sh -m llama-2-7b -d fp16 -i 1 -w 0 -in 2016 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d bf16 -i 1 -w 0 -in 2016 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d int8 -i 1 -w 0 -in 2016 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d w8a8 -i 1 -w 0 -in 2016 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d int4 -i 1 -w 0 -in 2016 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-7b -d nf4  -i 1 -w 0 -in 2016 -out 32 -s 2
 
 # llama-2-13b with short prompt & full data type:
 bash run_benchmark.sh -m llama-2-13b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-13b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-13b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-13b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m llama-2-13b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m llama-2-13b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m llama-2-13b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m llama-2-13b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-13b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-13b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-13b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-13b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m llama-2-13b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # chatglm3-6b with short prompt & full data type:
 bash run_benchmark.sh -m chatglm3-6b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m chatglm3-6b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m chatglm3-6b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m chatglm3-6b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m chatglm3-6b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m chatglm3-6b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m chatglm3-6b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m chatglm3-6b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m chatglm3-6b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m chatglm3-6b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m chatglm3-6b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m chatglm3-6b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m chatglm3-6b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # baichuan2-7b with short prompt & full data type:
 bash run_benchmark.sh -m baichuan2-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m baichuan2-7b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m baichuan2-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m baichuan2-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # baichuan2-13b with short prompt & full data type:
 bash run_benchmark.sh -m baichuan2-13b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m baichuan2-13b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m baichuan2-13b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m baichuan2-13b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-13b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-13b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-13b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-13b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m baichuan2-13b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # qwen-7b with short prompt & full data type:
 bash run_benchmark.sh -m qwen-1_8b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-1_8b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-1_8b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-1_8b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-1_8b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m qwen-1_8b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-1_8b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m qwen-1_8b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-1_8b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-1_8b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-1_8b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-1_8b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-1_8b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # qwen-7b with short prompt & full data type:
 bash run_benchmark.sh -m qwen-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m qwen-7b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m qwen-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # qwen-14b with short prompt & full data type:
 bash run_benchmark.sh -m qwen-14b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-14b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-14b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-14b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-14b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m qwen-14b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-14b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m qwen-14b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-14b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-14b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-14b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-14b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m qwen-14b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # gemma-2b with short prompt & full data type:
 bash run_benchmark.sh -m gemma-2b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-2b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-2b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-2b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-2b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m gemma-2b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m gemma-2b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m gemma-2b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-2b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-2b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-2b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-2b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-2b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # gemma-7b with short prompt & full data type:
 bash run_benchmark.sh -m gemma-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m gemma-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
-bash run_benchmark.sh -m gemma-7b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m gemma-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 1
+
+bash run_benchmark.sh -m gemma-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-7b -d int8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-7b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-7b -d int4 -i 1 -w 0 -in 32 -out 32 -s 2
+bash run_benchmark.sh -m gemma-7b -d nf4  -i 1 -w 0 -in 32 -out 32 -s 2
 
 # Add new test case here: