[Fix] Fix baichuan2-13 without rope. (#289)

intel · Apr 2, 2024 · 79f34ec · 79f34ec
1 parent 2e9743f
commit 79f34ec
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 12 deletions.
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -15,6 +15,10 @@
 import os
 from typing import Tuple, List
 
+import sys
+
+sys.stdout = open(sys.stdout.fileno(), mode="w", buffering=1)
+
 # Ignore Tensor-RT warning from huggingface
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 
@@ -147,8 +151,6 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
 
         print("[INFO] xfastertransformer is installed, using pip installed package.")
     except Exception as e:
-        import sys
-
         sys.path.append("../src")
         import xfastertransformer
 
@@ -237,15 +239,26 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
         print("=" * 120, "\n" * 3)
 
         if args.csv != "":
+            from datetime import datetime
+
+            arg_dict = dict(
+                filter(
+                    lambda item: str(item[0]).find("path") == -1 and str(item[0]).find("csv") == -1, vars(args).items()
+                )
+            )
+
             rst = {
-                "infer_avg_latency (ms)": np.mean(total_times),
-                "1st_avg_latency (ms)": np.mean(first_token_times),
-                "2nd_max_latency (ms)": np.max(next_token_times),
-                "2nd_min_latency (ms)": np.min(next_token_times),
-                "2nd_P90_latency (ms)": np.percentile(next_token_times, 90),
-                "2nd_avg_latency (ms)": np.mean(next_token_times),
-                "throughput_wo_1st (tokens/s)": 1000 / np.percentile(next_token_times, 90) * args.batch_size,
-                **vars(args),
+                "test_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "infer_avg(ms)": round(np.mean(total_times), 2),
+                "1st_avg(ms)": round(np.mean(first_token_times), 2),
+                "2nd_max(ms)": round(np.max(next_token_times), 2),
+                "2nd_min(ms)": round(np.min(next_token_times), 2),
+                "2nd_P90(ms)": round(np.percentile(next_token_times, 90), 2),
+                "2nd_avg(ms)": round(np.mean(next_token_times), 2),
+                "throughput_wo_1st (tokens/s)": round(1000 / np.percentile(next_token_times, 90) * args.batch_size, 2),
+                **arg_dict,
+                "Fake_model": True if os.environ.get("XFT_FAKE_MODEL", "-1") == "1" else False,
+                "Response": response,
             }
             # print(rst)
             check_and_update_csv(args.csv, rst)

diff --git a/ci/test_case b/ci/test_case
@@ -32,6 +32,7 @@ _test_case=$(
 # |    llama-2-7b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
 # |    llama-2-7b |   √  |   √  |   ×  |   ×  |   ×  |   × | 2016 |  32  |
 # |   chatglm3-6b |   √  |   √  |   ×  |   ×  |   ×  |   × |  32  |  32  |
+# |  baichuan2-7b |   √  |   √  |   ×  |   ×  |   ×  |   × |  32  |  32  |
 # | baichuan2-13b |   √  |   √  |   ×  |   ×  |   ×  |   × |  32  |  32  |
 # |       qwen-7b |   √  |   √  |   ×  |   ×  |   ×  |   × |  32  |  32  |
 
@@ -51,6 +52,10 @@ bash run_benchmark.sh -m llama-2-7b -d bf16 -i 1 -w 0 -in 2016 -out 32 -s 1
 bash run_benchmark.sh -m chatglm3-6b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m chatglm3-6b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
 
+# baichuan2-7b with short prompt & full data type:
+bash run_benchmark.sh -m baichuan2-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m baichuan2-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
+
 # baichuan2-13b with short prompt & full data type:
 bash run_benchmark.sh -m baichuan2-13b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
@@ -75,6 +80,7 @@ rls_test_case=$(
 # |   chatglm3-6b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
 # |  baichuan2-7b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
 # | baichuan2-13b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
+# |     qwen-1_8b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
 # |       qwen-7b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
 # |      qwen-14b |   √  |   √  |   √  |   √  |   √  |   √ |  32  |  32  |
 
@@ -126,6 +132,14 @@ bash run_benchmark.sh -m baichuan2-13b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m baichuan2-13b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
 
+# qwen-7b with short prompt & full data type:
+bash run_benchmark.sh -m qwen-1_8b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-1_8b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-1_8b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-1_8b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-1_8b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
+bash run_benchmark.sh -m qwen-1_8b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
+
 # qwen-7b with short prompt & full data type:
 bash run_benchmark.sh -m qwen-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
 bash run_benchmark.sh -m qwen-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1

diff --git a/ci_build b/ci_build
@@ -40,6 +40,8 @@ if [ ! -d $svr_log_dir ]; then
     mkdir -p $svr_log_dir
 fi
 
+git log --pretty=full $commit_id -n 3 &> $commit_dir/commit_info.txt
+
 interrupt_handler() {
     exit 1
 }
@@ -70,6 +72,7 @@ svr_info() {
 
 # Define functions for build, UT, and model
 build() {
+    pip install -r requirements.txt
     Info "Running build function with arguments: $@"
     rm -rf build && mkdir build && cd build && cmake -DXFT_BUILD_TESTS=1 -DPython_EXECUTABLE=$(which python) .. && make -j
 }

diff --git a/examples/model_config/baichuan2-13b/config.ini b/examples/model_config/baichuan2-13b/config.ini
@@ -3,7 +3,7 @@ model_name = /data/models/Baichuan2-13B-Chat/
 head_num = 40
 size_per_head = 128
 inter_size = 13696
-max_pos_seq_len = 4096
+max_pos_seq_len = 0
 model_max_length = 4096
 num_layer = 40
 rms_norm_eps = 1e-6

diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
@@ -246,7 +246,7 @@ struct DecoderContext {
 
     template <typename T>
     T *getBuffer(const std::string &name, size_t size, size_t alignment = 64) {
-        return (T *)SimpleMemPool::instance().getBuffer(name, size * sizeof(T), alignment);
+        return (T *)SimpleMemPool::instance().getBuffer(name, sizeof(T) * size, alignment);
     }
 
     void dump() {

diff --git a/src/utils/simple_mem_pool.h b/src/utils/simple_mem_pool.h
@@ -47,6 +47,10 @@ class SimpleMemPool {
 
     // Allocate or reallocate memory buffer based on name and size
     void *getBuffer(const std::string &name, size_t size, size_t alignment = 64) {
+        if (size == 0) {
+            // std::cout << "[Warning] Try to allocate 0 bytes for buffer:" << name << std::endl;
+            return nullptr;
+        }
         auto it = memoryMap.find(name);
 
         if (it != memoryMap.end()) {