Skip to content

Commit

Permalink
[Fix] Fix baichuan2-13 without rope. (#289)
Browse files Browse the repository at this point in the history
  • Loading branch information
marvin-Yu authored Apr 2, 2024
1 parent 2e9743f commit 79f34ec
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 12 deletions.
33 changes: 23 additions & 10 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import os
from typing import Tuple, List

import sys

sys.stdout = open(sys.stdout.fileno(), mode="w", buffering=1)

# Ignore Tensor-RT warning from huggingface
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

Expand Down Expand Up @@ -147,8 +151,6 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup

print("[INFO] xfastertransformer is installed, using pip installed package.")
except Exception as e:
import sys

sys.path.append("../src")
import xfastertransformer

Expand Down Expand Up @@ -237,15 +239,26 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
print("=" * 120, "\n" * 3)

if args.csv != "":
from datetime import datetime

arg_dict = dict(
filter(
lambda item: str(item[0]).find("path") == -1 and str(item[0]).find("csv") == -1, vars(args).items()
)
)

rst = {
"infer_avg_latency (ms)": np.mean(total_times),
"1st_avg_latency (ms)": np.mean(first_token_times),
"2nd_max_latency (ms)": np.max(next_token_times),
"2nd_min_latency (ms)": np.min(next_token_times),
"2nd_P90_latency (ms)": np.percentile(next_token_times, 90),
"2nd_avg_latency (ms)": np.mean(next_token_times),
"throughput_wo_1st (tokens/s)": 1000 / np.percentile(next_token_times, 90) * args.batch_size,
**vars(args),
"test_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"infer_avg(ms)": round(np.mean(total_times), 2),
"1st_avg(ms)": round(np.mean(first_token_times), 2),
"2nd_max(ms)": round(np.max(next_token_times), 2),
"2nd_min(ms)": round(np.min(next_token_times), 2),
"2nd_P90(ms)": round(np.percentile(next_token_times, 90), 2),
"2nd_avg(ms)": round(np.mean(next_token_times), 2),
"throughput_wo_1st (tokens/s)": round(1000 / np.percentile(next_token_times, 90) * args.batch_size, 2),
**arg_dict,
"Fake_model": True if os.environ.get("XFT_FAKE_MODEL", "-1") == "1" else False,
"Response": response,
}
# print(rst)
check_and_update_csv(args.csv, rst)
Expand Down
14 changes: 14 additions & 0 deletions ci/test_case
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ _test_case=$(
# | llama-2-7b | √ | √ | √ | √ | √ | √ | 32 | 32 |
# | llama-2-7b | √ | √ | × | × | × | × | 2016 | 32 |
# | chatglm3-6b | √ | √ | × | × | × | × | 32 | 32 |
# | baichuan2-7b | √ | √ | × | × | × | × | 32 | 32 |
# | baichuan2-13b | √ | √ | × | × | × | × | 32 | 32 |
# | qwen-7b | √ | √ | × | × | × | × | 32 | 32 |
Expand All @@ -51,6 +52,10 @@ bash run_benchmark.sh -m llama-2-7b -d bf16 -i 1 -w 0 -in 2016 -out 32 -s 1
bash run_benchmark.sh -m chatglm3-6b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m chatglm3-6b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
# baichuan2-7b with short prompt & full data type:
bash run_benchmark.sh -m baichuan2-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m baichuan2-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
# baichuan2-13b with short prompt & full data type:
bash run_benchmark.sh -m baichuan2-13b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m baichuan2-13b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
Expand All @@ -75,6 +80,7 @@ rls_test_case=$(
# | chatglm3-6b | √ | √ | √ | √ | √ | √ | 32 | 32 |
# | baichuan2-7b | √ | √ | √ | √ | √ | √ | 32 | 32 |
# | baichuan2-13b | √ | √ | √ | √ | √ | √ | 32 | 32 |
# | qwen-1_8b | √ | √ | √ | √ | √ | √ | 32 | 32 |
# | qwen-7b | √ | √ | √ | √ | √ | √ | 32 | 32 |
# | qwen-14b | √ | √ | √ | √ | √ | √ | 32 | 32 |
Expand Down Expand Up @@ -126,6 +132,14 @@ bash run_benchmark.sh -m baichuan2-13b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m baichuan2-13b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m baichuan2-13b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
# qwen-7b with short prompt & full data type:
bash run_benchmark.sh -m qwen-1_8b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m qwen-1_8b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m qwen-1_8b -d int8 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m qwen-1_8b -d w8a8 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m qwen-1_8b -d int4 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m qwen-1_8b -d nf4 -i 1 -w 0 -in 32 -out 32 -s 1
# qwen-7b with short prompt & full data type:
bash run_benchmark.sh -m qwen-7b -d fp16 -i 1 -w 0 -in 32 -out 32 -s 1
bash run_benchmark.sh -m qwen-7b -d bf16 -i 1 -w 0 -in 32 -out 32 -s 1
Expand Down
3 changes: 3 additions & 0 deletions ci_build
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ if [ ! -d $svr_log_dir ]; then
mkdir -p $svr_log_dir
fi

git log --pretty=full $commit_id -n 3 &> $commit_dir/commit_info.txt

interrupt_handler() {
exit 1
}
Expand Down Expand Up @@ -70,6 +72,7 @@ svr_info() {

# Define functions for build, UT, and model
build() {
pip install -r requirements.txt
Info "Running build function with arguments: $@"
rm -rf build && mkdir build && cd build && cmake -DXFT_BUILD_TESTS=1 -DPython_EXECUTABLE=$(which python) .. && make -j
}
Expand Down
2 changes: 1 addition & 1 deletion examples/model_config/baichuan2-13b/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ model_name = /data/models/Baichuan2-13B-Chat/
head_num = 40
size_per_head = 128
inter_size = 13696
max_pos_seq_len = 4096
max_pos_seq_len = 0
model_max_length = 4096
num_layer = 40
rms_norm_eps = 1e-6
Expand Down
2 changes: 1 addition & 1 deletion src/common/transformer_ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ struct DecoderContext {

template <typename T>
T *getBuffer(const std::string &name, size_t size, size_t alignment = 64) {
return (T *)SimpleMemPool::instance().getBuffer(name, size * sizeof(T), alignment);
return (T *)SimpleMemPool::instance().getBuffer(name, sizeof(T) * size, alignment);
}

void dump() {
Expand Down
4 changes: 4 additions & 0 deletions src/utils/simple_mem_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ class SimpleMemPool {

// Allocate or reallocate memory buffer based on name and size
void *getBuffer(const std::string &name, size_t size, size_t alignment = 64) {
if (size == 0) {
// std::cout << "[Warning] Try to allocate 0 bytes for buffer:" << name << std::endl;
return nullptr;
}
auto it = memoryMap.find(name);

if (it != memoryMap.end()) {
Expand Down

0 comments on commit 79f34ec

Please sign in to comment.