forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbenchmark_attention_impl.py
102 lines (78 loc) · 3.02 KB
/
benchmark_attention_impl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import random
import time
def benchmark_vllm(args):
random.seed(args.seed)
os.environ["VLLM_ATTENTION_BACKEND"] = args.attention_impl
import gc
import torch
from vllm.wde.encode_only.arg_utils import ( # noqa: E501
EncodeOnlyEngineArgs as EngineArgs)
from vllm.wde.entrypoints.llm import LLMEngine
prompt = "if" * args.input_len
requests = [prompt for _ in range(args.num_prompts)]
engine_args = EngineArgs(model=args.model,
tokenizer=args.tokenizer,
seed=args.seed,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
device=args.device,
max_num_seqs=32,
scheduling=args.scheduling)
engine = LLMEngine.from_engine_args(engine_args)
for batchsize in args.batchsize:
engine.engine_config.scheduler_config.set_args(max_num_seqs=batchsize)
start = time.perf_counter()
for request_id, prompt in enumerate(requests):
engine.add_request(str(request_id), prompt)
n_step = 0
while engine.has_unfinished_requests():
engine.step()
n_step += 1
end = time.perf_counter()
elapsed_time = end - start
delay = elapsed_time / n_step
print(f"Batchsize {batchsize}, Throughput: "
f"{len(requests) / elapsed_time:.4f} requests/s, "
f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")
engine.executor.shutdown_execute_loop()
gc.collect()
torch.cuda.empty_cache()
if __name__ == '__main__':
from easydict import EasyDict as edict
args = edict()
args.input_len = 256
args.num_prompts = 10000
args.model = 'BAAI/bge-m3'
args.trust_remote_code = False
args.tokenizer = args.model
args.seed = 0
args.max_model_len = None
args.device = "cuda"
args.batchsize = [1, 2, 4, 8, 16, 32, 64]
args.scheduling = "double_buffer"
from concurrent.futures import ProcessPoolExecutor
def run_vllm(args):
with ProcessPoolExecutor(1) as executor:
f = executor.submit(benchmark_vllm, args)
f.result()
AttentionImpls_fp32 = ["TORCH_SDPA", "XFORMERS", "TORCH_NAIVE"]
AttentionImpls_fp16 = [
"FLASH_ATTN", "TORCH_SDPA", "XFORMERS", "FLASHINFER", "TORCH_NAIVE"
]
AttentionImpls_bf16 = [
"FLASH_ATTN", "TORCH_SDPA", "XFORMERS", "FLASHINFER", "TORCH_NAIVE"
]
AttentionImpls = {
"float": AttentionImpls_fp32,
"half": AttentionImpls_fp16,
"bfloat16": AttentionImpls_bf16,
}
for dtype, attention_impls in AttentionImpls.items():
print("dtype:", dtype)
for attention_impl in attention_impls:
print("attention_impl:", attention_impl)
args.attention_impl = attention_impl
args.dtype = dtype
run_vllm(args)