forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbenchmark_xlm-roberta.py
132 lines (100 loc) · 3.92 KB
/
benchmark_xlm-roberta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import random
import time
def benchmark_hf(args):
random.seed(args.seed)
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
model = AutoModelForMaskedLM.from_pretrained(args.model,
torch_dtype=torch_dtype).to(
args.device)
prompt = "if" * args.input_len
requests = [prompt for _ in range(args.num_prompts)]
with torch.no_grad():
for batchsize in args.batchsize:
start = time.perf_counter()
n_step = 0
for i in range(0, len(requests), batchsize):
batch = requests[i:i + batchsize]
encoded_input = tokenizer(batch,
return_tensors='pt').to(args.device)
model(**encoded_input)
n_step += 1
end = time.perf_counter()
elapsed_time = end - start
delay = elapsed_time / n_step
print(f"Batchsize {batchsize}, Throughput: "
f"{len(requests) / elapsed_time:.4f} requests/s, "
f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")
def benchmark_vllm(args):
random.seed(args.seed)
import gc
import torch
from vllm.wde.encode_only.arg_utils import ( # noqa: E501
EncodeOnlyEngineArgs as EngineArgs)
from vllm.wde.entrypoints.llm import LLMEngine
prompt = "if" * args.input_len
requests = [prompt for _ in range(args.num_prompts)]
engine_args = EngineArgs(
model=args.model,
tokenizer=args.tokenizer,
quantization=args.quantization,
seed=args.seed,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
quantization_param_path=args.quantization_param_path,
device=args.device,
max_num_seqs=32,
scheduling=args.scheduling)
engine = LLMEngine.from_engine_args(engine_args)
for batchsize in args.batchsize:
engine.engine_config.scheduler_config.set_args(max_num_seqs=batchsize)
start = time.perf_counter()
for request_id, prompt in enumerate(requests):
engine.add_request(str(request_id), prompt)
n_step = 0
while engine.has_unfinished_requests():
engine.step()
n_step += 1
end = time.perf_counter()
elapsed_time = end - start
delay = elapsed_time / n_step
print(f"Batchsize {batchsize}, Throughput: "
f"{len(requests) / elapsed_time:.4f} requests/s, "
f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")
engine.executor.shutdown_execute_loop()
gc.collect()
torch.cuda.empty_cache()
if __name__ == '__main__':
from easydict import EasyDict as edict
args = edict()
args.input_len = 256
args.num_prompts = 10000
args.model = 'FacebookAI/xlm-roberta-base'
#args.model = 'FacebookAI/xlm-roberta-large'
args.trust_remote_code = False
args.tokenizer = args.model
args.seed = 0
args.quantization = None
args.quantization_param_path = None
args.max_model_len = None
args.dtype = "half"
args.device = "cuda"
args.batchsize = [1, 2, 4, 8, 16, 32, 64]
from concurrent.futures import ProcessPoolExecutor
def run_hf(args):
with ProcessPoolExecutor(1) as executor:
f = executor.submit(benchmark_hf, args)
f.result()
run_hf(args)
def run_vllm(args):
with ProcessPoolExecutor(1) as executor:
f = executor.submit(benchmark_vllm, args)
f.result()
for scheduling in ["sync", "async", "double_buffer"]:
print(scheduling)
args.scheduling = scheduling
run_vllm(args)