Skip to content

Commit

Permalink
Fix llm serving benchmark issue (#147)
Browse files Browse the repository at this point in the history
* Fix llm serving benchmark issue

Signed-off-by: lvliang-intel <[email protected]>
  • Loading branch information
lvliang-intel authored Sep 30, 2024
1 parent a4be366 commit d6bafbd
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 78 deletions.
35 changes: 25 additions & 10 deletions evals/benchmark/stresscli/locust/aistress.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import os
import sys
import threading
import time

import gevent
import numpy
import sseclient
from locust import HttpUser, between, events, task
from locust.runners import STATE_CLEANUP, STATE_STOPPED, STATE_STOPPING, MasterRunner, WorkerRunner
Expand Down Expand Up @@ -87,6 +87,7 @@ def bench_main(self):
url = bench_package.getUrl()
streaming_bench_target = [
"llmfixed",
"llmservefixed",
"llmbench",
"chatqnafixed",
"chatqnabench",
Expand Down Expand Up @@ -128,18 +129,32 @@ def bench_main(self):
}
else:
first_token_ts = None
client = sseclient.SSEClient(resp)
complete_response = ""
for event in client.events():
if event.data == "[DONE]":
break
else:
if self.environment.parsed_options.bench_target == "llmservefixed":
client = sseclient.SSEClient(resp)
for event in client.events():
if first_token_ts is None:
first_token_ts = time.perf_counter()
chunk = event.data.strip()
if chunk.startswith("b'") and chunk.endswith("'"):
chunk = chunk[2:-1]
complete_response += chunk
try:
data = json.loads(event.data)
if "choices" in data and len(data["choices"]) > 0:
delta = data["choices"][0].get("delta", {})
content = delta.get("content", "")
complete_response += content
except json.JSONDecodeError:
continue
else:
client = sseclient.SSEClient(resp)
for event in client.events():
if event.data == "[DONE]":
break
else:
if first_token_ts is None:
first_token_ts = time.perf_counter()
chunk = event.data.strip()
if chunk.startswith("b'") and chunk.endswith("'"):
chunk = chunk[2:-1]
complete_response += chunk
end_ts = time.perf_counter()
respData = {
"response_string": complete_response,
Expand Down
4 changes: 2 additions & 2 deletions evals/benchmark/stresscli/locust/llmfixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def getReqData():
}


def respStatics(environment, resp):
return token.respStatics(environment, resp)
def respStatics(environment, reqData, respData):
return token.respStatics(environment, reqData, respData)


def staticsOutput(environment, reqlist):
Expand Down
31 changes: 31 additions & 0 deletions evals/benchmark/stresscli/locust/llmservefixed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import logging

import numpy
import tokenresponse as token

console_logger = logging.getLogger("locust.stats_logger")


def getUrl():
return "/v1/chat/completions"


def getReqData():
return {
"messages": [{"role": "user", "content": "What is Deep Learning?"}],
"model": "tgi",
"max_tokens": 128,
"n": 1,
"stream": True,
}


def respStatics(environment, reqData, respData):
return token.respStatics(environment, reqData, respData)


def staticsOutput(environment, reqlist):
token.staticsOutput(environment, reqlist)
66 changes: 0 additions & 66 deletions evals/benchmark/stresscli/locust/tgifixed.py

This file was deleted.

3 changes: 3 additions & 0 deletions evals/benchmark/stresscli/locust/tokenresponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ def respStatics(environment, req, resp):
num_token_input_prompt = len(tokenizer.encode(req["messages"]))
elif environment.parsed_options.bench_target in ["llmfixed"]:
num_token_input_prompt = len(tokenizer.encode(req["query"]))
elif environment.parsed_options.bench_target == "llmservefixed":
content = " ".join([msg["content"] for msg in req["messages"]])
num_token_input_prompt = len(tokenizer.encode(content))
else:
num_token_input_prompt = -1

Expand Down

0 comments on commit d6bafbd

Please sign in to comment.