Fix llm serving benchmark issue (#147)

* Fix llm serving benchmark issue Signed-off-by: lvliang-intel <[email protected]>
opea-project · Sep 30, 2024 · d6bafbd · d6bafbd
1 parent a4be366
commit d6bafbd
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 78 deletions.
diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py
@@ -1,14 +1,14 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import logging
 import os
 import sys
 import threading
 import time
 
 import gevent
-import numpy
 import sseclient
 from locust import HttpUser, between, events, task
 from locust.runners import STATE_CLEANUP, STATE_STOPPED, STATE_STOPPING, MasterRunner, WorkerRunner
@@ -87,6 +87,7 @@ def bench_main(self):
         url = bench_package.getUrl()
         streaming_bench_target = [
             "llmfixed",
+            "llmservefixed",
             "llmbench",
             "chatqnafixed",
             "chatqnabench",
@@ -128,18 +129,32 @@ def bench_main(self):
                         }
                     else:
                         first_token_ts = None
-                        client = sseclient.SSEClient(resp)
                         complete_response = ""
-                        for event in client.events():
-                            if event.data == "[DONE]":
-                                break
-                            else:
+                        if self.environment.parsed_options.bench_target == "llmservefixed":
+                            client = sseclient.SSEClient(resp)
+                            for event in client.events():
                                 if first_token_ts is None:
                                     first_token_ts = time.perf_counter()
-                                chunk = event.data.strip()
-                                if chunk.startswith("b'") and chunk.endswith("'"):
-                                    chunk = chunk[2:-1]
-                            complete_response += chunk
+                                try:
+                                    data = json.loads(event.data)
+                                    if "choices" in data and len(data["choices"]) > 0:
+                                        delta = data["choices"][0].get("delta", {})
+                                        content = delta.get("content", "")
+                                        complete_response += content
+                                except json.JSONDecodeError:
+                                    continue
+                        else:
+                            client = sseclient.SSEClient(resp)
+                            for event in client.events():
+                                if event.data == "[DONE]":
+                                    break
+                                else:
+                                    if first_token_ts is None:
+                                        first_token_ts = time.perf_counter()
+                                    chunk = event.data.strip()
+                                    if chunk.startswith("b'") and chunk.endswith("'"):
+                                        chunk = chunk[2:-1]
+                                complete_response += chunk
                         end_ts = time.perf_counter()
                         respData = {
                             "response_string": complete_response,

diff --git a/evals/benchmark/stresscli/locust/llmfixed.py b/evals/benchmark/stresscli/locust/llmfixed.py
@@ -21,8 +21,8 @@ def getReqData():
     }
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, respData):
+    return token.respStatics(environment, reqData, respData)
 
 
 def staticsOutput(environment, reqlist):

diff --git a/evals/benchmark/stresscli/locust/llmservefixed.py b/evals/benchmark/stresscli/locust/llmservefixed.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import numpy
+import tokenresponse as token
+
+console_logger = logging.getLogger("locust.stats_logger")
+
+
+def getUrl():
+    return "/v1/chat/completions"
+
+
+def getReqData():
+    return {
+        "messages": [{"role": "user", "content": "What is Deep Learning?"}],
+        "model": "tgi",
+        "max_tokens": 128,
+        "n": 1,
+        "stream": True,
+    }
+
+
+def respStatics(environment, reqData, respData):
+    return token.respStatics(environment, reqData, respData)
+
+
+def staticsOutput(environment, reqlist):
+    token.staticsOutput(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/tgifixed.py b/evals/benchmark/stresscli/locust/tgifixed.py
diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py
@@ -19,6 +19,9 @@ def respStatics(environment, req, resp):
         num_token_input_prompt = len(tokenizer.encode(req["messages"]))
     elif environment.parsed_options.bench_target in ["llmfixed"]:
         num_token_input_prompt = len(tokenizer.encode(req["query"]))
+    elif environment.parsed_options.bench_target == "llmservefixed":
+        content = " ".join([msg["content"] for msg in req["messages"]])
+        num_token_input_prompt = len(tokenizer.encode(content))
     else:
         num_token_input_prompt = -1