Support e2e and first token P90 statistics (#77)

* Support e2e and first token P90 statistics Signed-off-by: lvliang-intel <[email protected]> * fix high-concurrency test issue Signed-off-by: lvliang-intel <[email protected]>
opea-project · Aug 26, 2024 · b07cd12 · b07cd12
1 parent d754a84
commit b07cd12
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 12 deletions.
diff --git a/evals/benchmark/README.md b/evals/benchmark/README.md
@@ -34,7 +34,15 @@ pip install -r ../../requirements.txt
 
 1 Define the test cases and configurations in the benchmark.yaml file.
 
-2 Run the benchmark script:
+2 Temporarily increase the file descriptor limit before run test:
+
+```bash
+ulimit -n 100000
+```
+
+This command increases the maximum number of file descriptors (which represent open files, network connections, etc.) that a single process can use. By default, many systems set a conservative limit, such as 1024, which may not be sufficient for high-concurrency applications or large-scale load testing. Raising this limit ensures that the process can handle a larger number of open connections or files without running into errors caused by insufficient file descriptors.
+
+3 Run the benchmark script:
 
 ```bash
 python benchmark.py

diff --git a/evals/benchmark/stresscli/commands/config.ini b/evals/benchmark/stresscli/commands/config.ini
@@ -8,11 +8,13 @@ RPS = RPS:\s+([\d.]+)
 Input_Tokens_per_Second = Input Tokens per Second:\s+([\d.]+)
 Output_Tokens_per_Second = Output Tokens per Second:\s+([\d.]+)
 End_to_End_latency_P50 = End to End latency\(ms\),\s+P50:\s+([\d.]+)
-End_to_End_latency_P99 = End to End latency\(ms\),\s+P50:[\s\d.,]+P99:\s+([\d.]+)
-End_to_End_latency_Avg = End to End latency\(ms\),\s+P50:[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
+End_to_End_latency_P90 = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
+End_to_End_latency_P99 = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
+End_to_End_latency_Avg = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
 First_token_latency_P50 = First token latency\(ms\),\s+P50:\s+([\d.]+)
-First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P99:\s+([\d.]+)
-First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
+First_token_latency_P90 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
+First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
+First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
 Average_Next_token_latency = Average Next token latency\(ms\):\s+([\d.]+)
 Average_token_latency = Average token latency\(ms\)\s+:\s+([\d.]+)
 locust_num_requests = \"num_requests\":\s+(\d+)

diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
@@ -115,6 +115,9 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
         os.makedirs(end_output_folder, exist_ok=True)
         metrics_output = os.path.join(output_folder, f"{index}_metrics.json")
 
+    spawn_rate = 100 if runspec["users"] > 100 else runspec["users"]
+    processes = 10 if runspec["max_requests"] > 2000 else 5 if runspec["max_requests"] > 1000 else 2
+
     cmd = [
         "locust",
         "--locustfile",
@@ -126,11 +129,11 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
         "--users",
         str(runspec["users"]),
         "--spawn-rate",
-        str(runspec["users"]),
+        str(spawn_rate),
         "--max-request",
         str(runspec["max_requests"]),
         "--processes",
-        str(runspec["processes"]),
+        str(processes),
         "--bench-target",
         str(runspec["bench-target"]),
         "--llm-model",

diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py
@@ -27,7 +27,7 @@ def _(parser):
         help="Stop the benchmark If exceed this request",
     )
     parser.add_argument(
-        "--http-timeout", type=int, env_var="HTTP_TIMEOUT", default=3000, help="Http timeout before receive response"
+        "--http-timeout", type=int, env_var="HTTP_TIMEOUT", default=120000, help="Http timeout before receive response"
     )
     parser.add_argument(
         "--bench-target",

diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py
@@ -62,8 +62,8 @@ def staticsOutput(environment, reqlist):
             "Succeed Response:  {} (Total {}, {:.1%} Success), Duration: {:.2f}s, Input Tokens: {},"
             " Output Tokens: {}, RPS: {:.2f}, Input Tokens per Second: {:.2f}, Output Tokens per Second: {:.2f}"
         )
-    e2e_msg = "End to End latency(ms),    P50: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
-    first_msg = "First token latency(ms),   P50: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
+    e2e_msg = "End to End latency(ms),    P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
+    first_msg = "First token latency(ms),   P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
     next_msg = "Average Next token latency(ms): {:.2f}"
     average_msg = "Average token latency(ms)     : {:.2f}"
     console_logger.warning("\n=================Total statistics=====================")
@@ -92,12 +92,20 @@ def staticsOutput(environment, reqlist):
             )
         )
     console_logger.warning(
-        e2e_msg.format(numpy.percentile(e2e_lat, 50), numpy.percentile(e2e_lat, 99), numpy.average(e2e_lat))
+        e2e_msg.format(
+            numpy.percentile(e2e_lat, 50),
+            numpy.percentile(e2e_lat, 90),
+            numpy.percentile(e2e_lat, 99),
+            numpy.average(e2e_lat),
+        )
     )
     if tokens_output != 0:
         console_logger.warning(
             first_msg.format(
-                numpy.percentile(first_token, 50), numpy.percentile(first_token, 99), numpy.average(first_token)
+                numpy.percentile(first_token, 50),
+                numpy.percentile(first_token, 90),
+                numpy.percentile(first_token, 99),
+                numpy.average(first_token),
             )
         )
         console_logger.warning(next_msg.format(numpy.average(next_token)))