From 1b4f73b14d4ad110254461eec7a327b1063289ce Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Wed, 21 Aug 2024 21:01:28 +0000
Subject: [PATCH 1/6] Add more percentiles and latencies

Drop a line

lint
---
 benchmarks/benchmark_serving.py | 67 +++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index fe687da492901..0b35c1cdfa17f 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -61,15 +61,31 @@ class BenchmarkMetrics:
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
+    p25_ttft_ms: float
+    p75_ttft_ms: float
+    p95_ttft_ms: float
     p99_ttft_ms: float
     mean_tpot_ms: float
     median_tpot_ms: float
     std_tpot_ms: float
+    p25_tpot_ms: float
+    p75_tpot_ms: float
+    p95_tpot_ms: float
     p99_tpot_ms: float
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
+    p25_itl_ms: float
+    p75_itl_ms: float
+    p95_itl_ms: float
     p99_itl_ms: float
+    mean_latency_ms: float
+    median_latency_ms: float
+    std_latency_ms: float
+    p25_latency_ms: float
+    p75_latency_ms: float
+    p95_latency_ms: float
+    p99_latency_ms: float
 
 
 def sample_sharegpt_requests(
@@ -242,6 +258,7 @@ def calculate_metrics(
     itls: List[float] = []
     tpots: List[float] = []
     ttfts: List[float] = []
+    latencies: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -258,6 +275,7 @@ def calculate_metrics(
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
+            latencies.append(outputs[i].latency)
             completed += 1
         else:
             actual_output_lens.append(0)
@@ -276,17 +294,33 @@ def calculate_metrics(
         output_throughput=sum(actual_output_lens) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
-        median_ttft_ms=np.median(ttfts or 0) * 1000,
         std_ttft_ms=np.std(ttfts or 0) * 1000,
+        p25_ttft_ms=np.percentile(ttfts or 0, 25) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        p75_ttft_ms=np.percentile(ttfts or 0, 75) * 1000,
+        p95_ttft_ms=np.percentile(ttfts or 0, 95) * 1000,
         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
-        median_tpot_ms=np.median(tpots or 0) * 1000,
         std_tpot_ms=np.std(tpots or 0) * 1000,
+        p25_tpot_ms=np.percentile(tpots or 0, 25) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        p75_tpot_ms=np.percentile(tpots or 0, 75) * 1000,
+        p95_tpot_ms=np.percentile(tpots or 0, 95) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
         mean_itl_ms=np.mean(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
+        p25_itl_ms=np.percentile(itls or 0, 25) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        p75_itl_ms=np.percentile(itls or 0, 75) * 1000,
+        p95_itl_ms=np.percentile(itls or 0, 95) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        mean_latency_ms=np.median(latencies or 0) * 1000,
+        std_latency_ms=np.std(latencies or 0) * 1000,
+        p25_latency_ms=np.percentile(latencies or 0, 25) * 1000,
+        median_latency_ms=np.mean(latencies or 0) * 1000,
+        p75_latency_ms=np.percentile(latencies or 0, 75) * 1000,
+        p95_latency_ms=np.percentile(latencies or 0, 95) * 1000,
+        p99_latency_ms=np.percentile(latencies or 0, 99) * 1000,
     )
 
     return metrics, actual_output_lens
@@ -409,20 +443,37 @@ async def benchmark(
                                     metrics.output_throughput))
     print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
     print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
-    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
-                                    metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P25 TTFT (ms):", metrics.p25_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P75 TTFT (ms):", metrics.p75_ttft_ms))
+    print("{:<40} {:<10.2f}".format("p50 TTFT (ms):", metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P95 TTFT (ms):", metrics.p95_ttft_ms))
     print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
     print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
                                n=50,
                                c='-'))
     print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
-    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
-                                    metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P25 TPOT (ms):", metrics.p25_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P50 TPOT (ms):", metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P75 TPOT (ms):", metrics.p75_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P95 TPOT (ms):", metrics.p95_tpot_ms))
     print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+
     print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
     print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P25 ITL (ms):", metrics.p25_itl_ms))
+    print("{:<40} {:<10.2f}".format("P50 ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P75 ITL (ms):", metrics.p75_itl_ms))
+    print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms))
     print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+
+    print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean EEL (ms):", metrics.mean_latency_ms))
+    print("{:<40} {:<10.2f}".format("P25 EEL (ms):", metrics.p25_latency_ms))
+    print("{:<40} {:<10.2f}".format("P50 EEL (ms):",
+                                    metrics.median_latency_ms))
+    print("{:<40} {:<10.2f}".format("P75 EEL (ms):", metrics.p75_latency_ms))
+    print("{:<40} {:<10.2f}".format("P95 EEL (ms):", metrics.p95_latency_ms))
+    print("{:<40} {:<10.2f}".format("P99 EEL (ms):", metrics.p99_latency_ms))
     print("=" * 50)
 
     result = {

From e876a0d9548a2b9057216748a12c6f8d0f2495b8 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Sat, 24 Aug 2024 00:57:05 +0000
Subject: [PATCH 2/6] Flexiable way to select some metrics

---
 benchmarks/benchmark_serving.py | 213 +++++++++++++++++---------------
 1 file changed, 113 insertions(+), 100 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0b35c1cdfa17f..992c1980eaeaa 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -61,31 +61,22 @@ class BenchmarkMetrics:
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
-    p25_ttft_ms: float
-    p75_ttft_ms: float
-    p95_ttft_ms: float
-    p99_ttft_ms: float
+    percentiles_ttft_ms: List[Tuple[float,float]]
     mean_tpot_ms: float
     median_tpot_ms: float
     std_tpot_ms: float
-    p25_tpot_ms: float
-    p75_tpot_ms: float
-    p95_tpot_ms: float
-    p99_tpot_ms: float
+    percentiles_tpot_ms: List[Tuple[float,float]]
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
-    p25_itl_ms: float
-    p75_itl_ms: float
-    p95_itl_ms: float
-    p99_itl_ms: float
-    mean_latency_ms: float
-    median_latency_ms: float
-    std_latency_ms: float
-    p25_latency_ms: float
-    p75_latency_ms: float
-    p95_latency_ms: float
-    p99_latency_ms: float
+    percentiles_itl_ms: List[Tuple[float,float]]
+    # ETEL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_etel_ms: float
+    median_etel_ms: float
+    std_etel_ms: float
+    percentiles_etel_ms: List[Tuple[float, float]]
 
 
 def sample_sharegpt_requests(
@@ -251,6 +242,8 @@ def calculate_metrics(
     outputs: List[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     actual_output_lens: List[int] = []
     total_input = 0
@@ -258,7 +251,7 @@ def calculate_metrics(
     itls: List[float] = []
     tpots: List[float] = []
     ttfts: List[float] = []
-    latencies: List[float] = []
+    etels: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -275,7 +268,7 @@ def calculate_metrics(
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
-            latencies.append(outputs[i].latency)
+            etels.append(outputs[i].latency)
             completed += 1
         else:
             actual_output_lens.append(0)
@@ -295,32 +288,20 @@ def calculate_metrics(
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
         std_ttft_ms=np.std(ttfts or 0) * 1000,
-        p25_ttft_ms=np.percentile(ttfts or 0, 25) * 1000,
         median_ttft_ms=np.median(ttfts or 0) * 1000,
-        p75_ttft_ms=np.percentile(ttfts or 0, 75) * 1000,
-        p95_ttft_ms=np.percentile(ttfts or 0, 95) * 1000,
-        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         std_tpot_ms=np.std(tpots or 0) * 1000,
-        p25_tpot_ms=np.percentile(tpots or 0, 25) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
-        p75_tpot_ms=np.percentile(tpots or 0, 75) * 1000,
-        p95_tpot_ms=np.percentile(tpots or 0, 95) * 1000,
-        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
         mean_itl_ms=np.mean(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
-        p25_itl_ms=np.percentile(itls or 0, 25) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
-        p75_itl_ms=np.percentile(itls or 0, 75) * 1000,
-        p95_itl_ms=np.percentile(itls or 0, 95) * 1000,
-        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
-        mean_latency_ms=np.median(latencies or 0) * 1000,
-        std_latency_ms=np.std(latencies or 0) * 1000,
-        p25_latency_ms=np.percentile(latencies or 0, 25) * 1000,
-        median_latency_ms=np.mean(latencies or 0) * 1000,
-        p75_latency_ms=np.percentile(latencies or 0, 75) * 1000,
-        p95_latency_ms=np.percentile(latencies or 0, 95) * 1000,
-        p99_latency_ms=np.percentile(latencies or 0, 99) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
+        mean_etel_ms=np.median(etels or 0) * 1000,
+        std_etel_ms=np.std(etels or 0) * 1000,
+        median_etel_ms=np.mean(etels or 0) * 1000,
+        percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000) for p in selected_percentiles],
     )
 
     return metrics, actual_output_lens
@@ -338,6 +319,8 @@ async def benchmark(
     request_rate: float,
     disable_tqdm: bool,
     profile: bool,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[str],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -426,56 +409,10 @@ async def benchmark(
         outputs=outputs,
         dur_s=benchmark_duration,
         tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
     )
 
-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
-    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-                                    benchmark_duration))
-    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-                                    metrics.request_throughput))
-    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
-                                    metrics.input_throughput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
-    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
-    print("{:<40} {:<10.2f}".format("P25 TTFT (ms):", metrics.p25_ttft_ms))
-    print("{:<40} {:<10.2f}".format("P75 TTFT (ms):", metrics.p75_ttft_ms))
-    print("{:<40} {:<10.2f}".format("p50 TTFT (ms):", metrics.median_ttft_ms))
-    print("{:<40} {:<10.2f}".format("P95 TTFT (ms):", metrics.p95_ttft_ms))
-    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
-    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
-                               n=50,
-                               c='-'))
-    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P25 TPOT (ms):", metrics.p25_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P50 TPOT (ms):", metrics.median_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P75 TPOT (ms):", metrics.p75_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P95 TPOT (ms):", metrics.p95_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
-
-    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-    print("{:<40} {:<10.2f}".format("P25 ITL (ms):", metrics.p25_itl_ms))
-    print("{:<40} {:<10.2f}".format("P50 ITL (ms):", metrics.median_itl_ms))
-    print("{:<40} {:<10.2f}".format("P75 ITL (ms):", metrics.p75_itl_ms))
-    print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms))
-    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
-
-    print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean EEL (ms):", metrics.mean_latency_ms))
-    print("{:<40} {:<10.2f}".format("P25 EEL (ms):", metrics.p25_latency_ms))
-    print("{:<40} {:<10.2f}".format("P50 EEL (ms):",
-                                    metrics.median_latency_ms))
-    print("{:<40} {:<10.2f}".format("P75 EEL (ms):", metrics.p75_latency_ms))
-    print("{:<40} {:<10.2f}".format("P95 EEL (ms):", metrics.p95_latency_ms))
-    print("{:<40} {:<10.2f}".format("P99 EEL (ms):", metrics.p99_latency_ms))
-    print("=" * 50)
-
     result = {
         "duration": benchmark_duration,
         "completed": metrics.completed,
@@ -484,18 +421,6 @@ async def benchmark(
         "request_throughput": metrics.request_throughput,
         "input_throughput": metrics.input_throughput,
         "output_throughput": metrics.output_throughput,
-        "mean_ttft_ms": metrics.mean_ttft_ms,
-        "median_ttft_ms": metrics.median_ttft_ms,
-        "std_ttft_ms": metrics.std_ttft_ms,
-        "p99_ttft_ms": metrics.p99_ttft_ms,
-        "mean_tpot_ms": metrics.mean_tpot_ms,
-        "median_tpot_ms": metrics.median_tpot_ms,
-        "std_tpot_ms": metrics.std_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "std_itl_ms": metrics.std_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
         "ttfts": [output.ttft for output in outputs],
@@ -503,6 +428,74 @@ async def benchmark(
         "generated_texts": [output.generated_text for output in outputs],
         "errors": [output.error for output in outputs],
     }
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+                                    metrics.input_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+
+    if "ttft" in selected_percentile_metrics:
+        print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+        print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+        print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+        result["mean_ttft_ms"] = metrics.mean_ttft_ms
+        result["median_ttft_ms"] = metrics.median_ttft_ms
+        result["std_ttft_ms"] = metrics.std_ttft_ms
+        for p, value in metrics.percentiles_ttft_ms:
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} TTFT (ms):", value))
+            result[f"p{p_word}_ttft_ms"] = value
+
+    if "tpot" in selected_percentile_metrics:
+        print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+                                  n=50,
+                                  c='-'))
+        print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
+        print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+        result["mean_tpot_ms"] = metrics.mean_tpot_ms
+        result["median_tpot_m"] = metrics.median_tpot_ms
+        result["std_tpot_ms"] = metrics.std_tpot_ms
+        for p, value in metrics.percentiles_tpot_ms:
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} TPOT (ms):", value))
+            result[f"p{p_word}_tpot_ms"] = value
+
+    if "itl" in selected_percentile_metrics:
+        print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
+        print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+        print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+        result["mean_itl_ms"] = metrics.mean_itl_ms,
+        result["median_itl_ms"] = metrics.median_itl_ms,
+        result["std_itl_ms"] = metrics.std_itl_ms,
+        for p, value in metrics.percentiles_itl_ms:
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} ITL (ms):", value))
+            result[f"p{p_word}_itl_ms"] = value
+
+    if "etel" in selected_percentile_metrics:
+        print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-'))
+        print("{:<40} {:<10.2f}".format("Mean ETEL (ms):", metrics.mean_etel_ms))
+        print("{:<40} {:<10.2f}".format("Median ETEL (ms):",
+                                        metrics.median_etel_ms))
+        result["mean_etel_ms"] = metrics.mean_etel_ms,
+        result["median_etel_ms"] = metrics.median_etel_ms,
+        result["std_etel_ms"] = metrics.std_etel_ms,
+        for p, value in metrics.percentiles_etel_ms:
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} ETEL (ms):", value))
+            result[f"p{p_word}_etel_ms"] = value
+
+    print("=" * 50)
+
     return result
 
 
@@ -601,6 +594,8 @@ def main(args: argparse.Namespace):
             request_rate=args.request_rate,
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
         ))
 
     # Save config and results to json
@@ -816,6 +811,24 @@ def main(args: argparse.Namespace):
         "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
         " format.",
     )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"etel\". "
+        "Default value is \"ttft,tpot,itl\"."
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\". "
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
 
     args = parser.parse_args()
     main(args)

From 6899f0cb33d026433efb16bfc651abd72207bdb7 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Sat, 24 Aug 2024 01:06:28 +0000
Subject: [PATCH 3/6] format

---
 benchmarks/benchmark_serving.py | 47 ++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 992c1980eaeaa..b97f1dee6f527 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -61,15 +61,15 @@ class BenchmarkMetrics:
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float,float]]
+    percentiles_ttft_ms: List[Tuple[float, float]]
     mean_tpot_ms: float
     median_tpot_ms: float
     std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float,float]]
+    percentiles_tpot_ms: List[Tuple[float, float]]
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float,float]]
+    percentiles_itl_ms: List[Tuple[float, float]]
     # ETEL stands for end-to-end latency per request.
     # It is the time taken on the client side from sending
     # a request to receiving a complete response.
@@ -289,19 +289,23 @@ def calculate_metrics(
         1000,  # ttfts is empty if streaming is not supported by backend
         std_ttft_ms=np.std(ttfts or 0) * 1000,
         median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         std_tpot_ms=np.std(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
         mean_itl_ms=np.mean(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
         mean_etel_ms=np.median(etels or 0) * 1000,
         std_etel_ms=np.std(etels or 0) * 1000,
         median_etel_ms=np.mean(etels or 0) * 1000,
-        percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000) for p in selected_percentiles],
+        percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000)
+                             for p in selected_percentiles],
     )
 
     return metrics, actual_output_lens
@@ -445,8 +449,10 @@ async def benchmark(
 
     if "ttft" in selected_percentile_metrics:
         print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
-        print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
-        print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+        print("{:<40} {:<10.2f}".format("Mean TTFT (ms):",
+                                        metrics.mean_ttft_ms))
+        print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+                                        metrics.median_ttft_ms))
         result["mean_ttft_ms"] = metrics.mean_ttft_ms
         result["median_ttft_ms"] = metrics.median_ttft_ms
         result["std_ttft_ms"] = metrics.std_ttft_ms
@@ -457,10 +463,12 @@ async def benchmark(
 
     if "tpot" in selected_percentile_metrics:
         print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
-                                  n=50,
-                                  c='-'))
-        print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
-        print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+                                   n=50,
+                                   c='-'))
+        print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+                                        metrics.median_tpot_ms))
+        print("{:<40} {:<10.2f}".format("Mean TPOT (ms):",
+                                        metrics.mean_tpot_ms))
         result["mean_tpot_ms"] = metrics.mean_tpot_ms
         result["median_tpot_m"] = metrics.median_tpot_ms
         result["std_tpot_ms"] = metrics.std_tpot_ms
@@ -472,7 +480,8 @@ async def benchmark(
     if "itl" in selected_percentile_metrics:
         print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
         print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-        print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+        print("{:<40} {:<10.2f}".format("Median ITL (ms):",
+                                        metrics.median_itl_ms))
         result["mean_itl_ms"] = metrics.mean_itl_ms,
         result["median_itl_ms"] = metrics.median_itl_ms,
         result["std_itl_ms"] = metrics.std_itl_ms,
@@ -483,7 +492,8 @@ async def benchmark(
 
     if "etel" in selected_percentile_metrics:
         print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-'))
-        print("{:<40} {:<10.2f}".format("Mean ETEL (ms):", metrics.mean_etel_ms))
+        print("{:<40} {:<10.2f}".format("Mean ETEL (ms):",
+                                        metrics.mean_etel_ms))
         print("{:<40} {:<10.2f}".format("Median ETEL (ms):",
                                         metrics.median_etel_ms))
         result["mean_etel_ms"] = metrics.mean_etel_ms,
@@ -595,7 +605,9 @@ def main(args: argparse.Namespace):
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
             selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
         ))
 
     # Save config and results to json
@@ -818,8 +830,7 @@ def main(args: argparse.Namespace):
         help="Comma-seperated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"etel\". "
-        "Default value is \"ttft,tpot,itl\"."
-    )
+        "Default value is \"ttft,tpot,itl\".")
     parser.add_argument(
         "--metric-percentiles",
         type=str,

From e3ca8f6fbcbdd2f79cd3ffc9dc457fc3daa7c9f8 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Sat, 24 Aug 2024 01:33:48 +0000
Subject: [PATCH 4/6] put repeated lines to function

---
 benchmarks/benchmark_serving.py | 116 +++++++++++++-------------------
 1 file changed, 47 insertions(+), 69 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b97f1dee6f527..433aeb3cf2d41 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -417,6 +417,20 @@ async def benchmark(
         selected_percentiles=selected_percentiles,
     )
 
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+                                    metrics.input_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+
     result = {
         "duration": benchmark_duration,
         "completed": metrics.completed,
@@ -433,76 +447,40 @@ async def benchmark(
         "errors": [output.error for output in outputs],
     }
 
-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
-    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-                                    benchmark_duration))
-    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-                                    metrics.request_throughput))
-    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
-                                    metrics.input_throughput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
-
-    if "ttft" in selected_percentile_metrics:
-        print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
-        print("{:<40} {:<10.2f}".format("Mean TTFT (ms):",
-                                        metrics.mean_ttft_ms))
-        print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
-                                        metrics.median_ttft_ms))
-        result["mean_ttft_ms"] = metrics.mean_ttft_ms
-        result["median_ttft_ms"] = metrics.median_ttft_ms
-        result["std_ttft_ms"] = metrics.std_ttft_ms
-        for p, value in metrics.percentiles_ttft_ms:
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} TTFT (ms):", value))
-            result[f"p{p_word}_ttft_ms"] = value
-
-    if "tpot" in selected_percentile_metrics:
-        print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
-                                   n=50,
-                                   c='-'))
-        print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
-                                        metrics.median_tpot_ms))
-        print("{:<40} {:<10.2f}".format("Mean TPOT (ms):",
-                                        metrics.mean_tpot_ms))
-        result["mean_tpot_ms"] = metrics.mean_tpot_ms
-        result["median_tpot_m"] = metrics.median_tpot_ms
-        result["std_tpot_ms"] = metrics.std_tpot_ms
-        for p, value in metrics.percentiles_tpot_ms:
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} TPOT (ms):", value))
-            result[f"p{p_word}_tpot_ms"] = value
-
-    if "itl" in selected_percentile_metrics:
-        print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
-        print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-        print("{:<40} {:<10.2f}".format("Median ITL (ms):",
-                                        metrics.median_itl_ms))
-        result["mean_itl_ms"] = metrics.mean_itl_ms,
-        result["median_itl_ms"] = metrics.median_itl_ms,
-        result["std_itl_ms"] = metrics.std_itl_ms,
-        for p, value in metrics.percentiles_itl_ms:
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} ITL (ms):", value))
-            result[f"p{p_word}_itl_ms"] = value
-
-    if "etel" in selected_percentile_metrics:
-        print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-'))
-        print("{:<40} {:<10.2f}".format("Mean ETEL (ms):",
-                                        metrics.mean_etel_ms))
-        print("{:<40} {:<10.2f}".format("Median ETEL (ms):",
-                                        metrics.median_etel_ms))
-        result["mean_etel_ms"] = metrics.mean_etel_ms,
-        result["median_etel_ms"] = metrics.median_etel_ms,
-        result["std_etel_ms"] = metrics.std_etel_ms,
-        for p, value in metrics.percentiles_etel_ms:
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
             p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} ETEL (ms):", value))
-            result[f"p{p_word}_etel_ms"] = value
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("etel", "ETEL", "End-to-end Latency")
 
     print("=" * 50)
 

From 2e48973cbbb807a37c7d8b986149d464bd1e175d Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Sat, 24 Aug 2024 03:11:10 +0000
Subject: [PATCH 5/6] format

---
 benchmarks/benchmark_serving.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 433aeb3cf2d41..93ced812fcda8 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -455,6 +455,8 @@ def process_one_metric(
         # E.g., "Time to First Token"
         metric_header: str,
     ):
+        # This function print and add statistics of the specified
+        # metric.
         if metric_attribute_name not in selected_percentile_metrics:
             return
         print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
@@ -470,7 +472,8 @@ def process_one_metric(
             metrics, f"median_{metric_attribute_name}_ms")
         result[f"std_{metric_attribute_name}_ms"] = getattr(
             metrics, f"std_{metric_attribute_name}_ms")
-        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
             p_word = str(int(p)) if int(p) == p else str(p)
             print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
                                             value))

From e494450e17167fd86fb8fbb84f2e0e2c18bcea68 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Sat, 24 Aug 2024 04:22:17 +0000
Subject: [PATCH 6/6] etel -> e2el

---
 benchmarks/benchmark_serving.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 93ced812fcda8..e38ceaa222956 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -70,13 +70,13 @@ class BenchmarkMetrics:
     median_itl_ms: float
     std_itl_ms: float
     percentiles_itl_ms: List[Tuple[float, float]]
-    # ETEL stands for end-to-end latency per request.
+    # E2EL stands for end-to-end latency per request.
     # It is the time taken on the client side from sending
     # a request to receiving a complete response.
-    mean_etel_ms: float
-    median_etel_ms: float
-    std_etel_ms: float
-    percentiles_etel_ms: List[Tuple[float, float]]
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: List[Tuple[float, float]]
 
 
 def sample_sharegpt_requests(
@@ -251,7 +251,7 @@ def calculate_metrics(
     itls: List[float] = []
     tpots: List[float] = []
     ttfts: List[float] = []
-    etels: List[float] = []
+    e2els: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -268,7 +268,7 @@ def calculate_metrics(
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
-            etels.append(outputs[i].latency)
+            e2els.append(outputs[i].latency)
             completed += 1
         else:
             actual_output_lens.append(0)
@@ -301,10 +301,10 @@ def calculate_metrics(
         median_itl_ms=np.median(itls or 0) * 1000,
         percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
                             for p in selected_percentiles],
-        mean_etel_ms=np.median(etels or 0) * 1000,
-        std_etel_ms=np.std(etels or 0) * 1000,
-        median_etel_ms=np.mean(etels or 0) * 1000,
-        percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000)
+        mean_e2el_ms=np.median(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.mean(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
                              for p in selected_percentiles],
     )
 
@@ -483,7 +483,7 @@ def process_one_metric(
     process_one_metric("tpot", "TPOT",
                        "Time per Output Token (excl. 1st token)")
     process_one_metric("itl", "ITL", "Inter-token Latency")
-    process_one_metric("etel", "ETEL", "End-to-end Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
 
     print("=" * 50)
 
@@ -810,7 +810,7 @@ def main(args: argparse.Namespace):
         default="ttft,tpot,itl",
         help="Comma-seperated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"etel\". "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
         "Default value is \"ttft,tpot,itl\".")
     parser.add_argument(
         "--metric-percentiles",