From 1b4f73b14d4ad110254461eec7a327b1063289ce Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 21 Aug 2024 21:01:28 +0000 Subject: [PATCH 1/6] Add more percentiles and latencies Drop a line lint --- benchmarks/benchmark_serving.py | 67 +++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index fe687da492901..0b35c1cdfa17f 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -61,15 +61,31 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float + p25_ttft_ms: float + p75_ttft_ms: float + p95_ttft_ms: float p99_ttft_ms: float mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float + p25_tpot_ms: float + p75_tpot_ms: float + p95_tpot_ms: float p99_tpot_ms: float mean_itl_ms: float median_itl_ms: float std_itl_ms: float + p25_itl_ms: float + p75_itl_ms: float + p95_itl_ms: float p99_itl_ms: float + mean_latency_ms: float + median_latency_ms: float + std_latency_ms: float + p25_latency_ms: float + p75_latency_ms: float + p95_latency_ms: float + p99_latency_ms: float def sample_sharegpt_requests( @@ -242,6 +258,7 @@ def calculate_metrics( itls: List[float] = [] tpots: List[float] = [] ttfts: List[float] = [] + latencies: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -258,6 +275,7 @@ def calculate_metrics( (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) itls += outputs[i].itl ttfts.append(outputs[i].ttft) + latencies.append(outputs[i].latency) completed += 1 else: actual_output_lens.append(0) @@ -276,17 +294,33 @@ def calculate_metrics( output_throughput=sum(actual_output_lens) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend - median_ttft_ms=np.median(ttfts or 0) * 1000, std_ttft_ms=np.std(ttfts or 0) * 1000, + p25_ttft_ms=np.percentile(ttfts or 0, 25) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + p75_ttft_ms=np.percentile(ttfts or 0, 75) * 1000, + p95_ttft_ms=np.percentile(ttfts or 0, 95) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, mean_tpot_ms=np.mean(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, + p25_tpot_ms=np.percentile(tpots or 0, 25) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + p75_tpot_ms=np.percentile(tpots or 0, 75) * 1000, + p95_tpot_ms=np.percentile(tpots or 0, 95) * 1000, p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, mean_itl_ms=np.mean(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, + p25_itl_ms=np.percentile(itls or 0, 25) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + p75_itl_ms=np.percentile(itls or 0, 75) * 1000, + p95_itl_ms=np.percentile(itls or 0, 95) * 1000, p99_itl_ms=np.percentile(itls or 0, 99) * 1000, + mean_latency_ms=np.median(latencies or 0) * 1000, + std_latency_ms=np.std(latencies or 0) * 1000, + p25_latency_ms=np.percentile(latencies or 0, 25) * 1000, + median_latency_ms=np.mean(latencies or 0) * 1000, + p75_latency_ms=np.percentile(latencies or 0, 75) * 1000, + p95_latency_ms=np.percentile(latencies or 0, 95) * 1000, + p99_latency_ms=np.percentile(latencies or 0, 99) * 1000, ) return metrics, actual_output_lens @@ -409,20 +443,37 @@ async def benchmark( metrics.output_throughput)) print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-')) print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms)) - print("{:<40} {:<10.2f}".format("Median TTFT (ms):", - metrics.median_ttft_ms)) + print("{:<40} {:<10.2f}".format("P25 TTFT (ms):", metrics.p25_ttft_ms)) + print("{:<40} {:<10.2f}".format("P75 TTFT (ms):", metrics.p75_ttft_ms)) + print("{:<40} {:<10.2f}".format("p50 TTFT (ms):", metrics.median_ttft_ms)) + print("{:<40} {:<10.2f}".format("P95 TTFT (ms):", metrics.p95_ttft_ms)) print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms)) print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)', n=50, c='-')) print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms)) - print("{:<40} {:<10.2f}".format("Median TPOT (ms):", - metrics.median_tpot_ms)) + print("{:<40} {:<10.2f}".format("P25 TPOT (ms):", metrics.p25_tpot_ms)) + print("{:<40} {:<10.2f}".format("P50 TPOT (ms):", metrics.median_tpot_ms)) + print("{:<40} {:<10.2f}".format("P75 TPOT (ms):", metrics.p75_tpot_ms)) + print("{:<40} {:<10.2f}".format("P95 TPOT (ms):", metrics.p95_tpot_ms)) print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms)) + print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-')) print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) - print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms)) + print("{:<40} {:<10.2f}".format("P25 ITL (ms):", metrics.p25_itl_ms)) + print("{:<40} {:<10.2f}".format("P50 ITL (ms):", metrics.median_itl_ms)) + print("{:<40} {:<10.2f}".format("P75 ITL (ms):", metrics.p75_itl_ms)) + print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms)) print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms)) + + print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-')) + print("{:<40} {:<10.2f}".format("Mean EEL (ms):", metrics.mean_latency_ms)) + print("{:<40} {:<10.2f}".format("P25 EEL (ms):", metrics.p25_latency_ms)) + print("{:<40} {:<10.2f}".format("P50 EEL (ms):", + metrics.median_latency_ms)) + print("{:<40} {:<10.2f}".format("P75 EEL (ms):", metrics.p75_latency_ms)) + print("{:<40} {:<10.2f}".format("P95 EEL (ms):", metrics.p95_latency_ms)) + print("{:<40} {:<10.2f}".format("P99 EEL (ms):", metrics.p99_latency_ms)) print("=" * 50) result = { From e876a0d9548a2b9057216748a12c6f8d0f2495b8 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Sat, 24 Aug 2024 00:57:05 +0000 Subject: [PATCH 2/6] Flexiable way to select some metrics --- benchmarks/benchmark_serving.py | 213 +++++++++++++++++--------------- 1 file changed, 113 insertions(+), 100 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 0b35c1cdfa17f..992c1980eaeaa 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -61,31 +61,22 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - p25_ttft_ms: float - p75_ttft_ms: float - p95_ttft_ms: float - p99_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float,float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - p25_tpot_ms: float - p75_tpot_ms: float - p95_tpot_ms: float - p99_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float,float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - p25_itl_ms: float - p75_itl_ms: float - p95_itl_ms: float - p99_itl_ms: float - mean_latency_ms: float - median_latency_ms: float - std_latency_ms: float - p25_latency_ms: float - p75_latency_ms: float - p95_latency_ms: float - p99_latency_ms: float + percentiles_itl_ms: List[Tuple[float,float]] + # ETEL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_etel_ms: float + median_etel_ms: float + std_etel_ms: float + percentiles_etel_ms: List[Tuple[float, float]] def sample_sharegpt_requests( @@ -251,6 +242,8 @@ def calculate_metrics( outputs: List[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 @@ -258,7 +251,7 @@ def calculate_metrics( itls: List[float] = [] tpots: List[float] = [] ttfts: List[float] = [] - latencies: List[float] = [] + etels: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -275,7 +268,7 @@ def calculate_metrics( (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) itls += outputs[i].itl ttfts.append(outputs[i].ttft) - latencies.append(outputs[i].latency) + etels.append(outputs[i].latency) completed += 1 else: actual_output_lens.append(0) @@ -295,32 +288,20 @@ def calculate_metrics( mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, - p25_ttft_ms=np.percentile(ttfts or 0, 25) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000, - p75_ttft_ms=np.percentile(ttfts or 0, 75) * 1000, - p95_ttft_ms=np.percentile(ttfts or 0, 95) * 1000, - p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles], mean_tpot_ms=np.mean(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, - p25_tpot_ms=np.percentile(tpots or 0, 25) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, - p75_tpot_ms=np.percentile(tpots or 0, 75) * 1000, - p95_tpot_ms=np.percentile(tpots or 0, 95) * 1000, - p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles], mean_itl_ms=np.mean(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, - p25_itl_ms=np.percentile(itls or 0, 25) * 1000, median_itl_ms=np.median(itls or 0) * 1000, - p75_itl_ms=np.percentile(itls or 0, 75) * 1000, - p95_itl_ms=np.percentile(itls or 0, 95) * 1000, - p99_itl_ms=np.percentile(itls or 0, 99) * 1000, - mean_latency_ms=np.median(latencies or 0) * 1000, - std_latency_ms=np.std(latencies or 0) * 1000, - p25_latency_ms=np.percentile(latencies or 0, 25) * 1000, - median_latency_ms=np.mean(latencies or 0) * 1000, - p75_latency_ms=np.percentile(latencies or 0, 75) * 1000, - p95_latency_ms=np.percentile(latencies or 0, 95) * 1000, - p99_latency_ms=np.percentile(latencies or 0, 99) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], + mean_etel_ms=np.median(etels or 0) * 1000, + std_etel_ms=np.std(etels or 0) * 1000, + median_etel_ms=np.mean(etels or 0) * 1000, + percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000) for p in selected_percentiles], ) return metrics, actual_output_lens @@ -338,6 +319,8 @@ async def benchmark( request_rate: float, disable_tqdm: bool, profile: bool, + selected_percentile_metrics: List[str], + selected_percentiles: List[str], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -426,56 +409,10 @@ async def benchmark( outputs=outputs, dur_s=benchmark_duration, tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, ) - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.2f}".format("Request throughput (req/s):", - metrics.request_throughput)) - print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):", - metrics.input_throughput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms)) - print("{:<40} {:<10.2f}".format("P25 TTFT (ms):", metrics.p25_ttft_ms)) - print("{:<40} {:<10.2f}".format("P75 TTFT (ms):", metrics.p75_ttft_ms)) - print("{:<40} {:<10.2f}".format("p50 TTFT (ms):", metrics.median_ttft_ms)) - print("{:<40} {:<10.2f}".format("P95 TTFT (ms):", metrics.p95_ttft_ms)) - print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms)) - print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)', - n=50, - c='-')) - print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms)) - print("{:<40} {:<10.2f}".format("P25 TPOT (ms):", metrics.p25_tpot_ms)) - print("{:<40} {:<10.2f}".format("P50 TPOT (ms):", metrics.median_tpot_ms)) - print("{:<40} {:<10.2f}".format("P75 TPOT (ms):", metrics.p75_tpot_ms)) - print("{:<40} {:<10.2f}".format("P95 TPOT (ms):", metrics.p95_tpot_ms)) - print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms)) - - print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) - print("{:<40} {:<10.2f}".format("P25 ITL (ms):", metrics.p25_itl_ms)) - print("{:<40} {:<10.2f}".format("P50 ITL (ms):", metrics.median_itl_ms)) - print("{:<40} {:<10.2f}".format("P75 ITL (ms):", metrics.p75_itl_ms)) - print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms)) - print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms)) - - print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean EEL (ms):", metrics.mean_latency_ms)) - print("{:<40} {:<10.2f}".format("P25 EEL (ms):", metrics.p25_latency_ms)) - print("{:<40} {:<10.2f}".format("P50 EEL (ms):", - metrics.median_latency_ms)) - print("{:<40} {:<10.2f}".format("P75 EEL (ms):", metrics.p75_latency_ms)) - print("{:<40} {:<10.2f}".format("P95 EEL (ms):", metrics.p95_latency_ms)) - print("{:<40} {:<10.2f}".format("P99 EEL (ms):", metrics.p99_latency_ms)) - print("=" * 50) - result = { "duration": benchmark_duration, "completed": metrics.completed, @@ -484,18 +421,6 @@ async def benchmark( "request_throughput": metrics.request_throughput, "input_throughput": metrics.input_throughput, "output_throughput": metrics.output_throughput, - "mean_ttft_ms": metrics.mean_ttft_ms, - "median_ttft_ms": metrics.median_ttft_ms, - "std_ttft_ms": metrics.std_ttft_ms, - "p99_ttft_ms": metrics.p99_ttft_ms, - "mean_tpot_ms": metrics.mean_tpot_ms, - "median_tpot_ms": metrics.median_tpot_ms, - "std_tpot_ms": metrics.std_tpot_ms, - "p99_tpot_ms": metrics.p99_tpot_ms, - "mean_itl_ms": metrics.mean_itl_ms, - "median_itl_ms": metrics.median_itl_ms, - "std_itl_ms": metrics.std_itl_ms, - "p99_itl_ms": metrics.p99_itl_ms, "input_lens": [output.prompt_len for output in outputs], "output_lens": actual_output_lens, "ttfts": [output.ttft for output in outputs], @@ -503,6 +428,74 @@ async def benchmark( "generated_texts": [output.generated_text for output in outputs], "errors": [output.error for output in outputs], } + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):", + metrics.input_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + + if "ttft" in selected_percentile_metrics: + print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-')) + print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms)) + print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms)) + result["mean_ttft_ms"] = metrics.mean_ttft_ms + result["median_ttft_ms"] = metrics.median_ttft_ms + result["std_ttft_ms"] = metrics.std_ttft_ms + for p, value in metrics.percentiles_ttft_ms: + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} TTFT (ms):", value)) + result[f"p{p_word}_ttft_ms"] = value + + if "tpot" in selected_percentile_metrics: + print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)', + n=50, + c='-')) + print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms)) + print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms)) + result["mean_tpot_ms"] = metrics.mean_tpot_ms + result["median_tpot_m"] = metrics.median_tpot_ms + result["std_tpot_ms"] = metrics.std_tpot_ms + for p, value in metrics.percentiles_tpot_ms: + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} TPOT (ms):", value)) + result[f"p{p_word}_tpot_ms"] = value + + if "itl" in selected_percentile_metrics: + print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-')) + print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) + print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms)) + result["mean_itl_ms"] = metrics.mean_itl_ms, + result["median_itl_ms"] = metrics.median_itl_ms, + result["std_itl_ms"] = metrics.std_itl_ms, + for p, value in metrics.percentiles_itl_ms: + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} ITL (ms):", value)) + result[f"p{p_word}_itl_ms"] = value + + if "etel" in selected_percentile_metrics: + print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-')) + print("{:<40} {:<10.2f}".format("Mean ETEL (ms):", metrics.mean_etel_ms)) + print("{:<40} {:<10.2f}".format("Median ETEL (ms):", + metrics.median_etel_ms)) + result["mean_etel_ms"] = metrics.mean_etel_ms, + result["median_etel_ms"] = metrics.median_etel_ms, + result["std_etel_ms"] = metrics.std_etel_ms, + for p, value in metrics.percentiles_etel_ms: + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} ETEL (ms):", value)) + result[f"p{p_word}_etel_ms"] = value + + print("=" * 50) + return result @@ -601,6 +594,8 @@ def main(args: argparse.Namespace): request_rate=args.request_rate, disable_tqdm=args.disable_tqdm, profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], )) # Save config and results to json @@ -816,6 +811,24 @@ def main(args: argparse.Namespace): "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" " format.", ) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"etel\". " + "Default value is \"ttft,tpot,itl\"." + ) + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) args = parser.parse_args() main(args) From 6899f0cb33d026433efb16bfc651abd72207bdb7 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Sat, 24 Aug 2024 01:06:28 +0000 Subject: [PATCH 3/6] format --- benchmarks/benchmark_serving.py | 47 ++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 992c1980eaeaa..b97f1dee6f527 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -61,15 +61,15 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float,float]] + percentiles_ttft_ms: List[Tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float,float]] + percentiles_tpot_ms: List[Tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float,float]] + percentiles_itl_ms: List[Tuple[float, float]] # ETEL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. @@ -289,19 +289,23 @@ def calculate_metrics( 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles], + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], mean_tpot_ms=np.mean(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles], + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], mean_itl_ms=np.mean(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], mean_etel_ms=np.median(etels or 0) * 1000, std_etel_ms=np.std(etels or 0) * 1000, median_etel_ms=np.mean(etels or 0) * 1000, - percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000) for p in selected_percentiles], + percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000) + for p in selected_percentiles], ) return metrics, actual_output_lens @@ -445,8 +449,10 @@ async def benchmark( if "ttft" in selected_percentile_metrics: print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms)) - print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms)) + print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", + metrics.mean_ttft_ms)) + print("{:<40} {:<10.2f}".format("Median TTFT (ms):", + metrics.median_ttft_ms)) result["mean_ttft_ms"] = metrics.mean_ttft_ms result["median_ttft_ms"] = metrics.median_ttft_ms result["std_ttft_ms"] = metrics.std_ttft_ms @@ -457,10 +463,12 @@ async def benchmark( if "tpot" in selected_percentile_metrics: print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)', - n=50, - c='-')) - print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms)) - print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms)) + n=50, + c='-')) + print("{:<40} {:<10.2f}".format("Median TPOT (ms):", + metrics.median_tpot_ms)) + print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", + metrics.mean_tpot_ms)) result["mean_tpot_ms"] = metrics.mean_tpot_ms result["median_tpot_m"] = metrics.median_tpot_ms result["std_tpot_ms"] = metrics.std_tpot_ms @@ -472,7 +480,8 @@ async def benchmark( if "itl" in selected_percentile_metrics: print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-')) print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) - print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms)) + print("{:<40} {:<10.2f}".format("Median ITL (ms):", + metrics.median_itl_ms)) result["mean_itl_ms"] = metrics.mean_itl_ms, result["median_itl_ms"] = metrics.median_itl_ms, result["std_itl_ms"] = metrics.std_itl_ms, @@ -483,7 +492,8 @@ async def benchmark( if "etel" in selected_percentile_metrics: print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean ETEL (ms):", metrics.mean_etel_ms)) + print("{:<40} {:<10.2f}".format("Mean ETEL (ms):", + metrics.mean_etel_ms)) print("{:<40} {:<10.2f}".format("Median ETEL (ms):", metrics.median_etel_ms)) result["mean_etel_ms"] = metrics.mean_etel_ms, @@ -595,7 +605,9 @@ def main(args: argparse.Namespace): disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], )) # Save config and results to json @@ -818,8 +830,7 @@ def main(args: argparse.Namespace): help="Comma-seperated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"etel\". " - "Default value is \"ttft,tpot,itl\"." - ) + "Default value is \"ttft,tpot,itl\".") parser.add_argument( "--metric-percentiles", type=str, From e3ca8f6fbcbdd2f79cd3ffc9dc457fc3daa7c9f8 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Sat, 24 Aug 2024 01:33:48 +0000 Subject: [PATCH 4/6] put repeated lines to function --- benchmarks/benchmark_serving.py | 116 +++++++++++++------------------- 1 file changed, 47 insertions(+), 69 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index b97f1dee6f527..433aeb3cf2d41 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -417,6 +417,20 @@ async def benchmark( selected_percentiles=selected_percentiles, ) + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):", + metrics.input_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + result = { "duration": benchmark_duration, "completed": metrics.completed, @@ -433,76 +447,40 @@ async def benchmark( "errors": [output.error for output in outputs], } - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.2f}".format("Request throughput (req/s):", - metrics.request_throughput)) - print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):", - metrics.input_throughput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - - if "ttft" in selected_percentile_metrics: - print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", - metrics.mean_ttft_ms)) - print("{:<40} {:<10.2f}".format("Median TTFT (ms):", - metrics.median_ttft_ms)) - result["mean_ttft_ms"] = metrics.mean_ttft_ms - result["median_ttft_ms"] = metrics.median_ttft_ms - result["std_ttft_ms"] = metrics.std_ttft_ms - for p, value in metrics.percentiles_ttft_ms: - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} TTFT (ms):", value)) - result[f"p{p_word}_ttft_ms"] = value - - if "tpot" in selected_percentile_metrics: - print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)', - n=50, - c='-')) - print("{:<40} {:<10.2f}".format("Median TPOT (ms):", - metrics.median_tpot_ms)) - print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", - metrics.mean_tpot_ms)) - result["mean_tpot_ms"] = metrics.mean_tpot_ms - result["median_tpot_m"] = metrics.median_tpot_ms - result["std_tpot_ms"] = metrics.std_tpot_ms - for p, value in metrics.percentiles_tpot_ms: - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} TPOT (ms):", value)) - result[f"p{p_word}_tpot_ms"] = value - - if "itl" in selected_percentile_metrics: - print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) - print("{:<40} {:<10.2f}".format("Median ITL (ms):", - metrics.median_itl_ms)) - result["mean_itl_ms"] = metrics.mean_itl_ms, - result["median_itl_ms"] = metrics.median_itl_ms, - result["std_itl_ms"] = metrics.std_itl_ms, - for p, value in metrics.percentiles_itl_ms: - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} ITL (ms):", value)) - result[f"p{p_word}_itl_ms"] = value - - if "etel" in selected_percentile_metrics: - print("{s:{c}^{n}}".format(s='End-to-end Latency', n=50, c='-')) - print("{:<40} {:<10.2f}".format("Mean ETEL (ms):", - metrics.mean_etel_ms)) - print("{:<40} {:<10.2f}".format("Median ETEL (ms):", - metrics.median_etel_ms)) - result["mean_etel_ms"] = metrics.mean_etel_ms, - result["median_etel_ms"] = metrics.median_etel_ms, - result["std_etel_ms"] = metrics.std_etel_ms, - for p, value in metrics.percentiles_etel_ms: + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} ETEL (ms):", value)) - result[f"p{p_word}_etel_ms"] = value + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("etel", "ETEL", "End-to-end Latency") print("=" * 50) From 2e48973cbbb807a37c7d8b986149d464bd1e175d Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Sat, 24 Aug 2024 03:11:10 +0000 Subject: [PATCH 5/6] format --- benchmarks/benchmark_serving.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 433aeb3cf2d41..93ced812fcda8 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -455,6 +455,8 @@ def process_one_metric( # E.g., "Time to First Token" metric_header: str, ): + # This function print and add statistics of the specified + # metric. if metric_attribute_name not in selected_percentile_metrics: return print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) @@ -470,7 +472,8 @@ def process_one_metric( metrics, f"median_{metric_attribute_name}_ms") result[f"std_{metric_attribute_name}_ms"] = getattr( metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) From e494450e17167fd86fb8fbb84f2e0e2c18bcea68 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Sat, 24 Aug 2024 04:22:17 +0000 Subject: [PATCH 6/6] etel -> e2el --- benchmarks/benchmark_serving.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 93ced812fcda8..e38ceaa222956 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -70,13 +70,13 @@ class BenchmarkMetrics: median_itl_ms: float std_itl_ms: float percentiles_itl_ms: List[Tuple[float, float]] - # ETEL stands for end-to-end latency per request. + # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. - mean_etel_ms: float - median_etel_ms: float - std_etel_ms: float - percentiles_etel_ms: List[Tuple[float, float]] + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] def sample_sharegpt_requests( @@ -251,7 +251,7 @@ def calculate_metrics( itls: List[float] = [] tpots: List[float] = [] ttfts: List[float] = [] - etels: List[float] = [] + e2els: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -268,7 +268,7 @@ def calculate_metrics( (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) itls += outputs[i].itl ttfts.append(outputs[i].ttft) - etels.append(outputs[i].latency) + e2els.append(outputs[i].latency) completed += 1 else: actual_output_lens.append(0) @@ -301,10 +301,10 @@ def calculate_metrics( median_itl_ms=np.median(itls or 0) * 1000, percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], - mean_etel_ms=np.median(etels or 0) * 1000, - std_etel_ms=np.std(etels or 0) * 1000, - median_etel_ms=np.mean(etels or 0) * 1000, - percentiles_etel_ms=[(p, np.percentile(etels or 0, p) * 1000) + mean_e2el_ms=np.median(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.mean(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], ) @@ -483,7 +483,7 @@ def process_one_metric( process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("etel", "ETEL", "End-to-end Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") print("=" * 50) @@ -810,7 +810,7 @@ def main(args: argparse.Namespace): default="ttft,tpot,itl", help="Comma-seperated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " - "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"etel\". " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") parser.add_argument( "--metric-percentiles",