Merge branch 'main' of https://github.com/blinkbear/vllm

blinkbear · Oct 25, 2024 · 7c94e65 · 7c94e65
2 parents 48233e9 + 52946c0
commit 7c94e65
Show file tree

Hide file tree

Showing 9 changed files with 304 additions and 219 deletions.
diff --git a/.gitignore b/.gitignore
@@ -191,4 +191,4 @@ hip_compat.h
 *.txt
 
 tags
-benchmarks/1_serving_benchmark.sh
+*1_serving_benchmark.sh
diff --git a/benchmarks/1_serving_benchmark.sh b/benchmarks/1_serving_benchmark.sh
@@ -47,8 +47,8 @@ max_serving_time=1200
 # request_rates[1]=1.0
 # request_rates[1]=2.0
 # request_rates[2]=10.0
-# request_rates[3]=10.0
-request_rates[4]=20.0
+request_rates[4]=10.0
+request_rates[5]=20.0
 # request_rates[5]=50.0
 # request_rates[5]=30.0
 # request_rates[5]=50.0
@@ -57,7 +57,7 @@ request_rates[4]=20.0
 # request_rates=(2.0)
 swap_out_partial_rates=(0.5)
 waiting_iter_base=(0.1)
-gpu_devices=1
+gpu_devices=3
 for i in {0..0}; do
   for waiting_iter in "${waiting_iter_base[@]}"; do
     for swap_out_partial_rate in "${swap_out_partial_rates[@]}"; do
@@ -68,7 +68,7 @@ for i in {0..0}; do
           swap_policy=${element[1]}
           # tmux new-session -s "api_server" -d bash start_server.sh $gpu_devices $model_name $swap_space $preemption_mode $policy $max_tokens $iter_theshold $max_num_seqs $swap_policy $swap_out_partial_rate $gpu_memory_utilization $waiting_iter
 
-          CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 40-41 python3 -m vllm.entrypoints.openai.api_server \
+          CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 21-22 python3 -m vllm.entrypoints.openai.api_server \
             --model $model_name --swap-space $swap_space --preemption-mode $preemption_mode --scheduler-policy $policy \
             --enable-chunked-prefill --max-num-batched-tokens $max_tokens --iter-threshold $iter_theshold --max-num-seqs $max_num_seqs --swap-out-tokens-policy $swap_policy --swap-out-partial-rate $swap_out_partial_rate --execution-budget $iter_theshold \
             --tensor-parallel-size 1 --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests --max-serving-time $max_serving_time >api_server_${policy}_${swap_policy}.log 2>&1 &

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -143,7 +143,6 @@ async def async_request_trt_llm(
 
                         data = json.loads(chunk)
                         output.generated_text += data["text_output"]
-                        print(data['text_output'])
                         timestamp = time.perf_counter()
                         # First token
                         if ttft == 0.0:
@@ -361,7 +360,6 @@ async def async_request_openai_chat_completions(
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
-                            print(chunk_bytes)
                             continue
 
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"),
@@ -392,7 +390,6 @@ async def async_request_openai_chat_completions(
                     output.success = True
                     output.latency = latency
                 else:
-                    print(response.reason)
                     output.error = response.reason or ""
                     output.success = False
         except Exception:

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -295,8 +295,6 @@ def calculate_metrics(
     tpots = []
     ttfts = []
     latencies = []
-    print("length of the input_requests: ", len(input_requests))
-    print("length of the outputs: ", len(outputs))
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all

diff --git a/benchmarks/result/analysis/result_analysis.py b/benchmarks/result/analysis/result_analysis.py
@@ -1,6 +1,6 @@
 import marimo
 
-__generated_with = "0.9.6"
+__generated_with = "0.7.12"
 app = marimo.App(width="full")
 
 
@@ -39,12 +39,12 @@ def __(mo):
 
 @app.cell
 def __(base_dir, os):
-    _date = "20241011"
-    _counters = [1447]
+    date = "20241024"
+    counters = [1522]
     e2e_result_dir_names = [
-        os.path.join(base_dir, _date, str(counter)) for counter in _counters
+        os.path.join(base_dir, date, str(counter)) for counter in counters
     ]
-    return (e2e_result_dir_names,)
+    return counters, date, e2e_result_dir_names
 
 
 @app.cell
@@ -96,7 +96,7 @@ def add_num_annotation(ax, rotation=0):
                 textcoords="offset points",
                 rotation=rotation,
             )
-    return (add_num_annotation,)
+    return add_num_annotation,
 
 
 @app.cell
@@ -106,7 +106,7 @@ def get_tp_ratio(df):
         min_result = df["output_throughput"].min()
         df["output_throughput"] = df["output_throughput"] / 1
         return df
-    return (get_tp_ratio,)
+    return get_tp_ratio,
 
 
 @app.cell
@@ -150,7 +150,7 @@ def e2e_result(
             group_keys=False,
         )
         .apply(lambda row: get_tp_ratio(row))
-        .drop(columns=["swap_policies", "request_rates"])
+        .drop(columns=["swap_policies"])
         .reset_index()
     )
     sns.set_style(style="whitegrid")
@@ -356,21 +356,16 @@ def get_metric_ratio(df):
         var_name="Metric",
         value_name="Value",
     )
-    _long_df = (
-        _long_df.groupby(
-            ["Metric", "request_rate"],
-            group_keys=False,
-        )
-        .apply(lambda row: get_metric_ratio(row))
-        .drop(columns=["Metric", "request_rate"])
-        .reset_index()
-    )
-    print(_long_df)
+    _long_df = _long_df.groupby(
+        ["Metric", "request_rate"],
+        group_keys=False,
+    ).apply(lambda row: get_metric_ratio(row))
     _long_df[["metric_name", "metric_type"]] = _long_df["Metric"].apply(
         lambda row: pd.Series(
             [row.split("_", 2)[0].capitalize(), row.split("_", 2)[1].upper()]
         )
     )
+    print(_long_df["Ratio"].max())
     # _long_df = _long_df[_long_df["metric_name"] == "P99"]
     show_legend = True
 
@@ -422,7 +417,7 @@ def __(e2e_result_dfs, np, pd):
             inplace=True,
         )
         request_level_result = pd.concat([request_level_result, _tmp_df], axis=0)
-    return (request_level_result,)
+    return request_level_result,
 
 
 @app.cell
@@ -461,7 +456,7 @@ def get_p99_ratio(df):
     plt.legend(title="")
     plt.grid(alpha=0.3, linestyle="--")
     plt.ylabel("P99 ITL")
-    return (get_p99_ratio,)
+    return get_p99_ratio,
 
 
 @app.cell
@@ -500,7 +495,7 @@ def get_max_mean_ttft_ratio(df):
     plt.legend(title="")
     plt.grid(alpha=0.3, linestyle="--")
     plt.ylabel("Median TTFT")
-    return (get_max_mean_ttft_ratio,)
+    return get_max_mean_ttft_ratio,
 
 
 @app.cell(hide_code=True)
@@ -510,13 +505,11 @@ def __(mo):
 
 
 @app.cell
-def __(base_dir, os):
-    _date = "results"
-    _counters = [512]
+def __(base_dir, counters, date, os):
     execute_result_dir_names = [
-        os.path.join(base_dir, _date, str(counter)) for counter in _counters
+        os.path.join(base_dir, date, str(counter)) for counter in counters
     ]
-    return (execute_result_dir_names,)
+    return execute_result_dir_names,
 
 
 @app.cell
@@ -642,7 +635,7 @@ def __(execute_result_dir_names, os, pd):
                     execute_result_dfs["FCFS"] = _detailed_result_df
                 elif "tfittradeoff" in _file:
                     execute_result_dfs["TFITTradeoff"] = _detailed_result_df
-    return (execute_result_dfs,)
+    return execute_result_dfs,
 
 
 @app.cell
@@ -658,7 +651,7 @@ def __(base_dir, os):
     detailed_result_dir_names = [
         os.path.join(base_dir, _date, str(counter)) for counter in _counters
     ]
-    return (detailed_result_dir_names,)
+    return detailed_result_dir_names,
 
 
 @app.cell
@@ -723,7 +716,7 @@ def __(detailed_result_dfs, plt, sns):
     plt.tight_layout(pad=0, w_pad=0.1, h_pad=0.1)
     plt.savefig("100_qps.pdf")
     plt.show()
-    return (jointplot_ax,)
+    return jointplot_ax,
 
 
 @app.cell
@@ -822,7 +815,7 @@ def plot_perf_gpu_resource_line(source_df, perf_name, window_size=5):
         perf_per_tier.set_yscale("log")
         gpu_resource_per_iter.set_yscale("log")
         plt.show()
-    return (plot_perf_gpu_resource_line,)
+    return plot_perf_gpu_resource_line,
 
 
 @app.cell
@@ -880,7 +873,7 @@ def plot_perf_gpu_resource_heatmap(source_df, perf_name, bucket_size=0.05):
     plot_perf_gpu_resource_heatmap(
         request_rate_gpu_resource, "throughput iter", 0.1
     )
-    return (plot_perf_gpu_resource_heatmap,)
+    return plot_perf_gpu_resource_heatmap,
 
 
 @app.cell(disabled=True, hide_code=True)
@@ -909,7 +902,7 @@ def __():
 
 
 @app.cell
-def __(add_num_annotation, detailed_result_dfs, plt, sns):
+def __(detailed_result_dfs, plt, sns):
     detailed_mean_result = (
         detailed_result_dfs.groupby(["schedule_policy"])
         .max()
@@ -934,4 +927,9 @@ def __(add_num_annotation, detailed_result_dfs, plt, sns):
         x="metric",
     )
     # plt.yscale("log")
-    plt.legend(title="")
+    plt.legend(title="")
+    return ax, detailed_mean_result
+
+
+if __name__ == "__main__":
+    app.run()
-Original file line number
+Diff line change
@@ Expand Up / @@ -191,4 +191,4 @@ hip_compat.h @@
     *.txt
     tags
-    benchmarks/1_serving_benchmark.sh
+    *1_serving_benchmark.sh