Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
mchen644 committed Oct 25, 2024
2 parents 48233e9 + 52946c0 commit 7c94e65
Show file tree
Hide file tree
Showing 9 changed files with 304 additions and 219 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -191,4 +191,4 @@ hip_compat.h
*.txt

tags
benchmarks/1_serving_benchmark.sh
*1_serving_benchmark.sh
8 changes: 4 additions & 4 deletions benchmarks/1_serving_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ max_serving_time=1200
# request_rates[1]=1.0
# request_rates[1]=2.0
# request_rates[2]=10.0
# request_rates[3]=10.0
request_rates[4]=20.0
request_rates[4]=10.0
request_rates[5]=20.0
# request_rates[5]=50.0
# request_rates[5]=30.0
# request_rates[5]=50.0
Expand All @@ -57,7 +57,7 @@ request_rates[4]=20.0
# request_rates=(2.0)
swap_out_partial_rates=(0.5)
waiting_iter_base=(0.1)
gpu_devices=1
gpu_devices=3
for i in {0..0}; do
for waiting_iter in "${waiting_iter_base[@]}"; do
for swap_out_partial_rate in "${swap_out_partial_rates[@]}"; do
Expand All @@ -68,7 +68,7 @@ for i in {0..0}; do
swap_policy=${element[1]}
# tmux new-session -s "api_server" -d bash start_server.sh $gpu_devices $model_name $swap_space $preemption_mode $policy $max_tokens $iter_theshold $max_num_seqs $swap_policy $swap_out_partial_rate $gpu_memory_utilization $waiting_iter

CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 40-41 python3 -m vllm.entrypoints.openai.api_server \
CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 21-22 python3 -m vllm.entrypoints.openai.api_server \
--model $model_name --swap-space $swap_space --preemption-mode $preemption_mode --scheduler-policy $policy \
--enable-chunked-prefill --max-num-batched-tokens $max_tokens --iter-threshold $iter_theshold --max-num-seqs $max_num_seqs --swap-out-tokens-policy $swap_policy --swap-out-partial-rate $swap_out_partial_rate --execution-budget $iter_theshold \
--tensor-parallel-size 1 --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests --max-serving-time $max_serving_time >api_server_${policy}_${swap_policy}.log 2>&1 &
Expand Down
3 changes: 0 additions & 3 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ async def async_request_trt_llm(

data = json.loads(chunk)
output.generated_text += data["text_output"]
print(data['text_output'])
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
Expand Down Expand Up @@ -361,7 +360,6 @@ async def async_request_openai_chat_completions(
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
print(chunk_bytes)
continue

chunk = remove_prefix(chunk_bytes.decode("utf-8"),
Expand Down Expand Up @@ -392,7 +390,6 @@ async def async_request_openai_chat_completions(
output.success = True
output.latency = latency
else:
print(response.reason)
output.error = response.reason or ""
output.success = False
except Exception:
Expand Down
2 changes: 0 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,6 @@ def calculate_metrics(
tpots = []
ttfts = []
latencies = []
print("length of the input_requests: ", len(input_requests))
print("length of the outputs: ", len(outputs))
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
Expand Down
64 changes: 31 additions & 33 deletions benchmarks/result/analysis/result_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import marimo

__generated_with = "0.9.6"
__generated_with = "0.7.12"
app = marimo.App(width="full")


Expand Down Expand Up @@ -39,12 +39,12 @@ def __(mo):

@app.cell
def __(base_dir, os):
_date = "20241011"
_counters = [1447]
date = "20241024"
counters = [1522]
e2e_result_dir_names = [
os.path.join(base_dir, _date, str(counter)) for counter in _counters
os.path.join(base_dir, date, str(counter)) for counter in counters
]
return (e2e_result_dir_names,)
return counters, date, e2e_result_dir_names


@app.cell
Expand Down Expand Up @@ -96,7 +96,7 @@ def add_num_annotation(ax, rotation=0):
textcoords="offset points",
rotation=rotation,
)
return (add_num_annotation,)
return add_num_annotation,


@app.cell
Expand All @@ -106,7 +106,7 @@ def get_tp_ratio(df):
min_result = df["output_throughput"].min()
df["output_throughput"] = df["output_throughput"] / 1
return df
return (get_tp_ratio,)
return get_tp_ratio,


@app.cell
Expand Down Expand Up @@ -150,7 +150,7 @@ def e2e_result(
group_keys=False,
)
.apply(lambda row: get_tp_ratio(row))
.drop(columns=["swap_policies", "request_rates"])
.drop(columns=["swap_policies"])
.reset_index()
)
sns.set_style(style="whitegrid")
Expand Down Expand Up @@ -356,21 +356,16 @@ def get_metric_ratio(df):
var_name="Metric",
value_name="Value",
)
_long_df = (
_long_df.groupby(
["Metric", "request_rate"],
group_keys=False,
)
.apply(lambda row: get_metric_ratio(row))
.drop(columns=["Metric", "request_rate"])
.reset_index()
)
print(_long_df)
_long_df = _long_df.groupby(
["Metric", "request_rate"],
group_keys=False,
).apply(lambda row: get_metric_ratio(row))
_long_df[["metric_name", "metric_type"]] = _long_df["Metric"].apply(
lambda row: pd.Series(
[row.split("_", 2)[0].capitalize(), row.split("_", 2)[1].upper()]
)
)
print(_long_df["Ratio"].max())
# _long_df = _long_df[_long_df["metric_name"] == "P99"]
show_legend = True

Expand Down Expand Up @@ -422,7 +417,7 @@ def __(e2e_result_dfs, np, pd):
inplace=True,
)
request_level_result = pd.concat([request_level_result, _tmp_df], axis=0)
return (request_level_result,)
return request_level_result,


@app.cell
Expand Down Expand Up @@ -461,7 +456,7 @@ def get_p99_ratio(df):
plt.legend(title="")
plt.grid(alpha=0.3, linestyle="--")
plt.ylabel("P99 ITL")
return (get_p99_ratio,)
return get_p99_ratio,


@app.cell
Expand Down Expand Up @@ -500,7 +495,7 @@ def get_max_mean_ttft_ratio(df):
plt.legend(title="")
plt.grid(alpha=0.3, linestyle="--")
plt.ylabel("Median TTFT")
return (get_max_mean_ttft_ratio,)
return get_max_mean_ttft_ratio,


@app.cell(hide_code=True)
Expand All @@ -510,13 +505,11 @@ def __(mo):


@app.cell
def __(base_dir, os):
_date = "results"
_counters = [512]
def __(base_dir, counters, date, os):
execute_result_dir_names = [
os.path.join(base_dir, _date, str(counter)) for counter in _counters
os.path.join(base_dir, date, str(counter)) for counter in counters
]
return (execute_result_dir_names,)
return execute_result_dir_names,


@app.cell
Expand Down Expand Up @@ -642,7 +635,7 @@ def __(execute_result_dir_names, os, pd):
execute_result_dfs["FCFS"] = _detailed_result_df
elif "tfittradeoff" in _file:
execute_result_dfs["TFITTradeoff"] = _detailed_result_df
return (execute_result_dfs,)
return execute_result_dfs,


@app.cell
Expand All @@ -658,7 +651,7 @@ def __(base_dir, os):
detailed_result_dir_names = [
os.path.join(base_dir, _date, str(counter)) for counter in _counters
]
return (detailed_result_dir_names,)
return detailed_result_dir_names,


@app.cell
Expand Down Expand Up @@ -723,7 +716,7 @@ def __(detailed_result_dfs, plt, sns):
plt.tight_layout(pad=0, w_pad=0.1, h_pad=0.1)
plt.savefig("100_qps.pdf")
plt.show()
return (jointplot_ax,)
return jointplot_ax,


@app.cell
Expand Down Expand Up @@ -822,7 +815,7 @@ def plot_perf_gpu_resource_line(source_df, perf_name, window_size=5):
perf_per_tier.set_yscale("log")
gpu_resource_per_iter.set_yscale("log")
plt.show()
return (plot_perf_gpu_resource_line,)
return plot_perf_gpu_resource_line,


@app.cell
Expand Down Expand Up @@ -880,7 +873,7 @@ def plot_perf_gpu_resource_heatmap(source_df, perf_name, bucket_size=0.05):
plot_perf_gpu_resource_heatmap(
request_rate_gpu_resource, "throughput iter", 0.1
)
return (plot_perf_gpu_resource_heatmap,)
return plot_perf_gpu_resource_heatmap,


@app.cell(disabled=True, hide_code=True)
Expand Down Expand Up @@ -909,7 +902,7 @@ def __():


@app.cell
def __(add_num_annotation, detailed_result_dfs, plt, sns):
def __(detailed_result_dfs, plt, sns):
detailed_mean_result = (
detailed_result_dfs.groupby(["schedule_policy"])
.max()
Expand All @@ -934,4 +927,9 @@ def __(add_num_annotation, detailed_result_dfs, plt, sns):
x="metric",
)
# plt.yscale("log")
plt.legend(title="")
plt.legend(title="")
return ax, detailed_mean_result


if __name__ == "__main__":
app.run()
Loading

0 comments on commit 7c94e65

Please sign in to comment.