Skip to content

Commit

Permalink
[EP Perf] Add concurrency test (#19804)
Browse files Browse the repository at this point in the history
### Description
<!-- Describe your changes. -->
* Add concurrency test to EP Perf CI panel (impl. by onnx_test_runner)
  * Model: FasterRCNN-10 model within CI image
  * `-c` param configurable via CI panel when kicking off CI tasks
  * Auto-replicate test input/outputs according to `-c` param
* By default, the model test will be executed in 100 iterations (~2min
added to T4 CI task load overall)

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
To monitor potential concurrency issues of ORT-TRT
  • Loading branch information
yf711 authored Mar 15, 2024
1 parent 42399df commit 0b2a75b
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 25 deletions.
23 changes: 22 additions & 1 deletion onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@

set -x

while getopts p:o:l:s: parameter
while getopts p:o:l:s:c: parameter
do case "${parameter}"
in
p) WORKSPACE=${OPTARG};;
o) ORT_BINARY_PATH=${OPTARG};;
l) BUILD_ORT_LATEST=${OPTARG};;
s) ORT_SOURCE=${OPTARG};;
c) CONCURRENCY=${OPTARG};;
esac
done

Expand Down Expand Up @@ -104,6 +105,26 @@ fi

mv valgrind.log result

# Concurrency Test
FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/"

mkdir FasterRCNN-10/
cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/

# replicate test inputs
for (( i=1; i<CONCURRENCY; i++ )); do
cp -r "./FasterRCNN-10/test_data_set_0/" "./FasterRCNN-10/test_data_set_$i/"
done

pip install onnx requests packaging
python ${ORT_SOURCE}/onnxruntime/python/tools/symbolic_shape_infer.py \
--input="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
--output="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
--auto_merge

${ORT_SOURCE}/build/Linux/Release/onnx_test_runner -e tensorrt -c ${CONCURRENCY} -r 100 ./FasterRCNN-10/ > concurrency_test.log 2>&1
mv concurrency_test.log result

# Run AddressSanitizer
ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
set -x

# Parse Arguments
while getopts w:d:p:l: parameter
while getopts w:d:p:l:c: parameter
do case "${parameter}"
in
w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime
d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine
p) MEM_TEST_DIR=${OPTARG};; # mem test dir
l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT
c) CONCURRENCY=${OPTARG};;
esac
done

Expand All @@ -24,4 +25,4 @@ then
BUILD_ORT_LATEST="true"
fi

docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST
docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY
61 changes: 50 additions & 11 deletions onnxruntime/python/tools/tensorrt/perf/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import argparse
import csv
import datetime
import os
import sys
Expand Down Expand Up @@ -419,10 +420,11 @@ def main():
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)

try:
# Load EP Perf test results from /result
result_file = args.report_folder

folders = os.listdir(result_file)
os.chdir(result_file)
result_perf_test_path = os.path.join(result_file, "result")
folders = os.listdir(result_perf_test_path)
os.chdir(result_perf_test_path)

tables = [
fail_name,
Expand All @@ -445,26 +447,26 @@ def main():
for model_group in folders:
os.chdir(model_group)
csv_filenames = os.listdir()
for csv in csv_filenames:
table = pd.read_csv(csv)
if session_name in csv:
for csv_file in csv_filenames:
table = pd.read_csv(csv_file)
if session_name in csv_file:
table_results[session_name] = pd.concat(
[table_results[session_name], get_session(table, model_group)], ignore_index=True
)
elif specs_name in csv:
elif specs_name in csv_file:
table_results[specs_name] = pd.concat(
[
table_results[specs_name],
get_specs(table, args.branch, args.commit_hash, args.commit_datetime),
],
ignore_index=True,
)
elif fail_name in csv:
elif fail_name in csv_file:
table_results[fail_name] = pd.concat(
[table_results[fail_name], get_failures(table, model_group)],
ignore_index=True,
)
elif latency_name in csv:
elif latency_name in csv_file:
table_results[memory_name] = pd.concat(
[table_results[memory_name], get_memory(table, model_group)],
ignore_index=True,
Expand All @@ -474,11 +476,11 @@ def main():
[table_results[latency_name], get_latency(table, model_group)],
ignore_index=True,
)
elif status_name in csv:
elif status_name in csv_file:
table_results[status_name] = pd.concat(
[table_results[status_name], get_status(table, model_group)], ignore_index=True
)
elif op_metrics_name in csv:
elif op_metrics_name in csv_file:
table = table.assign(Group=model_group)
table_results[op_metrics_name] = pd.concat(
[table_results[op_metrics_name], table], ignore_index=True
Expand Down Expand Up @@ -512,6 +514,43 @@ def main():
args.commit_datetime,
)

# Load concurrency test results
result_mem_test_path = os.path.join(result_file, "result_mem_test")
os.chdir(result_mem_test_path)
log_path = "concurrency_test.log"
if os.path.exists(log_path):
print("Generating concurrency test report")
with open(log_path) as log_file:
log_content = log_file.read()

failed_cases_section = log_content.split("Failed Test Cases:")[1]

# passed = 1 if no failed test cases
if failed_cases_section.strip() == "":
passed = 1
else:
passed = 0

csv_path = "concurrency_test.csv"
with open(csv_path, "w", newline="") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Passed", "Log"])
csv_writer.writerow([passed, log_content])

db_table_name = "ep_concurrencytest_record"
table = pd.read_csv(csv_path)
write_table(
ingest_client,
args.database,
table,
db_table_name,
upload_time,
identifier,
args.branch,
args.commit_hash,
args.commit_datetime,
)

except BaseException as e:
print(str(e))
sys.exit(1)
Expand Down
5 changes: 0 additions & 5 deletions onnxruntime/test/onnx/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -341,11 +341,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
logging_level = ORT_LOGGING_LEVEL_VERBOSE;
}

if (concurrent_session_runs > 1 && repeat_count > 1) {
fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n");
usage();
return -1;
}
argc -= optind;
argv += optind;
if (argc < 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,15 @@ parameters:
- "partner-models"

- name: MemTest
displayName: Run Memory Test
displayName: Run Memory Test and Concurrency Test
type: boolean
default: true

- name: ConcurrencyTest
displayName: Specifies the number of concurrency model test to invoke simultaneously
type: string
default: 2

- name: TrtEPOptions
displayName: TensorRT EP options
type: object
Expand Down Expand Up @@ -107,8 +112,8 @@ jobs:
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'

- ${{ if eq(parameters.MemTest, true) }}:
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
displayName: 'Run Memory Test'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}'
displayName: 'Run Memory Test and Concurrency Test'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'

- ${{ each option in parameters.ModelGroups }}:
Expand Down Expand Up @@ -152,16 +157,16 @@ jobs:
displayName: 'Check and Install Azure CLI'
- task: AzureCLI@2
displayName: 'Azure CLI Post to Dashboard'
displayName: 'Post EP Perf Results to Dashboard'
inputs:
azureSubscription: AIInfraBuildOnnxRuntimeOSS
scriptLocation: inlineScript
scriptType: bash
inlineScript: |
short_hash=$(git rev-parse --short HEAD) &&
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
- template: templates/component-governance-component-detection-steps.yml
parameters :
condition : 'succeeded'
Expand Down

0 comments on commit 0b2a75b

Please sign in to comment.