From 0b2a75b274e45c7a510bfdae9071a97a69e75618 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Fri, 15 Mar 2024 23:41:21 +0900 Subject: [PATCH] [EP Perf] Add concurrency test (#19804) ### Description * Add concurrency test to EP Perf CI panel (impl. by onnx_test_runner) * Model: FasterRCNN-10 model within CI image * `-c` param configurable via CI panel when kicking off CI tasks * Auto-replicate test input/outputs according to `-c` param * By default, the model test will be executed in 100 iterations (~2min added to T4 CI task load overall) ### Motivation and Context To monitor potential concurrency issues of ORT-TRT --- .../tools/tensorrt/perf/mem_test/run.sh | 23 ++++++- .../perf/mem_test/run_mem_test_docker.sh | 5 +- .../python/tools/tensorrt/perf/post.py | 61 +++++++++++++++---- onnxruntime/test/onnx/main.cc | 5 -- ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 17 ++++-- 5 files changed, 86 insertions(+), 25 deletions(-) diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh index dd53fe6127462..2cfdd39bc96aa 100755 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh @@ -4,13 +4,14 @@ set -x -while getopts p:o:l:s: parameter +while getopts p:o:l:s:c: parameter do case "${parameter}" in p) WORKSPACE=${OPTARG};; o) ORT_BINARY_PATH=${OPTARG};; l) BUILD_ORT_LATEST=${OPTARG};; s) ORT_SOURCE=${OPTARG};; +c) CONCURRENCY=${OPTARG};; esac done @@ -104,6 +105,26 @@ fi mv valgrind.log result +# Concurrency Test +FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/" + +mkdir FasterRCNN-10/ +cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/ + +# replicate test inputs +for (( i=1; i concurrency_test.log 2>&1 +mv concurrency_test.log result + # Run AddressSanitizer ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh index 4e94c63ee6c25..a355e4cf5d365 100755 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh @@ -3,13 +3,14 @@ set -x # Parse Arguments -while getopts w:d:p:l: parameter +while getopts w:d:p:l:c: parameter do case "${parameter}" in w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine p) MEM_TEST_DIR=${OPTARG};; # mem test dir l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT +c) CONCURRENCY=${OPTARG};; esac done @@ -24,4 +25,4 @@ then BUILD_ORT_LATEST="true" fi -docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST +docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py index 363fa3a96d283..df389ad572596 100644 --- a/onnxruntime/python/tools/tensorrt/perf/post.py +++ b/onnxruntime/python/tools/tensorrt/perf/post.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import argparse +import csv import datetime import os import sys @@ -419,10 +420,11 @@ def main(): upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0) try: + # Load EP Perf test results from /result result_file = args.report_folder - - folders = os.listdir(result_file) - os.chdir(result_file) + result_perf_test_path = os.path.join(result_file, "result") + folders = os.listdir(result_perf_test_path) + os.chdir(result_perf_test_path) tables = [ fail_name, @@ -445,13 +447,13 @@ def main(): for model_group in folders: os.chdir(model_group) csv_filenames = os.listdir() - for csv in csv_filenames: - table = pd.read_csv(csv) - if session_name in csv: + for csv_file in csv_filenames: + table = pd.read_csv(csv_file) + if session_name in csv_file: table_results[session_name] = pd.concat( [table_results[session_name], get_session(table, model_group)], ignore_index=True ) - elif specs_name in csv: + elif specs_name in csv_file: table_results[specs_name] = pd.concat( [ table_results[specs_name], @@ -459,12 +461,12 @@ def main(): ], ignore_index=True, ) - elif fail_name in csv: + elif fail_name in csv_file: table_results[fail_name] = pd.concat( [table_results[fail_name], get_failures(table, model_group)], ignore_index=True, ) - elif latency_name in csv: + elif latency_name in csv_file: table_results[memory_name] = pd.concat( [table_results[memory_name], get_memory(table, model_group)], ignore_index=True, @@ -474,11 +476,11 @@ def main(): [table_results[latency_name], get_latency(table, model_group)], ignore_index=True, ) - elif status_name in csv: + elif status_name in csv_file: table_results[status_name] = pd.concat( [table_results[status_name], get_status(table, model_group)], ignore_index=True ) - elif op_metrics_name in csv: + elif op_metrics_name in csv_file: table = table.assign(Group=model_group) table_results[op_metrics_name] = pd.concat( [table_results[op_metrics_name], table], ignore_index=True @@ -512,6 +514,43 @@ def main(): args.commit_datetime, ) + # Load concurrency test results + result_mem_test_path = os.path.join(result_file, "result_mem_test") + os.chdir(result_mem_test_path) + log_path = "concurrency_test.log" + if os.path.exists(log_path): + print("Generating concurrency test report") + with open(log_path) as log_file: + log_content = log_file.read() + + failed_cases_section = log_content.split("Failed Test Cases:")[1] + + # passed = 1 if no failed test cases + if failed_cases_section.strip() == "": + passed = 1 + else: + passed = 0 + + csv_path = "concurrency_test.csv" + with open(csv_path, "w", newline="") as csv_file: + csv_writer = csv.writer(csv_file) + csv_writer.writerow(["Passed", "Log"]) + csv_writer.writerow([passed, log_content]) + + db_table_name = "ep_concurrencytest_record" + table = pd.read_csv(csv_path) + write_table( + ingest_client, + args.database, + table, + db_table_name, + upload_time, + identifier, + args.branch, + args.commit_hash, + args.commit_datetime, + ) + except BaseException as e: print(str(e)) sys.exit(1) diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 5a2104ffeb0da..9c2c24e3c337d 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -341,11 +341,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) { logging_level = ORT_LOGGING_LEVEL_VERBOSE; } - if (concurrent_session_runs > 1 && repeat_count > 1) { - fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n"); - usage(); - return -1; - } argc -= optind; argv += optind; if (argc < 1) { diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 15f558e6f9ef0..af2d722a6b90c 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -28,10 +28,15 @@ parameters: - "partner-models" - name: MemTest - displayName: Run Memory Test + displayName: Run Memory Test and Concurrency Test type: boolean default: true +- name: ConcurrencyTest + displayName: Specifies the number of concurrency model test to invoke simultaneously + type: string + default: 2 + - name: TrtEPOptions displayName: TensorRT EP options type: object @@ -107,8 +112,8 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build' - ${{ if eq(parameters.MemTest, true) }}: - - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false' - displayName: 'Run Memory Test' + - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}' + displayName: 'Run Memory Test and Concurrency Test' workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/' - ${{ each option in parameters.ModelGroups }}: @@ -152,7 +157,7 @@ jobs: displayName: 'Check and Install Azure CLI' - task: AzureCLI@2 - displayName: 'Azure CLI Post to Dashboard' + displayName: 'Post EP Perf Results to Dashboard' inputs: azureSubscription: AIInfraBuildOnnxRuntimeOSS scriptLocation: inlineScript @@ -160,8 +165,8 @@ jobs: inlineScript: | short_hash=$(git rev-parse --short HEAD) && commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) && - python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) - + python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + - template: templates/component-governance-component-detection-steps.yml parameters : condition : 'succeeded'