From fe8e499c45edfe224984ea56caed95380a4b0795 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Mon, 1 May 2023 17:48:29 -0700 Subject: [PATCH 1/3] Add testing for batcher init failure, add wait for status check --- qa/L0_batch_custom/batch_custom_test.py | 17 +++++++-- qa/L0_batch_custom/test.sh | 36 +++++++++++++++++-- qa/L0_optional_input/optional_input_test.py | 17 +++++++-- .../trt_shape_tensor_test.py | 17 +++++++-- 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py index 5f77fd3975..4250908c91 100644 --- a/qa/L0_batch_custom/batch_custom_test.py +++ b/qa/L0_batch_custom/batch_custom_test.py @@ -122,8 +122,21 @@ def check_response(self, def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count): - stats = self.triton_client_.get_inference_statistics(model_name, "1") - self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics( + model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if actual_exec_cnt == exec_count: + break + print("WARNING: expect {} executions, got {} (attempt {})".format( + exec_count, actual_exec_cnt, i)) + time.sleep(1) + self.assertEqual(stats.model_stats[0].name, model_name, "expect model stats for model {}".format(model_name)) self.assertEqual( diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh index a420ec9d9d..d207c97fde 100755 --- a/qa/L0_batch_custom/test.sh +++ b/qa/L0_batch_custom/test.sh @@ -102,7 +102,7 @@ cp -r backend/examples/batching_strategies/single_batching/build/libtriton_singl # Run a test to validate the single batching strategy example. # Then, run tests to validate the volume batching example being passed in via the backend dir, model dir, version dir, and model config. BACKEND_DIR="/opt/tritonserver/backends/onnxruntime" -MODEL_DIR="models/$MODEL_NAME/" +MODEL_DIR="models/$MODEL_NAME" VERSION_DIR="$MODEL_DIR/1/" test_types=('single_batching_backend' 'backend_directory' 'model_directory' 'version_directory' 'model_config') @@ -151,10 +151,42 @@ for i in "${!test_setups[@]}"; do wait $SERVER_PID done +# Test ModelBatchInitialize failure +FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc" +OLD_STRING="\*batcher = reinterpret_cast(" +NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");" + +sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH} +sed -i "s/new unsigned int(max_volume_bytes));//g" ${FILE_PATH} + +(cd backend/examples/batching_strategies/volume_batching && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG .. && + make -j4 install) + +cp -r backend/examples/batching_strategies/volume_batching/build/libtriton_volumebatching.so models/${MODEL_NAME}/libtriton_volumebatching.so + +SERVER_LOG=${SERVER_LOG_BASE}_batching_init_failure + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** ModelBatchInit Error Test: unexpected successful server start $SERVER\n***" + kill_server + RET=1 +else + if [ `grep -c "Failure test case" $SERVER_LOG` != "1" ] || [ `grep -c "Not found" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** ModelBatchInit Error Test: failed to find 1 \"Failure test case\" message and 1 \"Not found\" error type" + RET=1 + fi +fi + + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else - cat $CLIENT_LOG echo -e "\n***\n*** Test FAILED\n***" fi diff --git a/qa/L0_optional_input/optional_input_test.py b/qa/L0_optional_input/optional_input_test.py index 2543a198d6..c813146ecd 100644 --- a/qa/L0_optional_input/optional_input_test.py +++ b/qa/L0_optional_input/optional_input_test.py @@ -124,8 +124,21 @@ def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")): self.add_deferred_exception(ex) def check_status(self, model_name, batch_exec, request_cnt, infer_cnt): - stats = self.triton_client_.get_inference_statistics(model_name, "1") - self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics( + model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if actual_exec_cnt == exec_cnt: + break + print("WARNING: expect {} executions, got {} (attempt {})".format( + exec_cnt, actual_exec_cnt, i)) + time.sleep(1) + self.assertEqual(stats.model_stats[0].name, model_name, "expect model stats for model {}".format(model_name)) self.assertEqual( diff --git a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py index 8e87b8cc40..14609dbb94 100644 --- a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py +++ b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py @@ -130,8 +130,21 @@ def check_setup(self, model_name): _max_queue_delay_ms * 1000) # 10 secs def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt): - stats = self.triton_client_.get_inference_statistics(model_name, "1") - self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics( + model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if actual_exec_cnt == exec_cnt: + break + print("WARNING: expect {} executions, got {} (attempt {})".format( + exec_cnt, actual_exec_cnt, i)) + time.sleep(1) + self.assertEqual(stats.model_stats[0].name, model_name, "expect model stats for model {}".format(model_name)) self.assertEqual( From a2e2fc80dd59728589b14f0ee479eb574c5f94f8 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Mon, 1 May 2023 17:49:22 -0700 Subject: [PATCH 2/3] Formatting --- qa/L0_batch_custom/batch_custom_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py index 4250908c91..3fb74cf25d 100644 --- a/qa/L0_batch_custom/batch_custom_test.py +++ b/qa/L0_batch_custom/batch_custom_test.py @@ -136,7 +136,7 @@ def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, print("WARNING: expect {} executions, got {} (attempt {})".format( exec_count, actual_exec_cnt, i)) time.sleep(1) - + self.assertEqual(stats.model_stats[0].name, model_name, "expect model stats for model {}".format(model_name)) self.assertEqual( From 8c283e7ed44f3bcc44ae897b48045e4bdd3db7f6 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Tue, 2 May 2023 08:01:51 -0700 Subject: [PATCH 3/3] Change search string --- qa/L0_batch_custom/test.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh index d207c97fde..ca4b0e4b24 100755 --- a/qa/L0_batch_custom/test.sh +++ b/qa/L0_batch_custom/test.sh @@ -153,11 +153,10 @@ done # Test ModelBatchInitialize failure FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc" -OLD_STRING="\*batcher = reinterpret_cast(" +OLD_STRING="\/\/ Batcher will point to an unsigned integer representing the maximum" NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");" sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH} -sed -i "s/new unsigned int(max_volume_bytes));//g" ${FILE_PATH} (cd backend/examples/batching_strategies/volume_batching && cd build && @@ -176,9 +175,9 @@ if [ "$SERVER_PID" != "0" ]; then kill_server RET=1 else - if [ `grep -c "Failure test case" $SERVER_LOG` != "1" ] || [ `grep -c "Not found" $SERVER_LOG` != "1" ]; then + if [ `grep -c "Failure test case" $SERVER_LOG` -lt 1 ] || [ `grep -c "Not found" $SERVER_LOG` -lt 1 ]; then cat $SERVER_LOG - echo -e "\n***\n*** ModelBatchInit Error Test: failed to find 1 \"Failure test case\" message and 1 \"Not found\" error type" + echo -e "\n***\n*** ModelBatchInit Error Test: failed to find \"Failure test case\" message and/or \"Not found\" error type" RET=1 fi fi