From b5c2e38e3cfdac19d089d1254286aa714cb2b7b7 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 25 Oct 2023 15:15:41 -0700 Subject: [PATCH] Test Python BLS with different sizes of CUDA memory pool (#6276) * Test with different sizes of CUDA memory pool * Check the server log for error message * Improve debugging * Fix syntax --- qa/L0_backend_python/bls/test.sh | 138 +++++++++++++++++-------------- qa/python_models/bls/model.py | 24 +++--- 2 files changed, 87 insertions(+), 75 deletions(-) diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh index 3975ab8c75..f3e7f29d52 100755 --- a/qa/L0_backend_python/bls/test.sh +++ b/qa/L0_backend_python/bls/test.sh @@ -34,7 +34,6 @@ source ../../common/util.sh TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} SERVER=${TRITON_DIR}/bin/tritonserver BACKEND_DIR=${TRITON_DIR}/backends -SERVER_ARGS="--model-repository=`pwd`/models --backend-directory=${BACKEND_DIR} --log-verbose=1" RET=0 # This variable is used to print out the correct server log for each sub-test. @@ -103,96 +102,109 @@ cp -r ${DATADIR}/qa_model_repository/libtorch_nobatch_float32_float32_float32/ . sed -i 's/libtorch_nobatch_float32_float32_float32/libtorch_cpu/' models/libtorch_cpu/config.pbtxt && \ echo "instance_group [ { kind: KIND_CPU} ]" >> models/libtorch_cpu/config.pbtxt -for TRIAL in non_decoupled decoupled ; do - export BLS_KIND=$TRIAL - SERVER_LOG="./bls_$TRIAL.inference_server.log" - - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi +# Test with different sizes of CUDA memory pool +for CUDA_MEMORY_POOL_SIZE_MB in 64 128 ; do + CUDA_MEMORY_POOL_SIZE_BYTES=$((CUDA_MEMORY_POOL_SIZE_MB * 1024 * 1024)) + SERVER_ARGS="--model-repository=`pwd`/models --backend-directory=${BACKEND_DIR} --log-verbose=1 --cuda-memory-pool-byte-size=0:${CUDA_MEMORY_POOL_SIZE_BYTES}" + for TRIAL in non_decoupled decoupled ; do + export BLS_KIND=$TRIAL + SERVER_LOG="./bls_$TRIAL.$CUDA_MEMORY_POOL_SIZE_MB.inference_server.log" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi - set +e + set +e - export MODEL_NAME='bls' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** 'bls' $BLS_KIND test FAILED. \n***" - cat $CLIENT_LOG - RET=1 - SUB_TEST_RET=1 - else - check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + export MODEL_NAME='bls' + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'bls' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" RET=1 SUB_TEST_RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + SUB_TEST_RET=1 + fi fi - fi - export MODEL_NAME='bls_memory' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** 'bls_memory' $BLS_KIND test FAILED. \n***" - cat $CLIENT_LOG - RET=1 - SUB_TEST_RET=1 - else - check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + export MODEL_NAME='bls_memory' + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'bls_memory' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" RET=1 SUB_TEST_RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + SUB_TEST_RET=1 + fi fi - fi - export MODEL_NAME='bls_memory_async' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** 'bls_async_memory' $BLS_KIND test FAILED. \n***" - cat $CLIENT_LOG - RET=1 - SUB_TEST_RET=1 - else - check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + export MODEL_NAME='bls_memory_async' + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'bls_async_memory' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" RET=1 SUB_TEST_RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + SUB_TEST_RET=1 + fi fi - fi - export MODEL_NAME='bls_async' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** 'bls_async' $BLS_KIND test FAILED. \n***" - cat $CLIENT_LOG - RET=1 - SUB_TEST_RET=1 - else - check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + export MODEL_NAME='bls_async' + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'bls_async' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" RET=1 SUB_TEST_RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + SUB_TEST_RET=1 + fi fi - fi - set -e + set -e - kill $SERVER_PID - wait $SERVER_PID + kill $SERVER_PID + wait $SERVER_PID - if [ $SUB_TEST_RET -eq 1 ]; then - cat $CLIENT_LOG - cat $SERVER_LOG - fi + if [ $SUB_TEST_RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + fi + + if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then + if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then + echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***" + cat $SERVER_LOG + RET=1 + fi + fi + done done # Test error handling when BLS is used in "initialize" or "finalize" function diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py index 54c62f954c..69c0d2740b 100644 --- a/qa/python_models/bls/model.py +++ b/qa/python_models/bls/model.py @@ -220,7 +220,7 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled): infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START ) infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT") self.assertFalse(output.is_cpu()) output = from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy() @@ -242,7 +242,7 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled): next(infer_responses) else: infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) # The new output is the previous output + the current input expected_output = output[0] + i @@ -275,7 +275,7 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled): else: infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) expected_output = output[0] + input.as_numpy()[0] output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT") self.assertFalse(output.is_cpu()) @@ -345,7 +345,7 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled): else: infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") @@ -401,7 +401,7 @@ def test_zero_length_io(self): else: infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") self.assertTrue(np.all(output0 == input0)) @@ -431,7 +431,7 @@ def test_bls_tensor_lifecycle(self): next(infer_responses) else: infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") np.testing.assert_equal( @@ -472,7 +472,7 @@ def test_bls_tensor_lifecycle(self): else: infer_response = infer_request.exec() - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") output0_pytorch = from_dlpack(output0.to_dlpack()) @@ -638,7 +638,7 @@ def _test_response_iterator_square( expected_output_cnt = np.array([expected_output_cnt], dtype=np.int32) for infer_response in response_iterator: - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) if len(infer_response.output_tensors()) > 0: output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") self.assertIsNotNone(output0) @@ -671,7 +671,7 @@ def test_response_iterator(self): # case 1. Use Next() to get the next response first, then use # for-loop to get the remaining responses. infer_response = next(infer_responses) - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") self.assertIsNotNone(output0) self.assertEqual(response_value, output0.as_numpy()) @@ -695,7 +695,7 @@ def test_response_iterator(self): # get the remaining responses. response_count = 0 for infer_response in infer_responses: - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") self.assertIsNotNone(output0) self.assertEqual(response_value, output0.as_numpy()) @@ -705,7 +705,7 @@ def test_response_iterator(self): break infer_response = next(infer_responses) - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") self.assertIsNotNone(output0) self.assertEqual(response_value, output0.as_numpy()) @@ -720,7 +720,7 @@ def test_response_iterator(self): infer_responses = infer_request.exec(decoupled=True) infer_response = next(infer_responses) - self.assertFalse(infer_response.has_error()) + self.assertFalse(infer_response.has_error(), infer_response.error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") self.assertIsNotNone(output0) self.assertEqual(response_value, output0.as_numpy())