From fe8e499c45edfe224984ea56caed95380a4b0795 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 1 May 2023 17:48:29 -0700
Subject: [PATCH 1/3] Add testing for batcher init failure, add wait for status
 check

---
 qa/L0_batch_custom/batch_custom_test.py       | 17 +++++++--
 qa/L0_batch_custom/test.sh                    | 36 +++++++++++++++++--
 qa/L0_optional_input/optional_input_test.py   | 17 +++++++--
 .../trt_shape_tensor_test.py                  | 17 +++++++--
 4 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py
index 5f77fd3975..4250908c91 100644
--- a/qa/L0_batch_custom/batch_custom_test.py
+++ b/qa/L0_batch_custom/batch_custom_test.py
@@ -122,8 +122,21 @@ def check_response(self,
 
     def check_status(self, model_name, batch_exec, request_cnt, infer_cnt,
                      exec_count):
-        stats = self.triton_client_.get_inference_statistics(model_name, "1")
-        self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
+        # There is a time window between when responses are returned and statistics are updated.
+        # To prevent intermittent test failure during that window, wait up to 10 seconds for the
+        # inference statistics to be ready.
+        num_tries = 10
+        for i in range(num_tries):
+            stats = self.triton_client_.get_inference_statistics(
+                model_name, "1")
+            self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
+            actual_exec_cnt = stats.model_stats[0].execution_count
+            if actual_exec_cnt == exec_count:
+                break
+            print("WARNING: expect {} executions, got {} (attempt {})".format(
+                exec_count, actual_exec_cnt, i))
+            time.sleep(1)
+            
         self.assertEqual(stats.model_stats[0].name, model_name,
                          "expect model stats for model {}".format(model_name))
         self.assertEqual(
diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh
index a420ec9d9d..d207c97fde 100755
--- a/qa/L0_batch_custom/test.sh
+++ b/qa/L0_batch_custom/test.sh
@@ -102,7 +102,7 @@ cp -r backend/examples/batching_strategies/single_batching/build/libtriton_singl
 # Run a test to validate the single batching strategy example.
 # Then, run tests to validate the volume batching example being passed in via the backend dir, model dir, version dir, and model config.
 BACKEND_DIR="/opt/tritonserver/backends/onnxruntime"
-MODEL_DIR="models/$MODEL_NAME/"
+MODEL_DIR="models/$MODEL_NAME"
 VERSION_DIR="$MODEL_DIR/1/"
 
 test_types=('single_batching_backend' 'backend_directory' 'model_directory' 'version_directory' 'model_config')
@@ -151,10 +151,42 @@ for i in "${!test_setups[@]}"; do
     wait $SERVER_PID
 done
 
+# Test ModelBatchInitialize failure
+FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc"
+OLD_STRING="\*batcher = reinterpret_cast<TRITONBACKEND_Batcher\*>("
+NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");"
+ 
+sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH}
+sed -i "s/new unsigned int(max_volume_bytes));//g" ${FILE_PATH}
+
+(cd backend/examples/batching_strategies/volume_batching &&
+ cd build &&
+ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
+       -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG .. &&
+ make -j4 install)
+
+cp -r backend/examples/batching_strategies/volume_batching/build/libtriton_volumebatching.so models/${MODEL_NAME}/libtriton_volumebatching.so
+
+SERVER_LOG=${SERVER_LOG_BASE}_batching_init_failure
+
+run_server
+if [ "$SERVER_PID" != "0" ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** ModelBatchInit Error Test: unexpected successful server start $SERVER\n***"
+    kill_server
+    RET=1
+else
+    if [ `grep -c "Failure test case" $SERVER_LOG` != "1" ] || [ `grep -c "Not found" $SERVER_LOG` != "1" ]; then
+        cat $SERVER_LOG
+        echo -e "\n***\n*** ModelBatchInit Error Test: failed to find 1 \"Failure test case\" message and 1 \"Not found\" error type"
+        RET=1
+    fi
+fi
+
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
-    cat $CLIENT_LOG
     echo -e "\n***\n*** Test FAILED\n***"
 fi
 
diff --git a/qa/L0_optional_input/optional_input_test.py b/qa/L0_optional_input/optional_input_test.py
index 2543a198d6..c813146ecd 100644
--- a/qa/L0_optional_input/optional_input_test.py
+++ b/qa/L0_optional_input/optional_input_test.py
@@ -124,8 +124,21 @@ def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")):
             self.add_deferred_exception(ex)
 
     def check_status(self, model_name, batch_exec, request_cnt, infer_cnt):
-        stats = self.triton_client_.get_inference_statistics(model_name, "1")
-        self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
+        # There is a time window between when responses are returned and statistics are updated.
+        # To prevent intermittent test failure during that window, wait up to 10 seconds for the
+        # inference statistics to be ready.
+        num_tries = 10
+        for i in range(num_tries):
+            stats = self.triton_client_.get_inference_statistics(
+                model_name, "1")
+            self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
+            actual_exec_cnt = stats.model_stats[0].execution_count
+            if actual_exec_cnt == exec_cnt:
+                break
+            print("WARNING: expect {} executions, got {} (attempt {})".format(
+                exec_cnt, actual_exec_cnt, i))
+            time.sleep(1)
+
         self.assertEqual(stats.model_stats[0].name, model_name,
                          "expect model stats for model {}".format(model_name))
         self.assertEqual(
diff --git a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
index 8e87b8cc40..14609dbb94 100644
--- a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
+++ b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
@@ -130,8 +130,21 @@ def check_setup(self, model_name):
                          _max_queue_delay_ms * 1000)  # 10 secs
 
     def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt):
-        stats = self.triton_client_.get_inference_statistics(model_name, "1")
-        self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
+        # There is a time window between when responses are returned and statistics are updated.
+        # To prevent intermittent test failure during that window, wait up to 10 seconds for the
+        # inference statistics to be ready.
+        num_tries = 10
+        for i in range(num_tries):
+            stats = self.triton_client_.get_inference_statistics(
+                model_name, "1")
+            self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
+            actual_exec_cnt = stats.model_stats[0].execution_count
+            if actual_exec_cnt == exec_cnt:
+                break
+            print("WARNING: expect {} executions, got {} (attempt {})".format(
+                exec_cnt, actual_exec_cnt, i))
+            time.sleep(1)
+
         self.assertEqual(stats.model_stats[0].name, model_name,
                          "expect model stats for model {}".format(model_name))
         self.assertEqual(

From a2e2fc80dd59728589b14f0ee479eb574c5f94f8 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 1 May 2023 17:49:22 -0700
Subject: [PATCH 2/3] Formatting

---
 qa/L0_batch_custom/batch_custom_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py
index 4250908c91..3fb74cf25d 100644
--- a/qa/L0_batch_custom/batch_custom_test.py
+++ b/qa/L0_batch_custom/batch_custom_test.py
@@ -136,7 +136,7 @@ def check_status(self, model_name, batch_exec, request_cnt, infer_cnt,
             print("WARNING: expect {} executions, got {} (attempt {})".format(
                 exec_count, actual_exec_cnt, i))
             time.sleep(1)
-            
+
         self.assertEqual(stats.model_stats[0].name, model_name,
                          "expect model stats for model {}".format(model_name))
         self.assertEqual(

From 8c283e7ed44f3bcc44ae897b48045e4bdd3db7f6 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 2 May 2023 08:01:51 -0700
Subject: [PATCH 3/3] Change search string

---
 qa/L0_batch_custom/test.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh
index d207c97fde..ca4b0e4b24 100755
--- a/qa/L0_batch_custom/test.sh
+++ b/qa/L0_batch_custom/test.sh
@@ -153,11 +153,10 @@ done
 
 # Test ModelBatchInitialize failure
 FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc"
-OLD_STRING="\*batcher = reinterpret_cast<TRITONBACKEND_Batcher\*>("
+OLD_STRING="\/\/ Batcher will point to an unsigned integer representing the maximum"
 NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");"
  
 sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH}
-sed -i "s/new unsigned int(max_volume_bytes));//g" ${FILE_PATH}
 
 (cd backend/examples/batching_strategies/volume_batching &&
  cd build &&
@@ -176,9 +175,9 @@ if [ "$SERVER_PID" != "0" ]; then
     kill_server
     RET=1
 else
-    if [ `grep -c "Failure test case" $SERVER_LOG` != "1" ] || [ `grep -c "Not found" $SERVER_LOG` != "1" ]; then
+    if [ `grep -c "Failure test case" $SERVER_LOG` -lt 1 ] || [ `grep -c "Not found" $SERVER_LOG` -lt 1 ]; then
         cat $SERVER_LOG
-        echo -e "\n***\n*** ModelBatchInit Error Test: failed to find 1 \"Failure test case\" message and 1 \"Not found\" error type"
+        echo -e "\n***\n*** ModelBatchInit Error Test: failed to find \"Failure test case\" message and/or \"Not found\" error type"
         RET=1
     fi
 fi