triton-inference-server · krishung5 · Nov 9, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 7, 2023
diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/python_unittest.py
@@ -68,6 +68,7 @@ def test_python_unittest(self):
                 model_name == "bls"
                 or model_name == "bls_memory"
                 or model_name == "bls_memory_async"
+                or model_name == "bls_request_rescheduling"
             ):
                 # For these tests, the memory region size will be grown. Because of
                 # this we need to use the shared memory probe only on the later

diff --git a/qa/L0_backend_python/request_rescheduling/grpc_endpoint_test.py b/qa/L0_backend_python/request_rescheduling/grpc_endpoint_test.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../../common")
+
+# GRPC streaming helpers..
+import queue
+import unittest
+from functools import partial
+
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+class GRPCENDPOINTTest(tu.TestResultCollector):
+    def test_grpc_decoupled(self, sequence_id=0, sequence_start=False):
+        user_data = UserData()
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            # Reload the model to reset the flag
+            triton_client.unload_model("generative_sequence")
+            triton_client.load_model("generative_sequence")
+
+            triton_client.start_stream(callback=partial(callback, user_data))
+            inputs = []
+            inputs.append(grpcclient.InferInput("IN", [1], "INT32"))
+            inputs[0].set_data_from_numpy(np.array([3], dtype=np.int32))
+
+            triton_client.async_stream_infer(
+                model_name="generative_sequence",
+                inputs=inputs,
+                sequence_id=sequence_id,
+                sequence_start=sequence_start,
+            )
+            res_count = 3
+            while res_count > 0:
+                data_item = user_data._completed_requests.get()
+                res_count -= 1
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    self.assertEqual(res_count, data_item.as_numpy("OUT")[0])
+            self.assertEqual(0, res_count)
+
+    def test_grpc_non_decoupled(self, sequence_id=0, sequence_start=False):
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            # Reload the model to reset the flag
+            triton_client.unload_model("request_rescheduling_addsub")
+            triton_client.load_model("request_rescheduling_addsub")
+
+            inputs = []
+            inputs.append(grpcclient.InferInput("INPUT0", [16], "FP32"))
+            inputs.append(grpcclient.InferInput("INPUT1", [16], "FP32"))
+            input0_val = np.random.randn(*[16]).astype(np.float32)
+            input1_val = np.random.randn(*[16]).astype(np.float32)
+            inputs[0].set_data_from_numpy(input0_val)
+            inputs[1].set_data_from_numpy(input1_val)
+
+            results = triton_client.infer(
+                model_name="request_rescheduling_addsub",
+                inputs=inputs,
+            )
+
+            output0_data = results.as_numpy("OUTPUT0")
+            output1_data = results.as_numpy("OUTPUT1")
+
+            self.assertTrue(np.array_equal(output0_data, input0_val + input1_val))
+            self.assertTrue(np.array_equal(output1_data, input0_val - input1_val))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CLIENT_PY=../python_unittest.py
+CLIENT_LOG="./request_rescheduling_client.log"
+EXPECTED_NUM_TESTS="1"
+TEST_RESULT_FILE='test_results.txt'
+source ../../common/util.sh
+
+TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
+SERVER=${TRITON_DIR}/bin/tritonserver
+BACKEND_DIR=${TRITON_DIR}/backends
+
+RET=0
+
+rm -fr *.log ./models *.txt
+
+mkdir -p models/bls_request_rescheduling/1/
+cp ../../python_models/bls_request_rescheduling/model.py models/bls_request_rescheduling/1/
+cp ../../python_models/bls_request_rescheduling/config.pbtxt models/bls_request_rescheduling
+
+mkdir -p models/request_rescheduling_addsub/1/
+cp ../../python_models/request_rescheduling_addsub/model.py models/request_rescheduling_addsub/1/
+cp ../../python_models/request_rescheduling_addsub/config.pbtxt models/request_rescheduling_addsub
+
+mkdir -p models/generative_sequence/1/
+cp ../../python_models/generative_sequence/model.py models/generative_sequence/1/
+cp ../../python_models/generative_sequence/config.pbtxt models/generative_sequence
+
+mkdir -p models/wrong_return_type/1/
+cp ../../python_models/wrong_return_type/model.py models/wrong_return_type/1/
+cp ../../python_models/wrong_return_type/config.pbtxt models/wrong_return_type
+
+SERVER_LOG="./request_rescheduling_server.log"
+SERVER_ARGS="--model-repository=`pwd`/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=* --log-verbose=1"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+export MODEL_NAME='bls_request_rescheduling'
+
+set +e
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** bls_request_rescheduling test FAILED. \n***"
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+set -e
+
+GRPC_TEST_PY=./grpc_endpoint_test.py
+EXPECTED_NUM_TESTS="2"
+
+set +e
+python3 $GRPC_TEST_PY >> $CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** GRPC Endpoint test FAILED. \n***"
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+
+if [ $RET -eq 1 ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** Request Rescheduling test FAILED. \n***"
+else
+    echo -e "\n***\n*** Request Rescheduling test PASSED. \n***"
+fi
+
+exit $RET
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
@@ -423,7 +423,7 @@ if [ "$TEST_JETSON" == "0" ]; then
     fi
 fi
 
-SUBTESTS="lifecycle restart model_control examples argument_validation logging custom_metrics"
+SUBTESTS="lifecycle restart model_control examples argument_validation logging custom_metrics request_rescheduling"
 for TEST in ${SUBTESTS}; do
     # Run each subtest in a separate virtual environment to avoid conflicts
     # between dependencies.

diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py
@@ -220,7 +220,7 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
                 infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START
             )
             infer_response = infer_request.exec()
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
             output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT")
             self.assertFalse(output.is_cpu())
             output = from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy()
@@ -242,7 +242,7 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
                         next(infer_responses)
                 else:
                     infer_response = infer_request.exec()
-                self.assertFalse(infer_response.has_error(), infer_response.error())
+                self.assertFalse(infer_response.has_error())
 
                 # The new output is the previous output + the current input
                 expected_output = output[0] + i
@@ -275,7 +275,7 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
             else:
                 infer_response = infer_request.exec()
 
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
             expected_output = output[0] + input.as_numpy()[0]
             output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT")
             self.assertFalse(output.is_cpu())
@@ -345,7 +345,7 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled):
         else:
             infer_response = infer_request.exec()
 
-        self.assertFalse(infer_response.has_error(), infer_response.error())
+        self.assertFalse(infer_response.has_error())
 
         output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
         output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1")
@@ -401,7 +401,7 @@ def test_zero_length_io(self):
         else:
             infer_response = infer_request.exec()
 
-        self.assertFalse(infer_response.has_error(), infer_response.error())
+        self.assertFalse(infer_response.has_error())
 
         output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
         self.assertTrue(np.all(output0 == input0))
@@ -439,7 +439,7 @@ def bls_tensor_lifecycle_helper(self):
                     next(infer_responses)
             else:
                 infer_response = infer_request.exec()
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
 
             output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             np.testing.assert_equal(
@@ -497,7 +497,7 @@ def bls_tensor_lifecycle_helper(self):
             else:
                 infer_response = infer_request.exec()
 
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
 
             output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             output0_pytorch = from_dlpack(output0.to_dlpack())
@@ -677,7 +677,7 @@ def _test_response_iterator_square(
         expected_output_cnt = np.array([expected_output_cnt], dtype=np.int32)
 
         for infer_response in response_iterator:
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
             if len(infer_response.output_tensors()) > 0:
                 output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
                 self.assertIsNotNone(output0)
@@ -710,7 +710,7 @@ def test_response_iterator(self):
             # case 1. Use Next() to get the next response first, then use
             # for-loop to get the remaining responses.
             infer_response = next(infer_responses)
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
             output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
             self.assertIsNotNone(output0)
             self.assertEqual(response_value, output0.as_numpy())
@@ -734,7 +734,7 @@ def test_response_iterator(self):
             # get the remaining responses.
             response_count = 0
             for infer_response in infer_responses:
-                self.assertFalse(infer_response.has_error(), infer_response.error())
+                self.assertFalse(infer_response.has_error())
                 output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
                 self.assertIsNotNone(output0)
                 self.assertEqual(response_value, output0.as_numpy())
@@ -744,7 +744,7 @@ def test_response_iterator(self):
                     break
 
             infer_response = next(infer_responses)
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
             output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
             self.assertIsNotNone(output0)
             self.assertEqual(response_value, output0.as_numpy())
@@ -759,7 +759,7 @@ def test_response_iterator(self):
             infer_responses = infer_request.exec(decoupled=True)
 
             infer_response = next(infer_responses)
-            self.assertFalse(infer_response.has_error(), infer_response.error())
+            self.assertFalse(infer_response.has_error())
             output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
             self.assertIsNotNone(output0)
             self.assertEqual(response_value, output0.as_numpy())

diff --git a/qa/python_models/bls_request_rescheduling/config.pbtxt b/qa/python_models/bls_request_rescheduling/config.pbtxt
@@ -0,0 +1,38 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "bls_request_rescheduling"
+backend: "python"
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]