triton-inference-server · kthui · Jun 15, 2023 · Jun 1, 2023 · Jun 5, 2023 · Jun 9, 2023
diff --git a/qa/L0_model_update/instance_update_test.py b/qa/L0_model_update/instance_update_test.py
@@ -339,7 +339,7 @@ def test_infer_while_updating(self):
         self.__unload_model()
 
     # Test instance resource requirement increase
-    @unittest.skipUnless(os.environ["RATE_LIMIT_MODE"] == "execution_count",
+    @unittest.skipUnless("execution_count" in os.environ["RATE_LIMIT_MODE"],
                          "Rate limiter precondition not met for this test")
     def test_instance_resource_increase(self):
         # Load model
@@ -368,6 +368,63 @@ def infer():
         # Unload model
         self.__unload_model()
 
+    # Test instance resource requirement increase above explicit resource
+    @unittest.skipUnless(os.environ["RATE_LIMIT_MODE"] ==
+                         "execution_count_with_explicit_resource",
+                         "Rate limiter precondition not met for this test")
+    def test_instance_resource_increase_above_explicit(self):
+        # Load model
+        self.__load_model(
+            1,
+            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 2\n}\n]\n}\n}"
+        )
+        # Increase resource requirement
+        with self.assertRaises(InferenceServerException):
+            self.__update_instance_count(
+                0, 0,
+                "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 32\n}\n]\n}\n}"
+            )
+        # Correct the resource requirement to match the explicit resource
+        self.__update_instance_count(
+            1, 1,
+            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 10\n}\n]\n}\n}"
+        )
+        # Unload model
+        self.__unload_model()
+
+    # Test instance resource requirement decrease
+    @unittest.skipUnless("execution_count" in os.environ["RATE_LIMIT_MODE"],
+                         "Rate limiter precondition not met for this test")
+    def test_instance_resource_decrease(self):
+        # Load model
+        self.__load_model(
+            1,
+            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 4\n}\n]\n}\n}"
+        )
+        # Decrease resource requirement
+        self.__update_instance_count(
+            1, 1,
+            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 3\n}\n]\n}\n}"
+        )
+        # Unload model
+        self.__unload_model()
+        # The resource count of 3 is unique across this entire test, so check
+        # the server output to make sure it is printed, which ensures the
+        # max resource is actually decreased.
+        time.sleep(1)  # make sure the log file is updated
+        log_path = os.path.join(
+            os.environ["MODEL_LOG_DIR"], "instance_update_test.rate_limit_" +
+            os.environ["RATE_LIMIT_MODE"] + ".server.log")
+        with open(log_path, mode="r", encoding="utf-8", errors="strict") as f:
+            if os.environ["RATE_LIMIT_MODE"] == "execution_count":
+                # Make sure the previous max resource limit of 4 is reduced to 3
+                # when no explicit limit is set.
+                self.assertIn("Resource: R1\t Count: 3", f.read())
+            else:
+                # Make sure the max resource limit is never set to 3 when
+                # explicit limit of 10 is set.
+                self.assertNotIn("Resource: R1\t Count: 3", f.read())
+
     # Test for instance update on direct sequence scheduling
     @unittest.skip("Sequence will not continue after update [FIXME: DLIS-4820]")
     def test_instance_update_on_direct_sequence_scheduling(self):

diff --git a/qa/L0_model_update/test.sh b/qa/L0_model_update/test.sh
@@ -55,15 +55,20 @@ function setup_models() {
 
 RET=0
 
-# Test model instance update with and without rate limiting enabled
-for RATE_LIMIT_MODE in "off" "execution_count"; do
+# Test model instance update with rate limiting on/off and explicit resource
+for RATE_LIMIT_MODE in "off" "execution_count" "execution_count_with_explicit_resource"; do
+
+    RATE_LIMIT_ARGS="--rate-limit=$RATE_LIMIT_MODE"
+    if [ "$RATE_LIMIT_MODE" == "execution_count_with_explicit_resource" ]; then
+        RATE_LIMIT_ARGS="--rate-limit=execution_count --rate-limit-resource=R1:10"
+    fi
 
     export RATE_LIMIT_MODE=$RATE_LIMIT_MODE
     TEST_LOG="instance_update_test.rate_limit_$RATE_LIMIT_MODE.log"
     SERVER_LOG="./instance_update_test.rate_limit_$RATE_LIMIT_MODE.server.log"
 
     setup_models
-    SERVER_ARGS="--model-repository=models --model-control-mode=explicit --rate-limit=$RATE_LIMIT_MODE --log-verbose=2"
+    SERVER_ARGS="--model-repository=models --model-control-mode=explicit $RATE_LIMIT_ARGS --log-verbose=2"
     run_server
     if [ "$SERVER_PID" == "0" ]; then
         echo -e "\n***\n*** Failed to start $SERVER\n***"
@@ -83,6 +88,15 @@ for RATE_LIMIT_MODE in "off" "execution_count"; do
     kill $SERVER_PID
     wait $SERVER_PID
 
+    set +e
+    grep "Should not print this" $SERVER_LOG
+    if [ $? -eq 0 ]; then
+        echo -e "\n***\n*** Found \"Should not print this\" on \"$SERVER_LOG\"\n***"
+        cat $SERVER_LOG
+        RET=1
+    fi
+    set -e
+
 done
 
 if [ $RET -eq 0 ]; then