triton-inference-server · GuanLuo · Nov 1, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/Dockerfile.QA b/Dockerfile.QA
@@ -132,6 +132,8 @@ RUN mkdir -p qa/common && \
     mkdir qa/L0_query/models/query/1 && \
     cp tritonbuild/tritonserver/backends/query/libtriton_query.so qa/L0_query/models/query/1/. && \
     cp bin/query_test qa/L0_query/. && \
+    mkdir qa/L0_generative_sequence/models/generative_sequence/1 && \
+    cp tritonbuild/tritonserver/backends/generative_sequence/libtriton_generative_sequence.so qa/L0_generative_sequence/models/generative_sequence/1/. && \
     cp bin/register_api_test qa/L0_register/. && \
     cp bin/async_work_queue_test qa/L0_async_work_queue/. && \
     cp tritonbuild/tritonserver/backends/implicit_state/libtriton_implicit_state.so \

diff --git a/build.py b/build.py
@@ -2051,6 +2051,7 @@ def cibase_build(
         "sequence",
         "dyna_sequence",
         "distributed_addsub",
+        "generative_sequence",
     ):
         be_install_dir = os.path.join(repo_install_dir, "backends", be)
         if target_platform() == "windows":

diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import json
+
+# GRPC streaming helpers..
+import queue
+import unittest
+from functools import partial
+
+import numpy as np
+import requests
+import sseclient
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+MODEL_CONFIG_BASE = """
+{{
+"backend": "generative_sequence",
+"max_batch_size": 4,
+"input" : [
+  {{
+    "name": "INPUT",
+    "data_type": "TYPE_INT32",
+    "dims": [ 1 ]
+  }}
+],
+"output" : [
+  {{
+    "name": "OUTPUT",
+    "data_type": "TYPE_INT32",
+    "dims": [ 1 ]
+  }}
+],
+"model_transaction_policy" : {{
+  "decoupled": true
+}},
+{},
+"instance_group" : [{{ "kind": "KIND_CPU" }}]
+}}
+"""
+
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+class GenerativeSequenceTest(tu.TestResultCollector):
+    def setUp(self):
+        # Always make sure the original config is used
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            triton_client.load_model("generative_sequence")
+
+    def test_generate_stream(self):
+        headers = {"Accept": "text/event-stream"}
+        url = "http://localhost:8000/v2/models/generative_sequence/generate_stream"
+        inputs = {"INPUT": 2}
+        res = requests.post(url, data=json.dumps(inputs), headers=headers)
+        res.raise_for_status()
+        client = sseclient.SSEClient(res)
+        res_count = 2
+        for event in client.events():
+            res_count -= 1
+            data = json.loads(event.data)
+            self.assertIn("OUTPUT", data)
+            self.assertEqual(res_count, data["OUTPUT"])
+        self.assertEqual(0, res_count)
+
+    def test_grpc_stream(self, sequence_id=0, sequence_start=False):
+        user_data = UserData()
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            triton_client.start_stream(callback=partial(callback, user_data))
+            inputs = []
+            inputs.append(grpcclient.InferInput("INPUT", [1, 1], "INT32"))
+            inputs[0].set_data_from_numpy(np.array([[2]], dtype=np.int32))
+
+            triton_client.async_stream_infer(
+                model_name="generative_sequence",
+                inputs=inputs,
+                sequence_id=sequence_id,
+                sequence_start=sequence_start,
+            )
+            res_count = 2
+            while res_count > 0:
+                data_item = user_data._completed_requests.get()
+                res_count -= 1
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    self.assertEqual(res_count, data_item.as_numpy("OUTPUT")[0][0])
+            self.assertEqual(0, res_count)
+
+    def test_unsupported_sequence_scheduler(self):
+        # Override model config with scheduler settings that do not support
+        # request rescheduling.
+        configs = [
+            r'"sequence_batching" : { "direct" : {}, "generative_sequence" : false }',
+            r'"sequence_batching" : { "oldest" : {}, "generative_sequence" : false }',
+        ]
+        sid = 1
+        for sc in configs:
+            with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+                triton_client.load_model(
+                    "generative_sequence", config=MODEL_CONFIG_BASE.format(sc)
+                )
+            with self.assertRaises(InferenceServerException) as context:
+                # Without specifying 'generative_sequence : true', the sequence
+                # batcher expects sequence parameters to be provided explicitly
+                self.test_grpc_stream(sequence_id=sid, sequence_start=True)
+            sid += 1
+            print(str(context.exception))
+            self.assertTrue(
+                "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE"
+                in str(context.exception)
+            )
+
+    def test_unsupported_dynamic_scheduler(self):
+        # Override model config with scheduler settings that do not support
+        # request rescheduling.
+        configs = [
+            r'"dynamic_batching" : {}',
+        ]
+        for sc in configs:
+            with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+                triton_client.load_model(
+                    "generative_sequence", config=MODEL_CONFIG_BASE.format(sc)
+                )
+            with self.assertRaises(InferenceServerException) as context:
+                self.test_grpc_stream()
+            print(str(context.exception))
+            self.assertTrue(
+                "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE"
+                in str(context.exception)
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
@@ -0,0 +1,48 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+backend: "generative_sequence"
+max_batch_size: 4
+input [
+  {
+    name: "INPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+model_transaction_policy {
+  decoupled: True
+}
+sequence_batching {
+  generative_sequence : true
+}
+instance_group [{ kind: KIND_CPU }]
diff --git a/qa/L0_generative_sequence/test.sh b/qa/L0_generative_sequence/test.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+if [ ! -z "$TEST_REPO_ARCH" ]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+source ../common/util.sh
+
+RET=0
+
+CLIENT_LOG="./generative_sequence_client.log"
+TEST_PY=./generative_sequence_e2e.py
+EXPECTED_NUM_TESTS="4"
+TEST_RESULT_FILE='test_results.txt'
+
+
+export CUDA_VISIBLE_DEVICES=0
+
+rm -fr *.log
+
+pip install sseclient-py
+
+SERVER=/opt/tritonserver/bin/tritonserver
+SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=EXPLICIT"
+SERVER_LOG="./inference_server.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+python $TEST_PY >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+
+exit $RET
diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
@@ -107,7 +107,7 @@ def check_sse_responses(self, res, expected_res):
                 self.assertIn(key, data)
                 self.assertEqual(value, data[key])
             res_count += 1
-        self.assertTrue(len(expected_res), res_count)
+        self.assertEqual(len(expected_res), res_count)
         # Make sure there is no message in the wrong form
         for remaining in client._read():
             self.assertTrue(

diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -101,6 +101,7 @@ add_subdirectory(repoagent/relocation_repoagent repoagent/relocation_repoagent)
 
 add_subdirectory(distributed_addsub distributed_addsub)
 add_subdirectory(dyna_sequence dyna_sequence)
+add_subdirectory(generative_sequence generative_sequence)
 add_subdirectory(implicit_state implicit_state)
 add_subdirectory(query_backend query_backend)