triton-inference-server · rmccorm4 · Sep 26, 2023 · Sep 26, 2023 · Sep 29, 2023 · Sep 30, 2023
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 __pycache__
 tmp
 *.log
+test_results*.txt
diff --git a/qa/L0_http/llm_models/vllm_proxy/1/model.py b/qa/L0_http/llm_models/vllm_proxy/1/model.py
@@ -0,0 +1,91 @@
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import triton_python_backend_utils as pb_utils
+import numpy as np
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_config = json.loads(args["model_config"])
+        self.decoupled = self.model_config.get("model_transaction_policy", {}).get(
+            "decoupled"
+        )
+        print(f"{self.decoupled=}")
+
+    def execute(self, requests):
+        if self.decoupled:
+            return self.exec_decoupled(requests)
+        else:
+            return self.exec(requests)
+
+    def exec(self, requests):
+        responses = []
+        for request in requests:
+            params = json.loads(request.parameters())
+            rep_count = params["REPETITION"] if "REPETITION" in params else 1
+
+            input_np = pb_utils.get_input_tensor_by_name(request, "PROMPT").as_numpy()
+            stream_np = pb_utils.get_input_tensor_by_name(request, "STREAM").as_numpy()
+            stream = stream_np.flatten()[0]
+            if stream:
+                responses.append(
+                    pb_utils.InferenceResponse(
+                        error=pb_utils.TritonError(
+                            "STREAM only supported in decoupled mode"
+                        )
+                    )
+                )
+            else:
+                out_tensor = pb_utils.Tensor("TEXT", np.repeat(input_np, rep_count, axis=1))
+                responses.append(pb_utils.InferenceResponse([out_tensor]))
+        return responses
+
+    def exec_decoupled(self, requests):
+        for request in requests:
+            params = json.loads(request.parameters())
+            rep_count = params["REPETITION"] if "REPETITION" in params else 1
+
+            sender = request.get_response_sender()
+            input_np = pb_utils.get_input_tensor_by_name(request, "PROMPT").as_numpy()
+            stream_np = pb_utils.get_input_tensor_by_name(request, "STREAM").as_numpy()
+            out_tensor = pb_utils.Tensor("TEXT", input_np)
+            response = pb_utils.InferenceResponse([out_tensor])
+            # If stream enabled, just send multiple copies of response
+            # FIXME: Could split up response string into tokens, but this is simpler for now.
+            stream = stream_np.flatten()[0]
+            if stream:
+                for _ in range(rep_count):
+                    sender.send(response)
+                sender.send(None, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            # If stream disabled, just send one response
+            else:
+                sender.send(
+                    response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                )
+        return None
diff --git a/qa/L0_http/llm_models/vllm_proxy/config.pbtxt b/qa/L0_http/llm_models/vllm_proxy/config.pbtxt
@@ -0,0 +1,62 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+backend: "python"
+
+# Disabling batching in Triton, let vLLM handle the batching on its own.
+max_batch_size: 0
+
+model_transaction_policy {
+  decoupled: True
+}
+
+input [
+  {
+    name: "PROMPT"
+    data_type: TYPE_STRING
+    dims: [ 1, 1 ]
+  },
+  {
+    name: "STREAM"
+    data_type: TYPE_BOOL
+    dims: [ 1, 1 ]
+  }
+]
+
+output [
+  {
+    name: "TEXT"
+    data_type: TYPE_STRING
+    dims: [ 1, -1 ]
+  }
+]
+
+# The usage of device is deferred to the vLLM engine
+instance_group [
+  {
+    count: 1
+    kind: KIND_MODEL
+  }
+]
diff --git a/qa/L0_http/llm_test.py b/qa/L0_http/llm_test.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python3
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import json
+import unittest
+
+import requests
+import test_util as tu
+
+
+class HttpTest(tu.TestResultCollector):
+    def _get_infer_url(self, model_name, route):
+        return f"http://localhost:8000/v2/models/{model_name}/{route}"
+
+    # def _simple_infer(self, model_name, inputs, expected_outputs):
+    #     headers = {"Content-Type": "application/json"}
+    #     url = self._get_infer_url(model_name, "infer")
+    #     r = requests.post(url, data=json.dumps(inputs), headers=headers)
+    #     r.raise_for_status()
+
+    #     content = r.json()
+    #     print(content)
+
+    #     self.assertEqual(content["model_name"], model_name)
+    #     self.assertIn("outputs", content)
+    #     self.assertEqual(content["outputs"], expected_outputs)
+
+    # def _simple_generate_stream(self, model_name, inputs, expected_outputs):
+    #     import sseclient
+
+    #     headers = {"Accept": "text/event-stream"}
+    #     url = self._get_infer_url(model_name, "generate_stream")
+    #     # stream=True used to indicate response can be iterated over
+    #     r = requests.post(url, data=json.dumps(inputs), headers=headers, stream=True)
+
+    #     # Validate SSE format
+    #     print(r.headers)
+    #     self.assertIn("Content-Type", r.headers)
+    #     # FIXME: Clarify correct header here.
+    #     # self.assertEqual(r.headers['Content-Type'], 'text/event-stream')
+    #     self.assertEqual(r.headers["Content-Type"], "text/event-stream; charset=utf-8")
+
+    #     # SSE format (data: []) is hard to parse, use helper library for simplicity
+    #     client = sseclient.SSEClient(r)
+    #     tokens = []
+    #     for i, event in enumerate(client.events()):
+    #         # End of event stream
+    #         if event.data == "[DONE]":
+    #             continue
+
+    #         # Parse event data, join events into a single response
+    #         data = json.loads(event.data)
+    #         print(f"Event {i}:", data)
+    #         if "TEXT" not in data:
+    #             print("FIXME: EXPECTED OUTPUT FIELD NOT FOUND")
+    #         else:
+    #             tokens.append(data["TEXT"])
+    #     print("TOKENS:", tokens)
+
+    # def test_infer(self):
+    #     model_name = "onnx_zero_1_object"
+    #     parameters = {}
+
+    #     # Setup text-based input
+    #     input0_data = ["hello"]
+    #     input0 = {
+    #         "name": "INPUT0",
+    #         "datatype": "BYTES",
+    #         "shape": [1, 1],
+    #         "data": input0_data,
+    #     }
+    #     inputs = {"inputs": [input0], "parameters": parameters}
+    #     # Identity model, output should match input
+    #     expected_outputs = [
+    #         {
+    #             "name": "OUTPUT0",
+    #             "datatype": "BYTES",
+    #             "shape": [1, 1],
+    #             "data": input0_data,
+    #         }
+    #     ]
+    #     self._simple_infer(model_name, inputs, expected_outputs)
+
+    def test_generate(self):
+        model_name = "vllm_proxy"
+        # Setup text-based input
+        text = "hello world"
+        inputs = {"PROMPT": [text], "STREAM": False}
+
+        url = self._get_infer_url(model_name, "generate")
+        # stream=True used to indicate response can be iterated over
+        r = requests.post(url, data=json.dumps(inputs))
+
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertIn("application/json", r.headers["Content-Type"])
+
+        data = r.json()
+        self.assertTrue("TEXT" in data)
+        self.assertEqual(text, data["TEXT"])
+
+    def test_generate_stream(self):
+        model_name = "vllm_proxy"
+        # Setup text-based input
+        text = "hello world"
+        rep_count = 3
+        inputs = {"PROMPT": [text], "STREAM": True, "REPETITION": rep_count}
+
+        import sseclient
+
+        headers = {"Accept": "text/event-stream"}
+        url = self._get_infer_url(model_name, "generate_stream")
+        # stream=True used to indicate response can be iterated over
+        r = requests.post(url, data=json.dumps(inputs), headers=headers, stream=True)
+
+        r.raise_for_status()
+
+        # Validate SSE format
+        self.assertIn("Content-Type", r.headers)
+        self.assertIn("text/event-stream", r.headers["Content-Type"])
+
+        # SSE format (data: []) is hard to parse, use helper library for simplicity
+        client = sseclient.SSEClient(r)
+        res_count = 0
+        for i, event in enumerate(client.events()):
+            # Parse event data, join events into a single response
+            data = json.loads(event.data)
+            print(f"Event {i}:", data)
+            self.assertTrue("TEXT" in data)
+            self.assertEqual(text, data["TEXT"])
+            res_count += 1
+        self.assertTrue(rep_count, res_count)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
@@ -629,6 +629,46 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+### LLM REST API Endpoint Tests ###
+
+# Helper library to parse SSE events
+# https://github.com/mpetazzoni/sseclient
+pip install sseclient-py
+
+SERVER_ARGS="--model-repository=`pwd`/llm_models"
+SERVER_LOG="./inference_server_llm_test.log"
+CLIENT_LOG="./llm_test.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+## Python Unit Tests
+TEST_RESULT_FILE='test_results.txt'
+PYTHON_TEST=llm_test.py
+EXPECTED_NUM_TESTS=2
+set +e
+python3 $PYTHON_TEST >$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+###
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else

diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
@@ -81,24 +81,6 @@ cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
 RET=0
 
 # Helpers =======================================
-function assert_curl_success {
-  message="${1}"
-  if [ "$code" != "200" ]; then
-    cat ./curl.out
-    echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
-    RET=1
-  fi
-}
-
-function assert_curl_failure {
-  message="${1}"
-  if [ "$code" != "400" ]; then
-    cat ./curl.out
-    echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
-    RET=1
-  fi
-}
-
 function get_global_trace_setting {
   rm -f ./curl.out
   set +e
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ @@
     __pycache__
     tmp
     *.log
+    test_results*.txt