diff --git a/Dockerfile.QA b/Dockerfile.QA index cba4cc703f..9e23a97672 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -132,6 +132,8 @@ RUN mkdir -p qa/common && \ mkdir qa/L0_query/models/query/1 && \ cp tritonbuild/tritonserver/backends/query/libtriton_query.so qa/L0_query/models/query/1/. && \ cp bin/query_test qa/L0_query/. && \ + mkdir qa/L0_generative_sequence/models/generative_sequence/1 && \ + cp tritonbuild/tritonserver/backends/generative_sequence/libtriton_generative_sequence.so qa/L0_generative_sequence/models/generative_sequence/1/. && \ cp bin/register_api_test qa/L0_register/. && \ cp bin/async_work_queue_test qa/L0_async_work_queue/. && \ cp tritonbuild/tritonserver/backends/implicit_state/libtriton_implicit_state.so \ diff --git a/build.py b/build.py index 9c5b61645b..5931ed5f7c 100755 --- a/build.py +++ b/build.py @@ -2051,6 +2051,7 @@ def cibase_build( "sequence", "dyna_sequence", "distributed_addsub", + "generative_sequence", ): be_install_dir = os.path.join(repo_install_dir, "backends", be) if target_platform() == "windows": diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py new file mode 100755 index 0000000000..68b6ca6622 --- /dev/null +++ b/qa/L0_generative_sequence/generative_sequence_e2e.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json + +# GRPC streaming helpers.. +import queue +import unittest +from functools import partial + +import numpy as np +import requests +import sseclient +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + +MODEL_CONFIG_BASE = """ +{{ +"backend": "generative_sequence", +"max_batch_size": 4, +"input" : [ + {{ + "name": "INPUT", + "data_type": "TYPE_INT32", + "dims": [ 1 ] + }} +], +"output" : [ + {{ + "name": "OUTPUT", + "data_type": "TYPE_INT32", + "dims": [ 1 ] + }} +], +"model_transaction_policy" : {{ + "decoupled": true +}}, +{}, +"instance_group" : [{{ "kind": "KIND_CPU" }}] +}} +""" + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class GenerativeSequenceTest(tu.TestResultCollector): + def setUp(self): + # Always make sure the original config is used + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model("generative_sequence") + + def test_generate_stream(self): + headers = {"Accept": "text/event-stream"} + url = "http://localhost:8000/v2/models/generative_sequence/generate_stream" + inputs = {"INPUT": 2} + res = requests.post(url, data=json.dumps(inputs), headers=headers) + res.raise_for_status() + client = sseclient.SSEClient(res) + res_count = 2 + for event in client.events(): + res_count -= 1 + data = json.loads(event.data) + self.assertIn("OUTPUT", data) + self.assertEqual(res_count, data["OUTPUT"]) + self.assertEqual(0, res_count) + + def test_grpc_stream(self, sequence_id=0, sequence_start=False): + user_data = UserData() + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + inputs = [] + inputs.append(grpcclient.InferInput("INPUT", [1, 1], "INT32")) + inputs[0].set_data_from_numpy(np.array([[2]], dtype=np.int32)) + + triton_client.async_stream_infer( + model_name="generative_sequence", + inputs=inputs, + sequence_id=sequence_id, + sequence_start=sequence_start, + ) + res_count = 2 + while res_count > 0: + data_item = user_data._completed_requests.get() + res_count -= 1 + if type(data_item) == InferenceServerException: + raise data_item + else: + self.assertEqual(res_count, data_item.as_numpy("OUTPUT")[0][0]) + self.assertEqual(0, res_count) + + def test_unsupported_sequence_scheduler(self): + # Override model config with scheduler settings that do not support + # request rescheduling. + configs = [ + r'"sequence_batching" : { "direct" : {}, "generative_sequence" : false }', + r'"sequence_batching" : { "oldest" : {}, "generative_sequence" : false }', + ] + sid = 1 + for sc in configs: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model( + "generative_sequence", config=MODEL_CONFIG_BASE.format(sc) + ) + with self.assertRaises(InferenceServerException) as context: + # Without specifying 'generative_sequence : true', the sequence + # batcher expects sequence parameters to be provided explicitly + self.test_grpc_stream(sequence_id=sid, sequence_start=True) + sid += 1 + print(str(context.exception)) + self.assertTrue( + "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE" + in str(context.exception) + ) + + def test_unsupported_dynamic_scheduler(self): + # Override model config with scheduler settings that do not support + # request rescheduling. + configs = [ + r'"dynamic_batching" : {}', + ] + for sc in configs: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model( + "generative_sequence", config=MODEL_CONFIG_BASE.format(sc) + ) + with self.assertRaises(InferenceServerException) as context: + self.test_grpc_stream() + print(str(context.exception)) + self.assertTrue( + "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE" + in str(context.exception) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt new file mode 100644 index 0000000000..123edc2962 --- /dev/null +++ b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt @@ -0,0 +1,48 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "generative_sequence" +max_batch_size: 4 +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +model_transaction_policy { + decoupled: True +} +sequence_batching { + generative_sequence : true +} +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_generative_sequence/test.sh b/qa/L0_generative_sequence/test.sh new file mode 100755 index 0000000000..f9212901d3 --- /dev/null +++ b/qa/L0_generative_sequence/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +CLIENT_LOG="./generative_sequence_client.log" +TEST_PY=./generative_sequence_e2e.py +EXPECTED_NUM_TESTS="4" +TEST_RESULT_FILE='test_results.txt' + + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +pip install sseclient-py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=EXPLICIT" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST_PY >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py index f426d07536..1fb3dddcc6 100755 --- a/qa/L0_http/generate_endpoint_test.py +++ b/qa/L0_http/generate_endpoint_test.py @@ -107,7 +107,7 @@ def check_sse_responses(self, res, expected_res): self.assertIn(key, data) self.assertEqual(value, data[key]) res_count += 1 - self.assertTrue(len(expected_res), res_count) + self.assertEqual(len(expected_res), res_count) # Make sure there is no message in the wrong form for remaining in client._read(): self.assertTrue( diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 25049624f8..6dcd3f8f09 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -101,6 +101,7 @@ add_subdirectory(repoagent/relocation_repoagent repoagent/relocation_repoagent) add_subdirectory(distributed_addsub distributed_addsub) add_subdirectory(dyna_sequence dyna_sequence) +add_subdirectory(generative_sequence generative_sequence) add_subdirectory(implicit_state implicit_state) add_subdirectory(query_backend query_backend) diff --git a/src/test/generative_sequence/CMakeLists.txt b/src/test/generative_sequence/CMakeLists.txt new file mode 100644 index 0000000000..174b40f6b6 --- /dev/null +++ b/src/test/generative_sequence/CMakeLists.txt @@ -0,0 +1,118 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.17) + +project(tritongenerativesequencebackend LANGUAGES C CXX) + +# +# libtriton_generative_sequence.so +# Shared library implementing the Triton Sequence Backend API +# +configure_file(src/libtriton_generative_sequence.ldscript libtriton_generative_sequence.ldscript COPYONLY) + +add_library( + triton-generative-sequence-backend SHARED + src/generative_sequence.cc +) + +add_library( + TritonGenerativeSequenceBackend::triton-generative-sequence-backend ALIAS triton-generative-sequence-backend +) + +target_compile_features(triton-generative-sequence-backend PRIVATE cxx_std_11) +target_compile_options( + triton-generative-sequence-backend PRIVATE + $<$,$,$>: + -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> +) + +target_link_libraries( + triton-generative-sequence-backend + PRIVATE + triton-backend-utils # from repo-backend + triton-core-serverapi # from repo-core + triton-core-backendapi # from repo-core + triton-core-serverstub # from repo-core +) + +set_target_properties( + triton-generative-sequence-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_generative_sequence + LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_generative_sequence.ldscript + LINK_FLAGS "-Wl,--version-script libtriton_generative_sequence.ldscript" +) + +# +# Install +# +include(GNUInstallDirs) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonGenerativeSequenceBackend) + +install( + TARGETS + triton-generative-sequence-backend + EXPORT + triton-generative-sequence-backend-targets + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/generative_sequence + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/generative_sequence +) + +install( + EXPORT + triton-generative-sequence-backend-targets + FILE + TritonGenerativeSequenceBackendTargets.cmake + NAMESPACE + TritonGenerativeSequenceBackend:: + DESTINATION + ${INSTALL_CONFIGDIR} +) + +include(CMakePackageConfigHelpers) +configure_package_config_file( + ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonGenerativeSequenceBackendConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/TritonGenerativeSequenceBackendConfig.cmake + INSTALL_DESTINATION ${INSTALL_CONFIGDIR} +) + +install( + FILES + ${CMAKE_CURRENT_BINARY_DIR}/TritonGenerativeSequenceBackendConfig.cmake + DESTINATION ${INSTALL_CONFIGDIR} +) + +# +# Export from build tree +# +export( + EXPORT triton-generative-sequence-backend-targets + FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonGenerativeSequenceBackendTargets.cmake + NAMESPACE TritonGenerativeSequenceBackend:: +) + +export(PACKAGE TritonGenerativeSequenceBackend) diff --git a/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in b/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in new file mode 100644 index 0000000000..cb3b10bbd1 --- /dev/null +++ b/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in @@ -0,0 +1,39 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(CMakeFindDependencyMacro) + +get_filename_component( + TRITONSEQUENCEBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH +) + +list(APPEND CMAKE_MODULE_PATH ${TRITONSEQUENCEBACKEND_CMAKE_DIR}) + +if(NOT TARGET TritonGenerativeSequenceBackend::triton-sequence-backend) + include("${TRITONSEQUENCEBACKEND_CMAKE_DIR}/TritonGenerativeSequenceBackendTargets.cmake") +endif() + +set(TRITONSEQUENCEBACKEND_LIBRARIES TritonGenerativeSequenceBackend::triton-sequence-backend) diff --git a/src/test/generative_sequence/src/generative_sequence.cc b/src/test/generative_sequence/src/generative_sequence.cc new file mode 100644 index 0000000000..960a13fcbd --- /dev/null +++ b/src/test/generative_sequence/src/generative_sequence.cc @@ -0,0 +1,580 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include + +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" + +namespace triton { namespace backend { namespace generative_sequence { + + +// Simple generative sequence backend that demonstrates the use of +// TRITONSERVER_REQUEST_RELEASE_RESCHEDULE flag to generatively produce +// sequence response. +// +// The backend supports models that take 1 input tensor, an INT32 [ 1 ] +// input named "INPUT"; and produces an output tensor "OUTPUT" with the same +// shape as the input tensor. The input value indicates the total number of +// responses to be generated and the output value indicates the number of +// remaining responses. For example, if the request input has value 2, +// the backend will: +// - Send a response with value 1. +// - Release request with RESCHEDULE flag. +// - When execute on the same request, send the last response with value 0. +// - Release request with ALL flag. +// + +#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X) \ + do { \ + if ((RESPONSES)[IDX] != nullptr) { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + err__), \ + "failed to send error response"); \ + (RESPONSES)[IDX] = nullptr; \ + TRITONSERVER_ErrorDelete(err__); \ + } \ + } \ + } while (false) + +// +// ModelState +// +// State associated with a model that is using this backend. An object +// of this class is created and associated with each +// TRITONBACKEND_Model. +// +class ModelState : public BackendModel { + public: + static TRITONSERVER_Error* Create( + TRITONBACKEND_Model* triton_model, ModelState** state); + virtual ~ModelState() = default; + + private: + ModelState(TRITONBACKEND_Model* triton_model); +}; + +TRITONSERVER_Error* +ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) +{ + try { + *state = new ModelState(triton_model); + } + catch (const BackendModelException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +ModelState::ModelState(TRITONBACKEND_Model* triton_model) + : BackendModel(triton_model) +{ +} + +// +// ModelInstanceState +// +// State associated with a model instance. An object of this class is +// created and associated with each TRITONBACKEND_ModelInstance. +// +class ModelInstanceState : public BackendModelInstance { + public: + static TRITONSERVER_Error* Create( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state); + virtual ~ModelInstanceState() = default; + + // Get the state of the model that corresponds to this instance. + ModelState* StateForModel() const { return model_state_; } + + // return output value on receiving request, initialize remainder + // if the corrid hasn't been recorded. + int32_t GetOutput(uint64_t corrid, int32_t init_value); + + private: + ModelInstanceState( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance); + + ModelState* model_state_; + + // A map from correlation ID to the remaining responses. + std::unordered_map remainders_; +}; + +TRITONSERVER_Error* +ModelInstanceState::Create( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state) +{ + try { + *state = new ModelInstanceState(model_state, triton_model_instance); + } + catch (const BackendModelInstanceException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelInstanceException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +ModelInstanceState::ModelInstanceState( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) + : BackendModelInstance(model_state, triton_model_instance), + model_state_(model_state) +{ +} + +int32_t +ModelInstanceState::GetOutput(uint64_t corrid, int32_t init_value) +{ + auto it = remainders_.find(corrid); + if (it == remainders_.end()) { + it = remainders_.emplace(corrid, init_value).first; + } + auto res = --it->second; + if (res <= 0) { + remainders_.erase(it); + } + return res; +} + +///////////// + +extern "C" { + +// Implementing TRITONBACKEND_Initialize is optional. The backend +// should initialize any global state that is intended to be shared +// across all models and model instances that use the backend. +TRITONSERVER_Error* +TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) +{ + const char* cname; + RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname)); + std::string name(cname); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("TRITONBACKEND_Initialize: ") + name).c_str()); + + // We should check the backend API version that Triton supports + // vs. what this backend was compiled against. + uint32_t api_version_major, api_version_minor; + RETURN_IF_ERROR( + TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor)); + + if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) || + (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "triton backend API version does not support this backend"); + } + + return nullptr; // success +} + +// Implementing TRITONBACKEND_ModelInitialize is optional. The backend +// should initialize any state that is intended to be shared across +// all instances of the model. +TRITONSERVER_Error* +TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) +{ + const char* cname; + RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); + std::string name(cname); + + uint64_t version; + RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + + std::to_string(version) + ")") + .c_str()); + + // With each model we create a ModelState object and associate it + // with the TRITONBACKEND_Model. + ModelState* model_state; + RETURN_IF_ERROR(ModelState::Create(model, &model_state)); + RETURN_IF_ERROR( + TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); + + return nullptr; // success +} + +// Implementing TRITONBACKEND_ModelFinalize is optional unless state +// is set using TRITONBACKEND_ModelSetState. The backend must free +// this state and perform any other cleanup. +TRITONSERVER_Error* +TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) +{ + void* vstate; + RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); + ModelState* model_state = reinterpret_cast(vstate); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state"); + + delete model_state; + + return nullptr; // success +} + +// Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The +// backend should initialize any state that is required for a model +// instance. +TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) +{ + const char* cname; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname)); + std::string name(cname); + + int32_t device_id; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); + TRITONSERVER_InstanceGroupKind kind; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind)); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" + + TRITONSERVER_InstanceGroupKindString(kind) + " device " + + std::to_string(device_id) + ")") + .c_str()); + + // The instance can access the corresponding model as well... here + // we get the model and from that get the model's state. + TRITONBACKEND_Model* model; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); + + void* vmodelstate; + RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); + ModelState* model_state = reinterpret_cast(vmodelstate); + + // With each instance we create a ModelInstanceState object and + // associate it with the TRITONBACKEND_ModelInstance. + ModelInstanceState* instance_state; + RETURN_IF_ERROR( + ModelInstanceState::Create(model_state, instance, &instance_state)); + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( + instance, reinterpret_cast(instance_state))); + + // Because this backend just copies IN -> OUT and requires that + // input and output be in CPU memory, we fail if a GPU instances is + // requested. + RETURN_ERROR_IF_FALSE( + instance_state->Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU, + TRITONSERVER_ERROR_INVALID_ARG, + std::string("'generative_sequence' backend only supports CPU instances")); + + return nullptr; // success +} + +// Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless +// state is set using TRITONBACKEND_ModelInstanceSetState. The backend +// must free this state and perform any other cleanup. +TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) +{ + void* vstate; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); + ModelInstanceState* instance_state = + reinterpret_cast(vstate); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); + + delete instance_state; + + return nullptr; // success +} + +// Implementing TRITONBACKEND_ModelInstanceExecute is required. +TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceExecute( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count) +{ + // Triton will not call this function simultaneously for the same + // 'instance'. But since this backend could be used by multiple + // instances from multiple models the implementation needs to handle + // multiple calls to this function at the same time (with different + // 'instance' objects). Suggested practice for this is to use only + // function-local and model-instance-specific state (obtained from + // 'instance'), which is what we do here. + ModelInstanceState* instance_state; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( + instance, reinterpret_cast(&instance_state))); + ModelState* model_state = instance_state->StateForModel(); + + // This backend specifies BLOCKING execution policy. That means that + // we should not return from this function until execution is + // complete. Triton will automatically release 'instance' on return + // from this function so that it is again available to be used for + // another call to TRITONBACKEND_ModelInstanceExecute. + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("model ") + model_state->Name() + ", instance " + + instance_state->Name() + ", executing " + std::to_string(request_count) + + " requests") + .c_str()); + + bool supports_batching = false; + RETURN_IF_ERROR(model_state->SupportsFirstDimBatching(&supports_batching)); + + // 'responses' is initialized with the response objects below and + // if/when an error response is sent the corresponding entry in + // 'responses' is set to nullptr to indicate that that response has + // already been sent. + std::vector responses; + responses.reserve(request_count); + + // Create a single response object for each request. If something + // goes wrong when attempting to create the response objects just + // fail all of the requests by returning an error. + for (uint32_t r = 0; r < request_count; ++r) { + TRITONBACKEND_Request* request = requests[r]; + + TRITONBACKEND_Response* response; + RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request)); + responses.push_back(response); + } + + // The way we collect these batch timestamps is not entirely + // accurate. Normally, in a performant backend you would execute all + // the requests at the same time, and so there would be a single + // compute-start / compute-end time-range. But here we execute each + // request separately so there is no single range. As a result we + // just show the entire execute time as being the compute time as + // well. + uint64_t min_exec_start_ns = std::numeric_limits::max(); + uint64_t max_exec_end_ns = 0; + uint64_t total_batch_size = 0; + + // After this point we take ownership of 'requests', which means + // that a response must be sent for every request. If something does + // go wrong in processing a particular request then we send an error + // response just for the specific request. + + // For simplicity we just process each request separately... in + // general a backend should try to operate on the entire batch of + // requests at the same time for improved performance. + std::vector start_buffer, end_buffer, ready_buffer, corrid_buffer, + input_buffer; + for (uint32_t r = 0; r < request_count; ++r) { + ++total_batch_size; + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + min_exec_start_ns = std::min(min_exec_start_ns, exec_start_ns); + + TRITONBACKEND_Request* request = requests[r]; + + uint64_t correlation_id = 0; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_RequestCorrelationId(request, &correlation_id)); + // If an error response was sent for the above then display an error + // message and move on to next request. + if (responses[r] == nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to read request input/output counts, error response " + "sent") + .c_str()); + continue; + } + + TRITONBACKEND_Input* input = nullptr; + GUARDED_RESPOND_IF_ERROR( + responses, r, TRITONBACKEND_RequestInput(request, "INPUT", &input)); + if (responses[r] == nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to read input 'INPUT', error response sent") + .c_str()); + continue; + } + + const void* input_buffer = nullptr; + uint64_t buffer_byte_size = 0; + TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t input_memory_type_id = 0; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_InputBuffer( + input, 0 /* input_buffer_count */, &input_buffer, &buffer_byte_size, + &input_memory_type, &input_memory_type_id)); + if ((responses[r] == nullptr) || + (input_memory_type == TRITONSERVER_MEMORY_GPU)) { + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "failed to get input buffer in CPU memory")); + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to get input buffer in CPU memory, error " + "response sent") + .c_str()); + continue; + } + + const int32_t init_value = *reinterpret_cast(input_buffer); + auto output_value = instance_state->GetOutput(correlation_id, init_value); + + TRITONBACKEND_Response* response = responses[r]; + + // The output shape is [1, 1] if the model + // configuration supports batching, or just + // [1] if the model configuration does not + // support batching. + std::vector shape; + if (supports_batching) { + shape.push_back(1); + } + shape.push_back(1); + + TRITONBACKEND_Output* output; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_ResponseOutput( + response, &output, "OUTPUT", TRITONSERVER_TYPE_INT32, shape.data(), + shape.size())); + if (responses[r] == nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to create response output, error response sent") + .c_str()); + continue; + } + + // Step 2. Get the output buffer. We request a buffer in CPU + // memory but we have to handle any returned type. If we get + // back a buffer in GPU memory we just fail the request. + void* output_buffer; + TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t output_memory_type_id = 0; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_OutputBuffer( + output, &output_buffer, sizeof(int32_t), &output_memory_type, + &output_memory_type_id)); + if ((responses[r] == nullptr) || + (output_memory_type == TRITONSERVER_MEMORY_GPU)) { + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "failed to create output buffer in CPU memory")); + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to create output buffer in CPU memory, error " + "response sent") + .c_str()); + continue; + } + + reinterpret_cast(output_buffer)[0] = output_value; + + // Set response flag and request flag correctly based on whether this + // is the last response of the sequence. + uint32_t res_flag = + (output_value <= 0) ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0; + uint32_t req_flag = (output_value <= 0) + ? TRITONSERVER_REQUEST_RELEASE_ALL + : TRITONSERVER_REQUEST_RELEASE_RESCHEDULE; + + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + max_exec_end_ns = std::max(max_exec_end_ns, exec_end_ns); + + // Release the request first as the testing backend may be configured to + // receive error on request release, in such a case, the error will be + // propagated back through error response. + auto err = TRITONBACKEND_RequestRelease(request, req_flag); + if (err) { + // Release request with ALL flag + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease( + request, TRITONSERVER_REQUEST_RELEASE_ALL), + "failed releasing request"); + res_flag = TRITONSERVER_RESPONSE_COMPLETE_FINAL; + } + + // Send all the responses that haven't already been sent because of + // an earlier error. + if (responses[r] != nullptr) { + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend(responses[r], res_flag, err), + "failed sending response"); + } + TRITONSERVER_ErrorDelete(err); + + // Report statistics for each request. + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + instance_state->TritonModelInstance(), request, + (responses[r] != nullptr) /* success */, exec_start_ns, + exec_start_ns, exec_end_ns, exec_end_ns), + "failed reporting request statistics"); + } + + // Report the entire batch statistics. + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportBatchStatistics( + instance_state->TritonModelInstance(), total_batch_size, + min_exec_start_ns, min_exec_start_ns, max_exec_end_ns, + max_exec_end_ns), + "failed reporting batch request statistics"); + + return nullptr; // success +} + +} // extern "C" + +}}} // namespace triton::backend::generative_sequence diff --git a/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript b/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript new file mode 100644 index 0000000000..00ee877745 --- /dev/null +++ b/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript @@ -0,0 +1,30 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +{ + global: + TRITONBACKEND_*; + local: *; +};