From 451f2bb640d1e303c88a15d3ec2c5d954e2462e6 Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Tue, 24 Oct 2023 03:43:50 -0700
Subject: [PATCH 1/7] Add testing backend and test

---
 .../generative_sequence_e2e.py                |  98 +++
 .../models/generative_sequence/config.pbtxt   |  47 ++
 qa/L0_generative_sequence/test.sh             |  92 +++
 src/test/CMakeLists.txt                       |   3 +-
 src/test/generative_sequence/CMakeLists.txt   | 118 ++++
 ...onGenerativeSequenceBackendConfig.cmake.in |  39 ++
 .../src/generative_sequence.cc                | 583 ++++++++++++++++++
 .../libtriton_generative_sequence.ldscript    |  30 +
 8 files changed, 1009 insertions(+), 1 deletion(-)
 create mode 100755 qa/L0_generative_sequence/generative_sequence_e2e.py
 create mode 100644 qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
 create mode 100755 qa/L0_generative_sequence/test.sh
 create mode 100644 src/test/generative_sequence/CMakeLists.txt
 create mode 100644 src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
 create mode 100644 src/test/generative_sequence/src/generative_sequence.cc
 create mode 100644 src/test/generative_sequence/src/libtriton_generative_sequence.ldscript

diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py
new file mode 100755
index 0000000000..96d660b951
--- /dev/null
+++ b/qa/L0_generative_sequence/generative_sequence_e2e.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import unittest
+
+import json
+import requests
+import sseclient
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+# GRPC streaming helpers..
+import queue
+from functools import partial
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+class GenerativeSequenceTest(tu.TestResultCollector):
+    def test_generate_stream(self):
+        headers = {"Accept": "text/event-stream"}
+        url = "http://localhost:8000/v2/models/generative_sequence/generate_stream"
+        inputs = {"INPUT": 2}
+        res = requests.post(
+            url,
+            data=json.dumps(inputs),
+            headers=headers
+        )
+        res.raise_for_status()
+        client = sseclient.SSEClient(res)
+        res_count = 2
+        for event in client.events():
+            res_count -= 1
+            data = json.loads(event.data)
+            self.assertIn("OUTPUT", data)
+            self.assertEqual(res_count, data["OUTPUT"])
+        self.assertEqual(0, res_count)
+        
+    def test_grpc_stream(self):
+        user_data = UserData()
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            triton_client.start_stream(
+                callback=partial(callback, user_data)
+            )
+            inputs = []
+            inputs.append(grpcclient.InferInput("INPUT", [1,1], "INT32"))
+            inputs[0].set_data_from_numpy(np.array([[2]], dtype=np.int32))
+            
+            triton_client.async_stream_infer(model_name="generative_sequence", inputs=inputs)
+            res_count = 2
+            while res_count > 0:
+                data_item = user_data._completed_requests.get()
+                res_count -= 1
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    self.assertEqual(res_count, data_item.as_numpy("OUTPUT")[0][0])
+            self.assertEqual(0, res_count)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
new file mode 100644
index 0000000000..b48f7e8e3e
--- /dev/null
+++ b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
@@ -0,0 +1,47 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+backend: "generative_sequence"
+max_batch_size: 4
+input [
+  {
+    name: "INPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+model_transaction_policy {
+  decoupled: True
+}
+sequence_batching {
+  generative_sequence : true
+}
\ No newline at end of file
diff --git a/qa/L0_generative_sequence/test.sh b/qa/L0_generative_sequence/test.sh
new file mode 100755
index 0000000000..2d2df5224e
--- /dev/null
+++ b/qa/L0_generative_sequence/test.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+if [ ! -z "$TEST_REPO_ARCH" ]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+source ../common/util.sh
+
+RET=0
+
+CLIENT_LOG="./generative_sequence_client.log"
+TEST_PY=./generative_sequence_e2e.py
+EXPECTED_NUM_TESTS="2"
+TEST_RESULT_FILE='test_results.txt'
+
+
+export CUDA_VISIBLE_DEVICES=0
+
+rm -fr *.log
+
+pip install sseclient-py
+
+SERVER=/opt/tritonserver/bin/tritonserver
+SERVER_ARGS="--model-repository=`pwd`/models"
+SERVER_LOG="./inference_server.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+python $TEST_PY >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+
+exit $RET
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 25049624f8..6dcd3f8f09 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -101,6 +101,7 @@ add_subdirectory(repoagent/relocation_repoagent repoagent/relocation_repoagent)
 
 add_subdirectory(distributed_addsub distributed_addsub)
 add_subdirectory(dyna_sequence dyna_sequence)
+add_subdirectory(generative_sequence generative_sequence)
 add_subdirectory(implicit_state implicit_state)
 add_subdirectory(query_backend query_backend)
 
diff --git a/src/test/generative_sequence/CMakeLists.txt b/src/test/generative_sequence/CMakeLists.txt
new file mode 100644
index 0000000000..174b40f6b6
--- /dev/null
+++ b/src/test/generative_sequence/CMakeLists.txt
@@ -0,0 +1,118 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.17)
+
+project(tritongenerativesequencebackend LANGUAGES C CXX)
+
+#
+# libtriton_generative_sequence.so
+# Shared library implementing the Triton Sequence Backend API
+#
+configure_file(src/libtriton_generative_sequence.ldscript libtriton_generative_sequence.ldscript COPYONLY)
+
+add_library(
+  triton-generative-sequence-backend SHARED
+  src/generative_sequence.cc
+)
+
+add_library(
+  TritonGenerativeSequenceBackend::triton-generative-sequence-backend ALIAS triton-generative-sequence-backend
+)
+
+target_compile_features(triton-generative-sequence-backend PRIVATE cxx_std_11)
+target_compile_options(
+  triton-generative-sequence-backend PRIVATE
+  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+)
+
+target_link_libraries(
+  triton-generative-sequence-backend
+  PRIVATE
+    triton-backend-utils    # from repo-backend
+    triton-core-serverapi   # from repo-core
+    triton-core-backendapi  # from repo-core
+    triton-core-serverstub  # from repo-core
+)
+
+set_target_properties(
+  triton-generative-sequence-backend PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  OUTPUT_NAME triton_generative_sequence
+  LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_generative_sequence.ldscript
+  LINK_FLAGS "-Wl,--version-script libtriton_generative_sequence.ldscript"
+)
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonGenerativeSequenceBackend)
+
+install(
+  TARGETS
+    triton-generative-sequence-backend
+  EXPORT
+    triton-generative-sequence-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/generative_sequence
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/generative_sequence
+)
+
+install(
+  EXPORT
+    triton-generative-sequence-backend-targets
+  FILE
+    TritonGenerativeSequenceBackendTargets.cmake
+  NAMESPACE
+    TritonGenerativeSequenceBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonGenerativeSequenceBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonGenerativeSequenceBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-generative-sequence-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonGenerativeSequenceBackendTargets.cmake
+  NAMESPACE TritonGenerativeSequenceBackend::
+)
+
+export(PACKAGE TritonGenerativeSequenceBackend)
diff --git a/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in b/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
new file mode 100644
index 0000000000..28e8e7ed0a
--- /dev/null
+++ b/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  TRITONSEQUENCEBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${TRITONSEQUENCEBACKEND_CMAKE_DIR})
+
+if(NOT TARGET TritonGenerativeSequenceBackend::triton-sequence-backend)
+  include("${TRITONSEQUENCEBACKEND_CMAKE_DIR}/TritonGenerativeSequenceBackendTargets.cmake")
+endif()
+
+set(TRITONSEQUENCEBACKEND_LIBRARIES TritonGenerativeSequenceBackend::triton-sequence-backend)
\ No newline at end of file
diff --git a/src/test/generative_sequence/src/generative_sequence.cc b/src/test/generative_sequence/src/generative_sequence.cc
new file mode 100644
index 0000000000..35d358c8c7
--- /dev/null
+++ b/src/test/generative_sequence/src/generative_sequence.cc
@@ -0,0 +1,583 @@
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <algorithm>
+#include <memory>
+#include <thread>
+
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+
+namespace triton { namespace backend { namespace generative_sequence {
+
+
+// Simple generative sequence backend that demonstrates the TRITONBACKEND API for a
+// blocking backend. A blocking backend completes execution of the
+// inference before returning from TRITONBACKEND_ModelInstanceExecute.
+//
+// The backend supports models that take 5 input tensors, three INT32 [ 1 ]
+// control values, one UINT64 [ 1 ] correlation ID control, and one
+// variable-size INT32 [ -1 ] value input; and produces an output
+// tensor with the same shape as the input tensor. The input tensors
+// must be named "START", "END", "READY", "CORRID" and "INPUT". The
+// output tensor must be named "OUTPUT".
+//
+// The model maintains an INT32 accumulator for each sequence which
+// is updated based on the control values in "START", "END", "READY"
+// and "CORRID":
+//
+//   READY=0, START=x, END=x: Ignore value input, do not change
+//   accumulator value.
+//
+//   READY=1, START=1, END=x: Start accumulating. Set accumulator
+//   equal to sum of INPUT tensor elements.
+//
+//   READY=1, START=0, END=x: Add INPUT tensor elements to
+//   accumulator.
+//
+// In addition to the above, when END=1 CORRID is added to the accumulator.
+//
+// When READY=1, the accumulator is returned in every element of the
+// OUTPUT tensor.
+//
+
+#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X)                     \
+  do {                                                                  \
+    if ((RESPONSES)[IDX] != nullptr) {                                  \
+      TRITONSERVER_Error* err__ = (X);                                  \
+      if (err__ != nullptr) {                                           \
+        LOG_IF_ERROR(                                                   \
+            TRITONBACKEND_ResponseSend(                                 \
+                (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                err__),                                                 \
+            "failed to send error response");                           \
+        (RESPONSES)[IDX] = nullptr;                                     \
+        TRITONSERVER_ErrorDelete(err__);                                \
+      }                                                                 \
+    }                                                                   \
+  } while (false)
+
+//
+// ModelState
+//
+// State associated with a model that is using this backend. An object
+// of this class is created and associated with each
+// TRITONBACKEND_Model.
+//
+class ModelState : public BackendModel {
+ public:
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_Model* triton_model, ModelState** state);
+  virtual ~ModelState() = default;
+
+ private:
+  ModelState(TRITONBACKEND_Model* triton_model);
+};
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+  try {
+    *state = new ModelState(triton_model);
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model)
+    : BackendModel(triton_model)
+{
+}
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState : public BackendModelInstance {
+ public:
+  static TRITONSERVER_Error* Create(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance,
+      ModelInstanceState** state);
+  virtual ~ModelInstanceState() = default;
+
+  // Get the state of the model that corresponds to this instance.
+  ModelState* StateForModel() const { return model_state_; }
+
+  // return output value on receiving request, initalize remainder
+  // if the corrid hasn't been recorded.
+  int32_t GetOutput(uint64_t corrid, int32_t init_value);
+
+ private:
+  ModelInstanceState(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance);
+
+  ModelState* model_state_;
+
+  // A map from correlation ID to the remaining responses.
+  std::unordered_map<uint64_t, int32_t> remainders_;
+};
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    ModelInstanceState** state)
+{
+  try {
+    *state = new ModelInstanceState(model_state, triton_model_instance);
+  }
+  catch (const BackendModelInstanceException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelInstanceException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+ModelInstanceState::ModelInstanceState(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
+    : BackendModelInstance(model_state, triton_model_instance),
+      model_state_(model_state)
+{
+}
+
+int32_t
+ModelInstanceState::GetOutput(uint64_t corrid, int32_t init_value)
+{
+  auto it = remainders_.find(corrid);
+  if (it == remainders_.end()) {
+    it = remainders_.emplace(corrid, init_value).first;
+  }
+  auto res = --it->second;
+  if (res <= 0) {
+    remainders_.erase(it);
+  }
+  return res;
+}
+
+/////////////
+
+extern "C" {
+
+// Implementing TRITONBACKEND_Initialize is optional. The backend
+// should initialize any global state that is intended to be shared
+// across all models and model instances that use the backend.
+TRITONSERVER_Error*
+TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
+  std::string name(cname);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
+
+  // We should check the backend API version that Triton supports
+  // vs. what this backend was compiled against.
+  uint32_t api_version_major, api_version_minor;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
+
+  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
+      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        "triton backend API version does not support this backend");
+  }
+
+  return nullptr;  // success
+}
+
+// Implementing TRITONBACKEND_ModelInitialize is optional. The backend
+// should initialize any state that is intended to be shared across
+// all instances of the model.
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
+  std::string name(cname);
+
+  uint64_t version;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
+       std::to_string(version) + ")")
+          .c_str());
+
+  // With each model we create a ModelState object and associate it
+  // with the TRITONBACKEND_Model.
+  ModelState* model_state;
+  RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
+  return nullptr;  // success
+}
+
+// Implementing TRITONBACKEND_ModelFinalize is optional unless state
+// is set using TRITONBACKEND_ModelSetState. The backend must free
+// this state and perform any other cleanup.
+TRITONSERVER_Error*
+TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
+
+  delete model_state;
+
+  return nullptr;  // success
+}
+
+// Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The
+// backend should initialize any state that is required for a model
+// instance.
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
+  std::string name(cname);
+
+  int32_t device_id;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
+  TRITONSERVER_InstanceGroupKind kind;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
+       TRITONSERVER_InstanceGroupKindString(kind) + " device " +
+       std::to_string(device_id) + ")")
+          .c_str());
+
+  // The instance can access the corresponding model as well... here
+  // we get the model and from that get the model's state.
+  TRITONBACKEND_Model* model;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+  void* vmodelstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+  // With each instance we create a ModelInstanceState object and
+  // associate it with the TRITONBACKEND_ModelInstance.
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(
+      ModelInstanceState::Create(model_state, instance, &instance_state));
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
+      instance, reinterpret_cast<void*>(instance_state)));
+
+  // Because this backend just copies IN -> OUT and requires that
+  // input and output be in CPU memory, we fail if a GPU instances is
+  // requested.
+  RETURN_ERROR_IF_FALSE(
+      instance_state->Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU,
+      TRITONSERVER_ERROR_INVALID_ARG,
+      std::string("'generative_sequence' backend only supports CPU instances"));
+
+  return nullptr;  // success
+}
+
+// Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless
+// state is set using TRITONBACKEND_ModelInstanceSetState. The backend
+// must free this state and perform any other cleanup.
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+  ModelInstanceState* instance_state =
+      reinterpret_cast<ModelInstanceState*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+
+  delete instance_state;
+
+  return nullptr;  // success
+}
+
+// Implementing TRITONBACKEND_ModelInstanceExecute is required.
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count)
+{
+  // Triton will not call this function simultaneously for the same
+  // 'instance'. But since this backend could be used by multiple
+  // instances from multiple models the implementation needs to handle
+  // multiple calls to this function at the same time (with different
+  // 'instance' objects). Suggested practice for this is to use only
+  // function-local and model-instance-specific state (obtained from
+  // 'instance'), which is what we do here.
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
+      instance, reinterpret_cast<void**>(&instance_state)));
+  ModelState* model_state = instance_state->StateForModel();
+
+  // This backend specifies BLOCKING execution policy. That means that
+  // we should not return from this function until execution is
+  // complete. Triton will automatically release 'instance' on return
+  // from this function so that it is again available to be used for
+  // another call to TRITONBACKEND_ModelInstanceExecute.
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("model ") + model_state->Name() + ", instance " +
+       instance_state->Name() + ", executing " + std::to_string(request_count) +
+       " requests")
+          .c_str());
+
+  bool supports_batching = false;
+  RETURN_IF_ERROR(model_state->SupportsFirstDimBatching(&supports_batching));
+
+  // 'responses' is initialized with the response objects below and
+  // if/when an error response is sent the corresponding entry in
+  // 'responses' is set to nullptr to indicate that that response has
+  // already been sent.
+  std::vector<TRITONBACKEND_Response*> responses;
+  responses.reserve(request_count);
+
+  // Create a single response object for each request. If something
+  // goes wrong when attempting to create the response objects just
+  // fail all of the requests by returning an error.
+  for (uint32_t r = 0; r < request_count; ++r) {
+    TRITONBACKEND_Request* request = requests[r];
+
+    TRITONBACKEND_Response* response;
+    RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
+    responses.push_back(response);
+  }
+
+  // The way we collect these batch timestamps is not entirely
+  // accurate. Normally, in a performant backend you would execute all
+  // the requests at the same time, and so there would be a single
+  // compute-start / compute-end time-range. But here we execute each
+  // request separately so there is no single range. As a result we
+  // just show the entire execute time as being the compute time as
+  // well.
+  uint64_t min_exec_start_ns = std::numeric_limits<uint64_t>::max();
+  uint64_t max_exec_end_ns = 0;
+  uint64_t total_batch_size = 0;
+
+  // After this point we take ownership of 'requests', which means
+  // that a response must be sent for every request. If something does
+  // go wrong in processing a particular request then we send an error
+  // response just for the specific request.
+
+  // For simplicity we just process each request separately... in
+  // general a backend should try to operate on the entire batch of
+  // requests at the same time for improved performance.
+  std::vector<uint8_t> start_buffer, end_buffer, ready_buffer, corrid_buffer,
+      input_buffer;
+  for (uint32_t r = 0; r < request_count; ++r) {
+    ++total_batch_size;
+
+    uint64_t exec_start_ns = 0;
+    SET_TIMESTAMP(exec_start_ns);
+    min_exec_start_ns = std::min(min_exec_start_ns, exec_start_ns);
+
+    TRITONBACKEND_Request* request = requests[r];
+
+    uint64_t correlation_id = 0;
+    GUARDED_RESPOND_IF_ERROR(
+        responses, r,
+        TRITONBACKEND_RequestCorrelationId(request, &correlation_id));
+    // If an error response was sent for the above then display an error
+    // message and move on to next request.
+    if (responses[r] == nullptr) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("request ") + std::to_string(r) +
+           ": failed to read request input/output counts, error response "
+           "sent")
+              .c_str());
+      continue;
+    }
+
+    TRITONBACKEND_Input* input = nullptr;
+    GUARDED_RESPOND_IF_ERROR(
+        responses, r, TRITONBACKEND_RequestInput(request, "INPUT", &input));
+    if (responses[r] == nullptr) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("request ") + std::to_string(r) +
+           ": failed to read input 'INPUT', error response sent")
+              .c_str());
+      continue;
+    }
+
+    const void* input_buffer = nullptr;
+    uint64_t buffer_byte_size = 0;
+    TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU;
+    int64_t input_memory_type_id = 0;
+    GUARDED_RESPOND_IF_ERROR(
+        responses, r,
+        TRITONBACKEND_InputBuffer(
+            input, 0 /* input_buffer_count */, &input_buffer, &buffer_byte_size,
+            &input_memory_type, &input_memory_type_id));
+    if ((responses[r] == nullptr) ||
+        (input_memory_type == TRITONSERVER_MEMORY_GPU)) {
+      GUARDED_RESPOND_IF_ERROR(
+          responses, r,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_UNSUPPORTED,
+              "failed to get input buffer in CPU memory"));
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("request ") + std::to_string(r) +
+           ": failed to get input buffer in CPU memory, error "
+           "response sent")
+              .c_str());
+      continue;
+    }
+
+    const int32_t init_value = *reinterpret_cast<const int32_t*>(input_buffer);
+    auto output_value = instance_state->GetOutput(correlation_id, init_value);
+
+    TRITONBACKEND_Response* response = responses[r];
+
+    // The output shape is [1, 1] if the model
+    // configuration supports batching, or just
+    // [1] if the model configuration does not
+    // support batching.
+    std::vector<int64_t> shape;
+    if (supports_batching) {
+      shape.push_back(1);
+    }
+    shape.push_back(1);
+
+    TRITONBACKEND_Output* output;
+    GUARDED_RESPOND_IF_ERROR(
+        responses, r,
+        TRITONBACKEND_ResponseOutput(
+            response, &output, "OUTPUT", TRITONSERVER_TYPE_INT32, shape.data(),
+            shape.size()));
+    if (responses[r] == nullptr) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("request ") + std::to_string(r) +
+            ": failed to create response output, error response sent")
+              .c_str());
+      continue;
+    }
+
+    // Step 2. Get the output buffer. We request a buffer in CPU
+    // memory but we have to handle any returned type. If we get
+    // back a buffer in GPU memory we just fail the request.
+    void* output_buffer;
+    TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
+    int64_t output_memory_type_id = 0;
+    GUARDED_RESPOND_IF_ERROR(
+        responses, r,
+        TRITONBACKEND_OutputBuffer(
+            output, &output_buffer, sizeof(int32_t), &output_memory_type,
+            &output_memory_type_id));
+    if ((responses[r] == nullptr) ||
+        (output_memory_type == TRITONSERVER_MEMORY_GPU)) {
+      GUARDED_RESPOND_IF_ERROR(
+          responses, r,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_UNSUPPORTED,
+              "failed to create output buffer in CPU memory"));
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("request ") + std::to_string(r) +
+            ": failed to create output buffer in CPU memory, error "
+            "response sent")
+              .c_str());
+      continue;
+    }
+
+    reinterpret_cast<int32_t*>(output_buffer)[0] = output_value;
+
+    // Set response flag and request flag correctly based on whether this
+    // is the last response of the sequence.
+    uint32_t res_flag = (output_value <= 0) ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0;
+    uint32_t req_flag = (output_value <= 0) ? TRITONSERVER_REQUEST_RELEASE_ALL : TRITONSERVER_REQUEST_RELEASE_RESCHEDULE;
+    
+    uint64_t exec_end_ns = 0;
+    SET_TIMESTAMP(exec_end_ns);
+    max_exec_end_ns = std::max(max_exec_end_ns, exec_end_ns);
+
+    // Send all the responses that haven't already been sent because of
+    // an earlier error.
+    if (responses[r] != nullptr) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              responses[r], res_flag,
+              nullptr /* success */),
+          "failed sending response");
+    }
+
+    // Report statistics for each request.
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportStatistics(
+            instance_state->TritonModelInstance(), request,
+            (responses[r] != nullptr) /* success */, exec_start_ns,
+            exec_start_ns, exec_end_ns, exec_end_ns),
+        "failed reporting request statistics");
+
+    LOG_IF_ERROR(
+        TRITONBACKEND_RequestRelease(request, req_flag),
+        "failed releasing request");
+  }
+
+  // Report the entire batch statistics.
+  LOG_IF_ERROR(
+      TRITONBACKEND_ModelInstanceReportBatchStatistics(
+          instance_state->TritonModelInstance(), total_batch_size,
+          min_exec_start_ns, min_exec_start_ns, max_exec_end_ns,
+          max_exec_end_ns),
+      "failed reporting batch request statistics");
+
+  return nullptr;  // success
+}
+
+}  // extern "C"
+
+}}}  // namespace triton::backend::generative_sequence
diff --git a/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript b/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript
new file mode 100644
index 0000000000..621f391c8d
--- /dev/null
+++ b/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript
@@ -0,0 +1,30 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
\ No newline at end of file

From 0be53e87e6ed2ccf161e0259dedcb91e751a12ec Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Tue, 24 Oct 2023 10:46:06 -0700
Subject: [PATCH 2/7] Add test to build / CI. Minor fix on L0_http

---
 Dockerfile.QA                        | 2 ++
 build.py                             | 1 +
 qa/L0_http/generate_endpoint_test.py | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.QA b/Dockerfile.QA
index cba4cc703f..9e23a97672 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -132,6 +132,8 @@ RUN mkdir -p qa/common && \
     mkdir qa/L0_query/models/query/1 && \
     cp tritonbuild/tritonserver/backends/query/libtriton_query.so qa/L0_query/models/query/1/. && \
     cp bin/query_test qa/L0_query/. && \
+    mkdir qa/L0_generative_sequence/models/generative_sequence/1 && \
+    cp tritonbuild/tritonserver/backends/generative_sequence/libtriton_generative_sequence.so qa/L0_generative_sequence/models/generative_sequence/1/. && \
     cp bin/register_api_test qa/L0_register/. && \
     cp bin/async_work_queue_test qa/L0_async_work_queue/. && \
     cp tritonbuild/tritonserver/backends/implicit_state/libtriton_implicit_state.so \
diff --git a/build.py b/build.py
index 9c5b61645b..5931ed5f7c 100755
--- a/build.py
+++ b/build.py
@@ -2051,6 +2051,7 @@ def cibase_build(
         "sequence",
         "dyna_sequence",
         "distributed_addsub",
+        "generative_sequence",
     ):
         be_install_dir = os.path.join(repo_install_dir, "backends", be)
         if target_platform() == "windows":
diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index f426d07536..1fb3dddcc6 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -107,7 +107,7 @@ def check_sse_responses(self, res, expected_res):
                 self.assertIn(key, data)
                 self.assertEqual(value, data[key])
             res_count += 1
-        self.assertTrue(len(expected_res), res_count)
+        self.assertEqual(len(expected_res), res_count)
         # Make sure there is no message in the wrong form
         for remaining in client._read():
             self.assertTrue(

From dd3f14f439474564357ac7efaf13ab7b52f43fd1 Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Tue, 24 Oct 2023 10:55:13 -0700
Subject: [PATCH 3/7] Format. Update backend documentation

---
 .../generative_sequence_e2e.py                | 34 +++++------
 .../src/generative_sequence.cc                | 60 ++++++++-----------
 2 files changed, 41 insertions(+), 53 deletions(-)

diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py
index 96d660b951..78d1942e5c 100755
--- a/qa/L0_generative_sequence/generative_sequence_e2e.py
+++ b/qa/L0_generative_sequence/generative_sequence_e2e.py
@@ -29,23 +29,26 @@
 
 sys.path.append("../common")
 
+import json
+
+# GRPC streaming helpers..
+import queue
 import unittest
+from functools import partial
 
-import json
+import numpy as np
 import requests
 import sseclient
-import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
-# GRPC streaming helpers..
-import queue
-from functools import partial
+
 class UserData:
     def __init__(self):
         self._completed_requests = queue.Queue()
 
+
 def callback(user_data, result, error):
     if error:
         user_data._completed_requests.put(error)
@@ -58,11 +61,7 @@ def test_generate_stream(self):
         headers = {"Accept": "text/event-stream"}
         url = "http://localhost:8000/v2/models/generative_sequence/generate_stream"
         inputs = {"INPUT": 2}
-        res = requests.post(
-            url,
-            data=json.dumps(inputs),
-            headers=headers
-        )
+        res = requests.post(url, data=json.dumps(inputs), headers=headers)
         res.raise_for_status()
         client = sseclient.SSEClient(res)
         res_count = 2
@@ -72,18 +71,18 @@ def test_generate_stream(self):
             self.assertIn("OUTPUT", data)
             self.assertEqual(res_count, data["OUTPUT"])
         self.assertEqual(0, res_count)
-        
+
     def test_grpc_stream(self):
         user_data = UserData()
         with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
-            triton_client.start_stream(
-                callback=partial(callback, user_data)
-            )
+            triton_client.start_stream(callback=partial(callback, user_data))
             inputs = []
-            inputs.append(grpcclient.InferInput("INPUT", [1,1], "INT32"))
+            inputs.append(grpcclient.InferInput("INPUT", [1, 1], "INT32"))
             inputs[0].set_data_from_numpy(np.array([[2]], dtype=np.int32))
-            
-            triton_client.async_stream_infer(model_name="generative_sequence", inputs=inputs)
+
+            triton_client.async_stream_infer(
+                model_name="generative_sequence", inputs=inputs
+            )
             res_count = 2
             while res_count > 0:
                 data_item = user_data._completed_requests.get()
@@ -94,5 +93,6 @@ def test_grpc_stream(self):
                     self.assertEqual(res_count, data_item.as_numpy("OUTPUT")[0][0])
             self.assertEqual(0, res_count)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/src/test/generative_sequence/src/generative_sequence.cc b/src/test/generative_sequence/src/generative_sequence.cc
index 35d358c8c7..117fc9a46c 100644
--- a/src/test/generative_sequence/src/generative_sequence.cc
+++ b/src/test/generative_sequence/src/generative_sequence.cc
@@ -35,34 +35,20 @@
 namespace triton { namespace backend { namespace generative_sequence {
 
 
-// Simple generative sequence backend that demonstrates the TRITONBACKEND API for a
-// blocking backend. A blocking backend completes execution of the
-// inference before returning from TRITONBACKEND_ModelInstanceExecute.
+// Simple generative sequence backend that demonstrates the use of
+// TRITONSERVER_REQUEST_RELEASE_RESCHEDULE flag to generatively produce
+// sequence response.
 //
-// The backend supports models that take 5 input tensors, three INT32 [ 1 ]
-// control values, one UINT64 [ 1 ] correlation ID control, and one
-// variable-size INT32 [ -1 ] value input; and produces an output
-// tensor with the same shape as the input tensor. The input tensors
-// must be named "START", "END", "READY", "CORRID" and "INPUT". The
-// output tensor must be named "OUTPUT".
-//
-// The model maintains an INT32 accumulator for each sequence which
-// is updated based on the control values in "START", "END", "READY"
-// and "CORRID":
-//
-//   READY=0, START=x, END=x: Ignore value input, do not change
-//   accumulator value.
-//
-//   READY=1, START=1, END=x: Start accumulating. Set accumulator
-//   equal to sum of INPUT tensor elements.
-//
-//   READY=1, START=0, END=x: Add INPUT tensor elements to
-//   accumulator.
-//
-// In addition to the above, when END=1 CORRID is added to the accumulator.
-//
-// When READY=1, the accumulator is returned in every element of the
-// OUTPUT tensor.
+// The backend supports models that take 1 input tensor, an INT32 [ 1 ]
+// input named "INPUT"; and produces an output tensor "OUTPUT" with the same
+// shape as the input tensor. The input value indicates the total number of
+// responses to be generated and the output value indicates the number of
+// remaining responses. For example, if the request input has value 2,
+// the backend will:
+//   - Send a response with value 1.
+//   - Release request with RESCHEDULE flag.
+//   - When execute on the same request, send the last response with value 0.
+//   - Release request with ALL flag.
 //
 
 #define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X)                     \
@@ -136,7 +122,7 @@ class ModelInstanceState : public BackendModelInstance {
   // Get the state of the model that corresponds to this instance.
   ModelState* StateForModel() const { return model_state_; }
 
-  // return output value on receiving request, initalize remainder
+  // return output value on receiving request, initialize remainder
   // if the corrid hasn't been recorded.
   int32_t GetOutput(uint64_t corrid, int32_t init_value);
 
@@ -501,7 +487,7 @@ TRITONBACKEND_ModelInstanceExecute(
       LOG_MESSAGE(
           TRITONSERVER_LOG_ERROR,
           (std::string("request ") + std::to_string(r) +
-            ": failed to create response output, error response sent")
+           ": failed to create response output, error response sent")
               .c_str());
       continue;
     }
@@ -527,8 +513,8 @@ TRITONBACKEND_ModelInstanceExecute(
       LOG_MESSAGE(
           TRITONSERVER_LOG_ERROR,
           (std::string("request ") + std::to_string(r) +
-            ": failed to create output buffer in CPU memory, error "
-            "response sent")
+           ": failed to create output buffer in CPU memory, error "
+           "response sent")
               .c_str());
       continue;
     }
@@ -537,9 +523,12 @@ TRITONBACKEND_ModelInstanceExecute(
 
     // Set response flag and request flag correctly based on whether this
     // is the last response of the sequence.
-    uint32_t res_flag = (output_value <= 0) ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0;
-    uint32_t req_flag = (output_value <= 0) ? TRITONSERVER_REQUEST_RELEASE_ALL : TRITONSERVER_REQUEST_RELEASE_RESCHEDULE;
-    
+    uint32_t res_flag =
+        (output_value <= 0) ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0;
+    uint32_t req_flag = (output_value <= 0)
+                            ? TRITONSERVER_REQUEST_RELEASE_ALL
+                            : TRITONSERVER_REQUEST_RELEASE_RESCHEDULE;
+
     uint64_t exec_end_ns = 0;
     SET_TIMESTAMP(exec_end_ns);
     max_exec_end_ns = std::max(max_exec_end_ns, exec_end_ns);
@@ -549,8 +538,7 @@ TRITONBACKEND_ModelInstanceExecute(
     if (responses[r] != nullptr) {
       LOG_IF_ERROR(
           TRITONBACKEND_ResponseSend(
-              responses[r], res_flag,
-              nullptr /* success */),
+              responses[r], res_flag, nullptr /* success */),
           "failed sending response");
     }
 

From a44819456a1ac8d903c441c9d9e298253c8a850a Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Wed, 25 Oct 2023 00:59:30 -0700
Subject: [PATCH 4/7] Fix up

---
 .../models/generative_sequence/config.pbtxt                    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
index b48f7e8e3e..123edc2962 100644
--- a/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
+++ b/qa/L0_generative_sequence/models/generative_sequence/config.pbtxt
@@ -44,4 +44,5 @@ model_transaction_policy {
 }
 sequence_batching {
   generative_sequence : true
-}
\ No newline at end of file
+}
+instance_group [{ kind: KIND_CPU }]

From fd9dc5ac04e2c618d45d78358488238d8b12c12c Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Mon, 30 Oct 2023 01:17:35 -0700
Subject: [PATCH 5/7] Address comment

---
 .../cmake/TritonGenerativeSequenceBackendConfig.cmake.in        | 2 +-
 .../src/libtriton_generative_sequence.ldscript                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in b/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
index 28e8e7ed0a..cb3b10bbd1 100644
--- a/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
+++ b/src/test/generative_sequence/cmake/TritonGenerativeSequenceBackendConfig.cmake.in
@@ -36,4 +36,4 @@ if(NOT TARGET TritonGenerativeSequenceBackend::triton-sequence-backend)
   include("${TRITONSEQUENCEBACKEND_CMAKE_DIR}/TritonGenerativeSequenceBackendTargets.cmake")
 endif()
 
-set(TRITONSEQUENCEBACKEND_LIBRARIES TritonGenerativeSequenceBackend::triton-sequence-backend)
\ No newline at end of file
+set(TRITONSEQUENCEBACKEND_LIBRARIES TritonGenerativeSequenceBackend::triton-sequence-backend)
diff --git a/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript b/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript
index 621f391c8d..00ee877745 100644
--- a/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript
+++ b/src/test/generative_sequence/src/libtriton_generative_sequence.ldscript
@@ -27,4 +27,4 @@
   global:
     TRITONBACKEND_*;
   local: *;
-};
\ No newline at end of file
+};

From b172f56bd3feab8f6240964aecc3f9f9fd349460 Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Mon, 30 Oct 2023 03:40:30 -0700
Subject: [PATCH 6/7] Add negative testing

---
 .../generative_sequence_e2e.py                | 81 ++++++++++++++++++-
 qa/L0_generative_sequence/test.sh             |  4 +-
 .../src/generative_sequence.cc                | 21 +++--
 3 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py
index 78d1942e5c..6362062205 100755
--- a/qa/L0_generative_sequence/generative_sequence_e2e.py
+++ b/qa/L0_generative_sequence/generative_sequence_e2e.py
@@ -43,6 +43,32 @@
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
+MODEL_CONFIG_BASE = """
+{{
+"backend": "generative_sequence",
+"max_batch_size": 4,
+"input" : [
+  {{
+    "name": "INPUT",
+    "data_type": "TYPE_INT32",
+    "dims": [ 1 ]
+  }}
+],
+"output" : [
+  {{
+    "name": "OUTPUT",
+    "data_type": "TYPE_INT32",
+    "dims": [ 1 ]
+  }}
+],
+"model_transaction_policy" : {{
+  "decoupled": true
+}},
+{},
+"instance_group" : [{{ "kind": "KIND_CPU" }}]
+}}
+"""
+
 
 class UserData:
     def __init__(self):
@@ -57,6 +83,11 @@ def callback(user_data, result, error):
 
 
 class GenerativeSequenceTest(tu.TestResultCollector):
+    def setUp(self):
+        # Always make sure the original config is used
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            triton_client.load_model("generative_sequence")
+
     def test_generate_stream(self):
         headers = {"Accept": "text/event-stream"}
         url = "http://localhost:8000/v2/models/generative_sequence/generate_stream"
@@ -72,7 +103,7 @@ def test_generate_stream(self):
             self.assertEqual(res_count, data["OUTPUT"])
         self.assertEqual(0, res_count)
 
-    def test_grpc_stream(self):
+    def test_grpc_stream(self, sequence_id=0, sequence_start=False):
         user_data = UserData()
         with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
             triton_client.start_stream(callback=partial(callback, user_data))
@@ -81,7 +112,10 @@ def test_grpc_stream(self):
             inputs[0].set_data_from_numpy(np.array([[2]], dtype=np.int32))
 
             triton_client.async_stream_infer(
-                model_name="generative_sequence", inputs=inputs
+                model_name="generative_sequence",
+                inputs=inputs,
+                sequence_id=sequence_id,
+                sequence_start=sequence_start,
             )
             res_count = 2
             while res_count > 0:
@@ -93,6 +127,49 @@ def test_grpc_stream(self):
                     self.assertEqual(res_count, data_item.as_numpy("OUTPUT")[0][0])
             self.assertEqual(0, res_count)
 
+    def test_unsupported_sequence_scheduler(self):
+        # Override model config with scheduler settings that do not support
+        # request rescheduling.
+        configs = [
+            r'"sequence_batching" : { "direct" : {}, "generative_sequence" : false }',
+            r'"sequence_batching" : { "oldest" : {}, "generative_sequence" : false }',
+        ]
+        sid = 1
+        for sc in configs:
+            with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+                triton_client.load_model(
+                    "generative_sequence", config=MODEL_CONFIG_BASE.format(sc)
+                )
+            with self.assertRaises(InferenceServerException) as context:
+                # Without specifying 'generative_sequence : true', the sequence
+                # batcher expects sequence paramters to be provided explicitly
+                self.test_grpc_stream(sequence_id=sid, sequence_start=True)
+            sid += 1
+            print(str(context.exception))
+            self.assertTrue(
+                "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE"
+                in str(context.exception)
+            )
+
+    def test_unsupported_dynamic_scheduler(self):
+        # Override model config with scheduler settings that do not support
+        # request rescheduling.
+        configs = [
+            r'"dynamic_batching" : {}',
+        ]
+        for sc in configs:
+            with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+                triton_client.load_model(
+                    "generative_sequence", config=MODEL_CONFIG_BASE.format(sc)
+                )
+            with self.assertRaises(InferenceServerException) as context:
+                self.test_grpc_stream()
+            print(str(context.exception))
+            self.assertTrue(
+                "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE"
+                in str(context.exception)
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_generative_sequence/test.sh b/qa/L0_generative_sequence/test.sh
index 2d2df5224e..f9212901d3 100755
--- a/qa/L0_generative_sequence/test.sh
+++ b/qa/L0_generative_sequence/test.sh
@@ -44,7 +44,7 @@ RET=0
 
 CLIENT_LOG="./generative_sequence_client.log"
 TEST_PY=./generative_sequence_e2e.py
-EXPECTED_NUM_TESTS="2"
+EXPECTED_NUM_TESTS="4"
 TEST_RESULT_FILE='test_results.txt'
 
 
@@ -55,7 +55,7 @@ rm -fr *.log
 pip install sseclient-py
 
 SERVER=/opt/tritonserver/bin/tritonserver
-SERVER_ARGS="--model-repository=`pwd`/models"
+SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=EXPLICIT"
 SERVER_LOG="./inference_server.log"
 run_server
 if [ "$SERVER_PID" == "0" ]; then
diff --git a/src/test/generative_sequence/src/generative_sequence.cc b/src/test/generative_sequence/src/generative_sequence.cc
index 117fc9a46c..960a13fcbd 100644
--- a/src/test/generative_sequence/src/generative_sequence.cc
+++ b/src/test/generative_sequence/src/generative_sequence.cc
@@ -533,14 +533,27 @@ TRITONBACKEND_ModelInstanceExecute(
     SET_TIMESTAMP(exec_end_ns);
     max_exec_end_ns = std::max(max_exec_end_ns, exec_end_ns);
 
+    // Release the request first as the testing backend may be configured to
+    // receive error on request release, in such a case, the error will be
+    // propagated back through error response.
+    auto err = TRITONBACKEND_RequestRelease(request, req_flag);
+    if (err) {
+      // Release request with ALL flag
+      LOG_IF_ERROR(
+          TRITONBACKEND_RequestRelease(
+              request, TRITONSERVER_REQUEST_RELEASE_ALL),
+          "failed releasing request");
+      res_flag = TRITONSERVER_RESPONSE_COMPLETE_FINAL;
+    }
+
     // Send all the responses that haven't already been sent because of
     // an earlier error.
     if (responses[r] != nullptr) {
       LOG_IF_ERROR(
-          TRITONBACKEND_ResponseSend(
-              responses[r], res_flag, nullptr /* success */),
+          TRITONBACKEND_ResponseSend(responses[r], res_flag, err),
           "failed sending response");
     }
+    TRITONSERVER_ErrorDelete(err);
 
     // Report statistics for each request.
     LOG_IF_ERROR(
@@ -549,10 +562,6 @@ TRITONBACKEND_ModelInstanceExecute(
             (responses[r] != nullptr) /* success */, exec_start_ns,
             exec_start_ns, exec_end_ns, exec_end_ns),
         "failed reporting request statistics");
-
-    LOG_IF_ERROR(
-        TRITONBACKEND_RequestRelease(request, req_flag),
-        "failed releasing request");
   }
 
   // Report the entire batch statistics.

From 7d2f4ae08c0dfbf8182c187a5b5a15ae37a6ee47 Mon Sep 17 00:00:00 2001
From: GuanLuo <gluo@nvidia.com>
Date: Mon, 30 Oct 2023 03:47:32 -0700
Subject: [PATCH 7/7] Fix up

---
 qa/L0_generative_sequence/generative_sequence_e2e.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py
index 6362062205..68b6ca6622 100755
--- a/qa/L0_generative_sequence/generative_sequence_e2e.py
+++ b/qa/L0_generative_sequence/generative_sequence_e2e.py
@@ -142,7 +142,7 @@ def test_unsupported_sequence_scheduler(self):
                 )
             with self.assertRaises(InferenceServerException) as context:
                 # Without specifying 'generative_sequence : true', the sequence
-                # batcher expects sequence paramters to be provided explicitly
+                # batcher expects sequence parameters to be provided explicitly
                 self.test_grpc_stream(sequence_id=sid, sequence_start=True)
             sid += 1
             print(str(context.exception))