triton-inference-server · kthui · Jun 6, 2024 · May 31, 2024 · May 31, 2024 · Jun 3, 2024
diff --git a/README.md b/README.md
@@ -479,6 +479,12 @@ Upon return from the execute function all tensor data associated with the
 InferenceRequest objects passed to the function are deleted, and so
 InferenceRequest objects should not be retained by the Python model.
 
+Starting from 24.06, models may choose to send the response using the
+`InferenceResponseSender` as illustrated on [Decoupled mode](#decoupled-mode).
+Since the model is in default mode, it must send exactly one response per
+request. The `pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag must be sent
+either with the response or as a flag only response afterward.
+
 #### Error Handling
 
 In case one of the requests has an error, you can use the `TritonError` object

diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -74,7 +74,7 @@ InferRequest::InferRequest(
   pb_cancel_ =
       std::make_shared<PbCancel>(response_factory_address_, request_address_);
   response_sender_ = std::make_shared<ResponseSender>(
-      request_address_, response_factory_address_,
+      request_address_, response_factory_address_, nullptr /* is_decoupled */,
       Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
 #endif
 }
@@ -272,7 +272,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
 std::unique_ptr<InferRequest>
 InferRequest::LoadFromSharedMemory(
     std::unique_ptr<SharedMemoryManager>& shm_pool,
-    bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle)
+    bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle,
+    bool const* is_model_decoupled)
 {
   AllocatedSharedMemory<char> infer_request_shm =
       shm_pool->Load<char>(request_handle);
@@ -328,7 +329,7 @@ InferRequest::LoadFromSharedMemory(
   return std::unique_ptr<InferRequest>(new InferRequest(
       infer_request_shm, request_id_shm, correlation_id_shm,
       requested_output_names_shm, model_name_shm, input_tensors, parameters_shm,
-      infer_trace_shm));
+      infer_trace_shm, is_model_decoupled));
 }
 
 InferRequest::InferRequest(
@@ -339,7 +340,8 @@ InferRequest::InferRequest(
     std::unique_ptr<PbString>& model_name_shm,
     std::vector<std::shared_ptr<PbTensor>>& input_tensors,
     std::unique_ptr<PbString>& parameters_shm,
-    std::unique_ptr<InferenceTrace>& infer_trace_shm)
+    std::unique_ptr<InferenceTrace>& infer_trace_shm,
+    bool const* is_model_decoupled)
     : infer_request_shm_(std::move(infer_request_shm)),
       request_id_shm_(std::move(request_id_shm)),
       requested_output_names_shm_(std::move(requested_output_names_shm)),
@@ -387,7 +389,7 @@ InferRequest::InferRequest(
   pb_cancel_ =
       std::make_shared<PbCancel>(response_factory_address_, request_address_);
   response_sender_ = std::make_shared<ResponseSender>(
-      request_address_, response_factory_address_,
+      request_address_, response_factory_address_, is_model_decoupled,
       Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
 #endif
 }
@@ -402,13 +404,6 @@ InferRequest::IsCancelled()
 std::shared_ptr<ResponseSender>
 InferRequest::GetResponseSender()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
-  if (!stub->IsDecoupled()) {
-    throw PythonBackendException(
-        "'get_response_sender' function must be called only when the model is "
-        "using the decoupled transaction policy.");
-  }
-
   return response_sender_;
 }
 

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -118,7 +118,7 @@ class InferRequest {
   static std::unique_ptr<InferRequest> LoadFromSharedMemory(
       std::unique_ptr<SharedMemoryManager>& shm_pool,
       bi::managed_external_buffer::handle_t request_handle,
-      bool open_cuda_handle);
+      bool open_cuda_handle, bool const* is_model_decoupled);
 
   /// Disallow copying the inference request object.
   DISALLOW_COPY_AND_ASSIGN(InferRequest);
@@ -135,7 +135,8 @@ class InferRequest {
       std::unique_ptr<PbString>& model_name_shm,
       std::vector<std::shared_ptr<PbTensor>>& input_tensors,
       std::unique_ptr<PbString>& parameters_shm,
-      std::unique_ptr<InferenceTrace>& infer_trace_shm);
+      std::unique_ptr<InferenceTrace>& infer_trace_shm,
+      bool const* is_model_decoupled);
 
   std::string request_id_;
   CorrelationId correlation_id_;