Fix BLS decoupled segfault and hang (triton-inference-server#325) (tr…

…iton-inference-server#326) * Store InferPayload using the address of the object managed by the shared_ptr * Fix hang * Release GIL before sending message to the other process * Release GIL in the beginning
basetenlabs · Dec 8, 2023 · caca2d5 · caca2d5
1 parent ffbac67
commit caca2d5
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 4 deletions.
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -442,6 +442,13 @@ InferRequest::GetResponseSender()
 std::shared_ptr<InferResponse>
 InferRequest::Exec(const bool is_decoupled)
 {
+  // Release the GIL. This avoids a potential deadlock situation in the parent
+  // process, where every thread in the thread pool is indirectly waiting for a
+  // function in the stub process that acquires the GIL. Meanwhile, the current
+  // thread, which holds the GIL, is also waiting for the parent side to have
+  // the next available thread to pick up the job during resource contention.
+  py::gil_scoped_release release;
+
   // BLS should not be used in "initialize" or "finalize" function.
   std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
   if (!stub->IsInitialized() || stub->IsFinalizing()) {
@@ -465,7 +472,6 @@ InferRequest::Exec(const bool is_decoupled)
   });
 
   try {
-    py::gil_scoped_release release;
     ipc_message = IPCMessage::Create(shm_pool, true /* inline_response */);
     bool has_exception = false;
     PythonBackendException pb_exception(std::string{});

diff --git a/src/python_be.cc b/src/python_be.cc
@@ -752,7 +752,7 @@ ModelInstanceState::ExecuteBLSRequest(
         if (is_decoupled && (infer_response->Id() != nullptr)) {
           // Need to manage the lifetime of InferPayload object for bls
           // decoupled responses.
-          infer_payload_[reinterpret_cast<void*>(&infer_payload)] =
+          infer_payload_[reinterpret_cast<intptr_t>(infer_payload.get())] =
               infer_payload;
         }
 
@@ -943,7 +943,7 @@ ModelInstanceState::ProcessBLSCleanupRequest(
       reinterpret_cast<CleanupMessage*>(cleanup_request_message.data_.get());
 
   void* id = cleanup_message_ptr->id;
-  infer_payload_.erase(id);
+  infer_payload_.erase(reinterpret_cast<intptr_t>(id));
 
   {
     bi::scoped_lock<bi::interprocess_mutex> lock{*(message->ResponseMutex())};

diff --git a/src/python_be.h b/src/python_be.h
@@ -286,7 +286,7 @@ class ModelInstanceState : public BackendModelInstance {
   std::unique_ptr<IPCMessage> received_message_;
   std::vector<std::future<void>> futures_;
   std::unique_ptr<boost::asio::thread_pool> thread_pool_;
-  std::unordered_map<void*, std::shared_ptr<InferPayload>> infer_payload_;
+  std::unordered_map<intptr_t, std::shared_ptr<InferPayload>> infer_payload_;
   std::unique_ptr<RequestExecutor> request_executor_;
   std::mutex response_factory_map_mutex_;
   std::unordered_map<intptr_t, TRITONBACKEND_ResponseFactory*>

diff --git a/src/response_sender.cc b/src/response_sender.cc
@@ -50,6 +50,13 @@ void
 ResponseSender::Send(
     std::shared_ptr<InferResponse> infer_response, const uint32_t flags)
 {
+  // Release the GIL. This avoids a potential deadlock situation in the parent
+  // process, where every thread in the thread pool is indirectly waiting for a
+  // function in the stub process that acquires the GIL. Meanwhile, the current
+  // thread, which holds the GIL, is also waiting for the parent side to have
+  // the next available thread to pick up the job during resource contention.
+  py::gil_scoped_release release;
+
   if (closed_) {
     throw PythonBackendException(
         "Unable to send response. Response sender has been closed.");