From 4159726d5277fe6b2b4521e4d23579029886bf68 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 28 Mar 2024 14:50:49 -0700
Subject: [PATCH 01/13] Add async decoupled execute

---
 src/pb_stub.cc | 83 +++++++++++++++++++++++++++++++++++++++++++++++---
 src/pb_stub.h  | 12 +++++++-
 2 files changed, 89 insertions(+), 6 deletions(-)
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index a9a910a1..13ce7d7a 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -533,6 +533,8 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle)
       c_python_backend_utils.attr("InferenceResponse"));
   c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get());
 
+  async_event_loop_ = py::none();
+
   py::object TritonPythonModel = sys.attr("TritonPythonModel");
   deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor");
   serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor");
@@ -690,11 +692,18 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr)
 
       py::object execute_return =
           model_instance_.attr("execute")(py_request_list);
-      if (!py::isinstance<py::none>(execute_return)) {
-        throw PythonBackendException(
-            "Python model '" + name_ +
-            "' is using the decoupled mode and the execute function must "
-            "return None.");
+      bool is_coroutine = py::module::import("asyncio")
+                              .attr("iscoroutine")(execute_return)
+                              .cast<bool>();
+      if (is_coroutine) {
+        RunCoroutine(execute_return);
+      } else {
+        if (!py::isinstance<py::none>(execute_return)) {
+          throw PythonBackendException(
+              "Python model '" + name_ +
+              "' is using the decoupled mode and the execute function must "
+              "return None.");
+        }
       }
     }
   }
@@ -870,6 +879,60 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
   }
 }
 
+py::object
+Stub::GetAsyncEventLoop()
+{
+  if (py::isinstance<py::none>(async_event_loop_)) {
+    // Create the event loop if not already.
+    async_event_loop_ = py::module_::import("asyncio").attr("new_event_loop")();
+    py::object py_thread =
+        py::module_::import("threading")
+            .attr("Thread")(
+                "target"_a = async_event_loop_.attr("run_forever"),
+                "daemon"_a = true);
+    py_thread.attr("start")();
+  }
+  return async_event_loop_;
+}
+
+py::object
+Stub::RunCoroutine(py::object coroutine)
+{
+  py::object loop = GetAsyncEventLoop();
+  py::object py_future = py::module_::import("asyncio").attr(
+      "run_coroutine_threadsafe")(coroutine, loop);
+
+  {
+    std::lock_guard<std::mutex> lock(async_event_futures_mu_);
+
+    std::shared_ptr<std::future<void>> shared_future(new std::future<void>());
+    std::future<void> c_future = std::async(
+        std::launch::async, [this, shared_future, py_future]() mutable {
+          {
+            py::gil_scoped_acquire gil_acquire;
+            try {
+              py_future.attr("result")();
+            }
+            catch (const PythonBackendException& pb_exception) {
+              LOG_ERROR << pb_exception.what();
+            }
+            catch (const py::error_already_set& error) {
+              LOG_ERROR << error.what();
+            }
+            py_future = py::none();
+          }
+          {
+            std::lock_guard<std::mutex> lock(async_event_futures_mu_);
+            async_event_futures_.erase(shared_future);
+          }
+        });
+    *shared_future = std::move(c_future);
+    async_event_futures_.emplace(std::move(shared_future));
+  }
+
+  return py::none();
+}
+
 void
 Stub::UpdateHealth()
 {
@@ -881,6 +944,14 @@ void
 Stub::Finalize()
 {
   finalizing_ = true;
+  // Stop async event loop if created.
+  if (!py::isinstance<py::none>(async_event_loop_)) {
+    if (!async_event_futures_.empty()) {
+      LOG_ERROR << "Finalizing stub with " << async_event_futures_.size()
+                << " ongoing coroutines";
+    }
+    async_event_loop_.attr("stop")();
+  }
   // Call finalize if exists.
   if (initialized_ && py::hasattr(model_instance_, "finalize")) {
     try {
@@ -943,6 +1014,8 @@ Stub::~Stub()
 
   {
     py::gil_scoped_acquire acquire;
+    async_event_futures_.clear();
+    async_event_loop_ = py::none();
     model_instance_ = py::none();
   }
   stub_instance_.reset();
diff --git a/src/pb_stub.h b/src/pb_stub.h
index a51f25f5..1b11c439 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -1,4 +1,4 @@
-// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -31,6 +31,9 @@
 #include <pybind11/stl.h>
 
 #include <filesystem>
+#include <future>
+#include <memory>
+#include <unordered_set>
 
 #include "infer_request.h"
 #include "infer_response.h"
@@ -255,6 +258,10 @@ class Stub {
 
   void ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr);
 
+  py::object GetAsyncEventLoop();
+
+  py::object RunCoroutine(py::object coroutine);
+
   /// Get the memory manager message queue
   std::unique_ptr<MessageQueue<uint64_t>>& MemoryManagerQueue();
 
@@ -363,6 +370,9 @@ class Stub {
   py::object model_instance_;
   py::object deserialize_bytes_;
   py::object serialize_bytes_;
+  py::object async_event_loop_;
+  std::unordered_set<std::shared_ptr<std::future<void>>> async_event_futures_;
+  std::mutex async_event_futures_mu_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
       stub_message_queue_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>

From 42ed090668175c04a9a0f144a24ffd45562a4d11 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Mon, 1 Apr 2024 15:21:45 -0700
Subject: [PATCH 02/13] Enable decoupled bls async exec

---
 src/pb_stub.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 13ce7d7a..42122b79 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -884,7 +884,9 @@ Stub::GetAsyncEventLoop()
 {
   if (py::isinstance<py::none>(async_event_loop_)) {
     // Create the event loop if not already.
-    async_event_loop_ = py::module_::import("asyncio").attr("new_event_loop")();
+    py::module asyncio = py::module_::import("asyncio");
+    async_event_loop_ = asyncio.attr("new_event_loop")();
+    asyncio.attr("set_event_loop")(async_event_loop_);
     py::object py_thread =
         py::module_::import("threading")
             .attr("Thread")(
@@ -1802,11 +1804,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
           [](std::shared_ptr<InferRequest>& infer_request,
              const bool decoupled) {
             std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
-            if (stub->IsDecoupled()) {
-              throw PythonBackendException(
-                  "Async BLS request execution is not support in the decoupled "
-                  "API.");
-            }
             py::object loop =
                 py::module_::import("asyncio").attr("get_running_loop")();
             py::cpp_function callback = [&stub, infer_request, decoupled]() {

From 985c5a24173a307f575824df0cc39d5bc7c5a277 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 2 Apr 2024 22:56:31 -0700
Subject: [PATCH 03/13] Improve handling for async execute future object

---
 src/pb_stub.cc | 53 ++++++++++++++++++++++----------------------------
 src/pb_stub.h  |  4 ++--
 2 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 42122b79..f35db5e6 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -904,33 +904,30 @@ Stub::RunCoroutine(py::object coroutine)
   py::object py_future = py::module_::import("asyncio").attr(
       "run_coroutine_threadsafe")(coroutine, loop);
 
-  {
-    std::lock_guard<std::mutex> lock(async_event_futures_mu_);
-
-    std::shared_ptr<std::future<void>> shared_future(new std::future<void>());
-    std::future<void> c_future = std::async(
-        std::launch::async, [this, shared_future, py_future]() mutable {
-          {
-            py::gil_scoped_acquire gil_acquire;
-            try {
-              py_future.attr("result")();
-            }
-            catch (const PythonBackendException& pb_exception) {
-              LOG_ERROR << pb_exception.what();
-            }
-            catch (const py::error_already_set& error) {
-              LOG_ERROR << error.what();
-            }
-            py_future = py::none();
+  std::shared_ptr<std::future<void>> shared_future(new std::future<void>());
+  std::future<void> c_future = std::async(
+      std::launch::async, [this, shared_future, py_future]() mutable {
+        {
+          py::gil_scoped_acquire gil_acquire;
+          try {
+            py_future.attr("result")();
           }
-          {
-            std::lock_guard<std::mutex> lock(async_event_futures_mu_);
-            async_event_futures_.erase(shared_future);
+          catch (const PythonBackendException& pb_exception) {
+            LOG_ERROR << pb_exception.what();
           }
-        });
-    *shared_future = std::move(c_future);
-    async_event_futures_.emplace(std::move(shared_future));
-  }
+          catch (const py::error_already_set& error) {
+            LOG_ERROR << error.what();
+          }
+          py_future = py::none();
+        }
+        std::vector<std::shared_ptr<std::future<void>>> empty;
+        {
+          std::lock_guard<std::mutex> lock(async_event_futures_mu_);
+          done_async_event_futures_.swap(empty);
+          done_async_event_futures_.emplace_back(std::move(shared_future));
+        }
+      });
+  *shared_future = std::move(c_future);
 
   return py::none();
 }
@@ -948,10 +945,6 @@ Stub::Finalize()
   finalizing_ = true;
   // Stop async event loop if created.
   if (!py::isinstance<py::none>(async_event_loop_)) {
-    if (!async_event_futures_.empty()) {
-      LOG_ERROR << "Finalizing stub with " << async_event_futures_.size()
-                << " ongoing coroutines";
-    }
     async_event_loop_.attr("stop")();
   }
   // Call finalize if exists.
@@ -1016,7 +1009,7 @@ Stub::~Stub()
 
   {
     py::gil_scoped_acquire acquire;
-    async_event_futures_.clear();
+    done_async_event_futures_.clear();
     async_event_loop_ = py::none();
     model_instance_ = py::none();
   }
diff --git a/src/pb_stub.h b/src/pb_stub.h
index 1b11c439..0d933c7e 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -33,7 +33,7 @@
 #include <filesystem>
 #include <future>
 #include <memory>
-#include <unordered_set>
+#include <vector>
 
 #include "infer_request.h"
 #include "infer_response.h"
@@ -371,7 +371,7 @@ class Stub {
   py::object deserialize_bytes_;
   py::object serialize_bytes_;
   py::object async_event_loop_;
-  std::unordered_set<std::shared_ptr<std::future<void>>> async_event_futures_;
+  std::vector<std::shared_ptr<std::future<void>>> done_async_event_futures_;
   std::mutex async_event_futures_mu_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
       stub_message_queue_;

From 11a63d68bbeea88ca384f576158f0fe002c6ffd1 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 4 Apr 2024 14:29:33 -0700
Subject: [PATCH 04/13] Add docs for async execute for decoupled model

---
 README.md | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 93fd212f..74c8d8df 100644
--- a/README.md
+++ b/README.md
@@ -620,9 +620,22 @@ full power of what can be achieved from decoupled API. Read
 [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
 for more details on how to host a decoupled model.
 
-##### Known Issues
-
-* Currently, decoupled Python models can not make async infer requests.
+##### Async Execute
+
+Starting from 24.04, `async def execute(self, requests):` is supported for
+decoupled Python models. Its coroutine will be executed by an AsyncIO event loop
+shared with requests executing in a model instance. The next request for the
+model instance can start executing while the current request is waiting.
+
+This is useful for minimizing the number of model instances for models that
+spend the majority of its time waiting, given requests can be executed
+"concurrently" by AsyncIO. To take full advantage of the "concurrency", it is
+vital for the async execute function to not block the event loop from making
+progress while it is waiting, i.e. downloading over the network.
+
+Limitations:
+* The server/backend do not control how many requests can be executed
+"concurrently" by a model instance.
 
 #### Request Rescheduling
 

From b26d5a09eb20bce57806cd96be895aeea72beedc Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 4 Apr 2024 14:52:57 -0700
Subject: [PATCH 05/13] Fix link on docs

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 74c8d8df..c7e6e6af 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ any C++ code.
       - [Request Cancellation Handling](#request-cancellation-handling)
       - [Decoupled mode](#decoupled-mode)
         - [Use Cases](#use-cases)
-        - [Known Issues](#known-issues)
+        - [Async Execute](#async-execute)
       - [Request Rescheduling](#request-rescheduling)
     - [`finalize`](#finalize)
   - [Model Config File](#model-config-file)

From c50d0cc6056febb1f972053a5631efed5ea46da1 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 4 Apr 2024 14:55:29 -0700
Subject: [PATCH 06/13] Improve docs wording

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c7e6e6af..d057d793 100644
--- a/README.md
+++ b/README.md
@@ -624,8 +624,8 @@ for more details on how to host a decoupled model.
 
 Starting from 24.04, `async def execute(self, requests):` is supported for
 decoupled Python models. Its coroutine will be executed by an AsyncIO event loop
-shared with requests executing in a model instance. The next request for the
-model instance can start executing while the current request is waiting.
+shared with requests executing in the same model instance. The next request for
+the model instance can start executing while the current request is waiting.
 
 This is useful for minimizing the number of model instances for models that
 spend the majority of its time waiting, given requests can be executed

From 75584745bb77abaa1572ad44b7b30d97d1206283 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 4 Apr 2024 20:23:33 -0700
Subject: [PATCH 07/13] Improve destruction steps for async execute future
 object

---
 src/pb_stub.cc | 9 ++++-----
 src/pb_stub.h  | 5 ++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index f35db5e6..7a7b9ac5 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -920,12 +920,11 @@ Stub::RunCoroutine(py::object coroutine)
           }
           py_future = py::none();
         }
-        std::vector<std::shared_ptr<std::future<void>>> empty;
         {
-          std::lock_guard<std::mutex> lock(async_event_futures_mu_);
-          done_async_event_futures_.swap(empty);
-          done_async_event_futures_.emplace_back(std::move(shared_future));
+          std::lock_guard<std::mutex> lock(async_event_future_mu_);
+          prev_done_async_event_future_.swap(shared_future);
         }
+        shared_future.reset();
       });
   *shared_future = std::move(c_future);
 
@@ -1007,9 +1006,9 @@ Stub::~Stub()
   }
 #endif
 
+  prev_done_async_event_future_.reset();
   {
     py::gil_scoped_acquire acquire;
-    done_async_event_futures_.clear();
     async_event_loop_ = py::none();
     model_instance_ = py::none();
   }
diff --git a/src/pb_stub.h b/src/pb_stub.h
index 0d933c7e..ab3cde88 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -33,7 +33,6 @@
 #include <filesystem>
 #include <future>
 #include <memory>
-#include <vector>
 
 #include "infer_request.h"
 #include "infer_response.h"
@@ -371,8 +370,8 @@ class Stub {
   py::object deserialize_bytes_;
   py::object serialize_bytes_;
   py::object async_event_loop_;
-  std::vector<std::shared_ptr<std::future<void>>> done_async_event_futures_;
-  std::mutex async_event_futures_mu_;
+  std::shared_ptr<std::future<void>> prev_done_async_event_future_;
+  std::mutex async_event_future_mu_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
       stub_message_queue_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>

From 7a7aa5d24566f0077a27fdac7b696a1ca597c9ca Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:33:33 -0700
Subject: [PATCH 08/13] Piggy back on GIL for protection

---
 src/pb_stub.cc | 3 ---
 src/pb_stub.h  | 1 -
 2 files changed, 4 deletions(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 7a7b9ac5..f99c2e2c 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -919,9 +919,6 @@ Stub::RunCoroutine(py::object coroutine)
             LOG_ERROR << error.what();
           }
           py_future = py::none();
-        }
-        {
-          std::lock_guard<std::mutex> lock(async_event_future_mu_);
           prev_done_async_event_future_.swap(shared_future);
         }
         shared_future.reset();
diff --git a/src/pb_stub.h b/src/pb_stub.h
index ab3cde88..cf3528d9 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -371,7 +371,6 @@ class Stub {
   py::object serialize_bytes_;
   py::object async_event_loop_;
   std::shared_ptr<std::future<void>> prev_done_async_event_future_;
-  std::mutex async_event_future_mu_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
       stub_message_queue_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>

From 00f0cb8873651fd0d93bb7273727178e6814d9ab Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:21:42 -0700
Subject: [PATCH 09/13] Document model should not modify event loop

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d057d793..719d3a98 100644
--- a/README.md
+++ b/README.md
@@ -633,7 +633,9 @@ spend the majority of its time waiting, given requests can be executed
 vital for the async execute function to not block the event loop from making
 progress while it is waiting, i.e. downloading over the network.
 
-Limitations:
+Notes:
+* The model should not modify the running event loop, as this might cause
+unexpected issues.
 * The server/backend do not control how many requests can be executed
 "concurrently" by a model instance.
 

From 68b4439338e6381beda5b14595d7f3c51b3f0bd6 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Mon, 8 Apr 2024 14:37:59 -0700
Subject: [PATCH 10/13] Use Python add_done_callback

---
 src/pb_stub.cc | 59 +++++++++++++++++++++++++++++---------------------
 src/pb_stub.h  |  5 +----
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index f99c2e2c..cb3c41f4 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -104,6 +104,30 @@ PyDefaultArgumentToMutableType(const py::object& argument)
       std::string(py::str(argument.get_type())));
 }
 
+void
+AsyncEventFutureDoneCallback(const py::object& py_future)
+{
+  // TODO: Why using `py_future.result()` with error hangs on exit?
+  try {
+    py::object exception = py_future.attr("exception")();
+    if (!py::isinstance<py::none>(exception)) {
+      std::string err_msg = "";
+      py::list traceback =
+          py::module_::import("traceback").attr("format_exception")(exception);
+      for (py::handle line : traceback) {
+        err_msg += py::str(line);
+      }
+      LOG_ERROR << err_msg;
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    LOG_ERROR << pb_exception.what();
+  }
+  catch (const py::error_already_set& error) {
+    LOG_ERROR << error.what();
+  }
+}
+
 void
 Stub::Instantiate(
     int64_t shm_growth_size, int64_t shm_default_size,
@@ -897,35 +921,15 @@ Stub::GetAsyncEventLoop()
   return async_event_loop_;
 }
 
-py::object
+void
 Stub::RunCoroutine(py::object coroutine)
 {
   py::object loop = GetAsyncEventLoop();
   py::object py_future = py::module_::import("asyncio").attr(
       "run_coroutine_threadsafe")(coroutine, loop);
-
-  std::shared_ptr<std::future<void>> shared_future(new std::future<void>());
-  std::future<void> c_future = std::async(
-      std::launch::async, [this, shared_future, py_future]() mutable {
-        {
-          py::gil_scoped_acquire gil_acquire;
-          try {
-            py_future.attr("result")();
-          }
-          catch (const PythonBackendException& pb_exception) {
-            LOG_ERROR << pb_exception.what();
-          }
-          catch (const py::error_already_set& error) {
-            LOG_ERROR << error.what();
-          }
-          py_future = py::none();
-          prev_done_async_event_future_.swap(shared_future);
-        }
-        shared_future.reset();
-      });
-  *shared_future = std::move(c_future);
-
-  return py::none();
+  py_future.attr("add_done_callback")(
+      py::module_::import("c_python_backend_utils")
+          .attr("async_event_future_done_callback"));
 }
 
 void
@@ -1003,7 +1007,6 @@ Stub::~Stub()
   }
 #endif
 
-  prev_done_async_event_future_.reset();
   {
     py::gil_scoped_acquire acquire;
     async_event_loop_ = py::none();
@@ -1919,6 +1922,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       "is_model_ready", &IsModelReady, py::arg("model_name").none(false),
       py::arg("model_version").none(false) = "");
 
+  // This function is not part of the public API for Python backend. This is
+  // only used for internal callbacks.
+  module.def(
+      "async_event_future_done_callback", &AsyncEventFutureDoneCallback,
+      py::arg("py_future").none(false));
+
   // This class is not part of the public API for Python backend. This is only
   // used for internal testing purposes.
   py::class_<SharedMemoryManager>(module, "SharedMemory")
diff --git a/src/pb_stub.h b/src/pb_stub.h
index cf3528d9..c9462fd0 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -31,8 +31,6 @@
 #include <pybind11/stl.h>
 
 #include <filesystem>
-#include <future>
-#include <memory>
 
 #include "infer_request.h"
 #include "infer_response.h"
@@ -259,7 +257,7 @@ class Stub {
 
   py::object GetAsyncEventLoop();
 
-  py::object RunCoroutine(py::object coroutine);
+  void RunCoroutine(py::object coroutine);
 
   /// Get the memory manager message queue
   std::unique_ptr<MessageQueue<uint64_t>>& MemoryManagerQueue();
@@ -370,7 +368,6 @@ class Stub {
   py::object deserialize_bytes_;
   py::object serialize_bytes_;
   py::object async_event_loop_;
-  std::shared_ptr<std::future<void>> prev_done_async_event_future_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
       stub_message_queue_;
   std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>

From 954222cb384c9b11ca64305deb969913458a8c45 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Mon, 8 Apr 2024 20:08:00 -0700
Subject: [PATCH 11/13] Protect infer_payload_

---
 src/python_be.cc | 2 ++
 src/python_be.h  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/python_be.cc b/src/python_be.cc
index b688fdfd..b95fb715 100644
--- a/src/python_be.cc
+++ b/src/python_be.cc
@@ -768,6 +768,7 @@ ModelInstanceState::ExecuteBLSRequest(
         if (is_decoupled && (infer_response->Id() != nullptr)) {
           // Need to manage the lifetime of InferPayload object for bls
           // decoupled responses.
+          std::lock_guard<std::mutex> lock(infer_payload_mu_);
           infer_payload_[reinterpret_cast<intptr_t>(infer_payload.get())] =
               infer_payload;
         }
@@ -961,6 +962,7 @@ ModelInstanceState::ProcessCleanupRequest(
   intptr_t id = reinterpret_cast<intptr_t>(cleanup_message_ptr->id);
   if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) {
     // Remove the InferPayload object from the map.
+    std::lock_guard<std::mutex> lock(infer_payload_mu_);
     infer_payload_.erase(id);
   } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) {
     // Delete response factory
diff --git a/src/python_be.h b/src/python_be.h
index 4430767c..9618204c 100644
--- a/src/python_be.h
+++ b/src/python_be.h
@@ -296,6 +296,7 @@ class ModelInstanceState : public BackendModelInstance {
   std::vector<std::future<void>> futures_;
   std::unique_ptr<boost::asio::thread_pool> thread_pool_;
   std::unordered_map<intptr_t, std::shared_ptr<InferPayload>> infer_payload_;
+  std::mutex infer_payload_mu_;
   std::unique_ptr<RequestExecutor> request_executor_;
 
  public:

From e81386df7ab8c651b5b1d3e46e5e310b6e4d45cc Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 9 Apr 2024 19:35:35 -0700
Subject: [PATCH 12/13] Use traceback API that supports Python 3.8 and 3.9

---
 src/pb_stub.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index cb3c41f4..b12e249d 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -112,8 +112,10 @@ AsyncEventFutureDoneCallback(const py::object& py_future)
     py::object exception = py_future.attr("exception")();
     if (!py::isinstance<py::none>(exception)) {
       std::string err_msg = "";
-      py::list traceback =
-          py::module_::import("traceback").attr("format_exception")(exception);
+      py::object traceback = py::module_::import("traceback")
+                                 .attr("TracebackException")
+                                 .attr("from_exception")(exception)
+                                 .attr("format")();
       for (py::handle line : traceback) {
         err_msg += py::str(line);
       }

From 2b387c5db241b5c16e07a0f89fd0757d9752083b Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:07:00 -0700
Subject: [PATCH 13/13] Update docs

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index dca8447b..7f9c7027 100644
--- a/README.md
+++ b/README.md
@@ -629,15 +629,15 @@ the model instance can start executing while the current request is waiting.
 
 This is useful for minimizing the number of model instances for models that
 spend the majority of its time waiting, given requests can be executed
-"concurrently" by AsyncIO. To take full advantage of the "concurrency", it is
-vital for the async execute function to not block the event loop from making
-progress while it is waiting, i.e. downloading over the network.
+concurrently by AsyncIO. To take full advantage of the concurrency, it is vital
+for the async execute function to not block the event loop from making progress
+while it is waiting, i.e. downloading over the network.
 
 Notes:
 * The model should not modify the running event loop, as this might cause
 unexpected issues.
-* The server/backend do not control how many requests can be executed
-"concurrently" by a model instance.
+* The server/backend do not control how many requests are added to the event
+loop by a model instance.
 
 #### Request Rescheduling