triton-inference-server · kthui · Jul 24, 2023 · May 15, 2023 · May 17, 2023 · May 18, 2023
diff --git a/src/backend_model.cc b/src/backend_model.cc
@@ -241,21 +241,19 @@ TritonModel::Create(
   }
 
   // Create or update the model instances for this model.
-  RETURN_IF_ERROR(TritonModelInstance::SetInstances(
-      raw_local_model, backend_cmdline_config_map, host_policy_map,
-      model_config));
-  RETURN_IF_ERROR(local_model->CommitInstances());
-
-  RETURN_IF_ERROR(local_model->SetConfiguredScheduler());
+  std::vector<std::shared_ptr<TritonModelInstance>> added_instances,
+      removed_instances;
+  RETURN_IF_ERROR(local_model->PrepareInstances(
+      model_config, &added_instances, &removed_instances));
+  RETURN_IF_ERROR(local_model->SetConfiguredScheduler(added_instances));
+  local_model->CommitInstances();
 
   *model = std::move(local_model);
   return Status::Success;
 }
 
 Status
-TritonModel::UpdateInstanceGroup(
-    const inference::ModelConfig& new_model_config,
-    std::unique_lock<std::mutex>* caller_lock)
+TritonModel::UpdateInstanceGroup(const inference::ModelConfig& new_model_config)
 {
   // Generate normalized model config with new instance group.
   inference::ModelConfig model_config = config_;
@@ -268,25 +266,26 @@ TritonModel::UpdateInstanceGroup(
       &model_config));
   RETURN_IF_ERROR(ValidateInstanceGroup(model_config, min_compute_capability_));
 
-  // Update the instances to the new config.
-  caller_lock->unlock();  // allow inference while creating instances
-  Status status = TritonModelInstance::SetInstances(
-      this, backend_cmdline_config_map_, host_policy_map_, model_config);
-  caller_lock->lock();
+  // Prepare the new instances on the new config.
+  std::vector<std::shared_ptr<TritonModelInstance>> added_instances,
+      removed_instances;
+  Status status =
+      PrepareInstances(model_config, &added_instances, &removed_instances);
+  if (!status.IsOk()) {
+    ClearBackgroundInstances();
+    return status;
+  }
+
+  // Update the scheduler.
+  status = UpdateConfiguredScheduler(added_instances, removed_instances);
   if (!status.IsOk()) {
-    // Remove any pending instances if created.
-    bg_instances_.clear();
-    bg_passive_instances_.clear();
+    ClearBackgroundInstances();
     return status;
   }
 
-  // At this point, the new model config is ready but not yet written into this
-  // object. The 'caller_lock' is held, so 'model_lifecycle' will pause any new
-  // inference request. It is safe to move forward and commit the change.
-  config_.mutable_instance_group()->Swap(model_config.mutable_instance_group());
-  RETURN_IF_ERROR(CommitInstances());
-  // Only model owned dynamic batch scheduler can be updated currently, so there
-  // is no need to update the scheduler.
+  // Commit the instance update.
+  CommitInstances();
+  *config_.mutable_instance_group() = model_config.instance_group();
 
   return Status::Success;
 }
@@ -394,27 +393,146 @@ TritonModel::IndexInstances() const
 }
 
 Status
-TritonModel::RegisterInstance(
-    std::shared_ptr<TritonModelInstance>&& instance, const bool passive)
+TritonModel::PrepareInstances(
+    const inference::ModelConfig& model_config,
+    std::vector<std::shared_ptr<TritonModelInstance>>* added_instances,
+    std::vector<std::shared_ptr<TritonModelInstance>>* removed_instances)
 {
-  if (passive) {
-    bg_passive_instances_.emplace_back(std::move(instance));
-  } else {
-    bg_instances_.emplace_back(std::move(instance));
+  added_instances->clear();
+  removed_instances->clear();
+
+  std::unordered_map<
+      TritonModelInstance::Signature,
+      std::vector<std::shared_ptr<TritonModelInstance>>>
+      existing_instances = IndexInstances();
+
+  // Iterates over all the requested instances on the model config, and decides
+  // if each requested instance can reuse an existing instance or a new instance
+  // is needed.
+  for (const auto& group : model_config.instance_group()) {
+    std::vector<std::string> profile_names;
+    for (const auto& profile_name : group.profile()) {
+      profile_names.push_back(profile_name);
+    }
+    std::vector<TritonModelInstance::SecondaryDevice> secondary_devices;
+    for (const auto& secondary_device : group.secondary_devices()) {
+      secondary_devices.emplace_back(
+          inference::
+              ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name(
+                  secondary_device.kind()),
+          secondary_device.device_id());
+    }
+    for (int32_t c = 0; c < group.count(); ++c) {
+      std::string instance_name{group.name() + "_" + std::to_string(c)};
+      const bool passive = group.passive();
+      struct InstanceSetting {
+        InstanceSetting(
+            const std::string& policy_name, TRITONSERVER_InstanceGroupKind kind,
+            int32_t device_id,
+            const inference::ModelRateLimiter* rate_limiter_config)
+            : policy_name_(policy_name), kind_(kind), device_id_(device_id),
+              rate_limiter_config_(rate_limiter_config)
+        {
+        }
+        const std::string policy_name_;
+        const TRITONSERVER_InstanceGroupKind kind_;
+        const int32_t device_id_;
+        const inference::ModelRateLimiter* rate_limiter_config_;
+      };
+      std::vector<InstanceSetting> instance_settings;
+      if (group.kind() == inference::ModelInstanceGroup::KIND_CPU) {
+        instance_settings.emplace_back(
+            group.host_policy().empty() ? "cpu" : group.host_policy(),
+            TRITONSERVER_INSTANCEGROUPKIND_CPU, 0 /* device_id */,
+            &group.rate_limiter());
+      } else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
+        for (const int32_t device_id : group.gpus()) {
+          instance_settings.emplace_back(
+              group.host_policy().empty() ? ("gpu_" + std::to_string(device_id))
+                                          : group.host_policy(),
+              TRITONSERVER_INSTANCEGROUPKIND_GPU, device_id,
+              &group.rate_limiter());
+        }
+      } else if (group.kind() == inference::ModelInstanceGroup::KIND_MODEL) {
+        instance_settings.emplace_back(
+            group.host_policy().empty() ? "model" : group.host_policy(),
+            TRITONSERVER_INSTANCEGROUPKIND_MODEL, 0 /* device_id */,
+            &group.rate_limiter());
+      } else {
+        return Status(
+            Status::Code::INVALID_ARG,
+            std::string("instance_group kind ") +
+                ModelInstanceGroup_Kind_Name(group.kind()) + " not supported");
+      }
+      for (const auto& is : instance_settings) {
+        // All the information for the requested instance is ready. Create a
+        // signature that identifies the requested instance.
+        const TritonModelInstance::Signature signature(group, is.device_id_);
+
+        // Check if the requested instance can reuse an existing instance.
+        if (!TritonModelInstance::ShareBackendThread(
+                DeviceBlocking(), is.kind_)) {
+          auto itr = existing_instances.find(signature);
+          if (itr != existing_instances.end() && !itr->second.empty()) {
+            auto existing_instance = itr->second.back();
+            itr->second.pop_back();
+            LOG_VERBOSE(2) << "Re-using model instance named '"
+                           << existing_instance->Name() << "' with device id '"
+                           << existing_instance->DeviceId() << "'";
+            RegisterBackgroundInstance(std::move(existing_instance), passive);
+
+            continue;
+          }
+        }
+
+        // The requested instance did not match an existing instance. Create a
+        // new instance.
+        std::shared_ptr<TritonModelInstance> new_instance;
+        LOG_VERBOSE(2) << "Creating model instance named '" << instance_name
+                       << "' with device id '" << is.device_id_ << "'";
+        RETURN_IF_ERROR(TritonModelInstance::CreateInstance(
+            this, instance_name, signature, is.kind_, is.device_id_,
+            profile_names, passive, is.policy_name_, *is.rate_limiter_config_,
+            secondary_devices, &new_instance));
+        added_instances->push_back(new_instance);
+        RegisterBackgroundInstance(std::move(new_instance), passive);
+      }
+    }
+  }
+
+  // Any existing instances not reused will be removed.
+  for (auto pair : existing_instances) {
+    removed_instances->insert(
+        removed_instances->end(), pair.second.begin(), pair.second.end());
   }
 
   return Status::Success;
 }
 
-Status
+void
 TritonModel::CommitInstances()
 {
   instances_.swap(bg_instances_);
   passive_instances_.swap(bg_passive_instances_);
+  ClearBackgroundInstances();
+}
+
+void
+TritonModel::RegisterBackgroundInstance(
+    std::shared_ptr<TritonModelInstance>&& instance, const bool passive)
+{
+  if (passive) {
+    bg_passive_instances_.emplace_back(std::move(instance));
+  } else {
+    bg_instances_.emplace_back(std::move(instance));
+  }
+}
+
+void
+TritonModel::ClearBackgroundInstances()
+{
   bg_instances_.clear();
   bg_passive_instances_.clear();
-
-  return Status::Success;
 }
 
 std::vector<std::shared_ptr<TritonModelInstance>>
@@ -483,7 +601,8 @@ TritonModel::UpdateModelConfig(
 }
 
 Status
-TritonModel::SetConfiguredScheduler()
+TritonModel::SetConfiguredScheduler(
+    const std::vector<std::shared_ptr<TritonModelInstance>>& new_instances)
 {
   std::unique_ptr<Scheduler> scheduler;
 
@@ -513,7 +632,7 @@ TritonModel::SetConfiguredScheduler()
                    "sequence batcher, using default batching strategy";
     }
     RETURN_IF_ERROR(SequenceBatchScheduler::Create(
-        this, enforce_equal_shape_tensors, &scheduler));
+        this, new_instances, enforce_equal_shape_tensors, &scheduler));
   } else if (config_.has_dynamic_batching()) {
     // Dynamic batcher
     RETURN_IF_ERROR(DynamicBatchScheduler::Create(
@@ -539,6 +658,29 @@ TritonModel::SetConfiguredScheduler()
   return SetScheduler(std::move(scheduler));
 }
 
+Status
+TritonModel::UpdateConfiguredScheduler(
+    const std::vector<std::shared_ptr<TritonModelInstance>>& added_instances,
+    const std::vector<std::shared_ptr<TritonModelInstance>>& removed_instances)
+{
+  if (config_.has_sequence_batching()) {
+    SequenceBatchScheduler* sched =
+        dynamic_cast<SequenceBatchScheduler*>(scheduler_.get());
+    if (sched == nullptr) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Unable to downcast from 'Scheduler' to 'SequenceBatchScheduler' "
+          "during scheduler update");
+    }
+    return sched->Update(added_instances, removed_instances);
+  }
+
+  // Non-sequence scheduler does not need to be updated, because other
+  // schedulers do not require the information on model instances to function,
+  // and only interact with the rate limiter.
+  return Status::Success;
+}
+
 Status
 TritonModel::SetBatchingStrategy(const std::string& batch_libpath)
 {
@@ -629,12 +771,14 @@ TritonModel::~TritonModel()
   // Clear library handles.
   ClearHandles();
 
+  // Explicitly delete/finalize the scheduler before the model instances.
+  scheduler_.reset(nullptr);
+
   // Explicitly delete/finalize all model instances before finalizing
   // the model itself.
   instances_.clear();
   passive_instances_.clear();
-  bg_instances_.clear();
-  bg_passive_instances_.clear();
+  ClearBackgroundInstances();
 
   // Unregister itself from the rate limiter. Note this should happen
   // after all instances are destructed. Destrucing instances ensures

diff --git a/src/backend_model.h b/src/backend_model.h
@@ -80,10 +80,15 @@ class TritonModel : public Model {
       TRITONSERVER_Message* updated_config_message);
   // Return the underlying backend.
   const std::shared_ptr<TritonBackend>& Backend() const { return backend_; }
-  // Return the foreground instances, excluding passive instances.
-  const std::vector<std::shared_ptr<TritonModelInstance>>& Instances() const
+  // Return the backend command line config map.
+  const triton::common::BackendCmdlineConfigMap& BackendConfigMap() const
   {
-    return instances_;
+    return backend_cmdline_config_map_;
+  }
+  // Return the host policy command line config map.
+  const triton::common::HostPolicyCmdlineConfigMap& HostPolicyMap() const
+  {
+    return host_policy_map_;
   }
 
   // True if different instances should be grouped by device; false otherwise.
@@ -96,21 +101,8 @@ class TritonModel : public Model {
   void* State() { return state_; }
   void SetState(void* state) { state_ = state; }
 
-  // Return all foreground instances indexed by its respective signature.
-  std::unordered_map<
-      TritonModelInstance::Signature,
-      std::vector<std::shared_ptr<TritonModelInstance>>>
-  IndexInstances() const;
-  // Register new instances into the background.
-  Status RegisterInstance(
-      std::shared_ptr<TritonModelInstance>&& instance, const bool passive);
-
-  // Update instance group. 'caller_lock' will be released when creating new
-  // instances and re-held when returning, to allow atomic switch over to the
-  // new instances.
-  Status UpdateInstanceGroup(
-      const inference::ModelConfig& new_model_config,
-      std::unique_lock<std::mutex>* caller_lock);
+  // Update instance group.
+  Status UpdateInstanceGroup(const inference::ModelConfig& new_model_config);
 
   // Custom batching function getters.
   TritonModelBatchInclFn_t ModelBatchInclFn() const { return batch_incl_fn_; }
@@ -130,8 +122,27 @@ class TritonModel : public Model {
       const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
       const triton::common::HostPolicyCmdlineConfigMap& host_policy_map);
 
+  // Prepare the next set of instances on the background. Returns the instances
+  // that will be added and removed if the next set of instances is to be
+  // committed.
+  Status PrepareInstances(
+      const inference::ModelConfig& model_config,
+      std::vector<std::shared_ptr<TritonModelInstance>>* added_instances,
+      std::vector<std::shared_ptr<TritonModelInstance>>* removed_instances);
   // Replace the foreground instances with background instances.
-  Status CommitInstances();
+  void CommitInstances();
+
+  // Return all foreground instances indexed by its respective signature.
+  std::unordered_map<
+      TritonModelInstance::Signature,
+      std::vector<std::shared_ptr<TritonModelInstance>>>
+  IndexInstances() const;
+
+  // Add a new instance into the background.
+  void RegisterBackgroundInstance(
+      std::shared_ptr<TritonModelInstance>&& instance, const bool passive);
+  // Clear all background instances.
+  void ClearBackgroundInstances();
 
   // Gets the execution policy setting from the backend.
   Status GetExecutionPolicy(const inference::ModelConfig& model_config);
@@ -157,9 +168,15 @@ class TritonModel : public Model {
     return res;
   }
 
-  // Set the scheduler based on the model configuration and foreground
-  // 'instances'.
-  Status SetConfiguredScheduler();
+  // Set the scheduler based on the model configuration and the provided
+  // instances.
+  Status SetConfiguredScheduler(
+      const std::vector<std::shared_ptr<TritonModelInstance>>& new_instances);
+  // Update the set scheduler to the new set of instances.
+  Status UpdateConfiguredScheduler(
+      const std::vector<std::shared_ptr<TritonModelInstance>>& added_instances,
+      const std::vector<std::shared_ptr<TritonModelInstance>>&
+          removed_instances);
 
   // Set the batching strategy, if custom functions provided by user.
   // This function should only be called with the dynamic batcher.