Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BackendAttribute for parallel model instance loading #235

Merged
merged 9 commits into from
Jul 26, 2023
22 changes: 21 additions & 1 deletion include/triton/core/tritonbackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ struct TRITONBACKEND_Batcher;
/// }
///
#define TRITONBACKEND_API_VERSION_MAJOR 1
#define TRITONBACKEND_API_VERSION_MINOR 13
#define TRITONBACKEND_API_VERSION_MINOR 14

/// Get the TRITONBACKEND API version supported by Triton. This value
/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
Expand Down Expand Up @@ -1480,6 +1480,26 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
const uint64_t* device_ids, const uint64_t id_count);

/// Sets whether or not the backend supports concurrently loading multiple
/// TRITONBACKEND_ModelInstances in a thread-safe manner.
///
/// Most backends are thread-safe for parallel execution of model instances as
/// that is the primary use of concurrency in backends. However, not all
/// backends are thread-safe when initializing or finalizing model instances. In
/// order for Triton to know that it can safely load instances concurrently, the
/// backend needs to opt-in by setting this backend attribute to true. By
/// default, this attribute is false and calls to the
/// TRITONBACKEND_ModelInstanceInitialize function will be made serially. If
/// this attribute is set to true, then Triton will make calls to
/// TRITONBACKEND_ModelInstanceInitialize concurrently.
///
/// \param backend_attributes The backend attributes object.
/// \param enabled Whether or not the backend supports loading model instances
/// in parallel.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendAttributeSetParallelInstanceLoading(
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
TRITONBACKEND_BackendAttribute* backend_attributes, bool enabled);

/// TRITONBACKEND Batching
///
/// API to add custom batching strategy
Expand Down
1 change: 1 addition & 0 deletions src/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ TritonBackend::UpdateAttributes()
if (!latest.preferred_groups_.empty()) {
attributes_.preferred_groups_ = latest.preferred_groups_;
}
attributes_.parallel_instance_loading_ = latest.parallel_instance_loading_;
return Status::Success;
}

Expand Down
8 changes: 7 additions & 1 deletion src/backend_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,15 @@ namespace triton { namespace core {
class TritonBackend {
public:
struct Attribute {
Attribute() : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING) {}
Attribute()
: exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING),
parallel_instance_loading_(false)
{
}
TRITONBACKEND_ExecutionPolicy exec_policy_;
std::vector<inference::ModelInstanceGroup> preferred_groups_;
// Whether the backend supports loading model instances in parallel
bool parallel_instance_loading_;
};
typedef TRITONSERVER_Error* (*TritonModelInitFn_t)(
TRITONBACKEND_Model* model);
Expand Down
10 changes: 10 additions & 0 deletions src/backend_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1531,6 +1531,16 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
return nullptr;
}


TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendAttributeSetParallelInstanceLoading(
TRITONBACKEND_BackendAttribute* backend_attributes, bool enabled)
{
auto ba = reinterpret_cast<TritonBackend::Attribute*>(backend_attributes);
ba->parallel_instance_loading_ = enabled;
return nullptr;
}

} // extern C

}} // namespace triton::core
120 changes: 82 additions & 38 deletions src/backend_model_instance.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

#include "backend_model_instance.h"

#include <future>

#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
Expand Down Expand Up @@ -204,6 +206,17 @@ TritonModelInstance::SetInstances(
existing_instances = model->IndexInstances();

static triton::common::HostPolicyCmdlineConfig empty_host_policy;
std::vector<std::future<Status>> creation_results;

// Deferred will be lazily evaluated when the result is requested. Since the
// creation_results are requested serially below, this is equivalent to making
// the calls serially.
auto launch_policy = std::launch::deferred;
// If the backend supports it, std::launch::async will allow the calls to be
// made concurrently.
if (model->Backend()->BackendAttributes().parallel_instance_loading_) {
launch_policy = std::launch::async;
}
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved

for (const auto& group : model_config.instance_group()) {
std::vector<std::string> profile_names;
Expand Down Expand Up @@ -252,6 +265,7 @@ TritonModelInstance::SetInstances(
for (const auto& is : instance_setting) {
const auto& kind = std::get<1>(is);
const auto& id = std::get<2>(is);
const auto rate_limiter_config_ptr = std::get<3>(is);

const Signature signature(group, id);
// Check if an existing instance can be re-used.
Expand All @@ -271,53 +285,83 @@ TritonModelInstance::SetInstances(

// No matching instance for re-using, so create the instance.
const std::string& policy_name = std::get<0>(is);
const triton::common::HostPolicyCmdlineConfig* host_policy;
const triton::common::HostPolicyCmdlineConfig* host_policy =
&empty_host_policy;
const auto policy_it = host_policy_map.find(policy_name);
if (policy_it != host_policy_map.end()) {
host_policy = &policy_it->second;
} else {
host_policy = &empty_host_policy;
}
std::shared_ptr<TritonModelInstance> new_instance;
RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
auto err = CreateInstance(
model, instance_name, signature, kind, id, profile_names, passive,
policy_name, *host_policy, *(std::get<3>(is)), secondary_devices,
&new_instance);
RETURN_IF_ERROR(ResetNumaMemoryPolicy());
RETURN_IF_ERROR(err);
RETURN_IF_ERROR(
model->RegisterInstance(std::move(new_instance), passive));

// When deploying on GPU, we want to make sure the GPU memory usage
// is within allowed range, otherwise, stop the creation to ensure
// there is sufficient GPU memory for other use.
// We check the usage after loading the instance to better enforcing
// the limit. If we check before loading, we may create instance
// that occupies the rest of available memory which against the purpose
if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
size_t free, total;
double memory_limit;
RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
backend_cmdline_config_map, id, &memory_limit));
const size_t allow = total * memory_limit;
const size_t used = total - free;
if (used > allow) {
return Status(
Status::Code::UNAVAILABLE,
std::string("can not create model '") + instance_name +
"': memory limit set for " +
TRITONSERVER_InstanceGroupKindString(kind) + " " +
std::to_string(id) +
" has exceeded, model loading is rejected.");
}

// Note std::async can raise an exception on failure to start threads
try {
// Note that the local variables should be captured by value
creation_results.emplace_back(std::async(
launch_policy,
[host_policy, model, instance_name, signature, kind, id,
profile_names, passive, policy_name, rate_limiter_config_ptr,
secondary_devices, &backend_cmdline_config_map]() {
std::shared_ptr<TritonModelInstance> new_instance;
RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
// NOTE [thread-safety]: CreateInstance can modify bg_instances
// via SetBackendThread
auto err = CreateInstance(
model, instance_name, signature, kind, id, profile_names,
passive, policy_name, *host_policy,
*rate_limiter_config_ptr, secondary_devices, &new_instance);
RETURN_IF_ERROR(ResetNumaMemoryPolicy());
RETURN_IF_ERROR(err);
// NOTE [thread-safety]: RegisterInstance modifies bg/bg_passive
// instances
RETURN_IF_ERROR(
model->RegisterInstance(std::move(new_instance), passive));

// When deploying on GPU, we want to make sure the GPU memory
// usage is within allowed range, otherwise, stop the creation
// to ensure there is sufficient GPU memory for other use. We
// check the usage after loading the instance to better
// enforcing the limit. If we check before loading, we may
// create instance that occupies the rest of available memory
// which against the purpose
if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
size_t free, total;
double memory_limit;
RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
backend_cmdline_config_map, id, &memory_limit));
const size_t allow = total * memory_limit;
const size_t used = total - free;
if (used > allow) {
return Status(
Status::Code::UNAVAILABLE,
std::string("can not create model '") + instance_name +
"': memory limit set for " +
TRITONSERVER_InstanceGroupKindString(kind) + " " +
std::to_string(id) +
" has exceeded, model loading is rejected.");
}
}

return Status::Success;
}));
}
catch (const std::exception& ex) {
return Status(
Status::Code::INTERNAL,
"ERROR: Failed to create instance: " + std::string(ex.what()));
}
}
}
}

return Status::Success;
auto res = Status::Success;
for (auto& cr : creation_results) {
auto lres = cr.get();
if (!lres.IsOk()) {
LOG_ERROR << "ERROR: Failed to create instance: " << lres.Message();
res = lres;
}
}
return res;
}

Status
Expand Down
18 changes: 12 additions & 6 deletions src/rate_limiter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -389,12 +389,18 @@ RateLimiter::InitializePayloadQueues(const TritonModelInstance* instance)
}
payload_queue = payload_queues_[instance->Model()].get();
}
if (payload_queue->specific_queues_.find(instance) ==
payload_queue->specific_queues_.end()) {
payload_queue->specific_queues_.emplace(
instance,
new InstanceQueue(
config.max_batch_size(), max_queue_delay_microseconds * 1000));
{
// NOTE: payload_queue can have a data race because instance->Model()
// is the same for multiple instances of same model, so protect it when
// creating model instances in parallel.
std::lock_guard<std::mutex> lk(payload_queue->mu_);
if (payload_queue->specific_queues_.find(instance) ==
payload_queue->specific_queues_.end()) {
payload_queue->specific_queues_.emplace(
instance,
new InstanceQueue(
config.max_batch_size(), max_queue_delay_microseconds * 1000));
}
}
}

Expand Down
5 changes: 5 additions & 0 deletions src/tritonserver_stub.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,11 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup()
{
}

TRITONAPI_DECLSPEC void
TRITONBACKEND_BackendAttributeSetParallelInstanceLoading()
{
}

TRITONAPI_DECLSPEC void
TRITONCACHE_ApiVersion()
{
Expand Down