From 4de7df1bb171ea8d111b6b2dd87c16f2d51dbf53 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Fri, 3 Jan 2025 17:51:50 +0000 Subject: [PATCH] enable automatic batch size and max tokens tuning --- llgtrt/src/config.rs | 10 ++++++++++ llgtrt/src/startup.rs | 2 ++ trtllm-c/main.cpp | 11 ++++++----- trtllm-c/tlc.h | 4 ++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/llgtrt/src/config.rs b/llgtrt/src/config.rs index 0b97a2e..a2660f4 100644 --- a/llgtrt/src/config.rs +++ b/llgtrt/src/config.rs @@ -39,6 +39,14 @@ pub struct TrtLlmRuntimeConfig { /// Host memory to use for KV cache pub kv_cache_host_memory_megabytes: usize, + + /// Control automatic tuning of batch size + /// Defaults to true (unlike trtllm) + pub enable_batch_size_tuning: bool, + + /// Control automatic tuning of max num tokens + /// Defaults to true (unlike trtllm) + pub enable_max_num_tokens_tuning: bool, } impl Default for TrtLlmRuntimeConfig { @@ -52,6 +60,8 @@ impl Default for TrtLlmRuntimeConfig { enable_kv_cache_reuse: true, kv_cache_free_gpu_mem_fraction: 0.9, kv_cache_host_memory_megabytes: 0, + enable_batch_size_tuning: true, + enable_max_num_tokens_tuning: true, } } } diff --git a/llgtrt/src/startup.rs b/llgtrt/src/startup.rs index 6841219..0dcdcdd 100644 --- a/llgtrt/src/startup.rs +++ b/llgtrt/src/startup.rs @@ -135,6 +135,8 @@ pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> { set_field!(enable_chunked_context); set_field!(enable_kv_cache_reuse); + set_field!(enable_batch_size_tuning); + set_field!(enable_max_num_tokens_tuning); set_field!(max_batch_size); set_field!(max_num_tokens); set_field!(max_queue_size); diff --git a/trtllm-c/main.cpp b/trtllm-c/main.cpp index 829c1ab..bb4bbba 100644 --- a/trtllm-c/main.cpp +++ b/trtllm-c/main.cpp @@ -92,11 +92,12 @@ TlcStatus tlc_init(TlcInitParams const* params, TlcExecutor** res) : std::nullopt, ep->kv_cache_host_memory_bytes, ep->kv_cache_onboard_blocks); - auto schedulerConfig - = tle::SchedulerConfig(ep->guaranteed_no_evict ? tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT - : tle::CapacitySchedulerPolicy::kMAX_UTILIZATION - // tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED // default? - ); + tle::DynamicBatchConfig dynamicBatchConfig(ep->enable_batch_size_tuning, ep->enable_max_num_tokens_tuning); + + auto policy = ep->guaranteed_no_evict ? tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT + : tle::CapacitySchedulerPolicy::kMAX_UTILIZATION; + auto chunking = tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED; // default? + auto schedulerConfig = tle::SchedulerConfig(policy, chunking, dynamicBatchConfig); executorConfig.setKvCacheConfig(kvConfig); executorConfig.setSchedulerConfig(schedulerConfig); diff --git a/trtllm-c/tlc.h b/trtllm-c/tlc.h index 579f23a..a1b1f7c 100644 --- a/trtllm-c/tlc.h +++ b/trtllm-c/tlc.h @@ -75,6 +75,10 @@ extern "C" int32_t sink_token_length; // defaults to false (prefix caching) bool enable_kv_cache_reuse; + + // both default to false + bool enable_batch_size_tuning; + bool enable_max_num_tokens_tuning; } TlcEngineParams; typedef struct