From 1b98693326cf164bf8d82d88a86927d6aa5ba0fc Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Thu, 6 Jun 2024 09:33:07 +0700 Subject: [PATCH] feat: enable caching by default --- README.md | 3 ++- src/llama_engine.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e346674..57e2b1a 100644 --- a/README.md +++ b/README.md @@ -145,4 +145,5 @@ Table of parameters |`model_type` | String | Model type we want to use: llm or embedding, default value is llm| |`model_alias`| String | Used as model_id if specified in request, mandatory in loadmodel| |`model` | String | Used as model_id if specified in request, mandatory in chat/embedding request| -|`flash_attn` | Boolean| To enable Flash Attention, default is false| \ No newline at end of file +|`flash_attn` | Boolean| To enable Flash Attention, default is false| +|`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16| \ No newline at end of file diff --git a/src/llama_engine.cc b/src/llama_engine.cc index 4c89379..0bac1cd 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -374,7 +374,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr jsonBody) { } server_map_[model_id].caching_enabled = - jsonBody->get("caching_enabled", false).asBool(); + jsonBody->get("caching_enabled", true).asBool(); server_map_[model_id].user_prompt = jsonBody->get("user_prompt", "USER: ").asString(); server_map_[model_id].ai_prompt =