From 1b98693326cf164bf8d82d88a86927d6aa5ba0fc Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Thu, 6 Jun 2024 09:33:07 +0700
Subject: [PATCH] feat: enable caching by default

---
 README.md           | 3 ++-
 src/llama_engine.cc | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e346674..57e2b1a 100644
--- a/README.md
+++ b/README.md
@@ -145,4 +145,5 @@ Table of parameters
 |`model_type` | String | Model type we want to use: llm or embedding, default value is llm|
 |`model_alias`| String | Used as model_id if specified in request, mandatory in loadmodel|
 |`model`      | String | Used as model_id if specified in request, mandatory in chat/embedding request|
-|`flash_attn` | Boolean| To enable Flash Attention, default is false|
\ No newline at end of file
+|`flash_attn` | Boolean| To enable Flash Attention, default is false|
+|`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16|
\ No newline at end of file
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
index 4c89379..0bac1cd 100644
--- a/src/llama_engine.cc
+++ b/src/llama_engine.cc
@@ -374,7 +374,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
     }
 
     server_map_[model_id].caching_enabled =
-        jsonBody->get("caching_enabled", false).asBool();
+        jsonBody->get("caching_enabled", true).asBool();
     server_map_[model_id].user_prompt =
         jsonBody->get("user_prompt", "USER: ").asString();
     server_map_[model_id].ai_prompt =