From 8cfadd38997c9b3f0c1778bab87ca40eae2d07fe Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 Jan 2025 13:13:17 +0700 Subject: [PATCH 1/3] fix: add ctx_shift parameter (#357) * fix: add ctx_shift parameter * chore: readme --------- Co-authored-by: vansangpfiev --- README.md | 1 + src/llama_engine.cc | 25 ++++++++++--------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ce67774..284ce52 100644 --- a/README.md +++ b/README.md @@ -148,3 +148,4 @@ Table of parameters |`flash_attn` | Boolean| To enable Flash Attention, default is true| |`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16| |`use_mmap` | Boolean| To enable mmap, default is true| +|`ctx_shift` | Boolean| To enable context shift, default is true| diff --git a/src/llama_engine.cc b/src/llama_engine.cc index 5560645..762d7e7 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -270,24 +270,18 @@ std::string CreateReturnJson(const std::string& id, const std::string& model, } const std::vector kv_cache_types = { - GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_Q8_0, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_IQ4_NL, - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, + GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, }; -ggml_type kv_cache_type_from_str(const std::string & s) { - for (const auto & type : kv_cache_types) { - if (ggml_type_name(type) == s) { - return type; - } +ggml_type kv_cache_type_from_str(const std::string& s) { + for (const auto& type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; } - throw std::runtime_error("Unsupported cache type: " + s); + } + throw std::runtime_error("Unsupported cache type: " + s); } } // namespace @@ -611,6 +605,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr json_body) { } } + params.ctx_shift = json_body->get("ctx_shift", true).asBool(); params.n_gpu_layers = json_body->get("ngl", 300) .asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers From 234143024c04aaf76545b41c8850ac695b5e8a77 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 Jan 2025 13:22:55 +0700 Subject: [PATCH 2/3] chore: down log level (#358) Co-authored-by: vansangpfiev --- src/llama_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama_engine.cc b/src/llama_engine.cc index 762d7e7..b967b71 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -287,7 +287,7 @@ ggml_type kv_cache_type_from_str(const std::string& s) { } // namespace void LlamaEngine::Load(EngineLoadOption opts) { - LOG_INFO << "Loading engine.."; + LOG_DEBUG << "Loading engine.."; LOG_DEBUG << "Is custom engine path: " << opts.is_custom_engine_path; LOG_DEBUG << "Engine path: " << opts.engine_path.string(); From 44412ee83a7d017353db41e0baeda03f4226235f Mon Sep 17 00:00:00 2001 From: jan-service-account <136811300+jan-service-account@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:12:00 +0700 Subject: [PATCH 3/3] Update submodule to latest release b4406 (#356) Co-authored-by: github-actions[bot] --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 0827b2c..0da5d86 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 0827b2c1da299805288abbd556d869318f2b121e +Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6