diff --git a/README.md b/README.md index ce67774..284ce52 100644 --- a/README.md +++ b/README.md @@ -148,3 +148,4 @@ Table of parameters |`flash_attn` | Boolean| To enable Flash Attention, default is true| |`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16| |`use_mmap` | Boolean| To enable mmap, default is true| +|`ctx_shift` | Boolean| To enable context shift, default is true| diff --git a/llama.cpp b/llama.cpp index 0827b2c..0da5d86 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 0827b2c1da299805288abbd556d869318f2b121e +Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6 diff --git a/src/llama_engine.cc b/src/llama_engine.cc index 05393eb..43d6fe3 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -712,6 +712,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr json_body) { } } + params.ctx_shift = json_body->get("ctx_shift", true).asBool(); params.n_gpu_layers = json_body->get("ngl", 300) .asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers