From b0670db34f7a9ef7087ccc0247699383df6a08a0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Sep 2023 15:47:21 +0300 Subject: [PATCH 1/4] llama : fix session saving/loading --- examples/chat-persistent.sh | 8 ++-- llama.cpp | 93 ++++++++++++++++++++++++------------- llama.h | 2 +- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index e0c251e5b03cc..22f5b83d3da06 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then exit 1 fi -MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}" +MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}" PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}" USER_NAME="${USER_NAME:-User}" AI_NAME="${AI_NAME:-ChatLLaMa}" @@ -61,9 +61,9 @@ fi if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then echo 'Prompt cache does not exist, building...' - # Default batch_size to 8 here for better user feedback during initial prompt processing + # Default batch_size to 64 here for better user feedback during initial prompt processing ./main 2>>"$LOG" \ - --batch_size 8 \ + --batch_size 64 \ "${OPTS[@]}" \ --prompt-cache "$PROMPT_CACHE_FILE" \ --file "$CUR_PROMPT_FILE" \ @@ -132,7 +132,7 @@ while read -e line; do # HACK get num tokens from debug message # TODO get both messages in one go if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || - ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then + ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then echo >&2 "Couldn't get number of tokens from ./main output!" exit 1 fi diff --git a/llama.cpp b/llama.cpp index 666acc2127532..37c517028b16f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7044,16 +7044,6 @@ struct llama_data_file_context : llama_data_context { * */ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { - // TODO: does not support multi-sequence states - { - const auto & kv_self = ctx->kv_self; - for (uint32_t i = 0; i < kv_self.head; ++i) { - GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i); - GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1); - GGML_ASSERT(kv_self.cells[i].has_seq_id(0)); - } - } - // copy rng { std::stringstream rng_ss; @@ -7106,36 +7096,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = cparams.n_ctx; + const auto n_layer = hparams.n_layer; + const auto n_embd = hparams.n_embd_gqa(); + const auto n_ctx = cparams.n_ctx; - const size_t kv_size = kv_self.buf.size; - const int kv_ntok = kv_self.head; + const size_t kv_buf_size = kv_self.buf.size; + const uint32_t kv_head = kv_self.head; + const uint32_t kv_size = kv_self.size; - data_ctx->write(&kv_size, sizeof(kv_size)); - data_ctx->write(&kv_ntok, sizeof(kv_ntok)); + data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); + data_ctx->write(&kv_head, sizeof(kv_head)); + data_ctx->write(&kv_size, sizeof(kv_size)); - if (kv_size) { + if (kv_buf_size) { const size_t elt_size = ggml_element_size(kv_self.k); ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); std::vector kout3d_data(ggml_nbytes(kout3d), 0); kout3d->data = kout3d_data.data(); - ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer); std::vector vout3d_data(ggml_nbytes(vout3d), 0); vout3d->data = vout3d_data.data(); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, + n_embd, kv_head, n_layer, elt_size*n_embd, elt_size*n_embd*n_ctx, 0); ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, + kv_head, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); @@ -7149,6 +7141,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(kout3d_data.data(), kout3d_data.size()); data_ctx->write(vout3d_data.data(), vout3d_data.size()); } + + for (uint32_t i = 0; i < kv_size; ++i) { + const auto & cell = kv_self.cells[i]; + + const llama_pos pos = cell.pos; + const size_t seq_id_size = cell.seq_id.size(); + + data_ctx->write(&pos, sizeof(pos)); + data_ctx->write(&seq_id_size, sizeof(seq_id_size)); + + for (auto seq_id : cell.seq_id) { + data_ctx->write(&seq_id, sizeof(seq_id)); + } + } } } @@ -7220,34 +7226,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const int n_embd = hparams.n_embd_gqa(); const int n_ctx = cparams.n_ctx; - size_t kv_size; - int kv_ntok; + size_t kv_buf_size; + uint32_t kv_head; + uint32_t kv_size; - memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); + memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); + memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); + memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - if (kv_size) { - GGML_ASSERT(kv_self.buf.size == kv_size); + if (kv_buf_size) { + GGML_ASSERT(kv_self.buf.size == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k); ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); kin3d->data = (void *) inp; inp += ggml_nbytes(kin3d); - ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer); vin3d->data = (void *) inp; inp += ggml_nbytes(vin3d); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, + n_embd, kv_head, n_layer, elt_size*n_embd, elt_size*n_embd*n_ctx, 0); ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, + kv_head, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); @@ -7257,8 +7265,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_free(cpy_ctx); } - ctx->kv_self.head = kv_ntok; + ctx->kv_self.head = kv_head; ctx->kv_self.size = kv_size; + + ctx->kv_self.cells.resize(kv_size); + + for (uint32_t i = 0; i < kv_size; ++i) { + llama_pos pos; + size_t seq_id_size; + + memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos); + memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size); + + ctx->kv_self.cells[i].pos = pos; + + llama_seq_id seq_id; + + for (size_t j = 0; j < seq_id_size; ++j) { + memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id); + ctx->kv_self.cells[i].seq_id.insert(seq_id); + } + } } const size_t nread = inp - src; diff --git a/llama.h b/llama.h index 96ff1f09c76db..a5190a8a5ad41 100644 --- a/llama.h +++ b/llama.h @@ -42,7 +42,7 @@ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 1 +#define LLAMA_SESSION_VERSION 2 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. From 0f332a91042dd11f565bc5d04ae68727a72a41aa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 2 Oct 2023 16:42:14 +0300 Subject: [PATCH 2/4] llama : temp fix for clearing "future" tokens from the KV cache --- llama.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/llama.cpp b/llama.cpp index f2f89ee1eb733..c8fb977027174 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7478,6 +7478,25 @@ void llama_batch_free(struct llama_batch batch) { int llama_decode( struct llama_context * ctx, struct llama_batch batch) { + // TODO: temporary solution to auto clear "future" tokens from the cache + // ref: https://github.com/ggerganov/llama.cpp/pull/3400 + if (batch.pos) { + std::map seq_min_pos; + for (int i = 0; i < batch.n_tokens; i++) { + if (seq_min_pos.count(batch.seq_id[i]) == 0) { + seq_min_pos[batch.seq_id[i]] = batch.pos[i]; + } else { + seq_min_pos[batch.seq_id[i]] = std::min(seq_min_pos[batch.seq_id[i]], batch.pos[i]); + } + } + + for (auto & kv : seq_min_pos) { + llama_kv_cache_seq_rm(ctx->kv_self, kv.first, kv.second, ctx->cparams.n_ctx); + } + } else { + llama_kv_cache_seq_rm(ctx->kv_self, batch.all_seq_id, batch.all_pos_0, ctx->cparams.n_ctx); + } + const int ret = llama_decode_internal(*ctx, batch); if (ret < 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); From 337120cc0d933bd6870bb7acfd880fade2d26b6c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Oct 2023 18:29:22 +0300 Subject: [PATCH 3/4] llama : fix handling of "future" tokens when loading sessions --- examples/main/main.cpp | 3 ++ examples/parallel/parallel.cpp | 2 +- examples/server/server.cpp | 2 +- examples/speculative/speculative.cpp | 6 +-- llama.cpp | 60 ++++++++++++---------------- llama.h | 8 ++++ 6 files changed, 41 insertions(+), 40 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3a4ed3f7814f8..7367ae3625ce2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -543,6 +543,9 @@ int main(int argc, char ** argv) { if (i > 0) { embd.erase(embd.begin(), embd.begin() + i); } + + // remove any "future" tokens that we might have inherited from the session from the KV cache + llama_kv_cache_tokens_rm(ctx, n_past, -1); } // evaluate tokens in batches diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 0434ded234b18..ffd7b1db4abdd 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -332,7 +332,7 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx); + llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6dda5e36b7438..921eb5da4812d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -448,7 +448,7 @@ struct llama_server_context n_past = common_part(embd, prompt_tokens); // since #3228 we now have to manually manage the KV cache - llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx); + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); embd = prompt_tokens; if (n_past == num_prompt_tokens) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index c5e5b234f0f5c..75a2e5e22d046 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { LOG("out of drafted tokens\n"); } - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx); + llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0)); ++n_past_dft; @@ -257,7 +257,7 @@ int main(int argc, char ** argv) { } // evaluate the drafted token on the draft model - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx); + llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1); llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0)); ++n_past_cur; @@ -267,7 +267,7 @@ int main(int argc, char ** argv) { } // evaluate the target model on the drafted tokens - llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx); + llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1); llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0)); ++n_past_tgt; diff --git a/llama.cpp b/llama.cpp index c8fb977027174..c9d5fc37aa606 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1281,8 +1281,8 @@ static bool llama_kv_cache_init( // find an empty slot of size "n_tokens" in the cache // updates the cache head static bool llama_kv_cache_find_slot( - struct llama_kv_cache & cache, - const struct llama_batch & batch) { + struct llama_kv_cache & cache, + const struct llama_batch & batch) { const uint32_t n_ctx = cache.size; const uint32_t n_tokens = batch.n_tokens; @@ -1350,10 +1350,13 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, } static void llama_kv_cache_seq_rm( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { cache.cells[i].seq_id.erase(seq_id); @@ -1365,11 +1368,14 @@ static void llama_kv_cache_seq_rm( } static void llama_kv_cache_seq_cp( - struct llama_kv_cache & cache, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { + struct llama_kv_cache & cache, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { cache.cells[i].seq_id.insert(seq_id_dst); @@ -1387,11 +1393,14 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id } static void llama_kv_cache_seq_shift( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { cache.cells[i].pos += delta; @@ -7478,25 +7487,6 @@ void llama_batch_free(struct llama_batch batch) { int llama_decode( struct llama_context * ctx, struct llama_batch batch) { - // TODO: temporary solution to auto clear "future" tokens from the cache - // ref: https://github.com/ggerganov/llama.cpp/pull/3400 - if (batch.pos) { - std::map seq_min_pos; - for (int i = 0; i < batch.n_tokens; i++) { - if (seq_min_pos.count(batch.seq_id[i]) == 0) { - seq_min_pos[batch.seq_id[i]] = batch.pos[i]; - } else { - seq_min_pos[batch.seq_id[i]] = std::min(seq_min_pos[batch.seq_id[i]], batch.pos[i]); - } - } - - for (auto & kv : seq_min_pos) { - llama_kv_cache_seq_rm(ctx->kv_self, kv.first, kv.second, ctx->cparams.n_ctx); - } - } else { - llama_kv_cache_seq_rm(ctx->kv_self, batch.all_seq_id, batch.all_pos_0, ctx->cparams.n_ctx); - } - const int ret = llama_decode_internal(*ctx, batch); if (ret < 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); diff --git a/llama.h b/llama.h index 4564d570eff4c..c4333d652873e 100644 --- a/llama.h +++ b/llama.h @@ -330,12 +330,16 @@ extern "C" { "avoid using this, it will be removed in the future, instead - count the tokens in user code"); // Remove all tokens data of cells in [c0, c1) + // c0 < -1 : [0, c1] + // c1 < -1 : [c0, inf) LLAMA_API void llama_kv_cache_tokens_rm( struct llama_context * ctx, int32_t c0, int32_t c1); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + // p0 < -1 : [0, p1] + // p1 < -1 : [p0, inf) LLAMA_API void llama_kv_cache_seq_rm( struct llama_context * ctx, llama_seq_id seq_id, @@ -344,6 +348,8 @@ extern "C" { // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence + // p0 < -1 : [0, p1] + // p1 < -1 : [p0, inf) LLAMA_API void llama_kv_cache_seq_cp( struct llama_context * ctx, llama_seq_id seq_id_src, @@ -358,6 +364,8 @@ extern "C" { // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly + // p0 < -1 : [0, p1] + // p1 < -1 : [p0, inf) LLAMA_API void llama_kv_cache_seq_shift( struct llama_context * ctx, llama_seq_id seq_id, From 5418932b718033f12ec75cbdedf0be1d2c4deaf2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Oct 2023 21:01:45 +0300 Subject: [PATCH 4/4] llama : fix comments for llama_kv_cache API --- llama.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llama.h b/llama.h index c4333d652873e..cf52cb6d6081a 100644 --- a/llama.h +++ b/llama.h @@ -330,16 +330,16 @@ extern "C" { "avoid using this, it will be removed in the future, instead - count the tokens in user code"); // Remove all tokens data of cells in [c0, c1) - // c0 < -1 : [0, c1] - // c1 < -1 : [c0, inf) + // c0 < 0 : [0, c1] + // c1 < 0 : [c0, inf) LLAMA_API void llama_kv_cache_tokens_rm( struct llama_context * ctx, int32_t c0, int32_t c1); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) - // p0 < -1 : [0, p1] - // p1 < -1 : [p0, inf) + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_rm( struct llama_context * ctx, llama_seq_id seq_id, @@ -348,8 +348,8 @@ extern "C" { // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence - // p0 < -1 : [0, p1] - // p1 < -1 : [p0, inf) + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_cp( struct llama_context * ctx, llama_seq_id seq_id_src, @@ -364,8 +364,8 @@ extern "C" { // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly - // p0 < -1 : [0, p1] - // p1 < -1 : [p0, inf) + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_shift( struct llama_context * ctx, llama_seq_id seq_id,