From b7496dea9be938c7d51b9ffbc1f07246481db66b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 11 Sep 2024 09:15:30 +0200
Subject: [PATCH 01/11] chore(deps): Bump yarl from 1.9.7 to 1.11.0 in
 /examples/langchain/langchainpy-localai-example (#3501)

chore(deps): Bump yarl

Bumps [yarl](https://github.com/aio-libs/yarl) from 1.9.7 to 1.11.0.
- [Release notes](https://github.com/aio-libs/yarl/releases)
- [Changelog](https://github.com/aio-libs/yarl/blob/master/CHANGES.rst)
- [Commits](https://github.com/aio-libs/yarl/compare/v1.9.7...v1.11.0)

---
updated-dependencies:
- dependency-name: yarl
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/langchain/langchainpy-localai-example/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/langchain/langchainpy-localai-example/requirements.txt b/examples/langchain/langchainpy-localai-example/requirements.txt
index 0c7be91791ff..753230059bcd 100644
--- a/examples/langchain/langchainpy-localai-example/requirements.txt
+++ b/examples/langchain/langchainpy-localai-example/requirements.txt
@@ -30,4 +30,4 @@ tqdm==4.66.5
 typing-inspect==0.9.0
 typing_extensions==4.12.2
 urllib3==2.2.2
-yarl==1.9.7
+yarl==1.11.0

From a7ac2f7bb0800b13b2cb2f305528ead6db9fd695 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 11 Sep 2024 09:15:52 +0200
Subject: [PATCH 02/11] chore(deps): Bump llama-index from 0.11.4 to 0.11.7 in
 /examples/chainlit (#3516)

chore(deps): Bump llama-index in /examples/chainlit

Bumps [llama-index](https://github.com/run-llama/llama_index) from 0.11.4 to 0.11.7.
- [Release notes](https://github.com/run-llama/llama_index/releases)
- [Changelog](https://github.com/run-llama/llama_index/blob/main/CHANGELOG.md)
- [Commits](https://github.com/run-llama/llama_index/compare/v0.11.4...v0.11.7)

---
updated-dependencies:
- dependency-name: llama-index
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/chainlit/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/chainlit/requirements.txt b/examples/chainlit/requirements.txt
index 8654ea9959a5..69212e28e102 100644
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -1,4 +1,4 @@
-llama_index==0.11.4
+llama_index==0.11.7
 requests==2.32.3
 weaviate_client==4.6.7
 transformers

From e35d8169b1cc1848cc339cb2525b3e6d42a1393b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 12 Sep 2024 08:52:27 +0200
Subject: [PATCH 03/11] chore: :arrow_up: Update ggerganov/whisper.cpp to
 `a551933542d956ae84634937acd2942eb40efaaf` (#3534)

:arrow_up: Update ggerganov/whisper.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index fe05dc1a5741..e0e15cfb1c4f 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=5caa19240d55bfd6ee316d50fbad32c6e9c39528
+WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
 
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp

From d51444d606e1c616c396e37d7413a8a562714cb6 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 12 Sep 2024 20:55:27 +0200
Subject: [PATCH 04/11] chore(deps): update llama.cpp (#3497)

* Apply llava patch

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                                 |   2 +-
 backend/cpp/llama/grpc-server.cpp        | 107 +++++++++++------------
 backend/cpp/llama/patches/01-llava.patch |  13 +++
 backend/cpp/llama/prepare.sh             |   7 ++
 backend/cpp/llama/utils.hpp              |  27 ------
 5 files changed, 70 insertions(+), 86 deletions(-)
 create mode 100644 backend/cpp/llama/patches/01-llava.patch

diff --git a/Makefile b/Makefile
index e0e15cfb1c4f..3d9ea5921d6b 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04
+CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index e1b6f868b2e8..a46b4ee0a335 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -17,11 +17,10 @@
 #include "common.h"
 #include "json.hpp"
 #include "llama.h"
-#include "grammar-parser.h"
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "utils.hpp"
-
+#include "sampling.h"
 // include std::regex
 #include <cstddef>
 #include <thread>
@@ -203,8 +202,8 @@ struct llama_client_slot
     std::string stopping_word;
 
     // sampling
-    struct llama_sampling_params sparams;
-    llama_sampling_context *ctx_sampling = nullptr;
+    struct gpt_sampler_params sparams;
+    gpt_sampler *ctx_sampling = nullptr;
 
     int32_t ga_i = 0;   // group-attention state
     int32_t ga_n = 1;   // group-attention factor
@@ -619,7 +618,7 @@ struct llama_server_context
 
     bool launch_slot_with_data(llama_client_slot* &slot, json data) {
         slot_params default_params;
-        llama_sampling_params default_sparams;
+        gpt_sampler_params default_sparams;
  
         slot->params.stream             = json_value(data, "stream",            false);
         slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -628,7 +627,7 @@ struct llama_server_context
         slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
         slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
         slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
+        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
         slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
         slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -641,7 +640,7 @@ struct llama_server_context
         slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
         slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
         slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->params.seed               = json_value(data, "seed",              default_params.seed);
+        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
         slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
         slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
@@ -665,6 +664,7 @@ struct llama_server_context
             slot->params.input_prefix = "";
         }
 
+
         if (data.count("input_suffix") != 0)
         {
             slot->params.input_suffix = data["input_suffix"];
@@ -683,6 +683,10 @@ struct llama_server_context
             slot->prompt = "";
         }
 
+        if (json_value(data, "ignore_eos", false)) {
+                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+        }
+        /*
         slot->sparams.penalty_prompt_tokens.clear();
         slot->sparams.use_penalty_prompt_tokens = false;
         const auto &penalty_prompt = data.find("penalty_prompt");
@@ -718,14 +722,10 @@ struct llama_server_context
                 slot->sparams.use_penalty_prompt_tokens = true;
             }
         }
+      */
 
         slot->sparams.logit_bias.clear();
 
-        if (json_value(data, "ignore_eos", false))
-        {
-            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
-        }
-
         const auto &logit_bias = data.find("logit_bias");
         if (logit_bias != data.end() && logit_bias->is_array())
         {
@@ -753,7 +753,7 @@ struct llama_server_context
                         llama_token tok = el[0].get<llama_token>();
                         if (tok >= 0 && tok < n_vocab)
                         {
-                            slot->sparams.logit_bias[tok] = bias;
+                            slot->sparams.logit_bias.push_back({tok, bias});
                         }
                     }
                     else if (el[0].is_string())
@@ -761,13 +761,13 @@ struct llama_server_context
                         auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                         for (auto tok : toks)
                         {
-                            slot->sparams.logit_bias[tok] = bias;
+                            slot->sparams.logit_bias.push_back({tok, bias});
                         }
                     }
                 }
             }
         }
-
+        
         slot->params.antiprompt.clear();
 
         const auto &stop = data.find("stop");
@@ -781,24 +781,22 @@ struct llama_server_context
                 }
             }
         }
-
-        const auto &samplers_sequence = data.find("samplers");
-        if (samplers_sequence != data.end() && samplers_sequence->is_array())
-        {
+        
+        const auto & samplers = data.find("samplers");
+        if (samplers != data.end() && samplers->is_array()) {
             std::vector<std::string> sampler_names;
-            for (const auto &sampler_name : *samplers_sequence)
-            {
-                if (sampler_name.is_string())
-                {
-                    sampler_names.emplace_back(sampler_name);
+                for (const auto & name : *samplers) {
+                    if (name.is_string()) {
+                        sampler_names.emplace_back(name);
+                    }
                 }
-            }
-            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
+                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
         }
         else
         {
-            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
+                slot->sparams.samplers = default_sparams.samplers;
         }
+        
 
         if (multimodal)
         {
@@ -875,10 +873,10 @@ struct llama_server_context
 
         if (slot->ctx_sampling != nullptr)
         {
-            llama_sampling_free(slot->ctx_sampling);
+            gpt_sampler_free(slot->ctx_sampling);
         }
-        slot->ctx_sampling = llama_sampling_init(slot->sparams);
-        llama_set_rng_seed(ctx, slot->params.seed);
+        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
+        //llama_set_rng_seed(ctx, slot->params.seed);
         slot->command = LOAD_PROMPT;
 
         all_slots_are_idle = false;
@@ -888,7 +886,7 @@ struct llama_server_context
             {"task_id", slot->task_id},
         });
 
-        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
 
         return true;
     }
@@ -1006,11 +1004,13 @@ struct llama_server_context
         slot.generated_text += token_str;
         slot.has_next_token = true;
 
+/*
         if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
         {
             // we can change penalty_prompt_tokens because it is always created from scratch each request
             slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
         }
+        */
 
         // check if there is incomplete UTF-8 character at the end
         bool incomplete = false;
@@ -1144,13 +1144,11 @@ struct llama_server_context
 
     json get_formated_generation(llama_client_slot &slot)
     {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
-        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
-                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-        std::vector<std::string> samplers_sequence;
-        for (const auto &sampler_type : slot.sparams.samplers_sequence)
+        std::vector<std::string> samplers;
+        samplers.reserve(slot.sparams.samplers.size());
+        for (const auto & sampler : slot.sparams.samplers)
         {
-            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
         }
 
         return json {
@@ -1165,13 +1163,11 @@ struct llama_server_context
             {"top_p",             slot.sparams.top_p},
             {"min_p",             slot.sparams.min_p},
             {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typical_p},
+            {"typical_p",         slot.sparams.typ_p},
             {"repeat_last_n",     slot.sparams.penalty_last_n},
             {"repeat_penalty",    slot.sparams.penalty_repeat},
             {"presence_penalty",  slot.sparams.penalty_present},
             {"frequency_penalty", slot.sparams.penalty_freq},
-            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
-            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
             {"mirostat",          slot.sparams.mirostat},
             {"mirostat_tau",      slot.sparams.mirostat_tau},
             {"mirostat_eta",      slot.sparams.mirostat_eta},
@@ -1179,13 +1175,13 @@ struct llama_server_context
             {"stop",              slot.params.antiprompt},
             {"n_predict",         slot.params.n_predict},
             {"n_keep",            params.n_keep},
-            {"ignore_eos",        ignore_eos},
+            {"ignore_eos",        slot.sparams.ignore_eos},
             {"stream",            slot.params.stream},
-            {"logit_bias",        slot.sparams.logit_bias},
+      //      {"logit_bias",        slot.sparams.logit_bias},
             {"n_probs",           slot.sparams.n_probs},
             {"min_keep",          slot.sparams.min_keep},
             {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers_sequence}
+            {"samplers",          samplers}
         };
     }
 
@@ -1714,7 +1710,7 @@ struct llama_server_context
 
                     if (!slot.params.cache_prompt)
                     {
-                        llama_sampling_reset(slot.ctx_sampling);
+                        gpt_sampler_reset(slot.ctx_sampling);
 
                         slot.n_past = 0;
                         slot.n_past_se = 0;
@@ -1726,7 +1722,7 @@ struct llama_server_context
                         // push the prompt into the sampling context (do not apply grammar)
                         for (auto &token : prompt_tokens)
                         {
-                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+                            gpt_sampler_accept(slot.ctx_sampling, token, false);
                         }
 
                         slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1934,9 +1930,9 @@ struct llama_server_context
                 }
 
                 completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
 
-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+                gpt_sampler_accept(slot.ctx_sampling, id, true);
 
                 slot.n_decoded += 1;
                 if (slot.n_decoded == 1)
@@ -1946,19 +1942,14 @@ struct llama_server_context
                     metrics.on_prompt_eval(slot);
                 }
 
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                 result.tok = id;
+                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
 
-                const int32_t n_probs = slot.sparams.n_probs;
-                if (slot.sparams.temp <= 0 && n_probs > 0)
-                {
-                    // for llama_sample_token_greedy we need to sort candidates
-                    llama_sample_softmax(ctx, &cur_p);
-                }
-
-                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
-                {
-                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                    result.probs.push_back({
+                        cur_p->data[i].id,
+                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                    });
                 }
 
                 if (!process_token(result, slot))
diff --git a/backend/cpp/llama/patches/01-llava.patch b/backend/cpp/llama/patches/01-llava.patch
new file mode 100644
index 000000000000..fa122da257cd
--- /dev/null
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -0,0 +1,13 @@
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+index 342042ff..224db9b5 100644
+--- a/examples/llava/clip.cpp
++++ b/examples/llava/clip.cpp
+@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+             int* patches_data = (int*)malloc(ggml_nbytes(patches));
+             for (int i = 0; i < num_patches; i++) {
+-                patches_data[i] = i + 1;
++                patches_data[i] = i;
+             }
+             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+             free(patches_data);
\ No newline at end of file
diff --git a/backend/cpp/llama/prepare.sh b/backend/cpp/llama/prepare.sh
index 6c00f27caa38..4c8393b908d7 100644
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,5 +1,12 @@
 #!/bin/bash
 
+## Patches
+## Apply patches from the `patches` directory
+for patch in $(ls patches); do
+    echo "Applying patch $patch"
+    patch -d llama.cpp/ -p1 < patches/$patch
+done 
+
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
diff --git a/backend/cpp/llama/utils.hpp b/backend/cpp/llama/utils.hpp
index c5dafbf0f9ce..198b6f265957 100644
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -480,31 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
     }
 
     return ret;
-}
-
-//
-// random string / id
-//
-
-static std::string random_string()
-{
-    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-    std::random_device rd;
-    std::mt19937 generator(rd());
-
-    std::string result(32, ' ');
-
-    for (int i = 0; i < 32; ++i) {
-        result[i] = str[generator() % str.size()];
-    }
-
-    return result;
-}
-
-static std::string gen_chatcmplid()
-{
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-    return chatcmplid.str();
 }
\ No newline at end of file

From cf747bcdeccaef7ef2128b3e0329da76871f83d7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:27:36 +0200
Subject: [PATCH 05/11] feat: extract output with regexes from LLMs (#3491)

* feat: extract output with regexes from LLMs

This changset adds `extract_regex` to the LLM config. It is a list of
regexes that can match output and will be used to re extract text from
the LLM output. This is particularly useful for LLMs which outputs final
results into tags.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add tests, enhance output in case of configuration error

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/backend/backend_suite_test.go |  13 ++++
 core/backend/llm.go                |  28 +++++++-
 core/backend/llm_test.go           | 109 +++++++++++++++++++++++++++++
 core/config/backend_config.go      |   1 +
 core/http/endpoints/openai/chat.go |   8 +--
 5 files changed, 154 insertions(+), 5 deletions(-)
 create mode 100644 core/backend/backend_suite_test.go
 create mode 100644 core/backend/llm_test.go

diff --git a/core/backend/backend_suite_test.go b/core/backend/backend_suite_test.go
new file mode 100644
index 000000000000..541c91f6be79
--- /dev/null
+++ b/core/backend/backend_suite_test.go
@@ -0,0 +1,13 @@
+package backend_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestBackend(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Backend test suite")
+}
diff --git a/core/backend/llm.go b/core/backend/llm.go
index 72c4ad9f0380..2b4564a886fe 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -9,6 +9,8 @@ import (
 	"sync"
 	"unicode/utf8"
 
+	"github.com/rs/zerolog/log"
+
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 
@@ -181,13 +183,37 @@ func Finetune(config config.BackendConfig, input, prediction string) string {
 		mu.Lock()
 		reg, ok := cutstrings[c]
 		if !ok {
-			cutstrings[c] = regexp.MustCompile(c)
+			r, err := regexp.Compile(c)
+			if err != nil {
+				log.Fatal().Err(err).Msg("failed to compile regex")
+			}
+			cutstrings[c] = r
 			reg = cutstrings[c]
 		}
 		mu.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}
 
+	// extract results from the response which can be for instance inside XML tags
+	var predResult string
+	for _, r := range config.ExtractRegex {
+		mu.Lock()
+		reg, ok := cutstrings[r]
+		if !ok {
+			regex, err := regexp.Compile(r)
+			if err != nil {
+				log.Fatal().Err(err).Msg("failed to compile regex")
+			}
+			cutstrings[r] = regex
+			reg = regex
+		}
+		mu.Unlock()
+		predResult += reg.FindString(prediction)
+	}
+	if predResult != "" {
+		prediction = predResult
+	}
+
 	for _, c := range config.TrimSpace {
 		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
 	}
diff --git a/core/backend/llm_test.go b/core/backend/llm_test.go
new file mode 100644
index 000000000000..f7630702e2a0
--- /dev/null
+++ b/core/backend/llm_test.go
@@ -0,0 +1,109 @@
+package backend_test
+
+import (
+	. "github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LLM tests", func() {
+	Context("Finetune LLM output", func() {
+		var (
+			testConfig config.BackendConfig
+			input      string
+			prediction string
+			result     string
+		)
+
+		BeforeEach(func() {
+			testConfig = config.BackendConfig{
+				PredictionOptions: schema.PredictionOptions{
+					Echo: false,
+				},
+				LLMConfig: config.LLMConfig{
+					Cutstrings:   []string{`<.*?>`},                  // Example regex for removing XML tags
+					ExtractRegex: []string{`<result>(.*?)</result>`}, // Example regex to extract from tags
+					TrimSpace:    []string{" ", "\n"},
+					TrimSuffix:   []string{".", "!"},
+				},
+			}
+		})
+
+		Context("when echo is enabled", func() {
+			BeforeEach(func() {
+				testConfig.Echo = true
+				input = "Hello"
+				prediction = "World"
+			})
+
+			It("should prepend input to prediction", func() {
+				result = Finetune(testConfig, input, prediction)
+				Expect(result).To(Equal("HelloWorld"))
+			})
+		})
+
+		Context("when echo is disabled", func() {
+			BeforeEach(func() {
+				testConfig.Echo = false
+				input = "Hello"
+				prediction = "World"
+			})
+
+			It("should not modify the prediction with input", func() {
+				result = Finetune(testConfig, input, prediction)
+				Expect(result).To(Equal("World"))
+			})
+		})
+
+		Context("when cutstrings regex is applied", func() {
+			BeforeEach(func() {
+				input = ""
+				prediction = "<div>Hello</div> World"
+			})
+
+			It("should remove substrings matching cutstrings regex", func() {
+				result = Finetune(testConfig, input, prediction)
+				Expect(result).To(Equal("Hello World"))
+			})
+		})
+
+		Context("when extract regex is applied", func() {
+			BeforeEach(func() {
+				input = ""
+				prediction = "<response><result>42</result></response>"
+			})
+
+			It("should extract substrings matching the extract regex", func() {
+				result = Finetune(testConfig, input, prediction)
+				Expect(result).To(Equal("42"))
+			})
+		})
+
+		Context("when trimming spaces", func() {
+			BeforeEach(func() {
+				input = ""
+				prediction = "   Hello World   "
+			})
+
+			It("should trim spaces from the prediction", func() {
+				result = Finetune(testConfig, input, prediction)
+				Expect(result).To(Equal("Hello World"))
+			})
+		})
+
+		Context("when trimming suffixes", func() {
+			BeforeEach(func() {
+				input = ""
+				prediction = "Hello World."
+			})
+
+			It("should trim suffixes from the prediction", func() {
+				result = Finetune(testConfig, input, prediction)
+				Expect(result).To(Equal("Hello World"))
+			})
+		})
+	})
+})
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index b83e1a986666..027e18a4a599 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -126,6 +126,7 @@ type LLMConfig struct {
 	Grammar         string   `yaml:"grammar"`
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
+	ExtractRegex    []string `yaml:"extract_regex"`
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`
 
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index a979b7bca33d..8144bdcd3341 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -68,9 +68,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 
 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
 		result = functions.CleanupLLMResult(result, config.FunctionsConfig)
-		results := functions.ParseFunctionCall(result, config.FunctionsConfig)
+		functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig)
 		log.Debug().Msgf("Text content to return: %s", textContentToReturn)
-		noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0
+		noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0
 
 		switch {
 		case noActionToRun:
@@ -83,7 +83,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			}
 			responses <- initialMessage
 
-			result, err := handleQuestion(config, req, ml, startupOptions, results, result, prompt)
+			result, err := handleQuestion(config, req, ml, startupOptions, functionResults, result, prompt)
 			if err != nil {
 				log.Error().Err(err).Msg("error handling question")
 				return
@@ -105,7 +105,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			responses <- resp
 
 		default:
-			for i, ss := range results {
+			for i, ss := range functionResults {
 				name, args := ss.Name, ss.Arguments
 
 				initialMessage := schema.OpenAIResponse{

From 7fe6d0ad2be25e31fe38439182911d74ec2c569f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Sep 2024 19:19:26 +0200
Subject: [PATCH 06/11] chore(gosec): fix CI (#3537)

downgrade to latest known version of the gosec action

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/secscan.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml
index d9743d9e36ae..db9db586947d 100644
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
         if: ${{ github.actor != 'dependabot[bot]' }}
       - name: Run Gosec Security Scanner
         if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@master
+        uses: securego/gosec@v2.21.0
         with:
           # we let the report trigger content trigger a failure using the GitHub Security features.
           args: '-no-fail -fmt sarif -out results.sarif ./...'

From 5213e79f5c3059616e461176564c97cca481e367 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Sep 2024 19:48:54 +0200
Subject: [PATCH 07/11] models(gallery): add azure_dusk-v0.2-iq-imatrix (#3538)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 4939820df83e..86e8133883a1 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -658,6 +658,23 @@
     - filename: Mahou-1.3-llama3.1-8B.Q4_K_M.gguf
       sha256: 88bfdca2f6077d789d3e0f161d19711aa208a6d9a02cce96a2276c69413b3594
       uri: huggingface://mradermacher/Mahou-1.3-llama3.1-8B-GGUF/Mahou-1.3-llama3.1-8B.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "azure_dusk-v0.2-iq-imatrix"
+  # chatml
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/n3-g_YTk3FY-DBzxXd28E.png
+  urls:
+    - https://huggingface.co/Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix
+  description: |
+    "Following up on Crimson_Dawn-v0.2 we have Azure_Dusk-v0.2! Training on Mistral-Nemo-Base-2407 this time I've added significantly more data, as well as trained using RSLoRA as opposed to regular LoRA. Another key change is training on ChatML as opposed to Mistral Formatting."
+    by Author.
+  overrides:
+    parameters:
+      model: Azure_Dusk-v0.2-Q4_K_M-imat.gguf
+  files:
+    - filename: Azure_Dusk-v0.2-Q4_K_M-imat.gguf
+      sha256: c03a670c00976d14c267a0322374ed488b2a5f4790eb509136ca4e75cbc10cf4
+      uri: huggingface://Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix/Azure_Dusk-v0.2-Q4_K_M-imat.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From 925315ab5cd4f09e32cba08c5ca23bb9b215ba98 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Sep 2024 19:49:14 +0200
Subject: [PATCH 08/11] models(gallery): add mn-12b-lyra-v4-iq-imatrix (#3539)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 86e8133883a1..b70a9122ac20 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1212,6 +1212,23 @@
     - filename: Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf
       sha256: cf3465c183bf4ecbccd1b6b480f687e0160475b04c87e2f1e5ebc8baa0f4c7aa
       uri: huggingface://bartowski/Pantheon-RP-1.6-12b-Nemo-GGUF/Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "mn-12b-lyra-v4-iq-imatrix"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/dVoru83WOpwVjMlgZ_xhA.png
+  #chatml
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  urls:
+    - https://huggingface.co/Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix
+  description: |
+      A finetune of Mistral Nemo by Sao10K.
+      Uses the ChatML prompt format.
+  overrides:
+    parameters:
+      model: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
+  files:
+    - filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
+      sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3
+      uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"

From cbfab81c3599c8c2051209234edf0a3259b5efaa Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Sep 2024 21:49:18 +0200
Subject: [PATCH 09/11] models(gallery): add datagemma models (#3540)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b70a9122ac20..188576b26bc5 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1756,6 +1756,34 @@
     - filename: Athena-codegemma-2-2b-it.Q4_K_M.gguf
       sha256: 59ce17023438b0da603dd211c7d39f78e7acac4108258ac0818a97a4ca7d64e3
       uri: huggingface://mradermacher/Athena-codegemma-2-2b-it-GGUF/Athena-codegemma-2-2b-it.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "datagemma-rag-27b-it"
+  urls:
+    - https://huggingface.co/google/datagemma-rag-27b-it
+    - https://huggingface.co/bartowski/datagemma-rag-27b-it-GGUF
+  description: |
+    DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RAG is used with Retrieval Augmented Generation, where it is trained to take a user query and generate natural language queries that can be understood by Data Commons' existing natural language interface. More information can be found in this research paper.
+  overrides:
+    parameters:
+      model: datagemma-rag-27b-it-Q4_K_M.gguf
+  files:
+    - filename: datagemma-rag-27b-it-Q4_K_M.gguf
+      sha256: 3dfcf51b05e3f0ab0979ad194de350edea71cb14444efa0a9f2ef5bfc80753f8
+      uri: huggingface://bartowski/datagemma-rag-27b-it-GGUF/datagemma-rag-27b-it-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "datagemma-rig-27b-it"
+  urls:
+    - https://huggingface.co/google/datagemma-rig-27b-it
+    - https://huggingface.co/bartowski/datagemma-rig-27b-it-GGUF
+  description: |
+    DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RIG is used in the retrieval interleaved generation approach (based off of tool-use approaches), where it is trained to annotate a response with natural language queries to Data Commons’ existing natural language interface wherever there are statistics. More information can be found in this research paper.
+  overrides:
+    parameters:
+      model: datagemma-rig-27b-it-Q4_K_M.gguf
+  files:
+    - filename: datagemma-rig-27b-it-Q4_K_M.gguf
+      sha256: a6738ffbb49b6c46d220e2793df85c0538e9ac72398e32a0914ee5e55c3096ad
+      uri: huggingface://bartowski/datagemma-rig-27b-it-GGUF/datagemma-rig-27b-it-Q4_K_M.gguf
 - &llama3
   url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png

From 25e7661de268bb9cd31622fdb05052cf26ac4e9f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Sep 2024 21:52:13 +0200
Subject: [PATCH 10/11] chore(exllama): drop exllama backend (#3536)

* chore(exllama): drop exllama backend

For polishing and cleaning up it makes now sense to drop exllama which
is completely unmaintained, and was only supporting the llamav1
architecture (nowadays it's superseded by llamav1) .

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(gosec): fix CI

downgrade to latest known version of the gosec action

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Dockerfile                                    |   5 +-
 Makefile                                      |  13 +-
 backend/python/exllama/.gitignore             |   1 -
 backend/python/exllama/Makefile               |  25 ---
 backend/python/exllama/README.md              |   5 -
 backend/python/exllama/backend.py             | 159 ------------------
 backend/python/exllama/install.sh             |  13 --
 backend/python/exllama/requirements-cpu.txt   |   3 -
 .../python/exllama/requirements-cublas11.txt  |   4 -
 .../python/exllama/requirements-cublas12.txt  |   3 -
 backend/python/exllama/requirements.txt       |   4 -
 backend/python/exllama/run.sh                 |   7 -
 backend/python/exllama/test.sh                |   6 -
 13 files changed, 3 insertions(+), 245 deletions(-)
 delete mode 100644 backend/python/exllama/.gitignore
 delete mode 100644 backend/python/exllama/Makefile
 delete mode 100644 backend/python/exllama/README.md
 delete mode 100755 backend/python/exllama/backend.py
 delete mode 100755 backend/python/exllama/install.sh
 delete mode 100644 backend/python/exllama/requirements-cpu.txt
 delete mode 100644 backend/python/exllama/requirements-cublas11.txt
 delete mode 100644 backend/python/exllama/requirements-cublas12.txt
 delete mode 100644 backend/python/exllama/requirements.txt
 delete mode 100755 backend/python/exllama/run.sh
 delete mode 100755 backend/python/exllama/test.sh

diff --git a/Dockerfile b/Dockerfile
index b86cc7061d8e..f08cb9a03b2a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,7 +13,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 
 
 RUN apt-get update && \
@@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
     ; fi && \
     if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
         make -C backend/python/transformers-musicgen \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/exllama \
     ; fi
 
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
diff --git a/Makefile b/Makefile
index 3d9ea5921d6b..a3f0ffd05c38 100644
--- a/Makefile
+++ b/Makefile
@@ -534,10 +534,10 @@ protogen-go-clean:
 	$(RM) bin/*
 
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
 
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
 
 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -571,14 +571,6 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 
-.PHONY: exllama-protogen
-exllama-protogen:
-	$(MAKE) -C backend/python/exllama protogen
-
-.PHONY: exllama-protogen-clean
-exllama-protogen-clean:
-	$(MAKE) -C backend/python/exllama protogen-clean
-
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
-	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/exllama2
 
 prepare-test-extra: protogen-python
diff --git a/backend/python/exllama/.gitignore b/backend/python/exllama/.gitignore
deleted file mode 100644
index 1d3a06547c70..000000000000
--- a/backend/python/exllama/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-source
\ No newline at end of file
diff --git a/backend/python/exllama/Makefile b/backend/python/exllama/Makefile
deleted file mode 100644
index e6a678810c3f..000000000000
--- a/backend/python/exllama/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-export CONDA_ENV_PATH = "exllama.yml"
-
-.PHONY: exllama
-exllama: protogen
-	bash install.sh ${CONDA_ENV_PATH}
-
-.PHONY: run
-run: protogen
-	@echo "Running exllama..."
-	bash run.sh
-	@echo "exllama run."
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
-
-.PHONY: clean
-clean: protogen-clean
-	$(RM) -r venv source __pycache__
\ No newline at end of file
diff --git a/backend/python/exllama/README.md b/backend/python/exllama/README.md
deleted file mode 100644
index f9ed5e9fbdb7..000000000000
--- a/backend/python/exllama/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Creating a separate environment for the exllama project
-
-```
-make exllama
-```
\ No newline at end of file
diff --git a/backend/python/exllama/backend.py b/backend/python/exllama/backend.py
deleted file mode 100755
index 58d1392c5ee4..000000000000
--- a/backend/python/exllama/backend.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python3
-import grpc
-from concurrent import futures
-import time
-import backend_pb2
-import backend_pb2_grpc
-import argparse
-import signal
-import sys
-import os, glob
-
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from torch import version as torch_version
-
-from source.tokenizer import ExLlamaTokenizer
-from source.generator import ExLlamaGenerator
-from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def generate(self,prompt, max_new_tokens):
-        self.generator.end_beam_search()
-
-        # Tokenizing the input
-        ids = self.generator.tokenizer.encode(prompt)
-
-        self.generator.gen_begin_reuse(ids)
-        initial_len = self.generator.sequence[0].shape[0]
-        has_leading_space = False
-        decoded_text = ''
-        for i in range(max_new_tokens):
-            token = self.generator.gen_single_token()
-            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
-                has_leading_space = True
-
-            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
-
-            if token.item() == self.generator.tokenizer.eos_token_id:
-                break
-        return decoded_text
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-        try:
-            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
-            model_directory = request.ModelFile
-
-            # Locate files we need within that directory
-            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
-            model_config_path = os.path.join(model_directory, "config.json")
-            st_pattern = os.path.join(model_directory, "*.safetensors")
-            model_path = glob.glob(st_pattern)[0]
-
-            # Create config, model, tokenizer and generator
-
-            config = ExLlamaConfig(model_config_path)               # create config from config.json
-            config.model_path = model_path                          # supply path to model weights file
-            if (request.ContextSize):
-                config.max_seq_len = request.ContextSize            # override max sequence length
-                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
-                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
-
-            # Set Rope scaling.
-            if (request.RopeFreqScale):
-                # Alpha value for Rope scaling. 
-                # Higher value increases context but adds perplexity.
-                # alpha_value and compress_pos_emb are mutually exclusive.
-                # https://github.com/turboderp/exllama/issues/115
-                config.alpha_value = request.RopeFreqScale
-                config.calculate_rotary_embedding_base()
-
-            model = ExLlama(config)                                 # create ExLlama instance and load the weights
-            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
-
-            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
-            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
-
-            self.generator= generator
-            self.model = model
-            self.tokenizer = tokenizer
-            self.cache = cache
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        penalty = 1.15
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-        self.generator.settings.token_repetition_penalty_max = penalty
-        self.generator.settings.temperature = request.Temperature
-        self.generator.settings.top_k = request.TopK
-        self.generator.settings.top_p = request.TopP
-
-        tokens = 512
-        if request.Tokens != 0:
-            tokens = request.Tokens
-
-        if self.cache.batch_size == 1:
-            del self.cache
-            self.cache = ExLlamaCache(self.model, batch_size=2)
-            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
-
-        t = self.generate(request.Prompt, tokens)
-
-        # Remove prompt from response if present
-        if request.Prompt in t:
-            t = t.replace(request.Prompt, "")
-
-        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
\ No newline at end of file
diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh
deleted file mode 100755
index d33c435600d0..000000000000
--- a/backend/python/exllama/install.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -e
-
-LIMIT_TARGETS="cublas"
-
-source $(dirname $0)/../common/libbackend.sh
-
-installRequirements
-
-git clone https://github.com/turboderp/exllama $MY_DIR/source
-uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
-
-cp -v ./*py $MY_DIR/source/
diff --git a/backend/python/exllama/requirements-cpu.txt b/backend/python/exllama/requirements-cpu.txt
deleted file mode 100644
index bbcdc8cda704..000000000000
--- a/backend/python/exllama/requirements-cpu.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch
\ No newline at end of file
diff --git a/backend/python/exllama/requirements-cublas11.txt b/backend/python/exllama/requirements-cublas11.txt
deleted file mode 100644
index 1dfb5b9854d2..000000000000
--- a/backend/python/exllama/requirements-cublas11.txt
+++ /dev/null
@@ -1,4 +0,0 @@
---extra-index-url https://download.pytorch.org/whl/cu118
-torch
-transformers
-accelerate
\ No newline at end of file
diff --git a/backend/python/exllama/requirements-cublas12.txt b/backend/python/exllama/requirements-cublas12.txt
deleted file mode 100644
index 1ec544cd1438..000000000000
--- a/backend/python/exllama/requirements-cublas12.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-torch
-transformers
-accelerate
\ No newline at end of file
diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt
deleted file mode 100644
index b9c192d5d304..000000000000
--- a/backend/python/exllama/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-grpcio==1.66.1
-protobuf
-certifi
-setuptools
\ No newline at end of file
diff --git a/backend/python/exllama/run.sh b/backend/python/exllama/run.sh
deleted file mode 100755
index 63119689d27a..000000000000
--- a/backend/python/exllama/run.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-LIMIT_TARGETS="cublas"
-BACKEND_FILE="${MY_DIR}/source/backend.py"
-
-source $(dirname $0)/../common/libbackend.sh
-
-startBackend $@
\ No newline at end of file
diff --git a/backend/python/exllama/test.sh b/backend/python/exllama/test.sh
deleted file mode 100755
index 6940b0661df2..000000000000
--- a/backend/python/exllama/test.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-runUnittests

From cabb1602e84535e1957412d63d5f2a3ad80c589b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 14 Sep 2024 00:05:38 +0200
Subject: [PATCH 11/11] chore: :arrow_up: Update ggerganov/llama.cpp to
 `feff4aa8461da7c432d144c11da4802e41fef3cf` (#3542)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index a3f0ffd05c38..9ba109b03bbb 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=feff4aa8461da7c432d144c11da4802e41fef3cf
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp