From b7496dea9be938c7d51b9ffbc1f07246481db66b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:15:30 +0200 Subject: [PATCH 01/11] chore(deps): Bump yarl from 1.9.7 to 1.11.0 in /examples/langchain/langchainpy-localai-example (#3501) chore(deps): Bump yarl Bumps [yarl](https://github.com/aio-libs/yarl) from 1.9.7 to 1.11.0. - [Release notes](https://github.com/aio-libs/yarl/releases) - [Changelog](https://github.com/aio-libs/yarl/blob/master/CHANGES.rst) - [Commits](https://github.com/aio-libs/yarl/compare/v1.9.7...v1.11.0) --- updated-dependencies: - dependency-name: yarl dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples/langchain/langchainpy-localai-example/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/langchain/langchainpy-localai-example/requirements.txt b/examples/langchain/langchainpy-localai-example/requirements.txt index 0c7be91791ff..753230059bcd 100644 --- a/examples/langchain/langchainpy-localai-example/requirements.txt +++ b/examples/langchain/langchainpy-localai-example/requirements.txt @@ -30,4 +30,4 @@ tqdm==4.66.5 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.2 -yarl==1.9.7 +yarl==1.11.0 From a7ac2f7bb0800b13b2cb2f305528ead6db9fd695 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:15:52 +0200 Subject: [PATCH 02/11] chore(deps): Bump llama-index from 0.11.4 to 0.11.7 in /examples/chainlit (#3516) chore(deps): Bump llama-index in /examples/chainlit Bumps [llama-index](https://github.com/run-llama/llama_index) from 0.11.4 to 0.11.7. - [Release notes](https://github.com/run-llama/llama_index/releases) - [Changelog](https://github.com/run-llama/llama_index/blob/main/CHANGELOG.md) - [Commits](https://github.com/run-llama/llama_index/compare/v0.11.4...v0.11.7) --- updated-dependencies: - dependency-name: llama-index dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples/chainlit/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/chainlit/requirements.txt b/examples/chainlit/requirements.txt index 8654ea9959a5..69212e28e102 100644 --- a/examples/chainlit/requirements.txt +++ b/examples/chainlit/requirements.txt @@ -1,4 +1,4 @@ -llama_index==0.11.4 +llama_index==0.11.7 requests==2.32.3 weaviate_client==4.6.7 transformers From e35d8169b1cc1848cc339cb2525b3e6d42a1393b Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:52:27 +0200 Subject: [PATCH 03/11] chore: :arrow_up: Update ggerganov/whisper.cpp to `a551933542d956ae84634937acd2942eb40efaaf` (#3534) :arrow_up: Update ggerganov/whisper.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fe05dc1a5741..e0e15cfb1c4f 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6 # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp -WHISPER_CPP_VERSION?=5caa19240d55bfd6ee316d50fbad32c6e9c39528 +WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf # bert.cpp version BERT_REPO?=https://github.com/go-skynet/go-bert.cpp From d51444d606e1c616c396e37d7413a8a562714cb6 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 12 Sep 2024 20:55:27 +0200 Subject: [PATCH 04/11] chore(deps): update llama.cpp (#3497) * Apply llava patch Signed-off-by: Ettore Di Giacinto --- Makefile | 2 +- backend/cpp/llama/grpc-server.cpp | 107 +++++++++++------------ backend/cpp/llama/patches/01-llava.patch | 13 +++ backend/cpp/llama/prepare.sh | 7 ++ backend/cpp/llama/utils.hpp | 27 ------ 5 files changed, 70 insertions(+), 86 deletions(-) create mode 100644 backend/cpp/llama/patches/01-llava.patch diff --git a/Makefile b/Makefile index e0e15cfb1c4f..3d9ea5921d6b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04 +CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1 # go-rwkv version RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index e1b6f868b2e8..a46b4ee0a335 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -17,11 +17,10 @@ #include "common.h" #include "json.hpp" #include "llama.h" -#include "grammar-parser.h" #include "backend.pb.h" #include "backend.grpc.pb.h" #include "utils.hpp" - +#include "sampling.h" // include std::regex #include #include @@ -203,8 +202,8 @@ struct llama_client_slot std::string stopping_word; // sampling - struct llama_sampling_params sparams; - llama_sampling_context *ctx_sampling = nullptr; + struct gpt_sampler_params sparams; + gpt_sampler *ctx_sampling = nullptr; int32_t ga_i = 0; // group-attention state int32_t ga_n = 1; // group-attention factor @@ -619,7 +618,7 @@ struct llama_server_context bool launch_slot_with_data(llama_client_slot* &slot, json data) { slot_params default_params; - llama_sampling_params default_sparams; + gpt_sampler_params default_sparams; slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); @@ -628,7 +627,7 @@ struct llama_server_context slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); @@ -641,7 +640,7 @@ struct llama_server_context slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.seed = json_value(data, "seed", default_sparams.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -665,6 +664,7 @@ struct llama_server_context slot->params.input_prefix = ""; } + if (data.count("input_suffix") != 0) { slot->params.input_suffix = data["input_suffix"]; @@ -683,6 +683,10 @@ struct llama_server_context slot->prompt = ""; } + if (json_value(data, "ignore_eos", false)) { + slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + } + /* slot->sparams.penalty_prompt_tokens.clear(); slot->sparams.use_penalty_prompt_tokens = false; const auto &penalty_prompt = data.find("penalty_prompt"); @@ -718,14 +722,10 @@ struct llama_server_context slot->sparams.use_penalty_prompt_tokens = true; } } + */ slot->sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) - { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - const auto &logit_bias = data.find("logit_bias"); if (logit_bias != data.end() && logit_bias->is_array()) { @@ -753,7 +753,7 @@ struct llama_server_context llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - slot->sparams.logit_bias[tok] = bias; + slot->sparams.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) @@ -761,13 +761,13 @@ struct llama_server_context auto toks = llama_tokenize(model, el[0].get(), false); for (auto tok : toks) { - slot->sparams.logit_bias[tok] = bias; + slot->sparams.logit_bias.push_back({tok, bias}); } } } } } - + slot->params.antiprompt.clear(); const auto &stop = data.find("stop"); @@ -781,24 +781,22 @@ struct llama_server_context } } } - - const auto &samplers_sequence = data.find("samplers"); - if (samplers_sequence != data.end() && samplers_sequence->is_array()) - { + + const auto & samplers = data.find("samplers"); + if (samplers != data.end() && samplers->is_array()) { std::vector sampler_names; - for (const auto &sampler_name : *samplers_sequence) - { - if (sampler_name.is_string()) - { - sampler_names.emplace_back(sampler_name); + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); + } } - } - slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); + slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); } else { - slot->sparams.samplers_sequence = default_sparams.samplers_sequence; + slot->sparams.samplers = default_sparams.samplers; } + if (multimodal) { @@ -875,10 +873,10 @@ struct llama_server_context if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); + gpt_sampler_free(slot->ctx_sampling); } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); + slot->ctx_sampling = gpt_sampler_init(model, slot->sparams); + //llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; @@ -888,7 +886,7 @@ struct llama_server_context {"task_id", slot->task_id}, }); - LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); + // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); return true; } @@ -1006,11 +1004,13 @@ struct llama_server_context slot.generated_text += token_str; slot.has_next_token = true; +/* if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { // we can change penalty_prompt_tokens because it is always created from scratch each request slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); } + */ // check if there is incomplete UTF-8 character at the end bool incomplete = false; @@ -1144,13 +1144,11 @@ struct llama_server_context json get_formated_generation(llama_client_slot &slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && std::isinf(eos_bias->second); - std::vector samplers_sequence; - for (const auto &sampler_type : slot.sparams.samplers_sequence) + std::vector samplers; + samplers.reserve(slot.sparams.samplers.size()); + for (const auto & sampler : slot.sparams.samplers) { - samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); + samplers.emplace_back(gpt_sampler_type_to_str(sampler)); } return json { @@ -1165,13 +1163,11 @@ struct llama_server_context {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, + {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1179,13 +1175,13 @@ struct llama_server_context {"stop", slot.params.antiprompt}, {"n_predict", slot.params.n_predict}, {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, + {"ignore_eos", slot.sparams.ignore_eos}, {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, + // {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, - {"samplers", samplers_sequence} + {"samplers", samplers} }; } @@ -1714,7 +1710,7 @@ struct llama_server_context if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); + gpt_sampler_reset(slot.ctx_sampling); slot.n_past = 0; slot.n_past_se = 0; @@ -1726,7 +1722,7 @@ struct llama_server_context // push the prompt into the sampling context (do not apply grammar) for (auto &token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); + gpt_sampler_accept(slot.ctx_sampling, token, false); } slot.n_past = common_part(slot.cache_tokens, prompt_tokens); @@ -1934,9 +1930,9 @@ struct llama_server_context } completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); + const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i); - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + gpt_sampler_accept(slot.ctx_sampling, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) @@ -1946,19 +1942,14 @@ struct llama_server_context metrics.on_prompt_eval(slot); } - llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; result.tok = id; + const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling); - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) - { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) - { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); } if (!process_token(result, slot)) diff --git a/backend/cpp/llama/patches/01-llava.patch b/backend/cpp/llama/patches/01-llava.patch new file mode 100644 index 000000000000..fa122da257cd --- /dev/null +++ b/backend/cpp/llama/patches/01-llava.patch @@ -0,0 +1,13 @@ +diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp +index 342042ff..224db9b5 100644 +--- a/examples/llava/clip.cpp ++++ b/examples/llava/clip.cpp +@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { +- patches_data[i] = i + 1; ++ patches_data[i] = i; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); \ No newline at end of file diff --git a/backend/cpp/llama/prepare.sh b/backend/cpp/llama/prepare.sh index 6c00f27caa38..4c8393b908d7 100644 --- a/backend/cpp/llama/prepare.sh +++ b/backend/cpp/llama/prepare.sh @@ -1,5 +1,12 @@ #!/bin/bash +## Patches +## Apply patches from the `patches` directory +for patch in $(ls patches); do + echo "Applying patch $patch" + patch -d llama.cpp/ -p1 < patches/$patch +done + cp -r CMakeLists.txt llama.cpp/examples/grpc-server/ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/ diff --git a/backend/cpp/llama/utils.hpp b/backend/cpp/llama/utils.hpp index c5dafbf0f9ce..198b6f265957 100644 --- a/backend/cpp/llama/utils.hpp +++ b/backend/cpp/llama/utils.hpp @@ -480,31 +480,4 @@ static inline std::vector base64_decode(const std::string & encoded_str } return ret; -} - -// -// random string / id -// - -static std::string random_string() -{ - static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() -{ - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); } \ No newline at end of file From cf747bcdeccaef7ef2128b3e0329da76871f83d7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 13:27:36 +0200 Subject: [PATCH 05/11] feat: extract output with regexes from LLMs (#3491) * feat: extract output with regexes from LLMs This changset adds `extract_regex` to the LLM config. It is a list of regexes that can match output and will be used to re extract text from the LLM output. This is particularly useful for LLMs which outputs final results into tags. Signed-off-by: Ettore Di Giacinto * Add tests, enhance output in case of configuration error Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- core/backend/backend_suite_test.go | 13 ++++ core/backend/llm.go | 28 +++++++- core/backend/llm_test.go | 109 +++++++++++++++++++++++++++++ core/config/backend_config.go | 1 + core/http/endpoints/openai/chat.go | 8 +-- 5 files changed, 154 insertions(+), 5 deletions(-) create mode 100644 core/backend/backend_suite_test.go create mode 100644 core/backend/llm_test.go diff --git a/core/backend/backend_suite_test.go b/core/backend/backend_suite_test.go new file mode 100644 index 000000000000..541c91f6be79 --- /dev/null +++ b/core/backend/backend_suite_test.go @@ -0,0 +1,13 @@ +package backend_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestBackend(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Backend test suite") +} diff --git a/core/backend/llm.go b/core/backend/llm.go index 72c4ad9f0380..2b4564a886fe 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -9,6 +9,8 @@ import ( "sync" "unicode/utf8" + "github.com/rs/zerolog/log" + "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/schema" @@ -181,13 +183,37 @@ func Finetune(config config.BackendConfig, input, prediction string) string { mu.Lock() reg, ok := cutstrings[c] if !ok { - cutstrings[c] = regexp.MustCompile(c) + r, err := regexp.Compile(c) + if err != nil { + log.Fatal().Err(err).Msg("failed to compile regex") + } + cutstrings[c] = r reg = cutstrings[c] } mu.Unlock() prediction = reg.ReplaceAllString(prediction, "") } + // extract results from the response which can be for instance inside XML tags + var predResult string + for _, r := range config.ExtractRegex { + mu.Lock() + reg, ok := cutstrings[r] + if !ok { + regex, err := regexp.Compile(r) + if err != nil { + log.Fatal().Err(err).Msg("failed to compile regex") + } + cutstrings[r] = regex + reg = regex + } + mu.Unlock() + predResult += reg.FindString(prediction) + } + if predResult != "" { + prediction = predResult + } + for _, c := range config.TrimSpace { prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c)) } diff --git a/core/backend/llm_test.go b/core/backend/llm_test.go new file mode 100644 index 000000000000..f7630702e2a0 --- /dev/null +++ b/core/backend/llm_test.go @@ -0,0 +1,109 @@ +package backend_test + +import ( + . "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("LLM tests", func() { + Context("Finetune LLM output", func() { + var ( + testConfig config.BackendConfig + input string + prediction string + result string + ) + + BeforeEach(func() { + testConfig = config.BackendConfig{ + PredictionOptions: schema.PredictionOptions{ + Echo: false, + }, + LLMConfig: config.LLMConfig{ + Cutstrings: []string{`<.*?>`}, // Example regex for removing XML tags + ExtractRegex: []string{`(.*?)`}, // Example regex to extract from tags + TrimSpace: []string{" ", "\n"}, + TrimSuffix: []string{".", "!"}, + }, + } + }) + + Context("when echo is enabled", func() { + BeforeEach(func() { + testConfig.Echo = true + input = "Hello" + prediction = "World" + }) + + It("should prepend input to prediction", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("HelloWorld")) + }) + }) + + Context("when echo is disabled", func() { + BeforeEach(func() { + testConfig.Echo = false + input = "Hello" + prediction = "World" + }) + + It("should not modify the prediction with input", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("World")) + }) + }) + + Context("when cutstrings regex is applied", func() { + BeforeEach(func() { + input = "" + prediction = "
Hello
World" + }) + + It("should remove substrings matching cutstrings regex", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("Hello World")) + }) + }) + + Context("when extract regex is applied", func() { + BeforeEach(func() { + input = "" + prediction = "42" + }) + + It("should extract substrings matching the extract regex", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("42")) + }) + }) + + Context("when trimming spaces", func() { + BeforeEach(func() { + input = "" + prediction = " Hello World " + }) + + It("should trim spaces from the prediction", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("Hello World")) + }) + }) + + Context("when trimming suffixes", func() { + BeforeEach(func() { + input = "" + prediction = "Hello World." + }) + + It("should trim suffixes from the prediction", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("Hello World")) + }) + }) + }) +}) diff --git a/core/config/backend_config.go b/core/config/backend_config.go index b83e1a986666..027e18a4a599 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -126,6 +126,7 @@ type LLMConfig struct { Grammar string `yaml:"grammar"` StopWords []string `yaml:"stopwords"` Cutstrings []string `yaml:"cutstrings"` + ExtractRegex []string `yaml:"extract_regex"` TrimSpace []string `yaml:"trimspace"` TrimSuffix []string `yaml:"trimsuffix"` diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index a979b7bca33d..8144bdcd3341 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -68,9 +68,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig) result = functions.CleanupLLMResult(result, config.FunctionsConfig) - results := functions.ParseFunctionCall(result, config.FunctionsConfig) + functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig) log.Debug().Msgf("Text content to return: %s", textContentToReturn) - noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0 + noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0 switch { case noActionToRun: @@ -83,7 +83,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup } responses <- initialMessage - result, err := handleQuestion(config, req, ml, startupOptions, results, result, prompt) + result, err := handleQuestion(config, req, ml, startupOptions, functionResults, result, prompt) if err != nil { log.Error().Err(err).Msg("error handling question") return @@ -105,7 +105,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup responses <- resp default: - for i, ss := range results { + for i, ss := range functionResults { name, args := ss.Name, ss.Arguments initialMessage := schema.OpenAIResponse{ From 7fe6d0ad2be25e31fe38439182911d74ec2c569f Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 19:19:26 +0200 Subject: [PATCH 06/11] chore(gosec): fix CI (#3537) downgrade to latest known version of the gosec action Signed-off-by: Ettore Di Giacinto --- .github/workflows/secscan.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml index d9743d9e36ae..db9db586947d 100644 --- a/.github/workflows/secscan.yaml +++ b/.github/workflows/secscan.yaml @@ -18,7 +18,7 @@ jobs: if: ${{ github.actor != 'dependabot[bot]' }} - name: Run Gosec Security Scanner if: ${{ github.actor != 'dependabot[bot]' }} - uses: securego/gosec@master + uses: securego/gosec@v2.21.0 with: # we let the report trigger content trigger a failure using the GitHub Security features. args: '-no-fail -fmt sarif -out results.sarif ./...' From 5213e79f5c3059616e461176564c97cca481e367 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 19:48:54 +0200 Subject: [PATCH 07/11] models(gallery): add azure_dusk-v0.2-iq-imatrix (#3538) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 4939820df83e..86e8133883a1 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -658,6 +658,23 @@ - filename: Mahou-1.3-llama3.1-8B.Q4_K_M.gguf sha256: 88bfdca2f6077d789d3e0f161d19711aa208a6d9a02cce96a2276c69413b3594 uri: huggingface://mradermacher/Mahou-1.3-llama3.1-8B-GGUF/Mahou-1.3-llama3.1-8B.Q4_K_M.gguf +- !!merge <<: *llama31 + name: "azure_dusk-v0.2-iq-imatrix" + # chatml + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/n3-g_YTk3FY-DBzxXd28E.png + urls: + - https://huggingface.co/Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix + description: | + "Following up on Crimson_Dawn-v0.2 we have Azure_Dusk-v0.2! Training on Mistral-Nemo-Base-2407 this time I've added significantly more data, as well as trained using RSLoRA as opposed to regular LoRA. Another key change is training on ChatML as opposed to Mistral Formatting." + by Author. + overrides: + parameters: + model: Azure_Dusk-v0.2-Q4_K_M-imat.gguf + files: + - filename: Azure_Dusk-v0.2-Q4_K_M-imat.gguf + sha256: c03a670c00976d14c267a0322374ed488b2a5f4790eb509136ca4e75cbc10cf4 + uri: huggingface://Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix/Azure_Dusk-v0.2-Q4_K_M-imat.gguf - &deepseek ## Deepseek url: "github:mudler/LocalAI/gallery/deepseek.yaml@master" From 925315ab5cd4f09e32cba08c5ca23bb9b215ba98 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 19:49:14 +0200 Subject: [PATCH 08/11] models(gallery): add mn-12b-lyra-v4-iq-imatrix (#3539) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 86e8133883a1..b70a9122ac20 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1212,6 +1212,23 @@ - filename: Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf sha256: cf3465c183bf4ecbccd1b6b480f687e0160475b04c87e2f1e5ebc8baa0f4c7aa uri: huggingface://bartowski/Pantheon-RP-1.6-12b-Nemo-GGUF/Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf +- !!merge <<: *mistral03 + name: "mn-12b-lyra-v4-iq-imatrix" + icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/dVoru83WOpwVjMlgZ_xhA.png + #chatml + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + urls: + - https://huggingface.co/Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix + description: | + A finetune of Mistral Nemo by Sao10K. + Uses the ChatML prompt format. + overrides: + parameters: + model: MN-12B-Lyra-v4-Q4_K_M-imat.gguf + files: + - filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf + sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3 + uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf - &mudler ### START mudler's LocalAI specific-models url: "github:mudler/LocalAI/gallery/mudler.yaml@master" From cbfab81c3599c8c2051209234edf0a3259b5efaa Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 21:49:18 +0200 Subject: [PATCH 09/11] models(gallery): add datagemma models (#3540) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index b70a9122ac20..188576b26bc5 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1756,6 +1756,34 @@ - filename: Athena-codegemma-2-2b-it.Q4_K_M.gguf sha256: 59ce17023438b0da603dd211c7d39f78e7acac4108258ac0818a97a4ca7d64e3 uri: huggingface://mradermacher/Athena-codegemma-2-2b-it-GGUF/Athena-codegemma-2-2b-it.Q4_K_M.gguf +- !!merge <<: *gemma + name: "datagemma-rag-27b-it" + urls: + - https://huggingface.co/google/datagemma-rag-27b-it + - https://huggingface.co/bartowski/datagemma-rag-27b-it-GGUF + description: | + DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RAG is used with Retrieval Augmented Generation, where it is trained to take a user query and generate natural language queries that can be understood by Data Commons' existing natural language interface. More information can be found in this research paper. + overrides: + parameters: + model: datagemma-rag-27b-it-Q4_K_M.gguf + files: + - filename: datagemma-rag-27b-it-Q4_K_M.gguf + sha256: 3dfcf51b05e3f0ab0979ad194de350edea71cb14444efa0a9f2ef5bfc80753f8 + uri: huggingface://bartowski/datagemma-rag-27b-it-GGUF/datagemma-rag-27b-it-Q4_K_M.gguf +- !!merge <<: *gemma + name: "datagemma-rig-27b-it" + urls: + - https://huggingface.co/google/datagemma-rig-27b-it + - https://huggingface.co/bartowski/datagemma-rig-27b-it-GGUF + description: | + DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RIG is used in the retrieval interleaved generation approach (based off of tool-use approaches), where it is trained to annotate a response with natural language queries to Data Commons’ existing natural language interface wherever there are statistics. More information can be found in this research paper. + overrides: + parameters: + model: datagemma-rig-27b-it-Q4_K_M.gguf + files: + - filename: datagemma-rig-27b-it-Q4_K_M.gguf + sha256: a6738ffbb49b6c46d220e2793df85c0538e9ac72398e32a0914ee5e55c3096ad + uri: huggingface://bartowski/datagemma-rig-27b-it-GGUF/datagemma-rig-27b-it-Q4_K_M.gguf - &llama3 url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master" icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png From 25e7661de268bb9cd31622fdb05052cf26ac4e9f Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 21:52:13 +0200 Subject: [PATCH 10/11] chore(exllama): drop exllama backend (#3536) * chore(exllama): drop exllama backend For polishing and cleaning up it makes now sense to drop exllama which is completely unmaintained, and was only supporting the llamav1 architecture (nowadays it's superseded by llamav1) . Signed-off-by: Ettore Di Giacinto * chore(gosec): fix CI downgrade to latest known version of the gosec action Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- Dockerfile | 5 +- Makefile | 13 +- backend/python/exllama/.gitignore | 1 - backend/python/exllama/Makefile | 25 --- backend/python/exllama/README.md | 5 - backend/python/exllama/backend.py | 159 ------------------ backend/python/exllama/install.sh | 13 -- backend/python/exllama/requirements-cpu.txt | 3 - .../python/exllama/requirements-cublas11.txt | 4 - .../python/exllama/requirements-cublas12.txt | 3 - backend/python/exllama/requirements.txt | 4 - backend/python/exllama/run.sh | 7 - backend/python/exllama/test.sh | 6 - 13 files changed, 3 insertions(+), 245 deletions(-) delete mode 100644 backend/python/exllama/.gitignore delete mode 100644 backend/python/exllama/Makefile delete mode 100644 backend/python/exllama/README.md delete mode 100755 backend/python/exllama/backend.py delete mode 100755 backend/python/exllama/install.sh delete mode 100644 backend/python/exllama/requirements-cpu.txt delete mode 100644 backend/python/exllama/requirements-cublas11.txt delete mode 100644 backend/python/exllama/requirements-cublas12.txt delete mode 100644 backend/python/exllama/requirements.txt delete mode 100755 backend/python/exllama/run.sh delete mode 100755 backend/python/exllama/test.sh diff --git a/Dockerfile b/Dockerfile index b86cc7061d8e..f08cb9a03b2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ARG TARGETARCH ARG TARGETVARIANT ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" RUN apt-get update && \ @@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG ; fi && \ if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/transformers-musicgen \ - ; fi && \ - if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ - make -C backend/python/exllama \ ; fi RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ diff --git a/Makefile b/Makefile index 3d9ea5921d6b..a3f0ffd05c38 100644 --- a/Makefile +++ b/Makefile @@ -534,10 +534,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen +protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean +protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean .PHONY: autogptq-protogen autogptq-protogen: @@ -571,14 +571,6 @@ diffusers-protogen: diffusers-protogen-clean: $(MAKE) -C backend/python/diffusers protogen-clean -.PHONY: exllama-protogen -exllama-protogen: - $(MAKE) -C backend/python/exllama protogen - -.PHONY: exllama-protogen-clean -exllama-protogen-clean: - $(MAKE) -C backend/python/exllama protogen-clean - .PHONY: exllama2-protogen exllama2-protogen: $(MAKE) -C backend/python/exllama2 protogen @@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python $(MAKE) -C backend/python/parler-tts $(MAKE) -C backend/python/vall-e-x $(MAKE) -C backend/python/openvoice - $(MAKE) -C backend/python/exllama $(MAKE) -C backend/python/exllama2 prepare-test-extra: protogen-python diff --git a/backend/python/exllama/.gitignore b/backend/python/exllama/.gitignore deleted file mode 100644 index 1d3a06547c70..000000000000 --- a/backend/python/exllama/.gitignore +++ /dev/null @@ -1 +0,0 @@ -source \ No newline at end of file diff --git a/backend/python/exllama/Makefile b/backend/python/exllama/Makefile deleted file mode 100644 index e6a678810c3f..000000000000 --- a/backend/python/exllama/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -export CONDA_ENV_PATH = "exllama.yml" - -.PHONY: exllama -exllama: protogen - bash install.sh ${CONDA_ENV_PATH} - -.PHONY: run -run: protogen - @echo "Running exllama..." - bash run.sh - @echo "exllama run." - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto - -.PHONY: clean -clean: protogen-clean - $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/exllama/README.md b/backend/python/exllama/README.md deleted file mode 100644 index f9ed5e9fbdb7..000000000000 --- a/backend/python/exllama/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Creating a separate environment for the exllama project - -``` -make exllama -``` \ No newline at end of file diff --git a/backend/python/exllama/backend.py b/backend/python/exllama/backend.py deleted file mode 100755 index 58d1392c5ee4..000000000000 --- a/backend/python/exllama/backend.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 -import grpc -from concurrent import futures -import time -import backend_pb2 -import backend_pb2_grpc -import argparse -import signal -import sys -import os, glob - -from pathlib import Path -import torch -import torch.nn.functional as F -from torch import version as torch_version - -from source.tokenizer import ExLlamaTokenizer -from source.generator import ExLlamaGenerator -from source.model import ExLlama, ExLlamaCache, ExLlamaConfig - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - def generate(self,prompt, max_new_tokens): - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text - def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - def LoadModel(self, request, context): - try: - # https://github.com/turboderp/exllama/blob/master/example_cfg.py - model_directory = request.ModelFile - - # Locate files we need within that directory - tokenizer_path = os.path.join(model_directory, "tokenizer.model") - model_config_path = os.path.join(model_directory, "config.json") - st_pattern = os.path.join(model_directory, "*.safetensors") - model_path = glob.glob(st_pattern)[0] - - # Create config, model, tokenizer and generator - - config = ExLlamaConfig(model_config_path) # create config from config.json - config.model_path = model_path # supply path to model weights file - if (request.ContextSize): - config.max_seq_len = request.ContextSize # override max sequence length - config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2. - # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163 - - # Set Rope scaling. - if (request.RopeFreqScale): - # Alpha value for Rope scaling. - # Higher value increases context but adds perplexity. - # alpha_value and compress_pos_emb are mutually exclusive. - # https://github.com/turboderp/exllama/issues/115 - config.alpha_value = request.RopeFreqScale - config.calculate_rotary_embedding_base() - - model = ExLlama(config) # create ExLlama instance and load the weights - tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file - - cache = ExLlamaCache(model, batch_size = 2) # create cache for inference - generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - - self.generator= generator - self.model = model - self.tokenizer = tokenizer - self.cache = cache - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - penalty = 1.15 - if request.Penalty != 0.0: - penalty = request.Penalty - self.generator.settings.token_repetition_penalty_max = penalty - self.generator.settings.temperature = request.Temperature - self.generator.settings.top_k = request.TopK - self.generator.settings.top_p = request.TopP - - tokens = 512 - if request.Tokens != 0: - tokens = request.Tokens - - if self.cache.batch_size == 1: - del self.cache - self.cache = ExLlamaCache(self.model, batch_size=2) - self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache) - - t = self.generate(request.Prompt, tokens) - - # Remove prompt from response if present - if request.Prompt in t: - t = t.replace(request.Prompt, "") - - return backend_pb2.Result(message=bytes(t, encoding='utf-8')) - - def PredictStream(self, request, context): - # Implement PredictStream RPC - #for reply in some_data_generator(): - # yield reply - # Not implemented yet - return self.Predict(request, context) - - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) \ No newline at end of file diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh deleted file mode 100755 index d33c435600d0..000000000000 --- a/backend/python/exllama/install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -LIMIT_TARGETS="cublas" - -source $(dirname $0)/../common/libbackend.sh - -installRequirements - -git clone https://github.com/turboderp/exllama $MY_DIR/source -uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt - -cp -v ./*py $MY_DIR/source/ diff --git a/backend/python/exllama/requirements-cpu.txt b/backend/python/exllama/requirements-cpu.txt deleted file mode 100644 index bbcdc8cda704..000000000000 --- a/backend/python/exllama/requirements-cpu.txt +++ /dev/null @@ -1,3 +0,0 @@ -transformers -accelerate -torch \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas11.txt b/backend/python/exllama/requirements-cublas11.txt deleted file mode 100644 index 1dfb5b9854d2..000000000000 --- a/backend/python/exllama/requirements-cublas11.txt +++ /dev/null @@ -1,4 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas12.txt b/backend/python/exllama/requirements-cublas12.txt deleted file mode 100644 index 1ec544cd1438..000000000000 --- a/backend/python/exllama/requirements-cublas12.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt deleted file mode 100644 index b9c192d5d304..000000000000 --- a/backend/python/exllama/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -grpcio==1.66.1 -protobuf -certifi -setuptools \ No newline at end of file diff --git a/backend/python/exllama/run.sh b/backend/python/exllama/run.sh deleted file mode 100755 index 63119689d27a..000000000000 --- a/backend/python/exllama/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -LIMIT_TARGETS="cublas" -BACKEND_FILE="${MY_DIR}/source/backend.py" - -source $(dirname $0)/../common/libbackend.sh - -startBackend $@ \ No newline at end of file diff --git a/backend/python/exllama/test.sh b/backend/python/exllama/test.sh deleted file mode 100755 index 6940b0661df2..000000000000 --- a/backend/python/exllama/test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -runUnittests From cabb1602e84535e1957412d63d5f2a3ad80c589b Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 14 Sep 2024 00:05:38 +0200 Subject: [PATCH 11/11] chore: :arrow_up: Update ggerganov/llama.cpp to `feff4aa8461da7c432d144c11da4802e41fef3cf` (#3542) :arrow_up: Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a3f0ffd05c38..9ba109b03bbb 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1 +CPPLLAMA_VERSION?=feff4aa8461da7c432d144c11da4802e41fef3cf # go-rwkv version RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp