diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml index d9743d9e36ae..db9db586947d 100644 --- a/.github/workflows/secscan.yaml +++ b/.github/workflows/secscan.yaml @@ -18,7 +18,7 @@ jobs: if: ${{ github.actor != 'dependabot[bot]' }} - name: Run Gosec Security Scanner if: ${{ github.actor != 'dependabot[bot]' }} - uses: securego/gosec@master + uses: securego/gosec@v2.21.0 with: # we let the report trigger content trigger a failure using the GitHub Security features. args: '-no-fail -fmt sarif -out results.sarif ./...' diff --git a/Dockerfile b/Dockerfile index b86cc7061d8e..f08cb9a03b2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ARG TARGETARCH ARG TARGETVARIANT ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" RUN apt-get update && \ @@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG ; fi && \ if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/transformers-musicgen \ - ; fi && \ - if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ - make -C backend/python/exllama \ ; fi RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ diff --git a/Makefile b/Makefile index fe05dc1a5741..9ba109b03bbb 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04 +CPPLLAMA_VERSION?=feff4aa8461da7c432d144c11da4802e41fef3cf # go-rwkv version RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp @@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6 # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp -WHISPER_CPP_VERSION?=5caa19240d55bfd6ee316d50fbad32c6e9c39528 +WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf # bert.cpp version BERT_REPO?=https://github.com/go-skynet/go-bert.cpp @@ -534,10 +534,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen +protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean +protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean .PHONY: autogptq-protogen autogptq-protogen: @@ -571,14 +571,6 @@ diffusers-protogen: diffusers-protogen-clean: $(MAKE) -C backend/python/diffusers protogen-clean -.PHONY: exllama-protogen -exllama-protogen: - $(MAKE) -C backend/python/exllama protogen - -.PHONY: exllama-protogen-clean -exllama-protogen-clean: - $(MAKE) -C backend/python/exllama protogen-clean - .PHONY: exllama2-protogen exllama2-protogen: $(MAKE) -C backend/python/exllama2 protogen @@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python $(MAKE) -C backend/python/parler-tts $(MAKE) -C backend/python/vall-e-x $(MAKE) -C backend/python/openvoice - $(MAKE) -C backend/python/exllama $(MAKE) -C backend/python/exllama2 prepare-test-extra: protogen-python diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index e1b6f868b2e8..a46b4ee0a335 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -17,11 +17,10 @@ #include "common.h" #include "json.hpp" #include "llama.h" -#include "grammar-parser.h" #include "backend.pb.h" #include "backend.grpc.pb.h" #include "utils.hpp" - +#include "sampling.h" // include std::regex #include #include @@ -203,8 +202,8 @@ struct llama_client_slot std::string stopping_word; // sampling - struct llama_sampling_params sparams; - llama_sampling_context *ctx_sampling = nullptr; + struct gpt_sampler_params sparams; + gpt_sampler *ctx_sampling = nullptr; int32_t ga_i = 0; // group-attention state int32_t ga_n = 1; // group-attention factor @@ -619,7 +618,7 @@ struct llama_server_context bool launch_slot_with_data(llama_client_slot* &slot, json data) { slot_params default_params; - llama_sampling_params default_sparams; + gpt_sampler_params default_sparams; slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); @@ -628,7 +627,7 @@ struct llama_server_context slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); @@ -641,7 +640,7 @@ struct llama_server_context slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.seed = json_value(data, "seed", default_sparams.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -665,6 +664,7 @@ struct llama_server_context slot->params.input_prefix = ""; } + if (data.count("input_suffix") != 0) { slot->params.input_suffix = data["input_suffix"]; @@ -683,6 +683,10 @@ struct llama_server_context slot->prompt = ""; } + if (json_value(data, "ignore_eos", false)) { + slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + } + /* slot->sparams.penalty_prompt_tokens.clear(); slot->sparams.use_penalty_prompt_tokens = false; const auto &penalty_prompt = data.find("penalty_prompt"); @@ -718,14 +722,10 @@ struct llama_server_context slot->sparams.use_penalty_prompt_tokens = true; } } + */ slot->sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) - { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - const auto &logit_bias = data.find("logit_bias"); if (logit_bias != data.end() && logit_bias->is_array()) { @@ -753,7 +753,7 @@ struct llama_server_context llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - slot->sparams.logit_bias[tok] = bias; + slot->sparams.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) @@ -761,13 +761,13 @@ struct llama_server_context auto toks = llama_tokenize(model, el[0].get(), false); for (auto tok : toks) { - slot->sparams.logit_bias[tok] = bias; + slot->sparams.logit_bias.push_back({tok, bias}); } } } } } - + slot->params.antiprompt.clear(); const auto &stop = data.find("stop"); @@ -781,24 +781,22 @@ struct llama_server_context } } } - - const auto &samplers_sequence = data.find("samplers"); - if (samplers_sequence != data.end() && samplers_sequence->is_array()) - { + + const auto & samplers = data.find("samplers"); + if (samplers != data.end() && samplers->is_array()) { std::vector sampler_names; - for (const auto &sampler_name : *samplers_sequence) - { - if (sampler_name.is_string()) - { - sampler_names.emplace_back(sampler_name); + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); + } } - } - slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); + slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); } else { - slot->sparams.samplers_sequence = default_sparams.samplers_sequence; + slot->sparams.samplers = default_sparams.samplers; } + if (multimodal) { @@ -875,10 +873,10 @@ struct llama_server_context if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); + gpt_sampler_free(slot->ctx_sampling); } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); + slot->ctx_sampling = gpt_sampler_init(model, slot->sparams); + //llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; @@ -888,7 +886,7 @@ struct llama_server_context {"task_id", slot->task_id}, }); - LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); + // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); return true; } @@ -1006,11 +1004,13 @@ struct llama_server_context slot.generated_text += token_str; slot.has_next_token = true; +/* if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { // we can change penalty_prompt_tokens because it is always created from scratch each request slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); } + */ // check if there is incomplete UTF-8 character at the end bool incomplete = false; @@ -1144,13 +1144,11 @@ struct llama_server_context json get_formated_generation(llama_client_slot &slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && std::isinf(eos_bias->second); - std::vector samplers_sequence; - for (const auto &sampler_type : slot.sparams.samplers_sequence) + std::vector samplers; + samplers.reserve(slot.sparams.samplers.size()); + for (const auto & sampler : slot.sparams.samplers) { - samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); + samplers.emplace_back(gpt_sampler_type_to_str(sampler)); } return json { @@ -1165,13 +1163,11 @@ struct llama_server_context {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, + {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1179,13 +1175,13 @@ struct llama_server_context {"stop", slot.params.antiprompt}, {"n_predict", slot.params.n_predict}, {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, + {"ignore_eos", slot.sparams.ignore_eos}, {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, + // {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, - {"samplers", samplers_sequence} + {"samplers", samplers} }; } @@ -1714,7 +1710,7 @@ struct llama_server_context if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); + gpt_sampler_reset(slot.ctx_sampling); slot.n_past = 0; slot.n_past_se = 0; @@ -1726,7 +1722,7 @@ struct llama_server_context // push the prompt into the sampling context (do not apply grammar) for (auto &token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); + gpt_sampler_accept(slot.ctx_sampling, token, false); } slot.n_past = common_part(slot.cache_tokens, prompt_tokens); @@ -1934,9 +1930,9 @@ struct llama_server_context } completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); + const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i); - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + gpt_sampler_accept(slot.ctx_sampling, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) @@ -1946,19 +1942,14 @@ struct llama_server_context metrics.on_prompt_eval(slot); } - llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; result.tok = id; + const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling); - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) - { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) - { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); } if (!process_token(result, slot)) diff --git a/backend/cpp/llama/patches/01-llava.patch b/backend/cpp/llama/patches/01-llava.patch new file mode 100644 index 000000000000..fa122da257cd --- /dev/null +++ b/backend/cpp/llama/patches/01-llava.patch @@ -0,0 +1,13 @@ +diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp +index 342042ff..224db9b5 100644 +--- a/examples/llava/clip.cpp ++++ b/examples/llava/clip.cpp +@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { +- patches_data[i] = i + 1; ++ patches_data[i] = i; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); \ No newline at end of file diff --git a/backend/cpp/llama/prepare.sh b/backend/cpp/llama/prepare.sh index 6c00f27caa38..4c8393b908d7 100644 --- a/backend/cpp/llama/prepare.sh +++ b/backend/cpp/llama/prepare.sh @@ -1,5 +1,12 @@ #!/bin/bash +## Patches +## Apply patches from the `patches` directory +for patch in $(ls patches); do + echo "Applying patch $patch" + patch -d llama.cpp/ -p1 < patches/$patch +done + cp -r CMakeLists.txt llama.cpp/examples/grpc-server/ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/ diff --git a/backend/cpp/llama/utils.hpp b/backend/cpp/llama/utils.hpp index c5dafbf0f9ce..198b6f265957 100644 --- a/backend/cpp/llama/utils.hpp +++ b/backend/cpp/llama/utils.hpp @@ -480,31 +480,4 @@ static inline std::vector base64_decode(const std::string & encoded_str } return ret; -} - -// -// random string / id -// - -static std::string random_string() -{ - static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() -{ - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); } \ No newline at end of file diff --git a/backend/python/exllama/.gitignore b/backend/python/exllama/.gitignore deleted file mode 100644 index 1d3a06547c70..000000000000 --- a/backend/python/exllama/.gitignore +++ /dev/null @@ -1 +0,0 @@ -source \ No newline at end of file diff --git a/backend/python/exllama/Makefile b/backend/python/exllama/Makefile deleted file mode 100644 index e6a678810c3f..000000000000 --- a/backend/python/exllama/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -export CONDA_ENV_PATH = "exllama.yml" - -.PHONY: exllama -exllama: protogen - bash install.sh ${CONDA_ENV_PATH} - -.PHONY: run -run: protogen - @echo "Running exllama..." - bash run.sh - @echo "exllama run." - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto - -.PHONY: clean -clean: protogen-clean - $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/exllama/README.md b/backend/python/exllama/README.md deleted file mode 100644 index f9ed5e9fbdb7..000000000000 --- a/backend/python/exllama/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Creating a separate environment for the exllama project - -``` -make exllama -``` \ No newline at end of file diff --git a/backend/python/exllama/backend.py b/backend/python/exllama/backend.py deleted file mode 100755 index 58d1392c5ee4..000000000000 --- a/backend/python/exllama/backend.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 -import grpc -from concurrent import futures -import time -import backend_pb2 -import backend_pb2_grpc -import argparse -import signal -import sys -import os, glob - -from pathlib import Path -import torch -import torch.nn.functional as F -from torch import version as torch_version - -from source.tokenizer import ExLlamaTokenizer -from source.generator import ExLlamaGenerator -from source.model import ExLlama, ExLlamaCache, ExLlamaConfig - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - def generate(self,prompt, max_new_tokens): - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text - def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - def LoadModel(self, request, context): - try: - # https://github.com/turboderp/exllama/blob/master/example_cfg.py - model_directory = request.ModelFile - - # Locate files we need within that directory - tokenizer_path = os.path.join(model_directory, "tokenizer.model") - model_config_path = os.path.join(model_directory, "config.json") - st_pattern = os.path.join(model_directory, "*.safetensors") - model_path = glob.glob(st_pattern)[0] - - # Create config, model, tokenizer and generator - - config = ExLlamaConfig(model_config_path) # create config from config.json - config.model_path = model_path # supply path to model weights file - if (request.ContextSize): - config.max_seq_len = request.ContextSize # override max sequence length - config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2. - # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163 - - # Set Rope scaling. - if (request.RopeFreqScale): - # Alpha value for Rope scaling. - # Higher value increases context but adds perplexity. - # alpha_value and compress_pos_emb are mutually exclusive. - # https://github.com/turboderp/exllama/issues/115 - config.alpha_value = request.RopeFreqScale - config.calculate_rotary_embedding_base() - - model = ExLlama(config) # create ExLlama instance and load the weights - tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file - - cache = ExLlamaCache(model, batch_size = 2) # create cache for inference - generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - - self.generator= generator - self.model = model - self.tokenizer = tokenizer - self.cache = cache - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - penalty = 1.15 - if request.Penalty != 0.0: - penalty = request.Penalty - self.generator.settings.token_repetition_penalty_max = penalty - self.generator.settings.temperature = request.Temperature - self.generator.settings.top_k = request.TopK - self.generator.settings.top_p = request.TopP - - tokens = 512 - if request.Tokens != 0: - tokens = request.Tokens - - if self.cache.batch_size == 1: - del self.cache - self.cache = ExLlamaCache(self.model, batch_size=2) - self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache) - - t = self.generate(request.Prompt, tokens) - - # Remove prompt from response if present - if request.Prompt in t: - t = t.replace(request.Prompt, "") - - return backend_pb2.Result(message=bytes(t, encoding='utf-8')) - - def PredictStream(self, request, context): - # Implement PredictStream RPC - #for reply in some_data_generator(): - # yield reply - # Not implemented yet - return self.Predict(request, context) - - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) \ No newline at end of file diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh deleted file mode 100755 index d33c435600d0..000000000000 --- a/backend/python/exllama/install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -LIMIT_TARGETS="cublas" - -source $(dirname $0)/../common/libbackend.sh - -installRequirements - -git clone https://github.com/turboderp/exllama $MY_DIR/source -uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt - -cp -v ./*py $MY_DIR/source/ diff --git a/backend/python/exllama/requirements-cpu.txt b/backend/python/exllama/requirements-cpu.txt deleted file mode 100644 index bbcdc8cda704..000000000000 --- a/backend/python/exllama/requirements-cpu.txt +++ /dev/null @@ -1,3 +0,0 @@ -transformers -accelerate -torch \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas11.txt b/backend/python/exllama/requirements-cublas11.txt deleted file mode 100644 index 1dfb5b9854d2..000000000000 --- a/backend/python/exllama/requirements-cublas11.txt +++ /dev/null @@ -1,4 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas12.txt b/backend/python/exllama/requirements-cublas12.txt deleted file mode 100644 index 1ec544cd1438..000000000000 --- a/backend/python/exllama/requirements-cublas12.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt deleted file mode 100644 index b9c192d5d304..000000000000 --- a/backend/python/exllama/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -grpcio==1.66.1 -protobuf -certifi -setuptools \ No newline at end of file diff --git a/backend/python/exllama/run.sh b/backend/python/exllama/run.sh deleted file mode 100755 index 63119689d27a..000000000000 --- a/backend/python/exllama/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -LIMIT_TARGETS="cublas" -BACKEND_FILE="${MY_DIR}/source/backend.py" - -source $(dirname $0)/../common/libbackend.sh - -startBackend $@ \ No newline at end of file diff --git a/backend/python/exllama/test.sh b/backend/python/exllama/test.sh deleted file mode 100755 index 6940b0661df2..000000000000 --- a/backend/python/exllama/test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -runUnittests diff --git a/core/backend/backend_suite_test.go b/core/backend/backend_suite_test.go new file mode 100644 index 000000000000..541c91f6be79 --- /dev/null +++ b/core/backend/backend_suite_test.go @@ -0,0 +1,13 @@ +package backend_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestBackend(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Backend test suite") +} diff --git a/core/backend/llm.go b/core/backend/llm.go index 72c4ad9f0380..2b4564a886fe 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -9,6 +9,8 @@ import ( "sync" "unicode/utf8" + "github.com/rs/zerolog/log" + "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/schema" @@ -181,13 +183,37 @@ func Finetune(config config.BackendConfig, input, prediction string) string { mu.Lock() reg, ok := cutstrings[c] if !ok { - cutstrings[c] = regexp.MustCompile(c) + r, err := regexp.Compile(c) + if err != nil { + log.Fatal().Err(err).Msg("failed to compile regex") + } + cutstrings[c] = r reg = cutstrings[c] } mu.Unlock() prediction = reg.ReplaceAllString(prediction, "") } + // extract results from the response which can be for instance inside XML tags + var predResult string + for _, r := range config.ExtractRegex { + mu.Lock() + reg, ok := cutstrings[r] + if !ok { + regex, err := regexp.Compile(r) + if err != nil { + log.Fatal().Err(err).Msg("failed to compile regex") + } + cutstrings[r] = regex + reg = regex + } + mu.Unlock() + predResult += reg.FindString(prediction) + } + if predResult != "" { + prediction = predResult + } + for _, c := range config.TrimSpace { prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c)) } diff --git a/core/backend/llm_test.go b/core/backend/llm_test.go new file mode 100644 index 000000000000..f7630702e2a0 --- /dev/null +++ b/core/backend/llm_test.go @@ -0,0 +1,109 @@ +package backend_test + +import ( + . "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("LLM tests", func() { + Context("Finetune LLM output", func() { + var ( + testConfig config.BackendConfig + input string + prediction string + result string + ) + + BeforeEach(func() { + testConfig = config.BackendConfig{ + PredictionOptions: schema.PredictionOptions{ + Echo: false, + }, + LLMConfig: config.LLMConfig{ + Cutstrings: []string{`<.*?>`}, // Example regex for removing XML tags + ExtractRegex: []string{`(.*?)`}, // Example regex to extract from tags + TrimSpace: []string{" ", "\n"}, + TrimSuffix: []string{".", "!"}, + }, + } + }) + + Context("when echo is enabled", func() { + BeforeEach(func() { + testConfig.Echo = true + input = "Hello" + prediction = "World" + }) + + It("should prepend input to prediction", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("HelloWorld")) + }) + }) + + Context("when echo is disabled", func() { + BeforeEach(func() { + testConfig.Echo = false + input = "Hello" + prediction = "World" + }) + + It("should not modify the prediction with input", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("World")) + }) + }) + + Context("when cutstrings regex is applied", func() { + BeforeEach(func() { + input = "" + prediction = "
Hello
World" + }) + + It("should remove substrings matching cutstrings regex", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("Hello World")) + }) + }) + + Context("when extract regex is applied", func() { + BeforeEach(func() { + input = "" + prediction = "42" + }) + + It("should extract substrings matching the extract regex", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("42")) + }) + }) + + Context("when trimming spaces", func() { + BeforeEach(func() { + input = "" + prediction = " Hello World " + }) + + It("should trim spaces from the prediction", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("Hello World")) + }) + }) + + Context("when trimming suffixes", func() { + BeforeEach(func() { + input = "" + prediction = "Hello World." + }) + + It("should trim suffixes from the prediction", func() { + result = Finetune(testConfig, input, prediction) + Expect(result).To(Equal("Hello World")) + }) + }) + }) +}) diff --git a/core/config/backend_config.go b/core/config/backend_config.go index b83e1a986666..027e18a4a599 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -126,6 +126,7 @@ type LLMConfig struct { Grammar string `yaml:"grammar"` StopWords []string `yaml:"stopwords"` Cutstrings []string `yaml:"cutstrings"` + ExtractRegex []string `yaml:"extract_regex"` TrimSpace []string `yaml:"trimspace"` TrimSuffix []string `yaml:"trimsuffix"` diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index a979b7bca33d..8144bdcd3341 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -68,9 +68,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig) result = functions.CleanupLLMResult(result, config.FunctionsConfig) - results := functions.ParseFunctionCall(result, config.FunctionsConfig) + functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig) log.Debug().Msgf("Text content to return: %s", textContentToReturn) - noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0 + noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0 switch { case noActionToRun: @@ -83,7 +83,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup } responses <- initialMessage - result, err := handleQuestion(config, req, ml, startupOptions, results, result, prompt) + result, err := handleQuestion(config, req, ml, startupOptions, functionResults, result, prompt) if err != nil { log.Error().Err(err).Msg("error handling question") return @@ -105,7 +105,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup responses <- resp default: - for i, ss := range results { + for i, ss := range functionResults { name, args := ss.Name, ss.Arguments initialMessage := schema.OpenAIResponse{ diff --git a/examples/chainlit/requirements.txt b/examples/chainlit/requirements.txt index 8654ea9959a5..69212e28e102 100644 --- a/examples/chainlit/requirements.txt +++ b/examples/chainlit/requirements.txt @@ -1,4 +1,4 @@ -llama_index==0.11.4 +llama_index==0.11.7 requests==2.32.3 weaviate_client==4.6.7 transformers diff --git a/examples/langchain/langchainpy-localai-example/requirements.txt b/examples/langchain/langchainpy-localai-example/requirements.txt index 0c7be91791ff..753230059bcd 100644 --- a/examples/langchain/langchainpy-localai-example/requirements.txt +++ b/examples/langchain/langchainpy-localai-example/requirements.txt @@ -30,4 +30,4 @@ tqdm==4.66.5 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.2 -yarl==1.9.7 +yarl==1.11.0 diff --git a/gallery/index.yaml b/gallery/index.yaml index 4939820df83e..188576b26bc5 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -658,6 +658,23 @@ - filename: Mahou-1.3-llama3.1-8B.Q4_K_M.gguf sha256: 88bfdca2f6077d789d3e0f161d19711aa208a6d9a02cce96a2276c69413b3594 uri: huggingface://mradermacher/Mahou-1.3-llama3.1-8B-GGUF/Mahou-1.3-llama3.1-8B.Q4_K_M.gguf +- !!merge <<: *llama31 + name: "azure_dusk-v0.2-iq-imatrix" + # chatml + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/n3-g_YTk3FY-DBzxXd28E.png + urls: + - https://huggingface.co/Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix + description: | + "Following up on Crimson_Dawn-v0.2 we have Azure_Dusk-v0.2! Training on Mistral-Nemo-Base-2407 this time I've added significantly more data, as well as trained using RSLoRA as opposed to regular LoRA. Another key change is training on ChatML as opposed to Mistral Formatting." + by Author. + overrides: + parameters: + model: Azure_Dusk-v0.2-Q4_K_M-imat.gguf + files: + - filename: Azure_Dusk-v0.2-Q4_K_M-imat.gguf + sha256: c03a670c00976d14c267a0322374ed488b2a5f4790eb509136ca4e75cbc10cf4 + uri: huggingface://Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix/Azure_Dusk-v0.2-Q4_K_M-imat.gguf - &deepseek ## Deepseek url: "github:mudler/LocalAI/gallery/deepseek.yaml@master" @@ -1195,6 +1212,23 @@ - filename: Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf sha256: cf3465c183bf4ecbccd1b6b480f687e0160475b04c87e2f1e5ebc8baa0f4c7aa uri: huggingface://bartowski/Pantheon-RP-1.6-12b-Nemo-GGUF/Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf +- !!merge <<: *mistral03 + name: "mn-12b-lyra-v4-iq-imatrix" + icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/dVoru83WOpwVjMlgZ_xhA.png + #chatml + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + urls: + - https://huggingface.co/Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix + description: | + A finetune of Mistral Nemo by Sao10K. + Uses the ChatML prompt format. + overrides: + parameters: + model: MN-12B-Lyra-v4-Q4_K_M-imat.gguf + files: + - filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf + sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3 + uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf - &mudler ### START mudler's LocalAI specific-models url: "github:mudler/LocalAI/gallery/mudler.yaml@master" @@ -1722,6 +1756,34 @@ - filename: Athena-codegemma-2-2b-it.Q4_K_M.gguf sha256: 59ce17023438b0da603dd211c7d39f78e7acac4108258ac0818a97a4ca7d64e3 uri: huggingface://mradermacher/Athena-codegemma-2-2b-it-GGUF/Athena-codegemma-2-2b-it.Q4_K_M.gguf +- !!merge <<: *gemma + name: "datagemma-rag-27b-it" + urls: + - https://huggingface.co/google/datagemma-rag-27b-it + - https://huggingface.co/bartowski/datagemma-rag-27b-it-GGUF + description: | + DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RAG is used with Retrieval Augmented Generation, where it is trained to take a user query and generate natural language queries that can be understood by Data Commons' existing natural language interface. More information can be found in this research paper. + overrides: + parameters: + model: datagemma-rag-27b-it-Q4_K_M.gguf + files: + - filename: datagemma-rag-27b-it-Q4_K_M.gguf + sha256: 3dfcf51b05e3f0ab0979ad194de350edea71cb14444efa0a9f2ef5bfc80753f8 + uri: huggingface://bartowski/datagemma-rag-27b-it-GGUF/datagemma-rag-27b-it-Q4_K_M.gguf +- !!merge <<: *gemma + name: "datagemma-rig-27b-it" + urls: + - https://huggingface.co/google/datagemma-rig-27b-it + - https://huggingface.co/bartowski/datagemma-rig-27b-it-GGUF + description: | + DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RIG is used in the retrieval interleaved generation approach (based off of tool-use approaches), where it is trained to annotate a response with natural language queries to Data Commons’ existing natural language interface wherever there are statistics. More information can be found in this research paper. + overrides: + parameters: + model: datagemma-rig-27b-it-Q4_K_M.gguf + files: + - filename: datagemma-rig-27b-it-Q4_K_M.gguf + sha256: a6738ffbb49b6c46d220e2793df85c0538e9ac72398e32a0914ee5e55c3096ad + uri: huggingface://bartowski/datagemma-rig-27b-it-GGUF/datagemma-rig-27b-it-Q4_K_M.gguf - &llama3 url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master" icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png