feat(llama.cpp): add distributed llama.cpp inferencing (#2324)

* feat(llama.cpp): support distributed llama.cpp Signed-off-by: Ettore Di Giacinto <[email protected]> * feat: let tweak how chat messages are merged together Signed-off-by: Ettore Di Giacinto <[email protected]> * refactor Signed-off-by: Ettore Di Giacinto <[email protected]> * Makefile: register to ALL_GRPC_BACKENDS Signed-off-by: Ettore Di Giacinto <[email protected]> * refactoring, allow disable auto-detection of backends Signed-off-by: Ettore Di Giacinto <[email protected]> * minor fixups Signed-off-by: mudler <[email protected]> * feat: add cmd to start rpc-server from llama.cpp Signed-off-by: mudler <[email protected]> * ci: add ccache Signed-off-by: mudler <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]> Signed-off-by: mudler <[email protected]>
mudler · May 14, 2024 · c89271b · c89271b
1 parent 2990966
commit c89271b
Show file tree

Hide file tree

Showing 11 changed files with 220 additions and 80 deletions.
diff --git a/.env b/.env
@@ -71,6 +71,11 @@
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1
 
+### Define a list of GRPC Servers for llama-cpp workers to distribute the load
+# https://github.com/ggerganov/llama.cpp/pull/6829
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# LLAMACPP_GRPC_SERVERS=""
+
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true
 

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -29,7 +29,7 @@ jobs:
       - name: Dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
       - name: Install CUDA Dependencies
         run: |
           curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -86,7 +86,7 @@ jobs:
           cache: false
       - name: Dependencies
         run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
           go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
           go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
       - name: Build stablediffusion

diff --git a/Dockerfile b/Dockerfile
@@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts"
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
+        ccache \
         ca-certificates \
         cmake \
         curl \

diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
 
 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
+CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12
 
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -158,6 +158,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
+ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -314,7 +316,7 @@ build: prepare backend-assets grpcs ## Build the project
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
 
 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build
 
 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
@@ -691,6 +693,17 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
 
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-grpc
+	$(MAKE) -C backend/cpp/llama-grpc purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
+
+backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
+	mkdir -p backend-assets/util/
+	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
+
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
@@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
     } else {
         params.n_parallel = 1;
     }
+
+    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+    if (llama_grpc_servers != NULL) {
+        params.rpc_servers = std::string(llama_grpc_servers);
+    }
+
     // TODO: Add yarn
 
     if (!request->tensorsplit().empty()) {

diff --git a/core/cli/cli.go b/core/cli/cli.go
@@ -13,8 +13,9 @@ type Context struct {
 var CLI struct {
 	Context `embed:""`
 
-	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
-	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
-	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
-	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+	Run            RunCMD            `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Models         ModelsCMD         `cmd:"" help:"Manage LocalAI models and definitions"`
+	TTS            TTSCMD            `cmd:"" help:"Convert text to speech"`
+	Transcript     TranscriptCMD     `cmd:"" help:"Convert audio to text"`
+	LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
 }
diff --git a/core/cli/llamacppworker.go b/core/cli/llamacppworker.go
@@ -0,0 +1,37 @@
+package cli
+
+import (
+	"os"
+	"syscall"
+
+	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/rs/zerolog/log"
+)
+
+type LLAMACPPWorkerCMD struct {
+	Args              []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
+	BackendAssetsPath string   `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+}
+
+func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
+	// Extract files from the embedded FS
+	err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
+	log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
+	if err != nil {
+		log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+	}
+
+	return syscall.Exec(
+		assets.ResolvePath(
+			r.BackendAssetsPath,
+			"util",
+			"llama-cpp-rpc-server",
+		),
+		append([]string{
+			assets.ResolvePath(
+				r.BackendAssetsPath,
+				"util",
+				"llama-cpp-rpc-server",
+			)}, r.Args...),
+		os.Environ())
+}
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
@@ -93,6 +93,8 @@ type Diffusers struct {
 	ControlNet       string  `yaml:"control_net"`
 }
 
+// LLMConfig is a struct that holds the configuration that are
+// generic for most of the LLM backends.
 type LLMConfig struct {
 	SystemPrompt    string   `yaml:"system_prompt"`
 	TensorSplit     string   `yaml:"tensor_split"`
@@ -144,20 +146,39 @@ type LLMConfig struct {
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }
 
+// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
 	Device           string `yaml:"device"`
 	Triton           bool   `yaml:"triton"`
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }
 
+// TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
-	Chat                 string `yaml:"chat"`
-	ChatMessage          string `yaml:"chat_message"`
-	Completion           string `yaml:"completion"`
-	Edit                 string `yaml:"edit"`
-	Functions            string `yaml:"function"`
-	UseTokenizerTemplate bool   `yaml:"use_tokenizer_template"`
+	// Chat is the template used in the chat completion endpoint
+	Chat string `yaml:"chat"`
+
+	// ChatMessage is the template used for chat messages
+	ChatMessage string `yaml:"chat_message"`
+
+	// Completion is the template used for completion requests
+	Completion string `yaml:"completion"`
+
+	// Edit is the template used for edit completion requests
+	Edit string `yaml:"edit"`
+
+	// Functions is the template used when tools are present in the client requests
+	Functions string `yaml:"function"`
+
+	// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
+	// Note: this is mostly consumed for backends such as vllm and transformers
+	// that can use the tokenizers specified in the JSON config files of the models
+	UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
+
+	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
+	// It defaults to \n
+	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 }
 
 func (c *BackendConfig) SetFunctionCallString(s string) {

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				mess = append(mess, content)
 			}
 
-			predInput = strings.Join(mess, "\n")
+			joinCharacter := "\n"
+			if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
+				joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
+			}
+
+			predInput = strings.Join(mess, joinCharacter)
 			log.Debug().Msgf("Prompt (before templating): %s", predInput)
 
 			templateFile := ""

diff --git a/pkg/assets/extract.go b/pkg/assets/extract.go
@@ -8,6 +8,10 @@ import (
 	"path/filepath"
 )
 
+func ResolvePath(dir string, paths ...string) string {
+	return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...)
+}
+
 func ExtractFiles(content embed.FS, extractDir string) error {
 	// Create the target directory if it doesn't exist
 	err := os.MkdirAll(extractDir, 0750)
@@ -39,7 +43,7 @@ func ExtractFiles(content embed.FS, extractDir string) error {
 		}
 
 		// Create the file in the target directory
-		err = os.WriteFile(targetFile, fileData, 0600)
+		err = os.WriteFile(targetFile, fileData, 0700)
 		if err != nil {
 			return fmt.Errorf("failed to write file: %v", err)
 		}