diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc index 2a762b5d1298..7b3f60369b93 100644 --- a/crates/llama-cpp-bindings/src/engine.cc +++ b/crates/llama-cpp-bindings/src/engine.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -126,6 +127,8 @@ class TextInferenceEngineImpl : public TextInferenceEngine { } rust::Vec step() override { + std::lock_guard guard(g_mutex_); + auto* ctx = ctx_.get(); auto n_vocab = llama_n_vocab(llama_get_model(ctx)); @@ -275,8 +278,15 @@ class TextInferenceEngineImpl : public TextInferenceEngine { std::unordered_set stopped_requests_; uint32_t parallelism_; + + // llama.cpp is not thread safe + // FIXME(meng): remove the mutex once https://github.com/ggerganov/llama.cpp/issues/3960 is fixed + // and integrated to tabby's fork. + static std::mutex g_mutex_; }; +std::mutex TextInferenceEngineImpl::g_mutex_; + static int g_llama_cpp_log_level = 0; static void llama_log_callback(ggml_log_level level, const char * text, void * user_data) { (void)user_data;