diff --git a/Dockerfile b/Dockerfile index 566e03bc6265..2f2bcafa9f06 100644 --- a/Dockerfile +++ b/Dockerfile @@ -303,7 +303,7 @@ RUN make prepare ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build ## (both will use CUDA or hipblas for the actual computation) RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \ - SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ + SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ else \ make build; \ fi diff --git a/Makefile b/Makefile index 9c4f3778225a..e3c280396943 100644 --- a/Makefile +++ b/Makefile @@ -186,6 +186,7 @@ endif ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 +ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc @@ -699,6 +700,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama. CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2 +backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp + cp -rf backend/cpp/llama backend/cpp/llama-avx512 + $(MAKE) -C backend/cpp/llama-avx512 purge + $(info ${GREEN}I llama-cpp build info:avx512${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512 + backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-avx $(MAKE) -C backend/cpp/llama-avx purge diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 9fc0c18c089d..ace72fa3d342 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -48,6 +48,7 @@ const ( LLamaCPP = "llama-cpp" LLamaCPPAVX2 = "llama-cpp-avx2" + LLamaCPPAVX512 = "llama-cpp-avx512" LLamaCPPAVX = "llama-cpp-avx" LLamaCPPFallback = "llama-cpp-fallback" LLamaCPPCUDA = "llama-cpp-cuda" @@ -68,6 +69,7 @@ const ( var llamaCPPVariants = []string{ LLamaCPPAVX2, + LLamaCPPAVX512, LLamaCPPAVX, LLamaCPPFallback, LLamaCPPCUDA, @@ -268,6 +270,12 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) selectedProcess = p } + } else if xsysinfo.HasCPUCaps(cpuid.AVX512F) { + p := backendPath(assetDir, LLamaCPPAVX512) + if _, err := os.Stat(p); err == nil { + log.Info().Msgf("[%s] attempting to load with AVX512 variant", backend) + selectedProcess = p + } } else if xsysinfo.HasCPUCaps(cpuid.AVX) { p := backendPath(assetDir, LLamaCPPAVX) if _, err := os.Stat(p); err == nil {