Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml backends interface, ggml-cuda refactor #2239

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -308,13 +308,13 @@ jobs:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

windows-latest-cmake-cublas:
windows-latest-cmake-cuda:
runs-on: windows-latest

strategy:
matrix:
cuda: ['12.1.0', '11.7.1']
build: ['cublas']
build: ['cuda']

steps:
- name: Clone
Expand All @@ -333,7 +333,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON
cmake --build . --config Release

- name: Get commit hash
Expand Down Expand Up @@ -395,7 +395,7 @@ jobs:
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
- windows-latest-cmake-cuda

steps:
- name: Download artifacts
Expand Down
10 changes: 5 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
Expand Down Expand Up @@ -239,18 +239,18 @@ if (LLAMA_K_QUANTS)
endif()
endif()

if (LLAMA_CUBLAS)
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
message(STATUS "CUDA found")

enable_language(CUDA)

set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
Expand Down Expand Up @@ -280,7 +280,7 @@ if (LLAMA_CUBLAS)
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

else()
message(WARNING "cuBLAS not found")
message(WARNING "CUDA not found")
endif()
endif()

Expand Down
30 changes: 24 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ else
CXXFLAGS += -DNDEBUG
endif

ifdef LLAMA_SANITIZE
CFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
CXXFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
LDFLAGS += -g -fsanitize=$(LLAMA_SANITIZE)
endif

ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
Expand Down Expand Up @@ -163,13 +169,17 @@ ifdef LLAMA_BLIS
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
ifdef LLAMA_CUDA
CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler
NVCCV := $(shell $(NVCC) --version | tail -n 1)
ifdef LLAMA_DEBUG
NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else
Expand Down Expand Up @@ -198,10 +208,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif

ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda-kern.h ggml-cuda-quant.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
endif # LLAMA_CUDA

ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
Expand Down Expand Up @@ -275,6 +284,9 @@ $(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV))
$(info I CXX: $(CXXV))
ifdef LLAMA_CUDA
$(info I NVCC: $(NVCCV))
endif # LLAMA_CUDA
$(info )

#
Expand All @@ -284,6 +296,12 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@

# temporary, probably will be added to ggml.c
ggml-backend.o: ggml-backend.c ggml-backend.h ggml.h
$(CC) $(CFLAGS) -c $< -o $@

OBJS += ggml-backend.o

llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

Expand Down
18 changes: 9 additions & 9 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,24 +327,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
std::string arg_next = argv[i];

// split string by , and /
Expand All @@ -361,14 +361,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
}
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUDA
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUDA
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
Expand Down
Loading