Skip to content

Commit

Permalink
metal : enable shader debugging (cmake option) (ggml-org#4705)
Browse files Browse the repository at this point in the history
* ggml : disable fast-math for Metal (cmake build only)

ggml-ci

* metal : fix Metal API debug warnings

* cmake : add -fno-inline for Metal build (ggml-org#4545)

* metal : fix API debug warnings

* metal : fix compile warnings

* metal : use uint64_t for strides

* cmake : rename option to LLAMA_METAL_SHADER_DEBUG

* metal : fix mat-vec Q8_0 kernel for BS > 1

* metal : normalize mat-vec kernel signatures

* cmake : respect LLAMA_QKK_64 option

* metal : fix mat-vec Q4_K kernel for QK_K == 64

ggml-ci
  • Loading branch information
ggerganov authored and hodlen committed Apr 1, 2024
1 parent cac3eee commit 977ab13
Show file tree
Hide file tree
Showing 5 changed files with 329 additions and 230 deletions.
34 changes: 31 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ option(LLAMA_HIP_UMA "llama: use HIP unified memory arch
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)

Expand Down Expand Up @@ -154,9 +155,9 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()

if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)

message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
Expand All @@ -173,6 +174,33 @@ if (LLAMA_METAL)
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o ggml.metallib
#
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
set(XC_FLAGS -fno-fast-math -fno-inline -g)
if (LLAMA_QKK_64)
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
endif()

add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
DEPENDS ggml-metal.metal
COMMENT "Compiling Metal kernels"
)

add_custom_target(
ggml-metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
)
endif()

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
Expand Down
14 changes: 10 additions & 4 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`

CMAKE_EXTRA=""

if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
fi

## helpers

# download a file if it does not exist or if it is outdated
Expand Down Expand Up @@ -81,8 +87,8 @@ function gg_run_ctest_debug {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

Expand All @@ -109,8 +115,8 @@ function gg_run_ctest_release {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
Expand Down
28 changes: 19 additions & 9 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif
NSError * error = nil;
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
if (libPath != nil) {
// pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
} else {
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);

NSString * sourcePath;
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
Expand Down Expand Up @@ -291,6 +292,13 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
options = [MTLCompileOptions new];
options.preprocessorMacros = @{ @"QK_K" : @(64) };
#endif
// try to disable fast-math
// NOTE: this seems to have no effect whatsoever
// instead, in order to disable fast-math, we have to build ggml.metallib from the command line
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
// and go through the "pre-compiled library found" path above
//[options setFastMathEnabled:false];

ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
}

Expand Down Expand Up @@ -1230,7 +1238,7 @@ void ggml_metal_graph_compute(
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel

const int nth = MIN(1024, ne00);
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);

[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
Expand Down Expand Up @@ -1285,7 +1293,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];

const int nth = MIN(1024, ne0);
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);

[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
Expand Down Expand Up @@ -1785,8 +1793,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < n_as; ++j) {
struct ggml_tensor * src_cur = dst->src[2 + j];
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
Expand Down Expand Up @@ -1909,8 +1918,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < n_as; ++j) {
struct ggml_tensor * src_cur = dst->src[2 + j];
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
Expand Down Expand Up @@ -2229,7 +2239,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];

const int nth = MIN(1024, ne0);
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);

[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
Expand Down
Loading

0 comments on commit 977ab13

Please sign in to comment.