diff --git a/.gitignore b/.gitignore index 62b6b8b1ab250..b485665da3dc4 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ models-mnt /train-text-from-scratch /tokenize /vdot +/merge /common/build-info.cpp arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index 4f26c0463fcd8..7d1d5c83da4ba 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o + simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o \ + merge # Binaries only useful for tests TEST_TARGETS = \ @@ -704,6 +705,10 @@ quantize: examples/quantize/quantize.cpp build-info.o ggml. $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +merge: examples/merge/merge.cpp examples/merge/parser.hpp build-info.o ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/merge/CMakeLists.txt b/examples/merge/CMakeLists.txt new file mode 100644 index 0000000000000..787ea86c34722 --- /dev/null +++ b/examples/merge/CMakeLists.txt @@ -0,0 +1,6 @@ +set(TARGET merge) +add_executable(${TARGET} merge.cpp parser.hpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/merge/config.example.txt b/examples/merge/config.example.txt new file mode 100644 index 0000000000000..d2ec9f2329b6b --- /dev/null +++ b/examples/merge/config.example.txt @@ -0,0 +1,123 @@ +# GGUF merge instructions +# +# Lines start with "#" will be comment +# Empty lines will be ignored +# The "output layer" instruction is to add a new layer for output model +# Merge instruction is in format: target (space) verb (space) parameters +# Supported verbs: +# - linear: merge linearly, parameters: source_layer,source_layer,t +# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale +# - copy: copy from which model, which layer + + +######################### +# Example: + +# This is the first layer of output model: +# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1) +# Except for "attn_output" tensor that we want t=0.5 instead t=0.1 + +output layer 0 +all slerp 0,0,0.1 +attn_output slerp 0,0,0.5 + +# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4 +# Except for "attn_output" tensor that we want to use slerp with t=0.9 + +output layer 1 +all linear 1,1,0.6,0.4 +attn_output slerp 1,1,0.9 + +# For next layer, we want to copy from model[0].layer[2] + +output layer 2 +all copy 0,2 + +output layer 3 +all copy 0,3 + +# For next layer, we want to copy from model[1].layer[4] + +output layer 4 +all copy 1,4 + +output layer 5 +all copy 1,5 + +output layer 6 +all linear 6,6,0.1,0.9 + +output layer 7 +all linear 7,7,0.1,0.9 + +output layer 8 +all linear 8,8,0.1,0.9 + +output layer 9 +all linear 9,9,0.1,0.9 + +output layer 10 +all linear 10,10,0.1,0.9 + +output layer 11 +all linear 11,11,0.1,0.9 + +output layer 12 +all linear 12,12,0.1,0.9 + +output layer 13 +all linear 13,13,0.3333,0.6666 + +output layer 14 +all linear 14,14,0.3333,0.6666 + +output layer 15 +all linear 15,15,0.3333,0.6666 + +output layer 16 +all linear 16,16,0.3333,0.6666 + +output layer 17 +all linear 17,17,0.3333,0.6666 + +output layer 18 +all linear 18,18,0.3333,0.6666 + +output layer 19 +all linear 19,19,0.3333,0.6666 + +output layer 20 +all slerp 20,20,0.8 + +output layer 21 +all slerp 21,21,0.8 + +output layer 22 +all slerp 22,22,0.8 + +output layer 23 +all slerp 23,23,0.8 + +output layer 24 +all slerp 24,24,0.8 + +output layer 25 +all slerp 25,25,0.8 + +output layer 26 +all slerp 26,26,0.8 + +output layer 27 +all slerp 27,27,0.8 + +output layer 28 +all slerp 28,28,0.8 + +output layer 29 +all slerp 29,29,0.8 + +output layer 30 +all slerp 30,30,0.8 + +output layer 31 +all slerp 31,31,0.8 diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp new file mode 100644 index 0000000000000..06b8de486f4ff --- /dev/null +++ b/examples/merge/merge.cpp @@ -0,0 +1,127 @@ +#include "common.h" +#include "llama.h" +#include "parser.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +static const size_t n_models = 2; // hard-limited to 2 input models for now + +struct merge_params { + std::string config_path = "config.txt"; + std::vector model_paths; + std::string output_path = "ggml-merged-f16.gguf"; + bool only_list_tensors_name = false; + bool dry_run = false; +}; + +[[noreturn]] +static void usage(const char * executable, int exit_code) { + struct merge_params defaults; + printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable); + printf("\n"); + printf("Merging multiple models, inspired by mergekit.\n"); + printf("For more details, see \"config.example.txt\" file.\n"); + printf("\n"); + printf("NOTE:\n"); + printf("- Only support merging 2 models.\n"); + printf("- The embedding and output layers of the first model will be used.\n"); + printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n"); + printf("\n"); + printf("Options:\n"); + printf(" -h, --help Show this help message and exit\n"); + printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str()); + printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n"); + printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str()); + printf(" --dry-run Only print out list of parsed and exit, useful for debugging\n"); + printf(" --print-list-tensor Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n"); + printf("\n"); + printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n"); + exit(exit_code); +} + +int main(int argc, char ** argv) { + bool invalid_param = false; + struct merge_params params; + + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg == "-h" || arg == "--help") { + usage(argv[0], 0); + } else if (arg == "-c" || arg == "--config") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.config_path = argv[i]; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_paths.push_back(argv[i]); + } else if (arg == "-o" || arg == "--output") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.output_path = argv[i]; + } else if (arg == "--print-list-tensor") { + params.only_list_tensors_name = true; + } else if (arg == "--dry-run") { + params.dry_run = true; + } + } + + try { + if (invalid_param) { + usage(argv[0], 1); + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } else if (!params.only_list_tensors_name && params.model_paths.size() < 2) { + throw std::invalid_argument("error: require at least 2 models"); + } + + if (params.only_list_tensors_name) { + if (params.model_paths.size() != 1) { + throw std::invalid_argument("error: we can only list tensors of one single model"); + } + print_model_tensors_name(params.model_paths[0]); + return 0; // exit now + } + + size_t n_layers = 0; + auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers); + + if (params.dry_run) { + return 0; + } + + std::vector p_model_paths; + for (auto & m : params.model_paths) { + p_model_paths.push_back(m.data()); + } + struct llama_merge_config config{ + { + params.model_paths[0].c_str(), + params.model_paths[1].c_str(), + }, + instructions.data(), + instructions.size(), + n_layers, + params.output_path.c_str(), + }; + + llama_merge_models(&config); + } catch (const std::exception & ex) { + std::cerr << ex.what() << "\n\n"; + } + + return 0; +} diff --git a/examples/merge/parser.hpp b/examples/merge/parser.hpp new file mode 100644 index 0000000000000..64f7d0e607887 --- /dev/null +++ b/examples/merge/parser.hpp @@ -0,0 +1,293 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// trim whitespace from the beginning and end of a string +static std::string str_trim(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && isspace(str[start])) { + start += 1; + } + while (end > start && isspace(str[end - 1])) { + end -= 1; + } + return str.substr(start, end - start); +} + +inline std::vector str_split(std::string str, const std::string & delimiter) { + size_t pos = 0; + std::string token; + std::vector output; + while ((pos = str.find(delimiter)) != std::string::npos) { + token = str.substr(0, pos); + output.push_back(token); + str.erase(0, pos + delimiter.length()); + } + output.push_back(str); // the rest + return output; +} + +///////////////////////////////// + +// dump a list of tensor name of the input model +static std::vector get_list_tensors_name(std::string & model_path) { + llama_model_params model_params = llama_model_default_params(); + llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); + size_t n_tensors = llama_get_all_tensors_name(model, nullptr, 0); + std::vector list(n_tensors, nullptr); + llama_get_all_tensors_name(model, list.data(), list.size()); + // copy the result + std::vector results; + for (auto & name : list) { + results.push_back(std::string(name)); + } + llama_free_model(model); + return results; +} + +static void print_model_tensors_name(std::string & model_path) { + auto tensors = get_list_tensors_name(model_path); + std::cout << "\n\n===================\n"; + std::cout << "Total number of tensors: " << tensors.size() << "\n"; + std::vector list(tensors.size(), nullptr); + for (size_t i = 0; i < tensors.size(); i++) { + char buf[128]; + sprintf(buf, "%4ld: %s", i, tensors[i].c_str()); + std::cout << buf << "\n"; + } +} + +///////////////////////////////// + +// get layer index from tensor name, for example "blk.x.attn_norm.weight" +// returns -1 if it is non-layer +static int get_i_layer(std::string tensor_name) { + int i_layer = -1; + return sscanf(tensor_name.c_str(), "blk.%d.", &i_layer) == 1 ? i_layer : -1; +}; + +static void print_inst(struct llama_merge_inst inst) { + std::cout << "Output: " << inst.name << "\n"; + switch (inst.method) { + case LLAMA_MERGE_LINEAR: + std::cout << " Linear\n"; + std::cout << " Model A: " << inst.scales[0] << " * " << inst.srcs[0] << "\n"; + std::cout << " Model B: " << inst.scales[1] << " * " << inst.srcs[1] << "\n"; + break; + case LLAMA_MERGE_SLERP: + std::cout << " SLERP\n"; + std::cout << " t=" << inst.t << "\n"; + std::cout << " Model A: " << inst.srcs[0] << "\n"; + std::cout << " Model B: " << inst.srcs[1] << "\n"; + break; + case LLAMA_MERGE_COPY: + std::cout << " Copy from model A: "<< inst.srcs[0] << "\n"; + break; + case LLAMA_MERGE_REPEAT: + std::cout << " Repeat from output model: " << inst.srcs[0] << "\n"; + break; + default: + break; + } +} + +static std::vector parse_config(std::string & config_path, std::string & model_path, size_t & n_layers) { + std::vector instructions; + + // read file + std::ifstream file(config_path); + if (!file.is_open()) { + throw std::runtime_error("Unable to open file merge config file"); + } + std::ostringstream content; + content << file.rdbuf(); // Read the entire file into the stringstream + auto lines = str_split(content.str(), "\n"); + file.close(); + + // get list of input tensors + auto inp_names = get_list_tensors_name(model_path); + std::set units; // name of units, for example "attn_output" + for (auto & name : inp_names) { + int il = get_i_layer(name); + if (il < 0) { + // non-layer, only copy + struct llama_merge_inst ins; + ins.method = LLAMA_MERGE_COPY; + strcpy(ins.name, name.c_str()); + strcpy(ins.srcs[0], name.c_str()); // always take the first model + strcpy(ins.srcs[1], ""); + instructions.push_back(ins); + } else { + // tensor belong to layer + auto parts = str_split(name, "."); + units.insert(parts[2]); + } + } + + std::cout << "List of units:\n"; + for (auto & u : units) std::cout << u << "\n"; + std::cout << "\n"; + + // process line by line, one line is one layer + std::unordered_map layer; // map tensor name to instruction + bool is_layer_empty = true; + int i_layer = -1; + auto get_tensor_name = [&](int layer, std::string unit) { + return "blk." + std::to_string(layer) + "." + unit + ".weight"; + }; + auto push_output_layer = [&]() { + if (!is_layer_empty) { + for (auto & it : layer) { + instructions.push_back(it.second); + } + } + layer.clear(); + is_layer_empty = true; + }; + auto new_output_layer = [&]() { + layer.clear(); + for (auto & u : units) { + struct llama_merge_inst ins; + strcpy(ins.name, get_tensor_name(i_layer, u).c_str()); + layer[u] = ins; + } + }; + + auto raise_err = [&](size_t i_line, std::string message) { + std::stringstream ss; + ss << "Parse error: (line " << i_line + 1 << ") " << message; + throw std::runtime_error(ss.str()); + }; + + for (size_t i_line = 0 ; i_line < lines.size(); i_line++) { + auto line = str_trim(lines[i_line]); + if (line.empty() || line.c_str()[0] == '#') { + continue; // skip empty line or comment + } + + auto parts = str_split(line, " "); + if (parts.size() != 3) { + raise_err(i_line, "does not follow format: \"target (space) verb (space) parameters\""); + } + + auto target = parts[0]; + auto verb = parts[1]; + auto params = str_split(parts[2], ","); + + if (target == "output" && verb == "layer") { + int il_curr = std::stoi(params[0]); + if (i_layer + 1 != il_curr) { + raise_err(i_line, "new layer number must be (last layer number + 1)"); + } + push_output_layer(); + i_layer = il_curr; + new_output_layer(); + continue; + } + + auto linear = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 4) { + raise_err(i_line, "verb \"linear\" requires exactly 4 parameters"); + } + ins.method = LLAMA_MERGE_LINEAR; + int src0 = std::stoi(params[0]); + int src1 = std::stoi(params[1]); + strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); + strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str()); + ins.scales[0] = std::stof(params[2]); + ins.scales[1] = std::stof(params[3]); + is_layer_empty = false; + }; + + auto slerp = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 3) { + raise_err(i_line, "verb \"slerp\" requires exactly 3 parameters"); + } + ins.method = LLAMA_MERGE_SLERP; + int src0 = std::stoi(params[0]); + int src1 = std::stoi(params[1]); + strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); + strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str()); + ins.t = std::stof(params[2]); + is_layer_empty = false; + }; + + /*auto repeat = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 1) { + raise_err(i_line, "verb \"repeat\" requires exactly 1 parameter"); + } + ins.method = LLAMA_MERGE_REPEAT; + int src0 = std::stoi(params[0]); + strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); + is_layer_empty = false; + };*/ + + auto copy = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 2) { + raise_err(i_line, "verb \"copy\" requires exactly 2 parameters"); + } + ins.method = LLAMA_MERGE_COPY; + int model = std::stoi(params[0]); + int layer = std::stoi(params[1]); + if (model == 0) { + strcpy(ins.srcs[0], get_tensor_name(layer, unit).c_str()); + strcpy(ins.srcs[1], ""); + } else if (model == 1) { + strcpy(ins.srcs[0], ""); + strcpy(ins.srcs[1], get_tensor_name(layer, unit).c_str()); + } else { + raise_err(i_line, "can only copy from model 0 or 1"); + } + is_layer_empty = false; + }; + + auto apply_verb = [&](struct llama_merge_inst & ins, std::string unit) { + if (verb == "linear") { + linear(ins, unit); + } else if (verb == "slerp") { + slerp(ins, unit); + } else if (verb == "repeat") { + // repeat(ins, unit); + raise_err(i_line, "repeat is currently not supported"); + } else if (verb == "copy") { + copy(ins, unit); + } else { + raise_err(i_line, "invalid verb: " + verb); + } + }; + + // TODO: what if user does not use "all"? we may miss some tensors? + if (target == "all") { + for (auto & u : units) { + apply_verb(layer[u], u); + } + } else { + if (units.find(target) == units.end()) { + raise_err(i_line, "unit " + target + " does not exist"); + } + apply_verb(layer[target], target); + } + } + push_output_layer(); + n_layers = i_layer + 1; + + // print all parsed instructions + std::cout << "Parsed instructions:\n"; + for (auto & ins : instructions) { + print_inst(ins); + } + std::cout << "---\n" << "Total output layers: " << n_layers << "\n"; + + return instructions; +} diff --git a/llama.cpp b/llama.cpp index 62699ce52e197..c786bc5778aa4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -11309,6 +11311,341 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } +// TODO: remove this when #5830 is merged +static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector & workers, const int nthread) { + std::mutex mutex; + int counter = 0; + size_t new_size = 0; + if (nthread < 2) { + // single-thread + return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix); + } + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size, + nrows, n_per_row, imatrix]() { + std::array local_hist = {}; + const int nrows_per_chunk = chunk_size / n_per_row; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + int first_row = counter; counter += nrows_per_chunk; + if (first_row >= nrows) { + if (local_size > 0) { + for (int j=0; j> models; + std::vector> mls; + std::vector> buf_in; + std::vector> buf_out; + std::set ref_names; // list of ref_name per layer + std::vector output_tensors; + + // output file + struct gguf_context * ctx_out = gguf_init_empty(); + std::ofstream fout(config->output_path, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + // remember to call before exit + auto clean_up = [&]() { + fout.close(); + for (auto & tensor : output_tensors) { + free(tensor); + } + gguf_free(ctx_out); + }; + + // load the input models + static const size_t n_models = 2; + for (size_t i = 0; i < n_models; i++) { + auto model = std::unique_ptr(new llama_model()); + auto ml = std::unique_ptr(new llama_model_loader(config->model_paths[i], use_mmap, NULL)); + ml->init_mapping(false); + llm_load_arch(*ml, *model); + llm_load_hparams(*ml, *model); + + models.push_back(std::move(model)); + mls.push_back(std::move(ml)); + } + + // for verb copy, we want to get the source tensor + auto get_src_tensor_for_copy = [&](const struct llama_merge_inst ins, size_t & i_model) { + i_model = std::string(ins.srcs[0]).empty() ? 1 : 0; + return mls[i_model]->get_tensor_meta(ins.srcs[i_model]); + }; + + // construct metadata + { + // copy the KV pairs from the input file + gguf_set_kv(ctx_out, mls[0]->ctx_gguf); + + // correct layer count for output model + std::stringstream ss; + ss << mls[0]->get_arch_name() << ".block_count"; + gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers); + LLAMA_LOG_INFO("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers); + + // populate metadata for output tensors + auto push_tensor = [&](struct ggml_tensor * ref, const char * name) { + struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE); + if (ref != nullptr) { + // copy metadata (shape, type,...) + memcpy(out_tensor, ref, GGML_TENSOR_SIZE); + } + ggml_set_name(out_tensor, name); + gguf_add_tensor(ctx_out, out_tensor); + output_tensors.push_back(out_tensor); + }; + for (size_t i = 0; i < config->n_insts; i++) { + const struct llama_merge_inst ins = config->insts[i]; + struct ggml_tensor * t0; + struct ggml_tensor * t1; + // TODO: reject non-requantize-able type (one that requires imatrix) + if (ins.method == LLAMA_MERGE_COPY) { + // simply copy from model A + size_t i_model; + t0 = get_src_tensor_for_copy(ins, i_model); + push_tensor(t0, ins.name); + } else if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) { + t0 = mls[0]->get_tensor_meta(ins.srcs[0]); + t1 = mls[1]->get_tensor_meta(ins.srcs[1]); + if (llama_format_tensor_shape(t0) != llama_format_tensor_shape(t1)) { + LLAMA_LOG_ERROR("some tensors does not have the same shape"); + clean_up(); + return -1; + } + push_tensor(t0, ins.name); + } else if (ins.method == LLAMA_MERGE_REPEAT) { + // TODO: in theory, we can point 2 tensors to the same offset, but here we're unable to do that, because offset is currently managed by gguf_add_tensor() + GGML_ASSERT(false); + /*int idx = nullptr; + std::string search_tensor(ins.srcs[0]); + for (auto & tensor : output_tensors) { + if (std::string(ggml_get_name(tensor)) == search_tensor) { + t0 = tensor; + break; + } + } + if (t0 == nullptr) { + LLAMA_LOG_ERROR("cannot find source tensor to repeat"); + clean_up(); + return -1; + } + push_tensor(t0, ins.name);*/ + } else { + GGML_ASSERT(false); // should never happen + } + } + + const size_t meta_size = gguf_get_meta_size(ctx_out); + + LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); + + // placeholder for the meta data + ::zeros(fout, meta_size); + } + + // load tensor data into buffer + auto read_tensor_data = [&](struct ggml_tensor * tensor, llama_model_loader & ml, std::vector> & buf) -> size_t { + if (!ml.use_mmap) { + if (buf.size() < ggml_nbytes(tensor)) { + buf.resize(ggml_nbytes(tensor)); + } + tensor->data = buf.data(); + } + ml.load_data_for(tensor); + return ggml_nbytes(tensor); + }; + + size_t n_done = 0; + auto write_output_tensor = [&](const struct ggml_tensor * tensor, void * data) { + // write tensor data + padding + const size_t len = ggml_nbytes(tensor); + fout.write((const char *) data, len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); + n_done++; + LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n", + n_done, output_tensors.size(), + ggml_get_name(tensor), + llama_format_tensor_shape(tensor).c_str(), + ggml_type_name(tensor->type)); + }; + + // TODO: allow user to set n_threads + const int n_threads = std::thread::hardware_concurrency(); + std::vector workers; + workers.reserve(n_threads); + + // process instruction one by one + GGML_ASSERT(config->n_insts == output_tensors.size()); + for (size_t i = 0; i < config->n_insts; i++) { + const struct llama_merge_inst ins = config->insts[i]; + struct ggml_tensor * t0; + struct ggml_tensor * t1; + struct ggml_tensor * out_tensor = output_tensors[i]; + const size_t n_elements = ggml_nelements(out_tensor); + std::vector> in_buf0; + std::vector> f32_in_buf0; // dequant it internally + std::vector> in_buf1; + std::vector> f32_in_buf1; // dequant it internally + std::vector f32_out_buf(n_elements, 0.0); // do not resize! + std::vector out_buf(ggml_nbytes(out_tensor)); // do not resize! + const int n_per_row = out_tensor->ne[0]; + const int n_rows = n_elements / n_per_row; + + if (ins.method == LLAMA_MERGE_COPY) { + LLAMA_LOG_INFO("copy\n"); + size_t i_model; + t0 = get_src_tensor_for_copy(ins, i_model); + read_tensor_data(t0, *mls[i_model], in_buf0); + write_output_tensor(out_tensor, t0->data); + continue; + } + + // dequantize the tensor to FP32 + auto dequantize = [&](struct ggml_tensor * in_tensor, std::vector> & f32_in_buf) { + if (in_tensor->type != GGML_TYPE_F32) { + LLAMA_LOG_INFO("dequant "); + llama_convert_tensor_internal(in_tensor, f32_in_buf, workers, n_elements, n_threads); + } else { + // if we already have f32, just copy it + LLAMA_LOG_INFO("f32_copy "); + f32_in_buf.resize(n_elements); + memcpy((void *) f32_in_buf.data(), in_tensor->data, n_elements * sizeof(float)); + } + }; + + // load data and dequantize + if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) { + t0 = mls[0]->get_tensor_meta(ins.srcs[0]); + t1 = mls[1]->get_tensor_meta(ins.srcs[1]); + read_tensor_data(t0, *mls[0], in_buf0); + read_tensor_data(t1, *mls[1], in_buf1); + dequantize(t0, f32_in_buf0); + dequantize(t1, f32_in_buf1); + } + + if (ins.method == LLAMA_MERGE_LINEAR) { + LLAMA_LOG_INFO("linear "); + float * in0 = (float *) f32_in_buf0.data(); + float * in1 = (float *) f32_in_buf1.data(); + float * dest = (float *) f32_out_buf.data(); + for (size_t i = 0; i < n_elements; i++) { + dest[i] = in0[i] * ins.scales[0] + in1[i] * ins.scales[1]; + } + } + + if (ins.method == LLAMA_MERGE_SLERP) { + // Python code: https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c + LLAMA_LOG_INFO("slerp "); + static const float dot_threshold = 0.9995; + auto lerp_row = [](float * in0, float * in1, float * out, size_t nelem, float t) { + for (size_t i = 0; i < nelem; i++) { + out[i] = in0[i] * (1.0 - t) + in1[i] * t; + } + }; + auto slerp_row = [&lerp_row](float * in0, float * in1, float * out, size_t nelem, float t) { + float norm0 = std::sqrt(std::inner_product(in0, in0 + nelem, in0, 0.0)); + float norm1 = std::sqrt(std::inner_product(in1, in1 + nelem, in1, 0.0)); + // Normalize the vectors to get the directions and angles + std::vector v0(nelem); + std::vector v1(nelem); + for (size_t i = 0; i < nelem; i++) { + v0[i] = in0[i] / norm0; + v1[i] = in1[i] / norm1; + } + // Dot product with the normalized vectors + float dot = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0.0); + // If absolute value of dot product is almost 1, vectors are ~colineal, so use lerp + if (std::abs(dot) > dot_threshold) { + return lerp_row(in0, in1, out, nelem, t); + } + // Calculate initial angle between v0 and v1 + float theta_0 = std::acos(dot); + float sin_theta_0 = std::sin(theta_0); + // Angle at timestep t + float theta_t = theta_0 * t; + float sin_theta_t = std::sin(theta_t); + // Finish the slerp algorithm + float s0 = std::sin(theta_0 - theta_t) / sin_theta_0; + float s1 = sin_theta_t / sin_theta_0; + for (size_t i = 0; i < nelem; i++) { + out[i] = in0[i] * s0 + in1[i] * s1; + } + }; + for (int r = 0; r < n_rows; r++) { + float * in0 = (float *) f32_in_buf0.data(); + float * in1 = (float *) f32_in_buf1.data(); + float * dest = (float *) f32_out_buf.data(); + size_t offset = n_per_row * r; + slerp_row(in0 + offset, in1 + offset, dest + offset, n_per_row, ins.t); + } + } + + // re-quantize it + { + LLAMA_LOG_INFO("requant\n"); + std::array hist_cur = {}; + static const int min_chunk_size = 32 * 512; + const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); + size_t new_size = llama_tensor_quantize_internal( + out_tensor->type, + f32_out_buf.data(), + out_buf.data(), + chunk_size, + n_rows, + n_per_row, + hist_cur.data(), // unused for now + nullptr, + workers, + n_threads); + GGML_ASSERT(new_size == out_buf.size()); + } + + LLAMA_LOG_INFO("===> INPUT %f %f %f\n", f32_in_buf0[0].value, f32_in_buf0[1].value, f32_in_buf0[2].value); + LLAMA_LOG_INFO("===> OUTPUT %f %f %f\n", f32_out_buf[0], f32_out_buf[1], f32_out_buf[2]); + + write_output_tensor(out_tensor, out_buf.data()); + } + + // go back to beginning of file and write the updated meta data + { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *) data.data(), data.size()); + LLAMA_LOG_INFO("===> Written metadata size = %ld bytes\n", data.size()); + } + + clean_up(); + return 0; +} + static int llama_apply_lora_from_file_internal( const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads ) { @@ -12175,6 +12512,18 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return nparams; } +int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size) { + size_t i = 0; + for (const auto & it : model->tensors_by_name) { + if (i == arr_size) { + break; + } + name_arr[i] = it.first.c_str(); + i++; + } + return model->tensors_by_name.size(); +} + struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), [name](const std::pair & it) { diff --git a/llama.h b/llama.h index 4d0ebe37d3f9b..b2ff920da4a89 100644 --- a/llama.h +++ b/llama.h @@ -327,6 +327,33 @@ extern "C" { const char * content; } llama_chat_message; + enum llama_merge_method { + LLAMA_MERGE_LINEAR, + LLAMA_MERGE_SLERP, + LLAMA_MERGE_REPEAT, // doesn't work for now + LLAMA_MERGE_COPY, + }; + + // instruction for merging tensors (model merge) + struct llama_merge_inst { + char name[GGML_MAX_NAME]; // name of output tensor + enum llama_merge_method method; + // we only support 2 models for now + char srcs[2][GGML_MAX_NAME]; // name of input tensors. if method == copy, only one src is non-empty + float scales[2]; // for linear method + float t; // for slerp method + }; + + // merge models + struct llama_merge_config { + // we only support 2 models for now + const char * model_paths[2]; + const struct llama_merge_inst * insts; + const size_t n_insts; + const size_t n_layers; // number of output layers + const char * output_path; + }; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -405,6 +432,9 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Get a list of model tensor name, returns number of elements + LLAMA_API int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size); + // Get a llama model tensor LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); @@ -414,6 +444,10 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); + // Merge multiple models, inspired by mergekit + LLAMA_API int32_t llama_merge_models( + const struct llama_merge_config * config); + // Apply a LoRA adapter to a loaded model // path_base_model is the path to a higher quality model to use as a base for // the layers modified by the adapter. Can be NULL to use the current loaded model.