ggerganov · ngxson · Feb 23, 2024 · Feb 24, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -72,6 +72,7 @@ models-mnt
 /train-text-from-scratch
 /tokenize
 /vdot
+/merge
 /common/build-info.cpp
 arm_neon.h
 compile_commands.json

diff --git a/Makefile b/Makefile
@@ -1,8 +1,9 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search     \
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o  \
+	merge
 
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -704,6 +705,10 @@ quantize: examples/quantize/quantize.cpp                      build-info.o ggml.
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+merge: examples/merge/merge.cpp examples/merge/parser.hpp     build-info.o ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

diff --git a/examples/merge/CMakeLists.txt b/examples/merge/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(TARGET merge)
+add_executable(${TARGET} merge.cpp parser.hpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/merge/config.example.txt b/examples/merge/config.example.txt
@@ -0,0 +1,123 @@
+# GGUF merge instructions
+#
+# Lines start with "#" will be comment
+# Empty lines will be ignored
+# The "output layer" instruction is to add a new layer for output model
+# Merge instruction is in format: target (space) verb (space) parameters
+# Supported verbs:
+# - linear: merge linearly, parameters: source_layer,source_layer,t
+# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale
+# - copy: copy from which model, which layer
+
+
+#########################
+# Example:
+
+# This is the first layer of output model:
+# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1)
+# Except for "attn_output" tensor that we want t=0.5 instead t=0.1
+
+output layer 0
+all slerp 0,0,0.1
+attn_output slerp 0,0,0.5
+
+# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4
+# Except for "attn_output" tensor that we want to use slerp with t=0.9
+
+output layer 1
+all linear 1,1,0.6,0.4
+attn_output slerp 1,1,0.9
+
+# For next layer, we want to copy from model[0].layer[2]
+
+output layer 2
+all copy 0,2
+
+output layer 3
+all copy 0,3
+
+# For next layer, we want to copy from model[1].layer[4]
+
+output layer 4
+all copy 1,4
+
+output layer 5
+all copy 1,5
+
+output layer 6
+all linear 6,6,0.1,0.9
+
+output layer 7
+all linear 7,7,0.1,0.9
+
+output layer 8
+all linear 8,8,0.1,0.9
+
+output layer 9
+all linear 9,9,0.1,0.9
+
+output layer 10
+all linear 10,10,0.1,0.9
+
+output layer 11
+all linear 11,11,0.1,0.9
+
+output layer 12
+all linear 12,12,0.1,0.9
+
+output layer 13
+all linear 13,13,0.3333,0.6666
+
+output layer 14
+all linear 14,14,0.3333,0.6666
+
+output layer 15
+all linear 15,15,0.3333,0.6666
+
+output layer 16
+all linear 16,16,0.3333,0.6666
+
+output layer 17
+all linear 17,17,0.3333,0.6666
+
+output layer 18
+all linear 18,18,0.3333,0.6666
+
+output layer 19
+all linear 19,19,0.3333,0.6666
+
+output layer 20
+all slerp 20,20,0.8
+
+output layer 21
+all slerp 21,21,0.8
+
+output layer 22
+all slerp 22,22,0.8
+
+output layer 23
+all slerp 23,23,0.8
+
+output layer 24
+all slerp 24,24,0.8
+
+output layer 25
+all slerp 25,25,0.8
+
+output layer 26
+all slerp 26,26,0.8
+
+output layer 27
+all slerp 27,27,0.8
+
+output layer 28
+all slerp 28,28,0.8
+
+output layer 29
+all slerp 29,29,0.8
+
+output layer 30
+all slerp 30,30,0.8
+
+output layer 31
+all slerp 31,31,0.8
diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp
@@ -0,0 +1,127 @@
+#include "common.h"
+#include "llama.h"
+#include "parser.hpp"
+
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <fstream>
+#include <cmath>
+#include <algorithm>
+
+static const size_t n_models = 2; // hard-limited to 2 input models for now
+
+struct merge_params {
+    std::string config_path = "config.txt";
+    std::vector<std::string> model_paths;
+    std::string output_path = "ggml-merged-f16.gguf";
+    bool only_list_tensors_name = false;
+    bool dry_run = false;
+};
+
+[[noreturn]]
+static void usage(const char * executable, int exit_code) {
+    struct merge_params defaults;
+    printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
+    printf("\n");
+    printf("Merging multiple models, inspired by mergekit.\n");
+    printf("For more details, see \"config.example.txt\" file.\n");
+    printf("\n");
+    printf("NOTE:\n");
+    printf("- Only support merging 2 models.\n");
+    printf("- The embedding and output layers of the first model will be used.\n");
+    printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n");
+    printf("\n");
+    printf("Options:\n");
+    printf("  -h, --help                 Show this help message and exit\n");
+    printf("  -c, --config CONFIG_FILE   Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
+    printf("  -m, --model MODEL_PATH     Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
+    printf("  -o, --output OUTPUT_FILE   Path to the output model (default: %s)\n", defaults.output_path.c_str());
+    printf("  --dry-run                  Only print out list of parsed and exit, useful for debugging\n");
+    printf("  --print-list-tensor        Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n");
+    printf("\n");
+    printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n");
+    exit(exit_code);
+}
+
+int main(int argc, char ** argv) {
+    bool invalid_param = false;
+    struct merge_params params;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-h" || arg == "--help") {
+            usage(argv[0], 0);
+        } else if (arg == "-c" || arg == "--config") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.config_path = argv[i];
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_paths.push_back(argv[i]);
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.output_path = argv[i];
+        } else if (arg == "--print-list-tensor") {
+            params.only_list_tensors_name = true;
+        } else if (arg == "--dry-run") {
+            params.dry_run = true;
+        }
+    }
+
+    try {
+        if (invalid_param) {
+            usage(argv[0], 1);
+            throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+        } else if (!params.only_list_tensors_name && params.model_paths.size() < 2) {
+            throw std::invalid_argument("error: require at least 2 models");
+        }
+
+        if (params.only_list_tensors_name) {
+            if (params.model_paths.size() != 1) {
+                throw std::invalid_argument("error: we can only list tensors of one single model");
+            }
+            print_model_tensors_name(params.model_paths[0]);
+            return 0; // exit now
+        }
+
+        size_t n_layers = 0;
+        auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers);
+
+        if (params.dry_run) {
+            return 0;
+        }
+
+        std::vector<const char*> p_model_paths;
+        for (auto & m : params.model_paths) {
+            p_model_paths.push_back(m.data());
+        }
+        struct llama_merge_config config{
+            {
+                params.model_paths[0].c_str(),
+                params.model_paths[1].c_str(),
+            },
+            instructions.data(),
+            instructions.size(),
+            n_layers,
+            params.output_path.c_str(),
+        };
+
+        llama_merge_models(&config);
+    } catch (const std::exception & ex) {
+        std::cerr << ex.what() << "\n\n";
+    }
+
+    return 0;
+}