Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add model merge example #5741

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ models-mnt
/train-text-from-scratch
/tokenize
/vdot
/merge
/common/build-info.cpp
arm_neon.h
compile_commands.json
Expand Down
9 changes: 7 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o \
merge

# Binaries only useful for tests
TEST_TARGETS = \
Expand Down Expand Up @@ -704,6 +705,10 @@ quantize: examples/quantize/quantize.cpp build-info.o ggml.
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

merge: examples/merge/merge.cpp examples/merge/parser.hpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
Expand Down
6 changes: 6 additions & 0 deletions examples/merge/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
set(TARGET merge)
add_executable(${TARGET} merge.cpp parser.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
123 changes: 123 additions & 0 deletions examples/merge/config.example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# GGUF merge instructions
#
# Lines start with "#" will be comment
# Empty lines will be ignored
# The "output layer" instruction is to add a new layer for output model
# Merge instruction is in format: target (space) verb (space) parameters
# Supported verbs:
# - linear: merge linearly, parameters: source_layer,source_layer,t
# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale
# - copy: copy from which model, which layer


#########################
# Example:

# This is the first layer of output model:
# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1)
# Except for "attn_output" tensor that we want t=0.5 instead t=0.1

output layer 0
all slerp 0,0,0.1
attn_output slerp 0,0,0.5

# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4
# Except for "attn_output" tensor that we want to use slerp with t=0.9

output layer 1
all linear 1,1,0.6,0.4
attn_output slerp 1,1,0.9

# For next layer, we want to copy from model[0].layer[2]

output layer 2
all copy 0,2

output layer 3
all copy 0,3

# For next layer, we want to copy from model[1].layer[4]

output layer 4
all copy 1,4

output layer 5
all copy 1,5

output layer 6
all linear 6,6,0.1,0.9

output layer 7
all linear 7,7,0.1,0.9

output layer 8
all linear 8,8,0.1,0.9

output layer 9
all linear 9,9,0.1,0.9

output layer 10
all linear 10,10,0.1,0.9

output layer 11
all linear 11,11,0.1,0.9

output layer 12
all linear 12,12,0.1,0.9

output layer 13
all linear 13,13,0.3333,0.6666

output layer 14
all linear 14,14,0.3333,0.6666

output layer 15
all linear 15,15,0.3333,0.6666

output layer 16
all linear 16,16,0.3333,0.6666

output layer 17
all linear 17,17,0.3333,0.6666

output layer 18
all linear 18,18,0.3333,0.6666

output layer 19
all linear 19,19,0.3333,0.6666

output layer 20
all slerp 20,20,0.8

output layer 21
all slerp 21,21,0.8

output layer 22
all slerp 22,22,0.8

output layer 23
all slerp 23,23,0.8

output layer 24
all slerp 24,24,0.8

output layer 25
all slerp 25,25,0.8

output layer 26
all slerp 26,26,0.8

output layer 27
all slerp 27,27,0.8

output layer 28
all slerp 28,28,0.8

output layer 29
all slerp 29,29,0.8

output layer 30
all slerp 30,30,0.8

output layer 31
all slerp 31,31,0.8
127 changes: 127 additions & 0 deletions examples/merge/merge.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include "common.h"
#include "llama.h"
#include "parser.hpp"

#include <cstdio>
#include <cstring>
#include <vector>
#include <string>
#include <unordered_map>
#include <fstream>
#include <cmath>
#include <algorithm>

static const size_t n_models = 2; // hard-limited to 2 input models for now

struct merge_params {
std::string config_path = "config.txt";
std::vector<std::string> model_paths;
std::string output_path = "ggml-merged-f16.gguf";
bool only_list_tensors_name = false;
bool dry_run = false;
};

[[noreturn]]
static void usage(const char * executable, int exit_code) {
struct merge_params defaults;
printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
printf("\n");
printf("Merging multiple models, inspired by mergekit.\n");
printf("For more details, see \"config.example.txt\" file.\n");
printf("\n");
printf("NOTE:\n");
printf("- Only support merging 2 models.\n");
printf("- The embedding and output layers of the first model will be used.\n");
printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n");
printf("\n");
printf("Options:\n");
printf(" -h, --help Show this help message and exit\n");
printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str());
printf(" --dry-run Only print out list of parsed and exit, useful for debugging\n");
printf(" --print-list-tensor Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n");
printf("\n");
printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n");
exit(exit_code);
}

int main(int argc, char ** argv) {
bool invalid_param = false;
struct merge_params params;

std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-h" || arg == "--help") {
usage(argv[0], 0);
} else if (arg == "-c" || arg == "--config") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.config_path = argv[i];
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_paths.push_back(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.output_path = argv[i];
} else if (arg == "--print-list-tensor") {
params.only_list_tensors_name = true;
} else if (arg == "--dry-run") {
params.dry_run = true;
}
}

try {
if (invalid_param) {
usage(argv[0], 1);
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
} else if (!params.only_list_tensors_name && params.model_paths.size() < 2) {
throw std::invalid_argument("error: require at least 2 models");
}

if (params.only_list_tensors_name) {
if (params.model_paths.size() != 1) {
throw std::invalid_argument("error: we can only list tensors of one single model");
}
print_model_tensors_name(params.model_paths[0]);
return 0; // exit now
}

size_t n_layers = 0;
auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers);

if (params.dry_run) {
return 0;
}

std::vector<const char*> p_model_paths;
for (auto & m : params.model_paths) {
p_model_paths.push_back(m.data());
}
struct llama_merge_config config{
{
params.model_paths[0].c_str(),
params.model_paths[1].c_str(),
},
instructions.data(),
instructions.size(),
n_layers,
params.output_path.c_str(),
};

llama_merge_models(&config);
} catch (const std::exception & ex) {
std::cerr << ex.what() << "\n\n";
}

return 0;
}
Loading
Loading