Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : refactor model loading code #2620

Merged
merged 29 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a82e3a4
llama : style formatting + remove helper methods
ggerganov Aug 15, 2023
66ce19a
llama : fix quantization using gguf tool
ggerganov Aug 15, 2023
c9c0b75
llama : simplify gguf_file_saver
ggerganov Aug 15, 2023
6e29ed5
llama : fix method names
ggerganov Aug 15, 2023
5c85332
llama : simplify write_header()
ggerganov Aug 15, 2023
9574f41
llama : no need to pass full file loader to the file saver
ggerganov Aug 15, 2023
da424b6
llama : gguf_file_saver write I32
ggerganov Aug 15, 2023
2d87c9c
llama : refactor tensor names (#2622)
monatis Aug 15, 2023
5cb9d9a
gguf : initial write API (not tested yet)
ggerganov Aug 15, 2023
85ebfb8
gguf : write to file API (not tested)
ggerganov Aug 15, 2023
f6ecd15
gguf : initial write API ready + example
ggerganov Aug 15, 2023
4463965
gguf : fix header write
ggerganov Aug 15, 2023
c9b2f7f
gguf : fixes + simplify example + add ggml_nbytes_pad()
ggerganov Aug 15, 2023
35177d7
gguf : minor
ggerganov Aug 15, 2023
4ef5e79
llama : replace gguf_file_saver with new gguf write API
ggerganov Aug 15, 2023
f7a6aa9
gguf : streaming support when writing files
ggerganov Aug 15, 2023
1751bd4
gguf : remove oboslete write methods
ggerganov Aug 15, 2023
2906d54
gguf : remove obosolete gguf_get_arr_xxx API
ggerganov Aug 15, 2023
6c3f824
llama : simplify gguf_file_loader
ggerganov Aug 15, 2023
a02b809
llama : move hparams and vocab from gguf_file_loader to llama_model_l…
ggerganov Aug 15, 2023
afd135a
llama : merge gguf-util.h in llama.cpp
ggerganov Aug 15, 2023
f477fb0
llama : reorder definitions in .cpp to match .h
ggerganov Aug 15, 2023
23248d7
llama : minor simplifications
ggerganov Aug 15, 2023
5339b85
llama : refactor llama_model_loader (WIP)
ggerganov Aug 15, 2023
31fb56e
llama : fix shape prints
ggerganov Aug 16, 2023
c1fe0ab
llama : fix Windows build + fix norm_rms_eps key
ggerganov Aug 16, 2023
f634b29
llama : throw error on missing KV paris in model meta data
ggerganov Aug 16, 2023
e524750
llama : improve printing + log meta data
ggerganov Aug 16, 2023
6823899
llama : switch print order of meta data
ggerganov Aug 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ OBJS += ggml-alloc.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
Expand Down
2 changes: 1 addition & 1 deletion convert-llama-h5-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def count_model_parts(dir_model: str) -> int:
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i): toktype = 2
if tokenizer.is_control(i): toktype = 3

# TODO: How to determinate if a token is user defined?
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
Expand Down
7 changes: 5 additions & 2 deletions examples/gguf/gguf-llama-simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ int main(int argc, char ** argv) {
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.

while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
const int n_gen = std::min(32, max_context_size);

while (llama_get_kv_cache_token_count(ctx) < n_gen) {
// evaluate the transformer

if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
Expand Down Expand Up @@ -114,13 +116,14 @@ int main(int argc, char ** argv) {

// push this new token for next evaluation
tokens_list.push_back(new_token_id);

}

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

fprintf(stderr, "\n\n");

return 0;
}
248 changes: 30 additions & 218 deletions examples/gguf/gguf.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#include "ggml.h"
#include "gguf-util.h"
#include "gguf-llama.h"

#include <cstdio>
Expand All @@ -21,133 +20,22 @@ static std::string to_string(const T & val) {
return ss.str();
}

void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
const int32_t n = val.size();
fout.write((const char *) &n, sizeof(n));
fout.write(val.c_str(), n);
}

void gguf_ex_write_i32(std::ofstream & fout, int32_t val) {
fout.write((const char *) &val, sizeof(val));
}

void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
fout.write((const char *) &val, sizeof(val));
}

template<typename T>
void gguf_ex_write_val(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
gguf_ex_write_str(fout, key);
fout.write((const char *) &type, sizeof(type));
fout.write((const char *) &val, sizeof(val));

fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), to_string(val).c_str());
}

template<>
void gguf_ex_write_val<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
gguf_ex_write_str(fout, key);
fout.write((const char *) &type, sizeof(type));

const int32_t n = val.size();
fout.write((const char *) &n, sizeof(n));
fout.write(val.c_str(), n);

fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), val.c_str());
}

template<typename T>
void gguf_ex_write_arr(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<T> & val) {
gguf_ex_write_str(fout, key);
{
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
fout.write((const char *) &tarr, sizeof(tarr));
}

const int32_t n = val.size();
fout.write((const char *) &type, sizeof(type));
fout.write((const char *) &n, sizeof(n));
fout.write((const char *) val.data(), n * sizeof(T));

fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
for (int i = 0; i < n; ++i) {
fprintf(stdout, "%s", to_string(val[i]).c_str());
if (i < n - 1) {
fprintf(stdout, ", ");
}
}
fprintf(stdout, "]\n");
}

template<>
void gguf_ex_write_arr<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
gguf_ex_write_str(fout, key);
{
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
fout.write((const char *) &tarr, sizeof(tarr));
}

const int32_t n = val.size();
fout.write((const char *) &type, sizeof(type));
fout.write((const char *) &n, sizeof(n));
for (int i = 0; i < n; ++i) {
const int32_t nstr = val[i].size();
fout.write((const char *) &nstr, sizeof(nstr));
fout.write(val[i].c_str(), nstr);
}

fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
for (int i = 0; i < n; ++i) {
fprintf(stdout, "%s", val[i].c_str());
if (i < n - 1) {
fprintf(stdout, ", ");
}
}
fprintf(stdout, "]\n");
}

bool gguf_ex_write(const std::string & fname) {
std::ofstream fout(fname.c_str(), std::ios::binary);

{
const int32_t magic = GGUF_MAGIC;
fout.write((const char *) &magic, sizeof(magic));
}

{
const int32_t version = GGUF_VERSION;
fout.write((const char *) &version, sizeof(version));
}

// NOTE: these have to match the output below!
const int n_tensors = 10;
const int n_kv = 12;

fout.write((const char*) &n_tensors, sizeof(n_tensors));
fout.write((const char*) &n_kv, sizeof(n_kv));

fprintf(stdout, "%s: write header\n", __func__);

// kv data
{
gguf_ex_write_val< uint8_t>(fout, "some.parameter.uint8", GGUF_TYPE_UINT8, 0x12);
gguf_ex_write_val< int8_t>(fout, "some.parameter.int8", GGUF_TYPE_INT8, -0x13);
gguf_ex_write_val<uint16_t>(fout, "some.parameter.uint16", GGUF_TYPE_UINT16, 0x1234);
gguf_ex_write_val< int16_t>(fout, "some.parameter.int16", GGUF_TYPE_INT16, -0x1235);
gguf_ex_write_val<uint32_t>(fout, "some.parameter.uint32", GGUF_TYPE_UINT32, 0x12345678);
gguf_ex_write_val< int32_t>(fout, "some.parameter.int32", GGUF_TYPE_INT32, -0x12345679);

gguf_ex_write_val<float> (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
gguf_ex_write_val<bool> (fout, "some.parameter.bool", GGUF_TYPE_BOOL, true);

gguf_ex_write_val<std::string>(fout, "some.parameter.string", GGUF_TYPE_STRING, "hello world");

gguf_ex_write_arr<int16_t> (fout, "some.parameter.arr.i16", GGUF_TYPE_INT16, { 1, 2, 3, 4, });
gguf_ex_write_arr<float> (fout, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, { 3.145f, 2.718f, 1.414f, });
gguf_ex_write_arr<std::string>(fout, "some.parameter.arr.str", GGUF_TYPE_STRING, { "hello", "world", "!" });
}

uint64_t offset_tensor = 0;
struct gguf_context * ctx = gguf_init_empty();

gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
gguf_set_val_bool(ctx, "some.parameter.bool", true);
gguf_set_val_str (ctx, "some.parameter.string", "hello world");

gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);

struct ggml_init_params params = {
/*.mem_size =*/ 128ull*1024ull*1024ull,
Expand All @@ -157,6 +45,8 @@ bool gguf_ex_write(const std::string & fname) {

struct ggml_context * ctx_data = ggml_init(params);

const int n_tensors = 10;

// tensor infos
for (int i = 0; i < n_tensors; ++i) {
const std::string name = "tensor_" + to_string(i);
Expand All @@ -178,58 +68,15 @@ bool gguf_ex_write(const std::string & fname) {
}
}

fprintf(stdout, "%s: tensor: %s, %d dims, ne = [", __func__, name.c_str(), n_dims);
for (int j = 0; j < 4; ++j) {
fprintf(stdout, "%s%3d", j == 0 ? "" : ", ", (int) cur->ne[j]);
}
fprintf(stdout, "], offset_tensor = %6" PRIu64 "\n", offset_tensor);

gguf_ex_write_str(fout, name);
gguf_ex_write_i32(fout, n_dims);
for (int j = 0; j < n_dims; ++j) {
gguf_ex_write_i32(fout, cur->ne[j]);
}
gguf_ex_write_i32(fout, cur->type);
gguf_ex_write_u64(fout, offset_tensor);

offset_tensor += GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT);
}

const uint64_t offset_data = GGML_PAD((uint64_t) fout.tellp(), GGUF_DEFAULT_ALIGNMENT);

fprintf(stdout, "%s: data offset = %" PRIu64 "\n", __func__, offset_data);

{
const size_t pad = offset_data - fout.tellp();

for (size_t j = 0; j < pad; ++j) {
fout.put(0);
}
}

for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: writing tensor %d data\n", __func__, i);

const std::string name = "tensor_" + to_string(i);

struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());

fout.write((const char *) cur->data, ggml_nbytes(cur));

{
const size_t pad = GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT) - ggml_nbytes(cur);

for (size_t j = 0; j < pad; ++j) {
fout.put(0);
}
}
gguf_add_tensor(ctx, cur);
}

fout.close();
gguf_write_to_file(ctx, fname.c_str(), false);

fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());

ggml_free(ctx_data);
gguf_free(ctx);

return true;
}
Expand Down Expand Up @@ -345,8 +192,16 @@ bool gguf_ex_read_1(const std::string & fname) {

struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n",
__func__, i, cur->n_dims, cur->name, cur->data);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);

// print first 10 elements
const float * data = (const float *) cur->data;

printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");

// check data
{
Expand All @@ -369,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) {
return true;
}

// read just the tensor info and mmap the data in user code
bool gguf_ex_read_2(const std::string & fname) {
struct ggml_context * ctx_data = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_data,
};

struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

struct gguf_file file(fname.c_str(), "rb");
gguf_mmap data_mmap(&file, 0, false);

const int n_tensors = gguf_get_n_tensors(ctx);

for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);

struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

cur->data = static_cast<char *>(data_mmap.addr) + offset;

// print first 10 elements
const float * data = (const float *) cur->data;

printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
}

fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));

ggml_free(ctx_data);
gguf_free(ctx);

return true;
}

int main(int argc, char ** argv) {
if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
Expand All @@ -427,7 +240,6 @@ int main(int argc, char ** argv) {
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
} else if (mode == "q") {
llama_model_quantize_params params = llama_model_quantize_default_params();
llama_model_quantize(fname.c_str(), "quant.gguf", &params);
Expand Down
Loading