Skip to content

Commit

Permalink
MPT : clone wte to output at load time
Browse files Browse the repository at this point in the history
  • Loading branch information
cebtenzzre committed Nov 23, 2023
1 parent 1d19f80 commit 69c505e
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 9 deletions.
5 changes: 0 additions & 5 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,11 +462,6 @@ def write_tensors(self):

self.gguf_writer.add_tensor(new_name, data)

# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
self.gguf_writer.add_tensor("output.weight", data)


class BaichuanModel(Model):
def set_vocab(self):
Expand Down
9 changes: 7 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -18278,10 +18278,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// the ggml_tensor structs to the appropriate locations in the binary blob

// compute the exact size needed for the new ggml_context
int n_tensors = ctx->header.n_tensors + params.extra_tensors;
const size_t mem_size =
params.no_alloc ?
(ctx->header.n_tensors )*ggml_tensor_overhead() :
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
(n_tensors )*ggml_tensor_overhead() :
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;

struct ggml_init_params pdata = {
.mem_size = mem_size,
Expand Down Expand Up @@ -18590,6 +18591,10 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
return ctx->infos[i].offset;
}

void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
ctx->infos[i].offset = offset;
}

char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data;
}
Expand Down
2 changes: 2 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2010,6 +2010,7 @@ extern "C" {

// if not NULL, create a ggml_context and allocate the tensor data in it
struct ggml_context ** ctx;
int extra_tensors;
};

GGML_API struct gguf_context * gguf_init_empty(void);
Expand Down Expand Up @@ -2053,6 +2054,7 @@ extern "C" {
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);

// overrides existing values or adds a new one
Expand Down
29 changes: 27 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1792,8 +1792,9 @@ struct llama_model_loader {

llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.extra_tensors = */ 1,
};

ctx_gguf = gguf_init_from_file(fname.c_str(), params);
Expand Down Expand Up @@ -2100,6 +2101,25 @@ struct llama_model_loader {
done_size += ggml_nbytes(cur);
}
}

// must be called before calc_sizes
void clone_tensor(const char * src_name, const char * dst_name) {
int src_idx = gguf_find_tensor(ctx_gguf, src_name);
GGML_ASSERT(src_idx >= 0);

struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);

struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
GGML_ASSERT(cur);

ggml_set_name(cur, dst_name);
gguf_add_tensor(ctx_gguf, cur);
gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
n_tensors++;
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
}
};

//
Expand Down Expand Up @@ -2666,6 +2686,11 @@ static void llm_load_tensors(

model.n_gpu_layers = n_gpu_layers;

// MPT output is tied to (same as) wte in original model
if (model.arch == LLM_ARCH_MPT) {
ml.clone_tensor("token_embd.weight", "output.weight");
}

size_t ctx_size;
size_t mmapped_size;

Expand Down

0 comments on commit 69c505e

Please sign in to comment.