Skip to content

Commit

Permalink
Revert "MPT : clone wte to output at load time"
Browse files Browse the repository at this point in the history
It seems like upstream isn't interested in this change for the time
being [1], and we are going to break compatiblity with Nomic's previous
conversion of MPT because of changes to the BPE tokenizer [2], so let's
remove this change to minimize the diff.

This reverts commit 69c505e.

[1] ggerganov#3626
[2] ggerganov#3252
  • Loading branch information
cebtenzzre committed Dec 1, 2023
1 parent 7e598c1 commit f494be0
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 36 deletions.
5 changes: 5 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,11 @@ def write_tensors(self):

self.gguf_writer.add_tensor(new_name, data)

# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
self.gguf_writer.add_tensor("output.weight", data)


class BaichuanModel(Model):
def set_vocab(self):
Expand Down
9 changes: 2 additions & 7 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -18278,11 +18278,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// the ggml_tensor structs to the appropriate locations in the binary blob

// compute the exact size needed for the new ggml_context
int n_tensors = ctx->header.n_tensors + params.extra_tensors;
const size_t mem_size =
params.no_alloc ?
(n_tensors )*ggml_tensor_overhead() :
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
(ctx->header.n_tensors )*ggml_tensor_overhead() :
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;

struct ggml_init_params pdata = {
.mem_size = mem_size,
Expand Down Expand Up @@ -18591,10 +18590,6 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
return ctx->infos[i].offset;
}

void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
ctx->infos[i].offset = offset;
}

char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data;
}
Expand Down
2 changes: 0 additions & 2 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2010,7 +2010,6 @@ extern "C" {

// if not NULL, create a ggml_context and allocate the tensor data in it
struct ggml_context ** ctx;
int extra_tensors;
};

GGML_API struct gguf_context * gguf_init_empty(void);
Expand Down Expand Up @@ -2054,7 +2053,6 @@ extern "C" {
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);

// overrides existing values or adds a new one
Expand Down
29 changes: 2 additions & 27 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1817,9 +1817,8 @@ struct llama_model_loader {

llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.extra_tensors = */ 1,
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};

ctx_gguf = gguf_init_from_file(fname.c_str(), params);
Expand Down Expand Up @@ -2129,25 +2128,6 @@ struct llama_model_loader {
done_size += ggml_nbytes(cur);
}
}

// must be called before calc_sizes
void clone_tensor(const char * src_name, const char * dst_name) {
int src_idx = gguf_find_tensor(ctx_gguf, src_name);
GGML_ASSERT(src_idx >= 0);

struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);

struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
GGML_ASSERT(cur);

ggml_set_name(cur, dst_name);
gguf_add_tensor(ctx_gguf, cur);
gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
n_tensors++;
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
}
};

//
Expand Down Expand Up @@ -2714,11 +2694,6 @@ static void llm_load_tensors(

model.n_gpu_layers = n_gpu_layers;

// MPT output is tied to (same as) wte in original model
if (model.arch == LLM_ARCH_MPT) {
ml.clone_tensor("token_embd.weight", "output.weight");
}

size_t ctx_size;
size_t mmapped_size;

Expand Down

0 comments on commit f494be0

Please sign in to comment.