Skip to content

Commit

Permalink
Some random ctx/eps stuff I pulled out from my fork
Browse files Browse the repository at this point in the history
  • Loading branch information
eousphoros committed Mar 13, 2023
1 parent 2a20f48 commit e0213e0
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 34 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ endif
# Compile flags
#

CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
CFLAGS = -I. -O2 -DNDEBUG -std=c11 -flto -fPIC
CXXFLAGS = -I. -I./examples -O2 -DNDEBUG -std=c++11 -flto -fPIC
LDFLAGS = -flto -fPIC

# OS specific
# TODO: support Windows
Expand Down
6 changes: 1 addition & 5 deletions convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,6 @@ def get_n_parts(dim):
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name[-5:] == "freqs":
continue

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
Expand Down Expand Up @@ -169,7 +165,7 @@ def get_n_parts(dim):
data.tofile(fout)

# I hope this deallocates the memory ..
model = None
del model

fout.close()

Expand Down
26 changes: 5 additions & 21 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2145,6 +2145,7 @@ struct ggml_context {
bool mem_buffer_owned;

int n_objects;
float_t eps;

struct ggml_object * objects_begin;
struct ggml_object * objects_end;
Expand All @@ -2159,26 +2160,6 @@ struct ggml_context_container {
struct ggml_context context;
};

//
// compute types
//

enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,
GGML_TASK_FINALIZE,
};

struct ggml_compute_params {
enum ggml_task_type type;

int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;
};

//
// ggml state
//
Expand Down Expand Up @@ -2422,6 +2403,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.n_objects =*/ 0,
/*.eps =*/ params.eps,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
Expand Down Expand Up @@ -5335,7 +5317,8 @@ static void ggml_compute_forward_norm_f32(
const size_t nb2 = dst->nb[2];
const size_t nb3 = dst->nb[3];

const ggml_float eps = 1e-5f; // TODO: make this a parameter
// if params->eps is zero, use default of 1e-6 otherwise use params->eps
const float eps = params->eps == 0.0f ? 1e-6f : params->eps;

// TODO: optimize
for (int i03 = 0; i03 < ne03; i03++) {
Expand Down Expand Up @@ -9378,6 +9361,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
/*.type =*/ GGML_TASK_INIT,
/*.ith =*/ 0,
/*.nth =*/ node->n_tasks,
/*.eps =*/ ctx->eps,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
Expand Down
34 changes: 33 additions & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,36 @@ struct ggml_scratch {
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes

// eps
float eps;

// work buffer
void * mem_buffer; // if NULL, memory will be allocated internally
};

//
// compute types
//

enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,
GGML_TASK_FINALIZE,
};

struct ggml_compute_params {
enum ggml_task_type type;

int ith, nth;

float eps;

// work buffer for all threads
size_t wsize;
void * wdata;
};

void ggml_time_init(void); // call this once at the beginning of the program
int64_t ggml_time_ms(void);
int64_t ggml_time_us(void);
Expand Down Expand Up @@ -477,7 +504,6 @@ struct ggml_tensor * ggml_silu(
struct ggml_tensor * a);

// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
Expand Down Expand Up @@ -585,6 +611,12 @@ struct ggml_tensor * ggml_rope(
int n_past,
int n_dims,
int mode);

static void ggml_compute_forward_rope(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst);

// padding = 1
// TODO: we don't support extra parameters for now
Expand Down
20 changes: 17 additions & 3 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct llama_layer {
struct ggml_tensor * w1;
struct ggml_tensor * w2;
struct ggml_tensor * w3;

};

struct llama_model {
Expand All @@ -72,6 +73,9 @@ struct llama_model {
struct ggml_tensor * norm;
struct ggml_tensor * output;

// rope frequencies
struct ggml_tensor * rope_freqs;

std::vector<llama_layer> layers;

// key + value memory
Expand Down Expand Up @@ -215,7 +219,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (5 + 10*n_layer)*256; // object overhead
ctx_size += (5 + 10*n_layer)*hparams.n_ctx; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand All @@ -224,6 +228,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
{
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.eps =*/ 1e-6, // change to 1e-5 for 7/13B models
/*.mem_buffer =*/ NULL,
};

Expand Down Expand Up @@ -286,6 +291,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
}

model.rope_freqs = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
model.tensors["rope.freqs"] = model.rope_freqs;

}

// key + value memory
Expand Down Expand Up @@ -543,7 +552,8 @@ bool llama_eval(

const int d_key = n_embd/n_head;

static size_t buf_size = 512u*1024*1024;
// allocate memory
static size_t buf_size = n_ctx*1024*1024;
static void * buf = malloc(buf_size);

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
Expand All @@ -561,6 +571,7 @@ bool llama_eval(

struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.eps =*/ 1e-6, // change to 1e-5 for for 7/13B models
/*.mem_buffer =*/ buf,
};

Expand Down Expand Up @@ -603,6 +614,7 @@ bool llama_eval(
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
}

// Apply rotary embeddings
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ggml_tensor * Q =
ggml_permute(ctx0,
Expand Down Expand Up @@ -633,6 +645,7 @@ bool llama_eval(
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);

// Scoring
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);

Expand All @@ -658,6 +671,7 @@ bool llama_eval(
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));

// attention.wo(output)
// projection (no bias)
cur = ggml_mul_mat(ctx0,
model.layers[il].wo,
Expand Down Expand Up @@ -795,7 +809,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();

if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ??
if (!llama_model_load(params.model, model, vocab, 1024)) { // TODO: set context from user input ??
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
Expand Down
2 changes: 1 addition & 1 deletion quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ int main(int argc, char ** argv) {

// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL };
struct ggml_init_params params = { 0, 1e-6, NULL };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
Expand Down

0 comments on commit e0213e0

Please sign in to comment.