Skip to content

Commit

Permalink
New simplified llama.h API, and GPU offloading for control vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
vgel committed Mar 12, 2024
1 parent c82301c commit 2d69bf8
Show file tree
Hide file tree
Showing 4 changed files with 282 additions and 250 deletions.
195 changes: 171 additions & 24 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1393,32 +1393,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (layer_start == 0) layer_start = 1;
if (layer_end == 0) layer_end = 31;

struct llama_control_vector * vector = nullptr;

for (const auto& t : params.control_vectors) {
std::string path;
float strength;
std::tie(path, strength) = t;

fprintf(stderr, "%s: loading control vector from %s\n", __func__, path.c_str());
struct llama_control_vector * temp = llama_control_vector_load(path.c_str());
if (temp == nullptr) {
fprintf(stderr, "%s: error: failed to load control vector from %s\n", __func__, path.c_str());
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
llama_control_vector_scale(temp, strength);

if (vector == nullptr) {
vector = temp;
} else {
llama_control_vector_add(vector, temp);
llama_control_vector_free(temp);
}
std::vector<float> control_vector;
int n_embd;
std::tie(control_vector, n_embd) = llama_control_vector_load(params.control_vectors);
if (n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}

llama_apply_control_vector(lctx, vector, layer_start, layer_end);
int err = llama_control_vector_apply(lctx,
control_vector.data(),
control_vector.size(),
n_embd,
layer_start,
layer_end);
if (err) {
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
}

for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
Expand Down Expand Up @@ -1937,3 +1931,156 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
}
}

//
// Control vector utils
//

static std::tuple<std::vector<float>, int> llama_control_vector_load_one(const std::string & path, float strength) {
int n_tensors;
size_t n_bytes = 0;
uint32_t max_direction_layer = 0;
int n_embd = -1;

// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
{
struct ggml_init_params meta_params = {
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context * meta_ctx = ggml_init(meta_params);
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
};
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path.c_str(), meta_gguf_params);
if (!meta_ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str());
ggml_free(meta_ctx);
return std::make_tuple(std::vector<float>(), -1);
}

n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);

// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return std::make_tuple(std::vector<float>(), -1);
}
if (layer > max_direction_layer) {
max_direction_layer = layer;
}
} catch (...) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return std::make_tuple(std::vector<float>(), -1);
}
}

struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return std::make_tuple(std::vector<float>(), -1);
}
if (n_embd == -1) {
n_embd = ggml_nelements(tensor_meta);
} else if (ggml_nelements(tensor_meta) != n_embd) {
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, path.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return std::make_tuple(std::vector<float>(), -1);
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
}

if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, path.c_str());
return std::make_tuple(std::vector<float>(), -1);
}

// load and scale tensors into final control vector context
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(path.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str());
ggml_free(ctx);
return std::make_tuple(std::vector<float>(), -1);
}

std::vector<float> vector;
for (uint32_t i = 1; i < max_direction_layer; i++) {
std::string name = "direction." + std::to_string(i);
ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor) {
const float * data = (const float *) tensor->data;
for (int i = 0; i < n_embd; i++) {
vector.push_back(data[i] * strength);
}
} else {
vector.insert(vector.end(), n_embd, 0.); // as a filler
}
}

return std::make_tuple(vector, n_embd);
}

std::tuple<std::vector<float>, int> llama_control_vector_load(const std::vector<std::tuple<std::string, float>> & vectors) {
std::vector<float> vector;
int n_embd = -1;

for (const auto& pair : vectors) {
std::string path;
float strength;
std::tie(path, strength) = pair;

std::vector<float> v;
int v_n_embd;
std::tie(v, v_n_embd) = llama_control_vector_load_one(path, strength);

if (v_n_embd == -1) {
return std::make_tuple(std::vector<float>(), -1);
}
if (n_embd != -1 && (n_embd != v_n_embd || v.size() != vector.size())) {
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, path.c_str());
return std::make_tuple(std::vector<float>(), -1);
}

if (n_embd == -1) {
vector = std::move(v);
n_embd = v_n_embd;
} else {
for (size_t i = 0; i < vector.size(); i++) {
vector[i] += v[i];
}
}
}

if (n_embd == -1) {
fprintf(stderr, "%s: no vectors passed\n", __func__);
}
return std::make_tuple(vector, n_embd);
}
9 changes: 9 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,12 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40

void llama_embd_normalize(const float * inp, float * out, int n);

//
// Control vector utils
//

// Load control vectors from a tuple of {path, strength}, scale each by strength, and add them together.
// Returns a tuple of {concatenated vector data (n_emnd x n_layer), n_embd}
// On error, returns a tuple of {empty, -1}
std::tuple<std::vector<float>, int> llama_control_vector_load(
const std::vector<std::tuple<std::string, float>> & vectors);
Loading

0 comments on commit 2d69bf8

Please sign in to comment.