Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LoRA support #820

Merged
merged 15 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions convert-lora-to-ggml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import json
import os
import re
import struct
import sys
from typing import Any, Dict, Sequence, TextIO

import torch

from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType

HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attention.wq",
"self_attn.k_proj": "attention.wk",
"self_attn.v_proj": "attention.wv",
"self_attn.o_proj": "attention.wo",
"mlp.gate_proj": "feed_forward.w1",
"mlp.down_proj": "feed_forward.w2",
"mlp.up_proj": "feed_forward.w3",
"input_layernorm": "attention_norm",
"post_attention_layernorm": "ffn_norm",
# "norm": "norm",
# "embed_tokens": "tok_embeddings",
# "lm_head": "output",
}


def translate_tensor_name(t: str) -> str:
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
if match:
nn = match.group(1)
sub_layer = match.group(2)
lora_type = match.group(3)

sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
if sub_layer_renamed is None:
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)

output_string = (
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
sys.exit(1)


def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))


def write_tensor_header(
self, name: str, shape: Sequence[int], data_type: DataType
) -> None:
sname = name.encode("utf-8")
fout.write(
struct.pack(
"iii",
len(shape),
len(sname),
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
fout.seek((fout.tell() + 31) & -32)


if len(sys.argv) != 2:
print(f"Usage: python {sys.argv[0]} <path>")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
sys.exit(1)

input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

model = torch.load(input_model, map_location="cpu")

with open(input_json, "r") as f:
params = json.load(f)

if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)

if params["fan_in_fan_out"] == True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)

if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)

# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)

with open(output_path, "wb") as fout:
fout.truncate()

write_file_header(fout, params)
for k, v in model.items():
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()

t = v.numpy()
tname = translate_tensor_name(k)
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)

print(f"Converted {input_json} and {input_model} to {output_path}")
15 changes: 15 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.model = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
Expand Down Expand Up @@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
Expand Down
7 changes: 4 additions & 3 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ struct gpt_params {

std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string input_prefix = ""; // string to prefix user inputs with


std::string input_prefix = ""; // string to prefix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter

bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
Expand Down
11 changes: 11 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,17 @@ int main(int argc, char ** argv) {
}
}

if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
slaren marked this conversation as resolved.
Show resolved Hide resolved
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
}

// print system information
{
fprintf(stderr, "\n");
Expand Down
11 changes: 11 additions & 0 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,17 @@ int main(int argc, char ** argv) {
}
}

if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
}

// print system information
{
fprintf(stderr, "\n");
Expand Down
Loading