Skip to content

Commit

Permalink
Support all LLaMA models + change Q4_0 quantization storage
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Mar 11, 2023
1 parent 5f2f970 commit 007a8f6
Show file tree
Hide file tree
Showing 5 changed files with 399 additions and 200 deletions.
44 changes: 38 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@ The main goal is to run the model using 4-bit quantization on a MacBook.

This was hacked in an evening - I have no idea if it works correctly.

So far, I've tested just the 7B model.
Here is a typical run:
Here is a typical run using LLaMA-7B:

```java
make -j && ./main -m ../LLaMA-4bit/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
I llama.cpp build info:
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
I UNAME_M: arm64
Expand All @@ -34,7 +33,7 @@ I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202)

make: Nothing to be done for `default'.
main: seed = 1678486056
llama_model_load: loading model from '../LLaMA-4bit/7B/ggml-model-q4_0.bin' - please wait ...
llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
llama_model_load: n_vocab = 32000
llama_model_load: n_ctx = 512
llama_model_load: n_embd = 4096
Expand Down Expand Up @@ -110,6 +109,8 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8

## Usage

Here are the step for the LLaMA-7B model:

```bash
# build this repo
git clone https://github.com/ggerganov/llama.cpp
Expand All @@ -133,9 +134,40 @@ python3 convert-pth-to-ggml.py models/7B/ 1
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
```

For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
will create 2 ggml files, instead of one:

```bash
ggml-model-f16.bin
ggml-model-f16.bin.1
```

You need to quantize each of them separately like this:

```bash
./quantize ./models/13B/ggml-model-f16.bin ./models/13B/ggml-model-q4_0.bin 2
./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
```

Everything else is the same. Simply run:

```bash
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128
```

The number of files generated for each model is as follows:

```
7B -> 1 file
13B -> 2 files
33B -> 4 files
65B -> 8 files
```

When running the larger models, make sure you have enough disk space to store all the intermediate files.

## Limitations

- Currently, only LLaMA-7B is supported since I haven't figured out how to merge the tensors of the bigger models. However, in theory, you should be able to run 65B on a 64GB MacBook
- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
Expand Down
172 changes: 99 additions & 73 deletions convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,23 @@

# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model.bin"

fname_hparams = sys.argv[1] + "/params.json"
fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"

def get_n_parts(dim):
if dim == 4096:
return 1
elif dim == 5120:
return 2
elif dim == 6656:
return 4
elif dim == 8192:
return 8
else:
print("Invalid dim: " + str(dim))
sys.exit(1)

# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
Expand All @@ -61,76 +72,91 @@

hparams.update({"vocab_size": tokenizer.vocab_size()})

n_parts = get_n_parts(hparams["dim"])

print(hparams)
print('n_parts = ', n_parts)

model = torch.load(fname_model, map_location="cpu")

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["dim"]))
fout.write(struct.pack("i", hparams["multiple_of"]))
fout.write(struct.pack("i", hparams["n_heads"]))
fout.write(struct.pack("i", hparams["n_layers"]))
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
fout.write(struct.pack("i", ftype))

# Is this correct??
for i in range(32000):
# TODO: this is probably wrong - not sure how this tokenizer works
text = tokenizer.decode([29889, i]).encode('utf-8')
# remove the first byte (it's always '.')
text = text[1:]
fout.write(struct.pack("i", len(text)))
fout.write(text)

for k, v in model.items():
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name[-5:] == "freqs":
continue

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
data = v.numpy().squeeze()
n_dims = len(data.shape);

# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
#if name[-14:] == "/attn/c_attn/w" or \
# name[-14:] == "/attn/c_proj/w" or \
# name[-11:] == "/mlp/c_fc/w" or \
# name[-13:] == "/mlp/c_proj/w":
# print(" Transposing")
# data = data.transpose()

dshape = data.shape

# default type is fp16
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0

# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(str);

# data
data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")
for p in range(n_parts):
print('Processing part ', p)

#fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
if (p > 0):
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)

model = torch.load(fname_model, map_location="cpu")

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["dim"]))
fout.write(struct.pack("i", hparams["multiple_of"]))
fout.write(struct.pack("i", hparams["n_heads"]))
fout.write(struct.pack("i", hparams["n_layers"]))
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
fout.write(struct.pack("i", ftype))

# Is this correct??
for i in range(32000):
# TODO: this is probably wrong - not sure how this tokenizer works
text = tokenizer.decode([29889, i]).encode('utf-8')
# remove the first byte (it's always '.')
text = text[1:]
fout.write(struct.pack("i", len(text)))
fout.write(text)

for k, v in model.items():
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name[-5:] == "freqs":
continue

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
data = v.numpy().squeeze()
n_dims = len(data.shape);

# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
#if name[-14:] == "/attn/c_attn/w" or \
# name[-14:] == "/attn/c_proj/w" or \
# name[-11:] == "/mlp/c_fc/w" or \
# name[-13:] == "/mlp/c_proj/w":
# print(" Transposing")
# data = data.transpose()

dshape = data.shape

# default type is fp16
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0

# header
sname = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(sname);

# data
data.tofile(fout)

# I hope this deallocates the memory ..
model = None

fout.close()

print("Done. Output file: " + fname_out + ", (part ", p, ")")
print("")
Loading

0 comments on commit 007a8f6

Please sign in to comment.