Support all LLaMA models + change Q4_0 quantization storage

ggerganov · Mar 11, 2023 · 007a8f6 · 007a8f6
1 parent 5f2f970
commit 007a8f6
Show file tree

Hide file tree

Showing 5 changed files with 399 additions and 200 deletions.
diff --git a/README.md b/README.md
@@ -17,12 +17,11 @@ The main goal is to run the model using 4-bit quantization on a MacBook.
 
 This was hacked in an evening - I have no idea if it works correctly.
 
-So far, I've tested just the 7B model.
-Here is a typical run:
+Here is a typical run using LLaMA-7B:
 
 ```java
-make -j && ./main -m ../LLaMA-4bit/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
-I llama.cpp build info: 
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
@@ -34,7 +33,7 @@ I CXX:      Apple clang version 14.0.0 (clang-1400.0.29.202)
 
 make: Nothing to be done for `default'.
 main: seed = 1678486056
-llama_model_load: loading model from '../LLaMA-4bit/7B/ggml-model-q4_0.bin' - please wait ...
+llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
 llama_model_load: n_vocab = 32000
 llama_model_load: n_ctx   = 512
 llama_model_load: n_embd  = 4096
@@ -110,6 +109,8 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 
 ## Usage
 
+Here are the step for the LLaMA-7B model:
+
 ```bash
 # build this repo
 git clone https://github.com/ggerganov/llama.cpp
@@ -133,9 +134,40 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
 ```
 
+For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
+will create 2 ggml files, instead of one:
+
+```bash
+ggml-model-f16.bin
+ggml-model-f16.bin.1
+```
+
+You need to quantize each of them separately like this:
+
+```bash
+./quantize ./models/13B/ggml-model-f16.bin   ./models/13B/ggml-model-q4_0.bin 2
+./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
+```
+
+Everything else is the same. Simply run:
+
+```bash
+./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128
+```
+
+The number of files generated for each model is as follows:
+
+```
+7B  -> 1 file
+13B -> 2 files
+33B -> 4 files
+65B -> 8 files
+```
+
+When running the larger models, make sure you have enough disk space to store all the intermediate files.
+
 ## Limitations
 
-- Currently, only LLaMA-7B is supported since I haven't figured out how to merge the tensors of the bigger models. However, in theory, you should be able to run 65B on a 64GB MacBook
 - Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
   - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
   - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69

diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
@@ -33,12 +33,23 @@
 
 # output in the same directory as the model
 dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
 
 fname_hparams   = sys.argv[1] + "/params.json"
-fname_model     = sys.argv[1] + "/consolidated.00.pth"
 fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
 
+def get_n_parts(dim):
+    if dim == 4096:
+        return 1
+    elif dim == 5120:
+        return 2
+    elif dim == 6656:
+        return 4
+    elif dim == 8192:
+        return 8
+    else:
+        print("Invalid dim: " + str(dim))
+        sys.exit(1)
+
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -61,76 +72,91 @@
 
 hparams.update({"vocab_size": tokenizer.vocab_size()})
 
+n_parts = get_n_parts(hparams["dim"])
+
 print(hparams)
+print('n_parts = ', n_parts)
 
-model = torch.load(fname_model, map_location="cpu")
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["dim"]))
-fout.write(struct.pack("i", hparams["multiple_of"]))
-fout.write(struct.pack("i", hparams["n_heads"]))
-fout.write(struct.pack("i", hparams["n_layers"]))
-fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
-fout.write(struct.pack("i", ftype))
-
-# Is this correct??
-for i in range(32000):
-    # TODO: this is probably wrong - not sure how this tokenizer works
-    text = tokenizer.decode([29889, i]).encode('utf-8')
-    # remove the first byte (it's always '.')
-    text = text[1:]
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for k, v in model.items():
-    name = k
-    shape = v.shape
-
-    # skip layers.X.attention.inner_attention.rope.freqs
-    if name[-5:] == "freqs":
-        continue
-
-    print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
-
-    #data = tf.train.load_variable(dir_model, name).squeeze()
-    data = v.numpy().squeeze()
-    n_dims = len(data.shape);
-
-    # for efficiency - transpose some matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    #if name[-14:] == "/attn/c_attn/w" or \
-    #   name[-14:] == "/attn/c_proj/w" or \
-    #   name[-11:] == "/mlp/c_fc/w" or \
-    #   name[-13:] == "/mlp/c_proj/w":
-    #    print("  Transposing")
-    #    data = data.transpose()
-
-    dshape = data.shape
-
-    # default type is fp16
-    ftype_cur = 1
-    if ftype == 0 or n_dims == 1:
-        print("  Converting to float32")
-        data = data.astype(np.float32)
-        ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
+for p in range(n_parts):
+    print('Processing part ', p)
+
+    #fname_model = sys.argv[1] + "/consolidated.00.pth"
+    fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+    if (p > 0):
+        fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
+
+    model = torch.load(fname_model, map_location="cpu")
+
+    fout = open(fname_out, "wb")
+
+    fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+    fout.write(struct.pack("i", hparams["vocab_size"]))
+    fout.write(struct.pack("i", hparams["dim"]))
+    fout.write(struct.pack("i", hparams["multiple_of"]))
+    fout.write(struct.pack("i", hparams["n_heads"]))
+    fout.write(struct.pack("i", hparams["n_layers"]))
+    fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
+    fout.write(struct.pack("i", ftype))
+
+    # Is this correct??
+    for i in range(32000):
+        # TODO: this is probably wrong - not sure how this tokenizer works
+        text = tokenizer.decode([29889, i]).encode('utf-8')
+        # remove the first byte (it's always '.')
+        text = text[1:]
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+
+    for k, v in model.items():
+        name = k
+        shape = v.shape
+
+        # skip layers.X.attention.inner_attention.rope.freqs
+        if name[-5:] == "freqs":
+            continue
+
+        print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
+
+        #data = tf.train.load_variable(dir_model, name).squeeze()
+        data = v.numpy().squeeze()
+        n_dims = len(data.shape);
+
+        # for efficiency - transpose some matrices
+        # "model/h.*/attn/c_attn/w"
+        # "model/h.*/attn/c_proj/w"
+        # "model/h.*/mlp/c_fc/w"
+        # "model/h.*/mlp/c_proj/w"
+        #if name[-14:] == "/attn/c_attn/w" or \
+        #   name[-14:] == "/attn/c_proj/w" or \
+        #   name[-11:] == "/mlp/c_fc/w" or \
+        #   name[-13:] == "/mlp/c_proj/w":
+        #    print("  Transposing")
+        #    data = data.transpose()
+
+        dshape = data.shape
+
+        # default type is fp16
+        ftype_cur = 1
+        if ftype == 0 or n_dims == 1:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+        # header
+        sname = name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
+        fout.write(sname);
+
+        # data
+        data.tofile(fout)
+
+    # I hope this deallocates the memory ..
+    model = None
+
+    fout.close()
+
+    print("Done. Output file: " + fname_out + ", (part ", p, ")")
+    print("")