intel · VincyZhang · Mar 15, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md
@@ -12,8 +12,11 @@ Validated GPTQ & AWQ models directly from the HuggingFace:
 * [Mixtral-8x7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ) & [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ)
 * [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ) & * [Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)
 * [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
+* [Baichuan2-13B-Chat-GPTQ](https://hf-mirror.com/TheBloke/Baichuan2-13B-Chat-GPTQ)
+* [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b/tree/main)
+* [onlinex/phi-1_5-gptq-4bit](https://hf-mirror.com/onlinex/phi-1_5-gptq-4bit)
 
-Please check more validated GPTQ & AWQ models in the list of [supported_models](./supported_models.md).
+For more details, please check the list of [supported_models](./supported_models.md).
 
 ## Examples
 

diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -235,13 +235,13 @@ Neural Speed supports the following models:
     <td><a href="https://huggingface.co/baichuan-inc/Baichuan-13B-Chat" target="_blank" rel="noopener noreferrer">Baichuan-13B-Chat</a>,
     <a href="https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat" target="_blank" rel="noopener noreferrer">Baichuan2-13B-Chat</a></td>
     <td>✅</td>
-    <td> </td>
-    <td> </td>
-    <td> </td>
     <td>✅</td>
-    <td> </td>
-    <td> </td>
-    <td> </td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
     <td>4.33.1</td>
     <td>4096</td>
   </tr>

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -24,7 +24,6 @@
 
 
 class Model:
-
     def __init__(self):
         self.module = None
         self.model = None
@@ -83,6 +82,15 @@ def get_model_type(model_config):
         model_type = model_maps.get(model_config.model_type, model_config.model_type)
         if model_type == "chatglm" and "chatglm2" in model_config._name_or_path:
             model_type = "chatglm2"
+
+        # for TheBloke/falcon-40b-instruct-GPTQ & TheBloke/Falcon-7B-Instruct-GPTQ
+        if model_type == "RefinedWebModel" or model_type == "RefinedWeb":
+            model_type = "falcon"
+
+        # for TheBloke/phi-2-GPTQ
+        if model_type == "phi-msft":
+            model_type = "phi"
+
         return model_type
 
     def init(self,

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
@@ -18,7 +18,15 @@
 from pathlib import Path
 import subprocess
 
-model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}
+model_maps = {
+    "gpt_neox": "gptneox",
+    "gpt_bigcode": "starcoder",
+    "whisper": "whisper",
+    "qwen2": "qwen",
+    "RefinedWebModel": "falcon",
+    "RefinedWeb": "falcon",
+    "phi-msft": "phi"
+}
 
 
 def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False):
@@ -28,6 +36,7 @@ def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_qu
     else:
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+
     model_type = model_maps.get(config.model_type, config.model_type)
 
     if use_quantized_model:

diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
@@ -516,3 +516,86 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
                                                 compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+
+
+def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
+    # unpack weight and repack into 3bits / 4bits BestLA format
+    import neural_speed.llama_cpp as cpp_model
+    if ".weight" in src_name:
+        src_name = src_name.replace(".weight", "")
+    qzeros = model[f"{src_name}.qzeros"]
+    zeros = qzeros_to_zeros(qzeros)
+    scales = model[f"{src_name}.scales"]
+    qweight = model[f"{src_name}.qweight"]
+
+    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+    int_weight = int_weight.view(-1, int_weight.shape[-1])
+
+    # shuffle weight in GPTQ when act order is on
+    if 'desc_act' in q_config and q_config['desc_act']:
+        g_idx = model[f"{src_name}.g_idx"]
+        int_weight2 = int_weight.clone()
+        group_size = q_config['group_size']
+        group_dict = {}
+        for i in range(len(g_idx)):
+            group_idx = g_idx[i].item()
+            if group_idx not in group_dict:
+                target_idx = group_idx * group_size
+                group_dict[group_idx] = 0
+            else:
+                group_dict[group_idx] = group_dict[group_idx] + 1
+                target_idx = group_idx * group_size + group_dict[group_idx]
+            int_weight2[target_idx] = int_weight[i]
+        int_weight = int_weight2
+
+    # shape = int_weight.shape[::-1]
+    shape = int_weight.shape[::-1]
+    # write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
+    n_dims = len(shape)
+    str = dst_name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), GGML_QJBLAS_TYPE))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", shape[n_dims - 1 - i]))
+    fout.write(str)
+
+    # INC stores sig-int4 value as u4(range 0~15, they add a offset),
+    # BesTLA requires s4_clip((-8,7)*16), so we sub the offset and then mul 16.
+    # Int3 is the same as int4, but offset=4, mul scale==32.
+    weight_dtype = "int8"
+    if q_config['bits'] == 4:
+        int_weight = (int_weight - 8) * 16
+        gptq_scales = gptq_scales / 16
+        gptq_zeros = (gptq_zeros - 8) * 16
+        weight_dtype = "int4"
+    elif q_config['bits'] == 3:
+        int_weight = (int_weight - 4) * 32
+        gptq_scales = gptq_scales / 32
+        gptq_zeros = (gptq_zeros - 4) * 32
+        weight_dtype = "int3"
+    else:
+        ValueError(f"Unsupported q_config[bits]: {q_config['bits']}")
+
+    dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
+    int_weight = np.ascontiguousarray(int_weight.numpy())
+    gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy())
+    if q_config['sym']:
+        gptq_zeros = np.empty(0, dtype=np.int8)
+    else:
+        gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy())
+    if 'desc_act' in q_config and q_config['desc_act']:
+        g_idx = np.ascontiguousarray(g_idx.numpy())
+    else:
+        g_idx = np.empty(0, dtype=np.int32)
+
+    # repack int weight in BesTLA format
+    byte_size = cpp_model.Model.np_bestla_qpack(int_weight,
+                                                gptq_scales,
+                                                gptq_zeros,
+                                                g_idx,
+                                                dst,
+                                                weight_dtype=weight_dtype,
+                                                group_size=q_config['group_size'],
+                                                alg="sym" if q_config['sym'] else "asym",
+                                                compute_dtype="int8")
+    dst.flatten()[:byte_size].tofile(fout)
+    print(f"convert_to_qx_bestla_tensor: {src_name:>40} -> {dst_name:<40} shape: {shape}, byte_size: {byte_size:<10}")
diff --git a/neural_speed/convert/convert_quantized_baichuan.py b/neural_speed/convert/convert_quantized_baichuan.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import sys
+import re
+import argparse
+from common import *
+from sentencepiece import SentencePieceProcessor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def load_vocab_for_baichuan(path: Path) -> SentencePieceVocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        path2 = path / "tokenizer.model"
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / "tokenizer.model"
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, \
+                pass the directory as --vocab-dir")
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
+    parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub",
+                        choices=["huggingface", "modelscope"],
+                        default="huggingface",
+                        help="hub to load model")
+    parser.add_argument("model", type=Path, help="directory containing model file")
+    args = parser.parse_args(args_in)
+
+    out_path = args.outfile.as_posix()
+    model_path = args.model.as_posix()
+
+    model, hparams, quantize_config = load_quantized_safetensors(model_path)
+    list_vars = model
+
+    print(hparams)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    fout = open(out_path, "wb")
+
+    # possible data types
+    #   ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0
+    if args.outtype == "f16":
+        ftype = 1
+
+    # 1. write hparams
+    print(hparams)
+    ne_file_magic = 0x67676d66
+    fout.write(struct.pack("i", ne_file_magic))  # magic: ne in hex
+    fout.write(struct.pack("i", 1))
+
+    fout.write(struct.pack("i", hparams["vocab_size"]))
+    fout.write(struct.pack("i", hparams["hidden_size"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", hparams["num_attention_heads"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", hparams["num_hidden_layers"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", ftype))
+    fout.write(struct.pack("i", hparams["model_max_length"]))
+    fout.write(struct.pack("f", 0))
+    fout.write(struct.pack("f", 0))
+    fout.write(struct.pack("i", 0))
+
+    fout.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
+    fout.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
+
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", hparams["intermediate_size"]))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
+    fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
+
+    fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
+
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
+
+    # 2. vocab
+    tokenizer_path = Path(tokenizer.vocab_file).parent
+    vocab = load_vocab_for_baichuan(Path(tokenizer_path))
+    counter = 0
+    for text, score in vocab.all_tokens():
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", score))
+        counter += 1
+
+    while counter < hparams["vocab_size"]:
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", 0))
+        counter += 1
+
+    def convert_qwen_to_fp32_tensor(src_name, dst_name, model, fout):
+        # qwen-gptq is torch.bfloat16 mostly.
+        if model[src_name].dtype == torch.float32:
+            data = model[src_name].squeeze().numpy()
+        else:
+            data = model[src_name].squeeze().to(torch.float32).numpy()
+        data = data.astype(np.float32)
+        shape = data.shape
+        n_dims = len(shape)
+        print("convert_qwen_to_fp32_tensor:  %40s" % src_name + "-> %-40s" % dst_name + " shape: ", shape, " type: ",
+              data.dtype)
+
+        #ftype_cur = {torch.float16: 1, torch.float32: 0}[data.dtype]
+        # default type is fp32
+        ftype_cur = 0
+        if ftype == 1 and n_dims > 1:
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            data = data.astype(np.float32)
+
+        # header
+        # write_header(fout, shape, dst_name, ftype_cur)
+        str = src_name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    #3. write tensors
+    convert_qwen_to_fp32_tensor("model.embed_tokens.weight", "model.embed_tokens.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("model.norm.weight", "model.norm.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.weight", "lm_head.weight", list_vars, fout)
+
+    for i in range(hparams["num_hidden_layers"]):
+        prefix = "model.layers." + str(i)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.input_layernorm.weight", f"{prefix}.input_layernorm.weight", list_vars,
+                                    fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.post_attention_layernorm.weight",
+                                    f"{prefix}.post_attention_layernorm.weight", list_vars, fout)
+        # qkv GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.W_pack.weight", f"{prefix}.self_attn.W_pack.weight", list_vars,
+                                    fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.o_proj.weight", f"{prefix}.self_attn.o_proj.weight", list_vars,
+                                    fout, quantize_config)
+
+        # ffn GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.gate_proj", f"{prefix}.mlp.gate_proj.weight", list_vars, fout,
+                                    quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.down_proj", f"{prefix}.mlp.down_proj.weight", list_vars, fout,
+                                    quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.up_proj", f"{prefix}.mlp.up_proj.weight", list_vars, fout,
+                                    quantize_config)
+
+    fout.close()
+    print(f"Success! saved as {out_path}")
+
+
+if __name__ == '__main__':
+    main()