From eed9b3072f4db0feeb14788c254a4a56478e0da9 Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <109137058+Zhenzhong1@users.noreply.github.com>
Date: Fri, 15 Mar 2024 10:45:20 +0800
Subject: [PATCH] [GPTQ Enhence] Support GPTQ for Baichuan2-13B & Falcon 7B &
 Phi-1.5 (#169)

---
 docs/gptq_and_awq.md                          |   5 +-
 docs/supported_models.md                      |  12 +-
 neural_speed/__init__.py                      |  10 +-
 neural_speed/convert/__init__.py              |  11 +-
 neural_speed/convert/common.py                |  83 +++++
 .../convert/convert_quantized_baichuan.py     | 196 +++++++++++
 .../convert/convert_quantized_bloom.py        | 243 -------------
 .../convert/convert_quantized_falcon.py       | 174 +++++++++
 neural_speed/convert/convert_quantized_phi.py | 329 ++++++++++++++++++
 .../convert/convert_quantized_qwen.py         | 107 +-----
 10 files changed, 819 insertions(+), 351 deletions(-)
 create mode 100644 neural_speed/convert/convert_quantized_baichuan.py
 delete mode 100644 neural_speed/convert/convert_quantized_bloom.py
 create mode 100644 neural_speed/convert/convert_quantized_falcon.py
 create mode 100644 neural_speed/convert/convert_quantized_phi.py
diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md
index d8dfbc43f..ef887c1c1 100644
--- a/docs/gptq_and_awq.md
+++ b/docs/gptq_and_awq.md
@@ -12,8 +12,11 @@ Validated GPTQ & AWQ models directly from the HuggingFace:
 * [Mixtral-8x7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ) & [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ)
 * [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ) & * [Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)
 * [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
+* [Baichuan2-13B-Chat-GPTQ](https://hf-mirror.com/TheBloke/Baichuan2-13B-Chat-GPTQ)
+* [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b/tree/main)
+* [onlinex/phi-1_5-gptq-4bit](https://hf-mirror.com/onlinex/phi-1_5-gptq-4bit)
 
-Please check more validated GPTQ & AWQ models in the list of [supported_models](./supported_models.md).
+For more details, please check the list of [supported_models](./supported_models.md).
 
 ## Examples
 
diff --git a/docs/supported_models.md b/docs/supported_models.md
index 4aad26d29..7db9c6b77 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -235,13 +235,13 @@ Neural Speed supports the following models:
     <td><a href="https://huggingface.co/baichuan-inc/Baichuan-13B-Chat" target="_blank" rel="noopener noreferrer">Baichuan-13B-Chat</a>,
     <a href="https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat" target="_blank" rel="noopener noreferrer">Baichuan2-13B-Chat</a></td>
     <td>✅</td>
-    <td> </td>
-    <td> </td>
-    <td> </td>
     <td>✅</td>
-    <td> </td>
-    <td> </td>
-    <td> </td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
     <td>4.33.1</td>
     <td>4096</td>
   </tr>
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index dda41c270..8116b2086 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -24,7 +24,6 @@
 
 
 class Model:
-
     def __init__(self):
         self.module = None
         self.model = None
@@ -83,6 +82,15 @@ def get_model_type(model_config):
         model_type = model_maps.get(model_config.model_type, model_config.model_type)
         if model_type == "chatglm" and "chatglm2" in model_config._name_or_path:
             model_type = "chatglm2"
+
+        # for TheBloke/falcon-40b-instruct-GPTQ & TheBloke/Falcon-7B-Instruct-GPTQ
+        if model_type == "RefinedWebModel" or model_type == "RefinedWeb":
+            model_type = "falcon"
+
+        # for TheBloke/phi-2-GPTQ
+        if model_type == "phi-msft":
+            model_type = "phi"
+
         return model_type
 
     def init(self,
diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
index 18ce11490..3cc4f2301 100644
--- a/neural_speed/convert/__init__.py
+++ b/neural_speed/convert/__init__.py
@@ -18,7 +18,15 @@
 from pathlib import Path
 import subprocess
 
-model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}
+model_maps = {
+    "gpt_neox": "gptneox",
+    "gpt_bigcode": "starcoder",
+    "whisper": "whisper",
+    "qwen2": "qwen",
+    "RefinedWebModel": "falcon",
+    "RefinedWeb": "falcon",
+    "phi-msft": "phi"
+}
 
 
 def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False):
@@ -28,6 +36,7 @@ def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_qu
     else:
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+
     model_type = model_maps.get(config.model_type, config.model_type)
 
     if use_quantized_model:
diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index fc0c7d1fc..d4e5f49cc 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -516,3 +516,86 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
                                                 compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+
+
+def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
+    # unpack weight and repack into 3bits / 4bits BestLA format
+    import neural_speed.llama_cpp as cpp_model
+    if ".weight" in src_name:
+        src_name = src_name.replace(".weight", "")
+    qzeros = model[f"{src_name}.qzeros"]
+    zeros = qzeros_to_zeros(qzeros)
+    scales = model[f"{src_name}.scales"]
+    qweight = model[f"{src_name}.qweight"]
+
+    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+    int_weight = int_weight.view(-1, int_weight.shape[-1])
+
+    # shuffle weight in GPTQ when act order is on
+    if 'desc_act' in q_config and q_config['desc_act']:
+        g_idx = model[f"{src_name}.g_idx"]
+        int_weight2 = int_weight.clone()
+        group_size = q_config['group_size']
+        group_dict = {}
+        for i in range(len(g_idx)):
+            group_idx = g_idx[i].item()
+            if group_idx not in group_dict:
+                target_idx = group_idx * group_size
+                group_dict[group_idx] = 0
+            else:
+                group_dict[group_idx] = group_dict[group_idx] + 1
+                target_idx = group_idx * group_size + group_dict[group_idx]
+            int_weight2[target_idx] = int_weight[i]
+        int_weight = int_weight2
+
+    # shape = int_weight.shape[::-1]
+    shape = int_weight.shape[::-1]
+    # write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
+    n_dims = len(shape)
+    str = dst_name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), GGML_QJBLAS_TYPE))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", shape[n_dims - 1 - i]))
+    fout.write(str)
+
+    # INC stores sig-int4 value as u4(range 0~15, they add a offset),
+    # BesTLA requires s4_clip((-8,7)*16), so we sub the offset and then mul 16.
+    # Int3 is the same as int4, but offset=4, mul scale==32.
+    weight_dtype = "int8"
+    if q_config['bits'] == 4:
+        int_weight = (int_weight - 8) * 16
+        gptq_scales = gptq_scales / 16
+        gptq_zeros = (gptq_zeros - 8) * 16
+        weight_dtype = "int4"
+    elif q_config['bits'] == 3:
+        int_weight = (int_weight - 4) * 32
+        gptq_scales = gptq_scales / 32
+        gptq_zeros = (gptq_zeros - 4) * 32
+        weight_dtype = "int3"
+    else:
+        ValueError(f"Unsupported q_config[bits]: {q_config['bits']}")
+
+    dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
+    int_weight = np.ascontiguousarray(int_weight.numpy())
+    gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy())
+    if q_config['sym']:
+        gptq_zeros = np.empty(0, dtype=np.int8)
+    else:
+        gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy())
+    if 'desc_act' in q_config and q_config['desc_act']:
+        g_idx = np.ascontiguousarray(g_idx.numpy())
+    else:
+        g_idx = np.empty(0, dtype=np.int32)
+
+    # repack int weight in BesTLA format
+    byte_size = cpp_model.Model.np_bestla_qpack(int_weight,
+                                                gptq_scales,
+                                                gptq_zeros,
+                                                g_idx,
+                                                dst,
+                                                weight_dtype=weight_dtype,
+                                                group_size=q_config['group_size'],
+                                                alg="sym" if q_config['sym'] else "asym",
+                                                compute_dtype="int8")
+    dst.flatten()[:byte_size].tofile(fout)
+    print(f"convert_to_qx_bestla_tensor: {src_name:>40} -> {dst_name:<40} shape: {shape}, byte_size: {byte_size:<10}")
diff --git a/neural_speed/convert/convert_quantized_baichuan.py b/neural_speed/convert/convert_quantized_baichuan.py
new file mode 100644
index 000000000..22928a6bc
--- /dev/null
+++ b/neural_speed/convert/convert_quantized_baichuan.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import sys
+import re
+import argparse
+from common import *
+from sentencepiece import SentencePieceProcessor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def load_vocab_for_baichuan(path: Path) -> SentencePieceVocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        path2 = path / "tokenizer.model"
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / "tokenizer.model"
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, \
+                pass the directory as --vocab-dir")
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
+    parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub",
+                        choices=["huggingface", "modelscope"],
+                        default="huggingface",
+                        help="hub to load model")
+    parser.add_argument("model", type=Path, help="directory containing model file")
+    args = parser.parse_args(args_in)
+
+    out_path = args.outfile.as_posix()
+    model_path = args.model.as_posix()
+
+    model, hparams, quantize_config = load_quantized_safetensors(model_path)
+    list_vars = model
+
+    print(hparams)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    fout = open(out_path, "wb")
+
+    # possible data types
+    #   ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0
+    if args.outtype == "f16":
+        ftype = 1
+
+    # 1. write hparams
+    print(hparams)
+    ne_file_magic = 0x67676d66
+    fout.write(struct.pack("i", ne_file_magic))  # magic: ne in hex
+    fout.write(struct.pack("i", 1))
+
+    fout.write(struct.pack("i", hparams["vocab_size"]))
+    fout.write(struct.pack("i", hparams["hidden_size"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", hparams["num_attention_heads"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", hparams["num_hidden_layers"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", ftype))
+    fout.write(struct.pack("i", hparams["model_max_length"]))
+    fout.write(struct.pack("f", 0))
+    fout.write(struct.pack("f", 0))
+    fout.write(struct.pack("i", 0))
+
+    fout.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
+    fout.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
+
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", hparams["intermediate_size"]))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
+    fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
+
+    fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
+
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
+
+    # 2. vocab
+    tokenizer_path = Path(tokenizer.vocab_file).parent
+    vocab = load_vocab_for_baichuan(Path(tokenizer_path))
+    counter = 0
+    for text, score in vocab.all_tokens():
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", score))
+        counter += 1
+
+    while counter < hparams["vocab_size"]:
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", 0))
+        counter += 1
+
+    def convert_qwen_to_fp32_tensor(src_name, dst_name, model, fout):
+        # qwen-gptq is torch.bfloat16 mostly.
+        if model[src_name].dtype == torch.float32:
+            data = model[src_name].squeeze().numpy()
+        else:
+            data = model[src_name].squeeze().to(torch.float32).numpy()
+        data = data.astype(np.float32)
+        shape = data.shape
+        n_dims = len(shape)
+        print("convert_qwen_to_fp32_tensor:  %40s" % src_name + "-> %-40s" % dst_name + " shape: ", shape, " type: ",
+              data.dtype)
+
+        #ftype_cur = {torch.float16: 1, torch.float32: 0}[data.dtype]
+        # default type is fp32
+        ftype_cur = 0
+        if ftype == 1 and n_dims > 1:
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            data = data.astype(np.float32)
+
+        # header
+        # write_header(fout, shape, dst_name, ftype_cur)
+        str = src_name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    #3. write tensors
+    convert_qwen_to_fp32_tensor("model.embed_tokens.weight", "model.embed_tokens.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("model.norm.weight", "model.norm.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.weight", "lm_head.weight", list_vars, fout)
+
+    for i in range(hparams["num_hidden_layers"]):
+        prefix = "model.layers." + str(i)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.input_layernorm.weight", f"{prefix}.input_layernorm.weight", list_vars,
+                                    fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.post_attention_layernorm.weight",
+                                    f"{prefix}.post_attention_layernorm.weight", list_vars, fout)
+        # qkv GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.W_pack.weight", f"{prefix}.self_attn.W_pack.weight", list_vars,
+                                    fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.o_proj.weight", f"{prefix}.self_attn.o_proj.weight", list_vars,
+                                    fout, quantize_config)
+
+        # ffn GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.gate_proj", f"{prefix}.mlp.gate_proj.weight", list_vars, fout,
+                                    quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.down_proj", f"{prefix}.mlp.down_proj.weight", list_vars, fout,
+                                    quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.up_proj", f"{prefix}.mlp.up_proj.weight", list_vars, fout,
+                                    quantize_config)
+
+    fout.close()
+    print(f"Success! saved as {out_path}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/neural_speed/convert/convert_quantized_bloom.py b/neural_speed/convert/convert_quantized_bloom.py
deleted file mode 100644
index a323019e8..000000000
--- a/neural_speed/convert/convert_quantized_bloom.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import os
-import numpy as np
-import struct
-from transformers import AutoTokenizer, TextStreamer, AutoConfig
-from transformers import AutoModelForCausalLM
-import json
-import copy
-from neural_compressor.adaptor.torch_utils.weight_only import quant_weight, quant_weight_w_scale
-import intel_extension_for_transformers.llm.runtime.graph.chatglm2_cpp as cpp_model
-
-GGML_QK8_0 = 32
-GGML_QK4_0 = 32
-GGML_QK4_1 = 32
-GGML_QK5_0 = 32
-GGML_QK5_1 = 32
-
-
-def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
-    # equivalent to ggml_quantize_q4_0 in ggml.c
-    # import pudb; pudb.set_trace()
-    assert tensor.shape[1] % GGML_QK4_0 == 0
-    tensor = tensor.view(-1, GGML_QK4_0)
-    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
-    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
-    scale = max_values / -8
-    tensor = (tensor / scale + 8).round().clamp(min=0, max=15).char()
-    # compress two int4 weights into an int8
-    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
-    # add scale into each block
-    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
-    return tensor
-
-
-def fetch_module(model, op_name):
-    """Get module with a given op name.
-
-    Args:
-        model (object): the input model.
-        op_name (str): name of op.
-
-    Returns:
-        module (object).
-    """
-    module = model
-    name_list = op_name.split(".")
-    for name in name_list:
-        if hasattr(module, name):
-            module = getattr(module, name)
-        else:
-            module = module
-    return module
-
-
-def extract_gptq(model, k, v):
-    print(f"Compressing {k}")
-    if v["dtype"] == "fp32":
-        return
-    else:
-        dtype = v["dtype"]
-        num_bits = v["bits"]
-        group_size = v["group_size"]
-        scheme = v["scheme"]
-    m = fetch_module(model, k)
-    m_weight = m.recover()
-    # import pdb; pdb.set_trace()
-    gptq_conf = gptq_config[k]
-    if "perm" in gptq_conf:
-        gptq_perm = torch.tensor(gptq_conf["perm"])
-        fp32_weight = m_weight[:, gptq_perm]
-    else:
-        fp32_weight = m_weight
-        gptq_perm = None
-    gptq_scale = torch.tensor(gptq_conf["scale"])
-    gptq_zp = None if scheme == "sym" else torch.tensor(gptq_conf["zero"])
-    int_weight = quant_weight_w_scale(fp32_weight, gptq_scale, gptq_zp, group_size)
-    return int_weight.to(torch.int8), gptq_scale, gptq_zp
-
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-
-    cs = [chr(n) for n in cs]
-
-    return dict(zip(bs, cs))
-
-
-model_name = "/mnt/disk1/data2/zhenweil/models/bloom/bloom-7b1"
-prompt = "Once upon a time, a little girl"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-inputs = tokenizer(prompt, return_tensors="pt").input_ids
-streamer = TextStreamer(tokenizer)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-
-gptq_model = "/mnt/disk1/data2/zhenweil/models/bloom/bloom-gptq/"
-from neural_compressor.utils.pytorch import load
-
-new_model = load(gptq_model, copy.deepcopy(model), weight_only=True)
-new_model_bk = copy.deepcopy(new_model)
-from neural_compressor.model import Model as INCModel
-
-inc_model = INCModel(new_model)
-qweight_config_path = gptq_model + "qconfig.json"
-gptq_config_path = gptq_model + "gptq_config.json"
-inc_model.export_compressed_model(qweight_config_path=qweight_config_path, gptq_config_path=gptq_config_path)
-
-with open(qweight_config_path, "r") as f:
-    weight_config = json.load(f)
-with open(gptq_config_path, "r") as f:
-    gptq_config = json.load(f)
-
-list_vars = new_model_bk.state_dict()
-f = open("bloom_gptq_q4.bin", "wb")
-
-# 1. write head and params
-hparams = config.to_dict()
-ftype = 0
-f.write(struct.pack("i", 0x67676d6c))  # magic: ggml in hex
-
-f.write(struct.pack("i", hparams["vocab_size"]))
-f.write(struct.pack("i", hparams["hidden_size"]))
-f.write(struct.pack("i", 1))
-f.write(struct.pack("i", hparams["n_head"]))
-f.write(struct.pack("i", hparams.get("n_head_kv", 0)))  # multi-query attention
-f.write(struct.pack("i", hparams["n_layer"]))
-f.write(struct.pack("i", 0))
-f.write(struct.pack("i", ftype))
-f.write(struct.pack("i", 0))
-f.write(struct.pack("f", 0))
-f.write(struct.pack("f", 0))
-f.write(struct.pack("i", 0))
-f.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
-f.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
-
-f.write(struct.pack("i", 0))
-f.write(struct.pack("i", 0))
-f.write(struct.pack("i", 0))
-f.write(struct.pack("i", 0))  # n_experts
-f.write(struct.pack("i", 0))  # n_expert_used
-f.write(struct.pack("f", 1e-6))  # rms norm eps
-f.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
-f.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
-f.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
-
-f.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
-f.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
-f.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
-f.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
-
-# 2. vocab
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
-
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([i]).encode('utf-8')
-    f.write(struct.pack("i", len(text)))
-    f.write(text)
-
-# 3. write tensors
-for name in list_vars.keys():
-    src = name
-    if "query_key_value" in src:
-        q_d, k_d, v_d = list_vars[src].reshape(config.n_head, 3, -1).unbind(1)
-        list_vars[src] = torch.cat([q_d, k_d, v_d], dim=0).reshape_as(list_vars[src])
-
-    ftype_cur = 0
-    if ".weight" in name and list_vars[name].dim() == 2:
-        ftype_cur = 2  # TODO(Zhenwei) support bestla
-
-    data = list_vars[src].squeeze().numpy()
-    data = data.astype(np.float32)
-
-    n_dims = len(data.shape)
-    print(name, n_dims, data.shape)
-    str = name.encode('utf-8')
-    f.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        f.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    f.write(str)
-
-    if ".weight" in name and list_vars[name].dim() == 2:
-        # to quantize
-        k = name.replace(".weight", "")
-        if k in weight_config and weight_config[k]["dtype"] != "fp32":
-            print(f"bestla {k}")
-            int_weight, gptq_scale, gptq_zp = extract_gptq(new_model, k, weight_config[k])
-
-            tensor = int_weight.view(-1, 32) + 8
-            tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
-            gptq_scale = gptq_scale.view(-1, 1)
-            gptq_scale = torch.cat([gptq_scale, gptq_scale, gptq_scale, gptq_scale], dim=1).view(-1, 1)
-            tensor = torch.cat((gptq_scale.half().view(torch.int8), tensor), dim=-1)
-            if "query_key_value" in src:
-                q_d, k_d, v_d = tensor.reshape(config.n_head, 3, -1).unbind(1)
-                tensor = torch.cat([q_d, k_d, v_d], dim=0).reshape_as(tensor)
-            tensor.numpy().tofile(f)
-
-        else:
-            print(f"q4_0 {k}")
-            tensor = quantize_q4_0(list_vars[name])
-            tensor.numpy().tofile(f)
-    else:
-        # keep float32
-        print(f"float {name}")
-        data.tofile(f)
-    # break
-f.close()
diff --git a/neural_speed/convert/convert_quantized_falcon.py b/neural_speed/convert/convert_quantized_falcon.py
new file mode 100644
index 000000000..956b0a92b
--- /dev/null
+++ b/neural_speed/convert/convert_quantized_falcon.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import sys
+import re
+import argparse
+from common import *
+from sentencepiece import SentencePieceProcessor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
+    parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub",
+                        choices=["huggingface", "modelscope"],
+                        default="huggingface",
+                        help="hub to load model")
+    parser.add_argument("model", type=Path, help="directory containing model file")
+    args = parser.parse_args(args_in)
+
+    out_path = args.outfile.as_posix()
+    model_path = args.model.as_posix()
+
+    model, hparams, quantize_config = load_quantized_safetensors(model_path)
+    list_vars = model
+
+    print(hparams)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    fout = open(out_path, "wb")
+
+    # possible data types
+    #   ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0
+    if args.outtype == "f16":
+        ftype = 1
+
+    # 1. write hparams
+    n_head_kv = hparams.get("n_head_kv", 1)
+    n_head = hparams["n_head"]
+    head_dim = hparams["hidden_size"] // n_head
+
+    fout.write(struct.pack("i", 0x67676d6c))  # magic: falcon in hex
+
+    fout.write(struct.pack("i", hparams["vocab_size"]))
+    fout.write(struct.pack("i", hparams["hidden_size"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", n_head))
+    fout.write(struct.pack("i", n_head_kv))  # multi-query attention
+    fout.write(struct.pack("i", hparams["n_layer"]))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", ftype))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("f", 0))
+    fout.write(struct.pack("f", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
+    fout.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
+
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
+    fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
+
+    fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
+
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
+
+    # 2. vocab
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(hparams["vocab_size"]):
+        text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+
+    def convert_to_fp32_tensor(src_name, dst_name, model, fout):
+        # qwen-gptq is torch.bfloat16 mostly.
+        if model[src_name].dtype == torch.float32:
+            data = model[src_name].squeeze().numpy()
+        else:
+            data = model[src_name].squeeze().to(torch.float32).numpy()
+        data = data.astype(np.float32)
+        shape = data.shape
+        n_dims = len(shape)
+        print("convert_to_fp32_tensor:  %45s" % src_name + "-> %-40s" % dst_name + " shape: ", shape, " type: ",
+              data.dtype)
+
+        #ftype_cur = {torch.float16: 1, torch.float32: 0}[data.dtype]
+        # default type is fp32
+        ftype_cur = 0
+        if ftype == 1 and n_dims > 1:
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            data = data.astype(np.float32)
+
+        # header
+        # write_header(fout, shape, dst_name, ftype_cur)
+        str = src_name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    #3. write tensors
+    convert_to_fp32_tensor("transformer.word_embeddings.weight", "transformer.word_embeddings.weight", list_vars, fout)
+    convert_to_fp32_tensor("transformer.ln_f.weight", "transformer.ln_f.weight", list_vars, fout)
+    convert_to_fp32_tensor("transformer.ln_f.bias", "transformer.ln_f.bias", list_vars, fout)
+    convert_to_fp32_tensor("lm_head.weight", "lm_head.weight", list_vars, fout)
+
+    for i in range(hparams["n_layer"]):
+        prefix = "transformer.h." + str(i)
+
+        if n_head_kv == 1:
+            convert_to_fp32_tensor(f"{prefix}.input_layernorm.weight", f"{prefix}.input_layernorm.weight", list_vars,
+                                   fout)
+            convert_to_fp32_tensor(f"{prefix}.input_layernorm.bias", f"{prefix}.input_layernorm.bias", list_vars, fout)
+        elif n_head_kv == 8:
+            convert_to_fp32_tensor(f"{prefix}.ln_mlp.weight", f"{prefix}.ln_mlp.weight", list_vars, fout)
+            convert_to_fp32_tensor(f"{prefix}.ln_mlp.bias", f"{prefix}.ln_mlp.bias", list_vars, fout)
+            convert_to_fp32_tensor(f"{prefix}.ln_attn.weight", f"{prefix}.ln_attn.weight", list_vars, fout)
+            convert_to_fp32_tensor(f"{prefix}.ln_attn.bias", f"{prefix}.ln_attn.bias", list_vars, fout)
+
+        # qkv GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attention.query_key_value.weight",
+                                    f"{prefix}.self_attention.query_key_value.weight", list_vars, fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attention.dense.weight", f"{prefix}.self_attention.dense.weight",
+                                    list_vars, fout, quantize_config)
+
+        # ffn GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.dense_h_to_4h", f"{prefix}.mlp.dense_h_to_4h.weight", list_vars,
+                                    fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.dense_4h_to_h", f"{prefix}.mlp.dense_4h_to_h.weight", list_vars,
+                                    fout, quantize_config)
+
+    fout.close()
+    print(f"Success! saved as {out_path}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/neural_speed/convert/convert_quantized_phi.py b/neural_speed/convert/convert_quantized_phi.py
new file mode 100644
index 000000000..3c085cda2
--- /dev/null
+++ b/neural_speed/convert/convert_quantized_phi.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import sys
+import re
+import argparse
+from common import *
+from sentencepiece import SentencePieceProcessor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def convert_phi1_5_gptq_to_bestTLA(model_path, out_path, outtype, model, hparams, quantize_config):
+    list_vars = model
+    for name in list_vars.keys():
+        print(name)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    fout = open(out_path, "wb")
+
+    # possible data types
+    #   ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0
+    if outtype == "f16":
+        ftype = 1
+
+    # 1. write hparams
+    print(hparams)
+    ne_file_magic = 0x67676d66
+    n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"])
+    # n_rot = hparams['rotary_dim']
+    fout.write(struct.pack("i", ne_file_magic))  # magic: ne in hex
+    fout.write(struct.pack("i", 1))
+    fout.write(struct.pack("i", hparams["vocab_size"]))
+    fout.write(struct.pack("i", hparams["hidden_size"]))
+    fout.write(struct.pack("i", hparams["intermediate_size"]))  # dummy data
+    fout.write(struct.pack("i", hparams["num_attention_heads"]))
+    fout.write(struct.pack("i", hparams["num_key_value_heads"]))  # multi-query attention
+    fout.write(struct.pack("i", hparams["num_hidden_layers"]))
+    fout.write(struct.pack("i", n_rot))
+    fout.write(struct.pack("i", ftype))
+    fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+    fout.write(struct.pack("f", 0.0))
+    fout.write(struct.pack("f", 0.0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
+    fout.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
+
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
+    fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
+    fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
+
+    # 2. vocab
+    for i in range(hparams["vocab_size"]):
+        if i < tokenizer.vocab_size:
+            text = tokenizer.decode([i]).encode('utf-8')
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+            fout.write(struct.pack("f", 0.0 - i))
+        else:
+            text = tokenizer.decode([tokenizer.vocab_size - 1]).encode('utf-8')
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+            fout.write(struct.pack("f", -10000))
+
+    def convert_qwen_to_fp32_tensor(src_name, dst_name, model, fout):
+        # qwen-gptq is torch.bfloat16 mostly.
+        if model[src_name].dtype == torch.float32:
+            data = model[src_name].squeeze().numpy()
+        else:
+            data = model[src_name].squeeze().to(torch.float32).numpy()
+        data = data.astype(np.float32)
+        shape = data.shape
+        n_dims = len(shape)
+        print("convert_qwen_to_fp32_tensor:  %40s" % src_name + "-> %-40s" % dst_name + " shape: ", shape, " type: ",
+              data.dtype)
+
+        #ftype_cur = {torch.float16: 1, torch.float32: 0}[data.dtype]
+        # default type is fp32
+        ftype_cur = 0
+        if ftype == 1 and n_dims > 1:
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            data = data.astype(np.float32)
+
+        # header
+        # write_header(fout, shape, dst_name, ftype_cur)
+        str = dst_name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    #3. write tensors
+    convert_qwen_to_fp32_tensor("model.embed_tokens.weight", "model.embed_tokens.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("model.final_layernorm.weight", "model.final_layernorm.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("model.final_layernorm.bias", "model.final_layernorm.bias", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.weight", "lm_head.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.bias", "lm_head.bias", list_vars, fout)
+
+    for i in range(hparams["num_hidden_layers"]):
+        prefix = "model.layers." + str(i)
+        renamed_prefix = "model.layers." + str(i)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.input_layernorm.weight", f"{renamed_prefix}.input_layernorm.weight",
+                                    list_vars, fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.input_layernorm.bias", f"{renamed_prefix}.input_layernorm.bias",
+                                    list_vars, fout)
+
+        # qkv GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.q_proj.weight", f"{prefix}.self_attn.q_proj.weight", list_vars,
+                                    fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.k_proj.weight", f"{prefix}.self_attn.k_proj.weight", list_vars,
+                                    fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.v_proj.weight", f"{prefix}.self_attn.v_proj.weight", list_vars,
+                                    fout, quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.self_attn.dense.weight", f"{prefix}.self_attn.dense.weight", list_vars,
+                                    fout, quantize_config)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.self_attn.q_proj.bias", f"{prefix}.self_attn.q_proj.bias", list_vars,
+                                    fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.self_attn.k_proj.bias", f"{prefix}.self_attn.k_proj.bias", list_vars,
+                                    fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.self_attn.v_proj.bias", f"{prefix}.self_attn.v_proj.bias", list_vars,
+                                    fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.self_attn.dense.bias", f"{prefix}.self_attn.dense.bias", list_vars, fout)
+
+        # ffn GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.fc1.weight", f"{renamed_prefix}.mlp.fc1.weight", list_vars, fout,
+                                    quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.fc2.weight", f"{renamed_prefix}.mlp.fc2.weight", list_vars, fout,
+                                    quantize_config)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.mlp.fc1.bias", f"{renamed_prefix}.mlp.fc1.bias", list_vars, fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.mlp.fc2.bias", f"{renamed_prefix}.mlp.fc2.bias", list_vars, fout)
+
+    fout.close()
+    print(f"Success! saved as {out_path}")
+
+
+def convert_phi2_gptq_to_bestTLA(model_path, model, out_path, hparams, quantize_config):
+    list_vars = model
+    for name in list_vars.keys():
+        print(name)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    fout = open(out_path, "wb")
+
+    # possible data types
+    #   ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0
+    if outtype == "f16":
+        ftype = 1
+
+    # 1. write hparams
+    print(hparams)
+    ne_file_magic = 0x67676d66
+    #n_rot = int(hparams["partial_rotary_factor"]*hparams["hidden_size"]/hparams["num_attention_heads"])
+    n_rot = hparams['rotary_dim']
+    fout.write(struct.pack("i", ne_file_magic))  # magic: ne in hex
+    fout.write(struct.pack("i", 1))
+    fout.write(struct.pack("i", hparams["vocab_size"]))
+    fout.write(struct.pack("i", hparams["n_embd"]))
+    fout.write(struct.pack("i", hparams["n_embd"] * 4))  # dummy data
+    fout.write(struct.pack("i", hparams["n_head"]))
+    fout.write(struct.pack("i", hparams["n_head"]))  # multi-query attention
+    fout.write(struct.pack("i", hparams["n_layer"]))
+    fout.write(struct.pack("i", n_rot))
+    fout.write(struct.pack("i", ftype))
+    fout.write(struct.pack("i", hparams["n_positions"]))
+    fout.write(struct.pack("f", 0.0))
+    fout.write(struct.pack("f", 0.0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
+    fout.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
+
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))
+    fout.write(struct.pack("i", 0))  # n_experts
+    fout.write(struct.pack("i", 0))  # n_expert_used
+    fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
+    fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
+    fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
+    fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
+    fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
+
+    # 2. vocab
+    for i in range(hparams["vocab_size"]):
+        if i < tokenizer.vocab_size:
+            text = tokenizer.decode([i]).encode('utf-8')
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+            fout.write(struct.pack("f", 0.0 - i))
+        else:
+            text = tokenizer.decode([tokenizer.vocab_size - 1]).encode('utf-8')
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+            fout.write(struct.pack("f", -10000))
+
+    def convert_qwen_to_fp32_tensor(src_name, dst_name, model, fout):
+        # qwen-gptq is torch.bfloat16 mostly.
+        if model[src_name].dtype == torch.float32:
+            data = model[src_name].squeeze().numpy()
+        else:
+            data = model[src_name].squeeze().to(torch.float32).numpy()
+        data = data.astype(np.float32)
+        shape = data.shape
+        n_dims = len(shape)
+        print("convert_qwen_to_fp32_tensor:  %40s" % src_name + "-> %-40s" % dst_name + " shape: ", shape, " type: ",
+              data.dtype)
+
+        #ftype_cur = {torch.float16: 1, torch.float32: 0}[data.dtype]
+        # default type is fp32
+        ftype_cur = 0
+        if ftype == 1 and n_dims > 1:
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            data = data.astype(np.float32)
+
+        # header
+        # write_header(fout, shape, dst_name, ftype_cur)
+        str = dst_name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    #3. write tensors
+    convert_qwen_to_fp32_tensor("transformer.embd.wte.weight", "model.embed_tokens.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.ln.weight", "model.final_layernorm.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.ln.bias", "model.final_layernorm.bias", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.linear.weight", "lm_head.weight", list_vars, fout)
+    convert_qwen_to_fp32_tensor("lm_head.linear.bias", "lm_head.bias", list_vars, fout)
+
+    for i in range(hparams["n_layer"]):
+        prefix = "transformer.h." + str(i)
+        renamed_prefix = "model.layers." + str(i)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.ln.weight", f"{renamed_prefix}.input_layernorm.weight", list_vars, fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.ln.bias", f"{renamed_prefix}.input_layernorm.bias", list_vars, fout)
+
+        # qkv GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.mixer.Wqkv.weight", f"{renamed_prefix}.mixer.Wqkv.weight", list_vars,
+                                    fout, quantize_config)
+        convert_qwen_to_fp32_tensor(f"{prefix}.mixer.Wqkv.bias", f"{renamed_prefix}.mixer.Wqkv.bias", list_vars, fout)
+
+        convert_to_qx_bestla_tensor(f"{prefix}.mixer.out_proj.weight", f"{renamed_prefix}.mixer.out_proj.weight",
+                                    list_vars, fout, quantize_config)
+        convert_qwen_to_fp32_tensor(f"{prefix}.mixer.out_proj.bias", f"{renamed_prefix}.mixer.out_proj.bias", list_vars,
+                                    fout)
+
+        # ffn GEMM
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.fc1.weight", f"{renamed_prefix}.mlp.fc1.weight", list_vars, fout,
+                                    quantize_config)
+        convert_to_qx_bestla_tensor(f"{prefix}.mlp.fc2.weight", f"{renamed_prefix}.mlp.fc2.weight", list_vars, fout,
+                                    quantize_config)
+
+        convert_qwen_to_fp32_tensor(f"{prefix}.mlp.fc1.bias", f"{renamed_prefix}.mlp.fc1.bias", list_vars, fout)
+        convert_qwen_to_fp32_tensor(f"{prefix}.mlp.fc2.bias", f"{renamed_prefix}.mlp.fc2.bias", list_vars, fout)
+
+    fout.close()
+    print(f"Success! saved as {out_path}")
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
+    parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub",
+                        choices=["huggingface", "modelscope"],
+                        default="huggingface",
+                        help="hub to load model")
+    parser.add_argument("model", type=Path, help="directory containing model file")
+    args = parser.parse_args(args_in)
+
+    out_path = args.outfile.as_posix()
+    model_path = args.model.as_posix()
+
+    model, hparams, quantize_config = load_quantized_safetensors(model_path)
+
+    if hparams['model_type'] == "phi":
+        convert_phi1_5_gptq_to_bestTLA(model_path, out_path, args.outtype, model, hparams, quantize_config)
+    elif hparams['model_type'] == "phi-msft":
+        convert_phi2_gptq_to_bestTLA(model_path, out_path, args.outtype, model, hparams, quantize_config)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/neural_speed/convert/convert_quantized_qwen.py b/neural_speed/convert/convert_quantized_qwen.py
index b57238862..02ded7622 100644
--- a/neural_speed/convert/convert_quantized_qwen.py
+++ b/neural_speed/convert/convert_quantized_qwen.py
@@ -21,115 +21,28 @@
 import re
 import argparse
 from common import *
-
-
-def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
-    # unpack weight and repack into 3bits / 4bits BestLA format
-    import neural_speed.llama_cpp as cpp_model
-    if ".weight" in src_name:
-        src_name = src_name.replace(".weight", "")
-    qzeros = model[f"{src_name}.qzeros"]
-    zeros = qzeros_to_zeros(qzeros)
-    scales = model[f"{src_name}.scales"]
-    qweight = model[f"{src_name}.qweight"]
-
-    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
-    int_weight = int_weight.view(-1, int_weight.shape[-1])
-
-    # shuffle weight in GPTQ when act order is on
-    if 'desc_act' in q_config and q_config['desc_act']:
-        g_idx = model[f"{src_name}.g_idx"]
-        int_weight2 = int_weight.clone()
-        group_size = q_config['group_size']
-        group_dict = {}
-        for i in range(len(g_idx)):
-            group_idx = g_idx[i].item()
-            if group_idx not in group_dict:
-                target_idx = group_idx * group_size
-                group_dict[group_idx] = 0
-            else:
-                group_dict[group_idx] = group_dict[group_idx] + 1
-                target_idx = group_idx * group_size + group_dict[group_idx]
-            int_weight2[target_idx] = int_weight[i]
-        int_weight = int_weight2
-
-    # shape = int_weight.shape[::-1]
-    shape = int_weight.shape[::-1]
-    # write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
-    n_dims = len(shape)
-    str = dst_name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), GGML_QJBLAS_TYPE))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", shape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # INC stores sig-int4 value as u4(range 0~15, they add a offset),
-    # BesTLA requires s4_clip((-8,7)*16), so we sub the offset and then mul 16.
-    # Int3 is the same as int4, but offset=4, mul scale==32.
-    weight_dtype = "int8"
-    if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
-        weight_dtype = "int4"
-    elif q_config['bits'] == 3:
-        int_weight = (int_weight - 4) * 32
-        gptq_scales = gptq_scales / 32
-        gptq_zeros = (gptq_zeros - 4) * 32
-        weight_dtype = "int3"
-    else:
-        ValueError(f"Unsupported q_config[bits]: {q_config['bits']}")
-
-    dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
-    int_weight = np.ascontiguousarray(int_weight.numpy())
-    gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy())
-    if q_config['sym']:
-        gptq_zeros = np.empty(0, dtype=np.int8)
-    else:
-        gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy())
-    if 'desc_act' in q_config and q_config['desc_act']:
-        g_idx = np.ascontiguousarray(g_idx.numpy())
-    else:
-        g_idx = np.empty(0, dtype=np.int32)
-
-    # repack int weight in BesTLA format
-    byte_size = cpp_model.Model.np_bestla_qpack(int_weight,
-                                                gptq_scales,
-                                                gptq_zeros,
-                                                g_idx,
-                                                dst,
-                                                weight_dtype=weight_dtype,
-                                                group_size=q_config['group_size'],
-                                                alg="sym" if q_config['sym'] else "asym",
-                                                compute_dtype="int8")
-    dst.flatten()[:byte_size].tofile(fout)
-    print(f"convert_to_qx_bestla_tensor: {src_name:>40} -> {dst_name:<40} shape: {shape}, byte_size: {byte_size:<10}")
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
-                        default="huggingface", help="hub to load model")
+    parser.add_argument("--model_hub",
+                        choices=["huggingface", "modelscope"],
+                        default="huggingface",
+                        help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
     out_path = args.outfile.as_posix()
     model_path = args.model.as_posix()
 
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    # QWEN-GPTQ & AWQ
     model, hparams, quantize_config = load_quantized_safetensors(model_path)
     list_vars = model
 
     print(hparams)
 
-    # orinal QWEN
-    # model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
-    # hparams = model.config.to_dict()
-    # list_vars = model.state_dict()
-
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     f = open(out_path, "wb")
 
@@ -140,11 +53,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         ftype = 1
 
     # 1. write hparams
-    # 0x67676d6c is unversioned ne
-    # 0x67676d66 is versioned ggmf (requires token scores)
     ne_file_magic = 0x67676d66
-    #ne_file_version = 0x00000001 # v1
-
     f.write(struct.pack("i", ne_file_magic))  # magic: ne in hex
     f.write(struct.pack("i", 1))
 
@@ -168,9 +77,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     f.write(struct.pack("i", 0))
     if hparams['model_type']=='qwen2':
-        fout.write(struct.pack("i", hparams["intermediate_size"]))
+        f.write(struct.pack("i", hparams["intermediate_size"]))
     else:
-        fout.write(struct.pack("i", int(hparams["intermediate_size"]/2)))
+        f.write(struct.pack("i", int(hparams["intermediate_size"]/2)))
     f.write(struct.pack("i", 0))
     f.write(struct.pack("i", 0))  # n_experts
     f.write(struct.pack("i", 0))  # n_expert_used
@@ -182,7 +91,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     f.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
     f.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
 
-    if hparams['model_type']=='qwen2':
+    if hparams['model_type'] == 'qwen2':
         f.write(struct.pack("i", hparams["bos_token_id"]))
         f.write(struct.pack("i", hparams["eos_token_id"]))
     else: