intel · VincyZhang · Mar 6, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md
@@ -6,11 +6,12 @@ Neural Speed supports multiple weight-only quantization algorithms, such as GPTQ
 More algorithm details please check [GPTQ](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978).
 
 Validated GPTQ & AWQ models directly from the HuggingFace:
-* [Llama-2-7B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ)
-* [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ)
+* [Llama-2-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-Chat-GPTQ) & [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ)
+* [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ) & [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ)
+* [Mistral-7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ) & [Mistral-7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ)
+* [Mixtral-8x7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ) & [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ)
+* [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ)
 * [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
-* [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ)
-* [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ)
 
 Please check more validated GPTQ & AWQ models in the list of [supported_models](./docs/supported_models.md).
 

diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -76,12 +76,12 @@ Neural Speed supports the following models:
   <tr>
     <td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
     <td>✅</td>
-    <td> </td>
-    <td> </td>
-    <td> </td>
+    <td>✅</td>
     <td>✅</td>
     <td> </td>
-    <td> </td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
     <td> </td>
     <td>Latest</td>
   </tr>
@@ -204,12 +204,12 @@ Neural Speed supports the following models:
     <td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
      <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" target="_blank" rel="noopener noreferrer">Mixtral-8x7B</a></td>
     <td>✅</td>
-    <td> </td>
-    <td> </td>
-    <td> </td>
+    <td>✅</td>
     <td>✅</td>
     <td> </td>
-    <td> </td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
     <td> </td>
     <td>4.36.0 or newer</td>
   </tr>

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -66,7 +66,7 @@ def __import_package(self, model_type):
             import neural_speed.qwen_cpp as cpp_model
         elif model_type == "mistral":
             import neural_speed.mistral_cpp as cpp_model
-        elif model_type == "qwen":
+        elif model_type == "qwen2":
             import neural_speed.qwen_cpp as cpp_model
         elif model_type == "phi":
             import neural_speed.phi_cpp as cpp_model

diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
@@ -16,13 +16,14 @@
 # limitations under the License.
 
 import torch
+import os
 from pathlib import Path
 import numpy as np
 import struct
 import json
 import warnings
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
-                    Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
+                    Union)
 from sentencepiece import SentencePieceProcessor  # type: ignore
 
 GGML_QK8_0 = 32
@@ -35,6 +36,7 @@
 GGML_QK4_1_TYPE = 3
 GGML_QJBLAS_TYPE = 19
 
+
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
     """
@@ -57,6 +59,7 @@ def bytes_to_unicode():
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+
 def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
     # equivalent to ggml_quantize_q4_0 in ggml.c
     assert tensor.shape[1] % GGML_QK4_0 == 0
@@ -71,6 +74,7 @@ def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
     tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
     return tensor
 
+
 def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
     # equivalent to ggml_quantize_q4_1 in ggml.c
     assert tensor.shape[1] % GGML_QK4_1 == 0
@@ -85,6 +89,7 @@ def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
     tensor = torch.cat((scale.half().view(torch.int8), min_vals.half().view(torch.int8), tensor), dim=-1)
     return tensor
 
+
 class SentencePieceVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
@@ -154,8 +159,7 @@ def load_vocab(path: Path) -> SentencePieceVocab:
         else:
             raise FileNotFoundError(
                 f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, \
-                pass the directory as --vocab-dir"
-            )
+                pass the directory as --vocab-dir")
     added_tokens_path = path.parent / "added_tokens.json"
     print(f"Loading vocab file {path}")
     return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@@ -192,6 +196,7 @@ def qzeros_to_zeros(qzeros, bits=4):
         col += 1
     return zeros
 
+
 def unpack_weight(qweight, scales, qzeros, q_config):
     if "quant_method" not in q_config:
         raise ValueError(f"Unsupported q_config without quant_method: {q_config}")
@@ -220,17 +225,18 @@ def unpack_gptq_weight_4bits(qweight, scales, qzeros, q_config):
     wf = torch.tensor(list(range(0, s32_bits, bits)), dtype=torch.int32).unsqueeze(0)
     zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits),
                                       wf.unsqueeze(0)).to(torch.int16 if bits == 8 else torch.int8)
-    torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)
+    torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
 
     zeros = zeros + 1
     zeros = zeros.reshape(scales.shape)
 
     weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1),
                                        wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
-    torch.bitwise_and(weight,(2 ** bits) - 1, out=weight)
+    torch.bitwise_and(weight, (2**bits) - 1, out=weight)
 
     return weight, scales, zeros
 
+
 def unpack_gptq_weight_3bits(qweight, scales, qzeros, q_config):
     print("unpack_gptq_weight_3bits...   ", end='')
     group_size = q_config['group_size']
@@ -239,23 +245,23 @@ def unpack_gptq_weight_3bits(qweight, scales, qzeros, q_config):
 
     assert bits == 3
     # Int32 can only store 10 * 3bits data. This is the offset for each data.
-    wf = torch.tensor([[ i for i in range(0, s32_bits - bits, bits)]], dtype=torch.int32)
+    wf = torch.tensor([[i for i in range(0, s32_bits - bits, bits)]], dtype=torch.int32)
     zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits),
                                       wf.unsqueeze(0)).to(torch.int16 if bits == 8 else torch.int8)
-    torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)
+    torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
 
     zeros = zeros + 1
     zeros = zeros.reshape(zeros.shape[0], -1)
-    zeros = zeros[:,:scales.shape[1]]
+    zeros = zeros[:, :scales.shape[1]]
 
     weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1),
                                        wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
 
     weight = weight.reshape(-1, weight.shape[-1])
     input_feature = group_size * scales.shape[0]
-    weight = weight[:input_feature,:]
+    weight = weight[:input_feature, :]
 
-    torch.bitwise_and(weight,(2 ** bits) - 1, out=weight)
+    torch.bitwise_and(weight, (2**bits) - 1, out=weight)
 
     return weight, scales, zeros
 
@@ -271,12 +277,13 @@ def unpack_awq_weight(qweight, scales, qzeros, q_config):
     for col in range(qweight.shape[1]):
         for i in range(pack_num):
             w_col = torch.bitwise_right_shift(qweight[:, col], 4 * order_map[i])
-            weight[:, col * pack_num + i] = torch.bitwise_and(w_col, (2 ** bits) - 1)
+            weight[:, col * pack_num + i] = torch.bitwise_and(w_col, (2**bits) - 1)
             z_col = torch.bitwise_right_shift(qzeros[:, col], 4 * order_map[i])
-            zeros[:, col * pack_num + i] = torch.bitwise_and(z_col, (2 ** bits) - 1)
+            zeros[:, col * pack_num + i] = torch.bitwise_and(z_col, (2**bits) - 1)
 
     return weight, scales, zeros
 
+
 def write_header(fout, shape, dst_name, ftype_cur):
     sname = dst_name.encode('utf-8')
     fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
@@ -295,6 +302,31 @@ def find_quantized_model_file(model_path):
             print(f"Detected model file {found[0]}")
             return str(found[0])
 
+
+def load_quantized_safetensors(model_path):
+    # load GPTQ & AWQ models, only for safetensors
+    from safetensors.torch import load_file
+    safetensors = []
+    for file in os.listdir(model_path):
+        if file.endswith(".safetensors"):
+            safetensors.append(file)
+
+    print(f"safetensors list = {safetensors}")
+    model = {}
+    for file in safetensors:
+        tmp = load_file(model_path + "/" + file)
+        if isinstance(tmp, dict):
+            model.update(tmp)
+
+    with open(model_path + '/config.json', "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    quantize_config = config["quantization_config"]
+    if "zero_point" in quantize_config:
+        quantize_config["sym"] = not quantize_config["zero_point"]
+    return model, config, config["quantization_config"]
+
+
 def load_quantized_model(model_path):
     input_path = find_quantized_model_file(model_path)
     model = None
@@ -318,8 +350,9 @@ def load_quantized_model(model_path):
 def convert_to_fp32_tensor(src_name, dst_name, model, fout):
     v = model[src_name]
     shape = v.shape
-    # print("Processing non-Q4 variable: " + src_name +
-    #       " with shape: ", shape, " and type: ", v.dtype)
+    n_dims = len(shape)
+    print("Processing non-Q4 variable:     " + src_name + " -> " + dst_name + " with shape: ", shape, " and type: ",
+          v.dtype, "data: ", v[:2, :2].tolist() if n_dims > 1 else v[:2].tolist())
     v = v.to(torch.float32)
 
     ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
@@ -329,7 +362,8 @@ def convert_to_fp32_tensor(src_name, dst_name, model, fout):
 
     # data
     v.numpy().tofile(fout)
-    print(f"converting {dst_name} float tensor")
+    #print(f"converting {src_name} -> {dst_name} float tensor")
+
 
 def convert_q4_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None):
     qzeros = model[f"{src_name}.qzeros"]
@@ -338,9 +372,9 @@ def convert_q4_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2
     qweight = model[f"{src_name}.qweight"]
     int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
 
-    int_weight = int_weight.view(-1,int_weight.shape[-1]).t()
-    gptq_scales = gptq_scales.view(-1,gptq_scales.shape[-1]).t()
-    gptq_zeros = gptq_zeros.view(-1,gptq_zeros.shape[-1]).t()
+    int_weight = int_weight.view(-1, int_weight.shape[-1]).t()
+    gptq_scales = gptq_scales.view(-1, gptq_scales.shape[-1]).t()
+    gptq_zeros = gptq_zeros.view(-1, gptq_zeros.shape[-1]).t()
 
     write_header(fout, int_weight.shape, dst_name, 2)
     if permute_func:
@@ -350,12 +384,13 @@ def convert_q4_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2
 
     tensor = int_weight.reshape(-1, 32) - 8
     tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
-    gptq_scale = gptq_scales.reshape(-1,1)
+    gptq_scale = gptq_scales.reshape(-1, 1)
     # gptq_scale = torch.cat([gptq_scale,gptq_scale,gptq_scale,gptq_scale], dim=1).view(-1,1)
     pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), tensor), dim=-1)
     pack_tensor.numpy().tofile(fout)
     print(f"converting {dst_name} quantized tensor to ggml q4 block")
 
+
 def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None):
     qzeros = model[f"{src_name}.qzeros"]
     zeros = qzeros_to_zeros(qzeros)
@@ -364,9 +399,9 @@ def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_hea
     qweight = model[f"{src_name}.qweight"]
     int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
 
-    int_weight = int_weight.view(-1,int_weight.shape[-1]).t()
-    gptq_scales = gptq_scales.view(-1,gptq_scales.shape[-1]).t()
-    gptq_zeros = gptq_zeros.view(-1,gptq_zeros.shape[-1]).t()
+    int_weight = int_weight.view(-1, int_weight.shape[-1]).t()
+    gptq_scales = gptq_scales.view(-1, gptq_scales.shape[-1]).t()
+    gptq_zeros = gptq_zeros.view(-1, gptq_zeros.shape[-1]).t()
 
     write_header(fout, int_weight.shape, dst_name, 3)
     if permute_func:
@@ -376,9 +411,9 @@ def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_hea
 
     tensor = int_weight.reshape(-1, 32)
     tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
-    gptq_scale = gptq_scales.reshape(-1,1)
-    gptq_zeros = gptq_zeros.reshape(-1,1)
-    gptq_zeros = -gptq_scale*gptq_zeros
+    gptq_scale = gptq_scales.reshape(-1, 1)
+    gptq_zeros = gptq_zeros.reshape(-1, 1)
+    gptq_zeros = -gptq_scale * gptq_zeros
     pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), gptq_zeros.half().view(torch.int8), tensor), dim=-1)
     pack_tensor.numpy().tofile(fout)
     print(f"converting {dst_name} quantized tensor to ggml q4 1 block")
@@ -413,6 +448,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
 
     print(f"converting {dst_name} quantized tensor to fp32 tensor")
 
+
 def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None):
     # unpack weight and repack into jblas format
     import neural_speed.llama_cpp as cpp_model
@@ -422,7 +458,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
     qweight = model[f"{src_name}.qweight"]
 
     int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
-    int_weight = int_weight.view(-1,int_weight.shape[-1])
+    int_weight = int_weight.view(-1, int_weight.shape[-1])
 
     # permute_func for llama-like model
     if permute_func:
@@ -431,10 +467,10 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
         gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()
 
     # shuffle weight in GPTQ when act order is on
-    if 'desc_act'in q_config and q_config['desc_act']:
+    if 'desc_act' in q_config and q_config['desc_act']:
         g_idx = model[f"{src_name}.g_idx"]
         int_weight2 = int_weight.clone()
-        group_size=q_config['group_size']
+        group_size = q_config['group_size']
         group_dict = {}
         for i in range(len(g_idx)):
             group_idx = g_idx[i].item()
@@ -461,16 +497,20 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
         gptq_zeros = np.empty(0, dtype=np.int8)
     else:
         gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy())
-    if 'desc_act'in q_config and q_config['desc_act']:
+    if 'desc_act' in q_config and q_config['desc_act']:
         g_idx = np.ascontiguousarray(g_idx.numpy())
     else:
         g_idx = np.empty(0, dtype=np.int32)
 
     # pack int weight in bestla format
-    byte_size = cpp_model.Model.np_bestla_qpack(int_weight, gptq_scales, gptq_zeros, g_idx, dst,
-                                               weight_dtype="int4" if q_config['bits'] == 4 else "int8",
-                                               group_size=q_config['group_size'],
-                                               alg="sym" if q_config['sym'] else "asym",
-                                               compute_dtype="int8")
+    byte_size = cpp_model.Model.np_bestla_qpack(int_weight,
+                                                gptq_scales,
+                                                gptq_zeros,
+                                                g_idx,
+                                                dst,
+                                                weight_dtype="int4" if q_config['bits'] == 4 else "int8",
+                                                group_size=q_config['group_size'],
+                                                alg="sym" if q_config['sym'] else "asym",
+                                                compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py
@@ -161,9 +161,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
     f.write(struct.pack("f", config["rms_norm_eps"]))
     f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
     f.write(struct.pack("f", rope_scale))
-    f.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
-    f.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
-    f.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
 
     f.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
     f.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings