From d230bc6a02e800c2c967516a27341424f502e7e6 Mon Sep 17 00:00:00 2001 From: kalineid Date: Mon, 22 Jul 2024 21:32:12 +0800 Subject: [PATCH] Add support to convert from EfficientQAT/GPTQv2/exllamav2 weights to gguf --- convert-hf-to-gguf-t-mac.py | 76 ++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf-t-mac.py b/convert-hf-to-gguf-t-mac.py index 84a47e47da7f7..85155c519db70 100644 --- a/convert-hf-to-gguf-t-mac.py +++ b/convert-hf-to-gguf-t-mac.py @@ -1483,6 +1483,33 @@ def real_quantize_tensor(w, n_bit=8, zero_point=True, q_group_size=-1): return w, scales, zeros +def unpack_gptqv2(qweight, scales, qzeros): + """ + Unpack GPTQv2 + Return T-MAC biased uint8 weight [0, 2 ** bits), fp16 scales, biased fp16 zeros, bits, group_size + """ + assert qweight.dtype == "int32" + assert qzeros.dtype == "int32" + + bits = 32 // (scales.shape[1] // qzeros.shape[1]) + K = qweight.shape[0] * (32 // bits) + M = qweight.shape[1] + group_size = K // scales.shape[0] + + # Unpack qweight + qweights = [(qweight >> bit_offset) & ((1 << bits) - 1) for bit_offset in range(0, 32, bits)] + w = np.stack(qweights, axis=1).reshape(K, M).T.astype("uint8") + + scales = scales.T + + # Unpack qzeros + zeros = [(qzeros >> bit_offset) & ((1 << bits) - 1) for bit_offset in range(0, 32, bits)] + zeros = np.stack(zeros, axis=-1).reshape(K // group_size, M).T.astype(scales.dtype) + zeros = (zeros - (2 ** (bits - 1))) * scales + + return w, scales, zeros, bits, group_size + + @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA @@ -1528,11 +1555,58 @@ def write_tensors(self): n_kv_head = self.hparams.get("num_key_value_heads") n_experts = self.hparams.get("num_local_experts") experts = dict() + + quant_dict = {} + # Store scales and qzeros to dict to be later preprocessed + # Save memory by not storing qweight + for name, data_torch in self.get_tensors(): + if name.endswith(".scales") or name.endswith(".qzeros"): + data = data_torch.numpy() + quant_dict[name] = data + for name, data_torch in self.get_tensors(): # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue + # should be converted with qweight together + if name.endswith(".scales") or name.endswith(".qzeros") or name.endswith(".g_idx"): + continue + + if name.endswith(".qweight"): + qweight = data_torch.numpy() + scales = quant_dict[name.replace(".qweight", ".scales")] + qzeros = quant_dict[name.replace(".qweight", ".qzeros")] + w, scales, zeros, bits, group_size = unpack_gptqv2(qweight, scales, qzeros) + if name.endswith("q_proj.qweight"): + w = permute(w, n_head, n_head) + scales = permute(scales, n_head, n_head) + zeros = permute(zeros, n_head, n_head) + if name.endswith("k_proj.qweight"): + w = permute(w, n_head, n_kv_head) + scales = permute(scales, n_head, n_kv_head) + zeros = permute(zeros, n_head, n_kv_head) + data_shape = w.shape + new_name = tensor_map.get_name(name.replace(".qweight", ".weight"), try_suffixes=(".weight", ".bias")) + + if self.ftype == LlamaFType.MOSTLY_I2: + to_dtype = gguf.GGMLQuantizationType.I2 + data = preprocess_for_t_mac(w, scales, zeros, bits=bits) + assert bits == 2, "Currently we only support 2-bit quantized model. 4-bit will soon be added." + else: + to_dtype = gguf.GGMLQuantizationType.F32 + w = w.astype("float32").reshape(-1, group_size) + scales = scales.astype("float32").reshape(-1, 1) + zeros = zeros.astype("float32").reshape(-1, 1) + data = (w - (zeros / scales + (2 ** (bits - 1)))) * scales + if self.ftype == LlamaFType.MOSTLY_F16: + to_dtype = gguf.GGMLQuantizationType.F16 + data = data.astype("float16") + + logger.info(f"{new_name}, n_dims = {data_torch.ndim}, {data_torch.dtype} --> {to_dtype.name}") + self.gguf_writer.add_tensor(new_name, data, raw_shape=data_shape, raw_dtype=to_dtype) + continue + old_dtype = data_torch.dtype # convert any unsupported data types to float32 @@ -3213,7 +3287,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--model-name", type=str, default=None, help="name of the model") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") parser.add_argument("--kcfg", type=str, default="", help="Path to T-MAC kcfg.ini") - parser.add_argument("--quant-type", type=str, default="bitnet", choices=["bitnet", "bitdistiller"]) + parser.add_argument("--quant-type", type=str, default="bitnet", choices=["bitnet", "bitdistiller", "gptqv2"]) parser.add_argument("--group-size", type=int, default=128) return parser.parse_args()