diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py index ccc65bad7..a440f529d 100644 --- a/neural_speed/convert/convert_mistral.py +++ b/neural_speed/convert/convert_mistral.py @@ -317,7 +317,7 @@ def __repr__(self) -> str: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv + n_head = n_head_kv return (weights.reshape(n_head_kv, 2, weights.shape[0] // n_head_kv // 2, *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py index 5ce8f863e..059522398 100644 --- a/neural_speed/convert/convert_quantized_gptj.py +++ b/neural_speed/convert/convert_quantized_gptj.py @@ -23,12 +23,6 @@ from transformers import AutoTokenizer -def permute_func(weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, - *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) - def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config): # unpack weight and repack into 3bits / 4bits BestLA format import neural_speed.llama_cpp as cpp_model diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py index 8e154a8ca..7e25bd22b 100644 --- a/neural_speed/convert/convert_quantized_mistral.py +++ b/neural_speed/convert/convert_quantized_mistral.py @@ -22,12 +22,84 @@ import argparse from common import * + def permute_func(weights, n_head: int, n_head_kv: int): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return (weights.reshape(n_head_kv, 2, weights.shape[0] // n_head_kv // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + return (weights.reshape(n_head_kv, 2, weights.shape[0] // n_head_kv // 2, + *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) + + +def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None): + # unpack weight and repack into jblas format + import neural_speed.llama_cpp as cpp_model + if ".weight" in src_name: + src_name = src_name.replace(".weight", "") + qzeros = model[f"{src_name}.qzeros"] + zeros = qzeros_to_zeros(qzeros) + scales = model[f"{src_name}.scales"] + qweight = model[f"{src_name}.qweight"] + + int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config) + int_weight = int_weight.view(-1, int_weight.shape[-1]) + + # shuffle weight in GPTQ when act order is on + if 'desc_act' in q_config and q_config['desc_act']: + g_idx = model[f"{src_name}.g_idx"] + int_weight2 = int_weight.clone() + group_size = q_config['group_size'] + group_dict = {} + for i in range(len(g_idx)): + group_idx = g_idx[i].item() + if group_idx not in group_dict: + target_idx = group_idx * group_size + group_dict[group_idx] = 0 + else: + group_dict[group_idx] = group_dict[group_idx] + 1 + target_idx = group_idx * group_size + group_dict[group_idx] + int_weight2[target_idx] = int_weight[i] + int_weight = int_weight2 + + # permute_func for llama-like model + if permute_func: + int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous() + gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous() + gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous() + + shape = int_weight.shape + write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE) + + weight_dtype = "int8" + if q_config['bits'] == 4: + int_weight = (int_weight - 8) * 16 + gptq_scales = gptq_scales / 16 + gptq_zeros = (gptq_zeros - 8) * 16 + weight_dtype == "int4" + + dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8) + int_weight = np.ascontiguousarray(int_weight.numpy()) + gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy()) + if q_config['sym']: + gptq_zeros = np.empty(0, dtype=np.int8) + else: + gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy()) + if 'desc_act' in q_config and q_config['desc_act']: + g_idx = np.ascontiguousarray(g_idx.numpy()) + else: + g_idx = np.empty(0, dtype=np.int32) + + # pack int weight in bestla format + byte_size = cpp_model.Model.np_bestla_qpack(int_weight, + gptq_scales, + gptq_zeros, + g_idx, + dst, + weight_dtype=weight_dtype, + group_size=q_config['group_size'], + alg="sym" if q_config['sym'] else "asym", + compute_dtype="int8") + dst.flatten()[:byte_size].tofile(fout) + print(f"converting {dst_name} quantized tensor to bestla q4 block") def main(args_in: Optional[List[str]] = None) -> None: @@ -49,13 +121,14 @@ def main(args_in: Optional[List[str]] = None) -> None: n_layer = config["num_hidden_layers"] n_head = config["num_attention_heads"] ffn_hidden_size = config["intermediate_size"] - rope_scale = 1 - if "rope_scaling" in config and config["rope_scaling"] is not None: - rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1 + # hardcoded: n_mult = 256 # 1. write head and params f.write(b"ggjt"[::-1]) # magic + rope_scale = 1 + if "rope_scaling" in config and config["rope_scaling"] is not None: + rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1 n_head = n_head n_head_kv = 8 @@ -63,12 +136,12 @@ def main(args_in: Optional[List[str]] = None) -> None: 1, # file version n_vocab, n_embd, - 256, #hparams.n_mult, + 256, #hparams.n_mult, n_head, - n_head_kv, # n_head_kv (multi_query attention) + n_head_kv, # n_head_kv (multi_query attention) n_layer, n_embd // n_head, # rot (obsolete) - 0, #file_type.value, # TODO + 0, #file_type.value, # TODO ] f.write(struct.pack("i" * len(values), *values)) @@ -92,6 +165,10 @@ def main(args_in: Optional[List[str]] = None) -> None: f.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings f.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + f.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + f.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + f.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp f.write(struct.pack("i", 1)) @@ -115,31 +192,41 @@ def main(args_in: Optional[List[str]] = None) -> None: convert_to_fp32_tensor("lm_head.weight", "output.weight", list_vars, f) for i in range(n_layer): - convert_q4_bestla_tensor(f"model.layers.{i}.self_attn.q_proj", - f"layers.{i}.attention.wq.weight", list_vars, f, quantize_config, n_head, n_head, - permute_func=permute_func) - convert_q4_bestla_tensor(f"model.layers.{i}.self_attn.k_proj", - f"layers.{i}.attention.wk.weight", list_vars, f, quantize_config, n_head, n_head_kv, - permute_func=permute_func) - convert_q4_bestla_tensor(f"model.layers.{i}.self_attn.v_proj", - f"layers.{i}.attention.wv.weight", list_vars, f, quantize_config, n_head) - convert_q4_bestla_tensor(f"model.layers.{i}.self_attn.o_proj", - f"layers.{i}.attention.wo.weight", list_vars, f, quantize_config, n_head) - convert_q4_bestla_tensor(f"model.layers.{i}.mlp.gate_proj", - f"layers.{i}.feed_forward.w1.weight", list_vars, f, quantize_config, n_head) - convert_q4_bestla_tensor(f"model.layers.{i}.mlp.down_proj", - f"layers.{i}.feed_forward.w2.weight", list_vars, f, quantize_config, n_head) - convert_q4_bestla_tensor(f"model.layers.{i}.mlp.up_proj", - f"layers.{i}.feed_forward.w3.weight", list_vars, f, quantize_config, n_head) - - convert_to_fp32_tensor(f"model.layers.{i}.input_layernorm.weight", - f"layers.{i}.attention_norm.weight", list_vars, f) - convert_to_fp32_tensor(f"model.layers.{i}.post_attention_layernorm.weight", - f"layers.{i}.ffn_norm.weight", list_vars, f) - + convert_to_q4_bestla_tensor(f"model.layers.{i}.self_attn.q_proj", + f"layers.{i}.attention.wq.weight", + list_vars, + f, + quantize_config, + n_head, + n_head, + permute_func=permute_func) + convert_to_q4_bestla_tensor(f"model.layers.{i}.self_attn.k_proj", + f"layers.{i}.attention.wk.weight", + list_vars, + f, + quantize_config, + n_head, + n_head_kv, + permute_func=permute_func) + convert_to_q4_bestla_tensor(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight", list_vars, + f, quantize_config, n_head) + convert_to_q4_bestla_tensor(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight", list_vars, + f, quantize_config, n_head) + convert_to_q4_bestla_tensor(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight", list_vars, + f, quantize_config, n_head) + convert_to_q4_bestla_tensor(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight", list_vars, + f, quantize_config, n_head) + convert_to_q4_bestla_tensor(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight", list_vars, f, + quantize_config, n_head) + + convert_to_fp32_tensor(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight", + list_vars, f) + convert_to_fp32_tensor(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight", + list_vars, f) f.close() print(f"Success! saved as {out_path}") + if __name__ == '__main__': main()