From 7c85e27cd9477d3a8df894b5fd880f260e84e199 Mon Sep 17 00:00:00 2001 From: intellinjun Date: Tue, 27 Feb 2024 17:47:00 +0800 Subject: [PATCH 01/12] enable qwen1-5 Signed-off-by: intellinjun --- neural_speed/models/model_utils/gguf.h | 3 +- neural_speed/models/qwen/qwen.cpp | 46 +++++++++++++++++++------ neural_speed/models/qwen/qwen_utils.cpp | 16 +++++++-- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/neural_speed/models/model_utils/gguf.h b/neural_speed/models/model_utils/gguf.h index 0018ec7d3..68a955a8d 100644 --- a/neural_speed/models/model_utils/gguf.h +++ b/neural_speed/models/model_utils/gguf.h @@ -231,6 +231,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_CHATGLM2, LLM_ARCH_PHI, + LLM_ARCH_QWEN2, LLM_ARCH_UNKNOWN, }; @@ -242,7 +243,7 @@ static std::map LLM_ARCH_NAMES = { {LLM_ARCH_PERSIMMON, "persimmon"}, {LLM_ARCH_REFACT, "refact"}, {LLM_ARCH_BLOOM, "bloom"}, {LLM_ARCH_STABLELM, "stablelm"}, {LLM_ARCH_QWEN, "qwen"}, {LLM_ARCH_CHATGLM, "chatglm"}, - {LLM_ARCH_CHATGLM2, "chatglm2"}, {LLM_ARCH_PHI, "phi"}}; + {LLM_ARCH_CHATGLM2, "chatglm2"}, {LLM_ARCH_PHI, "phi"}, {LLM_ARCH_QWEN2, "qwen2"}}; struct gguf_tensor_info { struct gguf_str name; diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp index 837c09021..117f5f349 100644 --- a/neural_speed/models/qwen/qwen.cpp +++ b/neural_speed/models/qwen/qwen.cpp @@ -77,7 +77,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu model_context& lctx = *ctx; // static batching for now - const int N = inputs->n_tokens; + const int N = 2; const int n_past = inputs->n_past; const int n_total = inputs->n_total; const bool shift_roped_k = lctx.shift_roped_k; @@ -102,6 +102,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_rot; const int head_dim = n_embd / n_head; + const int qwen_version = 2; auto& mem_per_token = lctx.mem_per_token; auto& buf_compute = lctx.buf_compute; @@ -119,7 +120,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; + const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA&&false; kv_cache_info_t kv_cache_info = {}; if (run_mha_reordered) { NE_ASSERT(("kv cache should be the same dtype", kv_self.v->type == NE_TYPE_BTLA)); @@ -143,8 +144,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu } struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size); ne_set_name(embd, "embd"); + uint32_t input_tokens[2] = {14990, 1879}; for (int i = 0; i < batch_size; ++i) { - memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + memcpy(static_cast(embd->data) + i * N, input_tokens, N * ne_element_size(embd)); } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -164,20 +166,38 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu } // compute QKV - { - cur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); + struct ne_tensor* Qcur; + struct ne_tensor* Kcur; + struct ne_tensor* Vcur; - cur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], cur), cur); - } + if(qwen_version == 1){ + + cur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); + + cur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], cur), cur); size_t fused_qkv_row_nb = (3 * n_embd) * sizeof(float); - struct ne_tensor* Qcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), + Qcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), fused_qkv_row_nb, 0 * sizeof(float) * n_embd)); // head_dim, n_head, N --> head_dim, N, n_head - struct ne_tensor* Kcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), + Kcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), fused_qkv_row_nb, 1 * sizeof(float) * n_embd)); // head_dim, n_head, N --> N, head_dim, n_head - struct ne_tensor* Vcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), + Vcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), fused_qkv_row_nb, 2 * sizeof(float) * n_embd)); + } + + Qcur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); + Qcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], Qcur), Qcur); + Qcur = ne_reshape_3d(ctx0, Qcur,head_dim, n_head, N); + + Kcur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); + Kcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[3], Kcur), Kcur); + Kcur = ne_reshape_3d(ctx0, Kcur,head_dim, n_head, N); + + Vcur = ne_mul_mat(ctx0, model.layers[il].attn[4], cur); + Vcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[5], Vcur), Vcur); + Vcur = ne_reshape_3d(ctx0, Vcur,head_dim, n_head, N); + // using mode = 2 for GPT-NeoX mode Qcur = ne_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0, hparams.freq_base, hparams.freq_scale); @@ -300,7 +320,11 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu cur = ne_view_2d(ctx0, KQV_Out, n_embd, N, n_embd * ne_element_size(KQV_Out), 0); } // projection - { cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); } + if(qwen_version == 1){ + cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); + }else{ + cur = ne_mul_mat(ctx0, model.layers[il].attn[6], cur); + } } lctx.use_buf(ctx0, 1); diff --git a/neural_speed/models/qwen/qwen_utils.cpp b/neural_speed/models/qwen/qwen_utils.cpp index ae618a04b..fe03d18c0 100644 --- a/neural_speed/models/qwen/qwen_utils.cpp +++ b/neural_speed/models/qwen/qwen_utils.cpp @@ -52,7 +52,7 @@ void QWEN::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo model.hparams = ml->file_loaders.at(0)->hparams; model_file_version file_version = ml->file_loaders.at(0)->file_version; auto& hparams = model.hparams; - n_ff = hparams.ffn_hidden_size / 2; + n_ff = hparams.ffn_hidden_size; fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); @@ -117,9 +117,21 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); // qkv GEMM + if (ml->verify_tensor(layers_i + ".attn_qkv.weight")){ layer.attn[0] = ml->get_tensor(layers_i + ".attn_qkv.weight", {n_embd, 3 * n_embd}, backend); layer.attn[1] = ml->get_tensor(layers_i + ".attn_qkv.bias", {3 * n_embd}, backend); layer.attn[2] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); + } + else { + layer.attn[0] = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".attn_q.bias", {n_embd}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd}, backend); + layer.attn[3] = ml->get_tensor(layers_i + ".attn_k.bias", {n_embd}, backend); + layer.attn[4] = ml->get_tensor(layers_i + ".attn_v.weight", {n_embd, n_embd}, backend); + layer.attn[5] = ml->get_tensor(layers_i + ".attn_v.bias", {n_embd}, backend); + layer.attn[6] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); + } + // ffn GEMM layer.ffn[0] = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend); @@ -201,5 +213,5 @@ void model_load_internal(const std::string& fname, model_archs arch, model_conte ms->init(fname.c_str(), ctx, n_gpu_layers, use_mmap, use_mlock, vocab_only); ms->load(ctx, progress_callback, progress_callback_user_data); model_context& lctx = *ctx; - lctx.support_bestla_kv = true; + lctx.support_bestla_kv = false; } From fa79bbcd55cb4e09cb085117a26ef44b1dab7146 Mon Sep 17 00:00:00 2001 From: intellinjun Date: Wed, 28 Feb 2024 11:35:39 +0800 Subject: [PATCH 02/12] enable qwen2 Signed-off-by: intellinjun --- neural_speed/convert/convert_qwen2.py | 174 ++++++++++++++++++++++++ neural_speed/models/model_utils/gguf.h | 14 +- neural_speed/models/qwen/qwen.cpp | 68 ++++----- neural_speed/models/qwen/qwen_utils.cpp | 63 ++++++--- 4 files changed, 261 insertions(+), 58 deletions(-) create mode 100644 neural_speed/convert/convert_qwen2.py diff --git a/neural_speed/convert/convert_qwen2.py b/neural_speed/convert/convert_qwen2.py new file mode 100644 index 000000000..f36b31eeb --- /dev/null +++ b/neural_speed/convert/convert_qwen2.py @@ -0,0 +1,174 @@ +# limitations under the License. +# Convert Hugging Face fine-tuned gpt-neox-like models to ne format +# +# Usage: +# +# python3 models/convert-h5-to-ne.py +# +# This script is similar to "convert-pt-to-ne.py" +# + +import io +import os +import sys +import struct +import json +import code +import torch +import numpy as np +from pathlib import Path +import argparse +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, + Union) +from transformers import AutoModelForCausalLM, AutoTokenizer + + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def main(args_in: Optional[List[str]] = None) -> None: + parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") + parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file") + args = parser.parse_args(args_in) + + dir_model = args.model.as_posix() + fname_out = args.outfile.as_posix() + + # possible data types + # ftype == 0 -> float32 + # ftype == 1 -> float16 + ftype = 0 + if args.outtype == "f16": + ftype = 1 + + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + print("Loading model: ", dir_model) + model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) + model.eval() + for p in model.parameters(): + p.requires_grad = False + hparams = model.config.to_dict() + print("Model loaded: ", dir_model) + + fout = open(fname_out, "wb") + + # 0x67676d6c is unversioned ne + # 0x67676d66 is versioned ggmf (requires token scores) + ne_file_magic = 0x67676d66 + #ne_file_version = 0x00000001 # v1 + + fout.write(struct.pack("i", ne_file_magic)) # magic: ne in hex + fout.write(struct.pack("i", 1)) + #import pdb;pdb.set_trace() + fout.write(struct.pack("i", hparams["vocab_size"])) + fout.write(struct.pack("i", hparams["hidden_size"])) + fout.write(struct.pack("i", hparams["intermediate_size"])) # dummy data + fout.write(struct.pack("i", hparams["num_attention_heads"])) + fout.write(struct.pack("i", 0)) # multi-query attention + fout.write(struct.pack("i", hparams["num_hidden_layers"])) + fout.write(struct.pack("i", int(hparams["hidden_size"]/hparams["num_attention_heads"]))) + fout.write(struct.pack("i", ftype)) + fout.write(struct.pack("i", hparams["max_position_embeddings"])) + fout.write(struct.pack("f", 0.0)) + fout.write(struct.pack("f", 0.0)) + fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # word_embed_proj_dim (for opt) + fout.write(struct.pack("i", 0)) # do_layer_norm_before (for opt) + + fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", hparams["intermediate_size"])) + fout.write(struct.pack("i", 0)) + fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps + fout.write(struct.pack("f", 10000.0)) # freq_base + fout.write(struct.pack("f", 1.0)) # rope_factor + + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + + fout.write(struct.pack("i", hparams["bos_token_id"])) + fout.write(struct.pack("i", hparams["eos_token_id"])) + fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) + fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) + + for i in range(hparams["vocab_size"]): + if i < tokenizer.vocab_size: + text = tokenizer.decode([i]).encode('utf-8') + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", 0.0 - i)) + else: + text = tokenizer.decode([tokenizer.vocab_size - 1]).encode('utf-8') + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", -10000)) + + list_vars = model.state_dict() + + print(hparams) + + for name in list_vars.keys(): + # No gradients for these + list_vars[name].requires_grad = False + src = name + nn = name + + print(src, ' -> ', name) + data = list_vars[src].squeeze().numpy() + data = data.astype(np.float32) + + n_dims = len(data.shape) + print(name, n_dims, data.shape) + + # default type is fp32 + ftype_cur = 0 + if ftype == 1 and n_dims > 1: + print(" Converting to float16", data.shape, data[:3, :3].tolist()) + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32", data.shape, data[:3, :3].tolist() if n_dims > 1 else data[:3].tolist()) + data = data.astype(np.float32) + + # header + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + print(str) + fout.write(str) + + # data + data.tofile(fout) + + fout.close() + + print("Done. Output file: " + fname_out) + print("") + + +if __name__ == '__main__': + main() + diff --git a/neural_speed/models/model_utils/gguf.h b/neural_speed/models/model_utils/gguf.h index 68a955a8d..b57d51b16 100644 --- a/neural_speed/models/model_utils/gguf.h +++ b/neural_speed/models/model_utils/gguf.h @@ -236,14 +236,12 @@ enum llm_arch { }; static std::map LLM_ARCH_NAMES = { - {LLM_ARCH_LLAMA, "llama"}, {LLM_ARCH_FALCON, "falcon"}, - {LLM_ARCH_GPT2, "gpt2"}, {LLM_ARCH_GPTJ, "gptj"}, - {LLM_ARCH_GPTNEOX, "gptneox"}, {LLM_ARCH_MPT, "mpt"}, - {LLM_ARCH_BAICHUAN, "baichuan"}, {LLM_ARCH_STARCODER, "starcoder"}, - {LLM_ARCH_PERSIMMON, "persimmon"}, {LLM_ARCH_REFACT, "refact"}, - {LLM_ARCH_BLOOM, "bloom"}, {LLM_ARCH_STABLELM, "stablelm"}, - {LLM_ARCH_QWEN, "qwen"}, {LLM_ARCH_CHATGLM, "chatglm"}, - {LLM_ARCH_CHATGLM2, "chatglm2"}, {LLM_ARCH_PHI, "phi"}, {LLM_ARCH_QWEN2, "qwen2"}}; + {LLM_ARCH_LLAMA, "llama"}, {LLM_ARCH_FALCON, "falcon"}, {LLM_ARCH_GPT2, "gpt2"}, + {LLM_ARCH_GPTJ, "gptj"}, {LLM_ARCH_GPTNEOX, "gptneox"}, {LLM_ARCH_MPT, "mpt"}, + {LLM_ARCH_BAICHUAN, "baichuan"}, {LLM_ARCH_STARCODER, "starcoder"}, {LLM_ARCH_PERSIMMON, "persimmon"}, + {LLM_ARCH_REFACT, "refact"}, {LLM_ARCH_BLOOM, "bloom"}, {LLM_ARCH_STABLELM, "stablelm"}, + {LLM_ARCH_QWEN, "qwen"}, {LLM_ARCH_CHATGLM, "chatglm"}, {LLM_ARCH_CHATGLM2, "chatglm2"}, + {LLM_ARCH_PHI, "phi"}, {LLM_ARCH_QWEN2, "qwen2"}}; struct gguf_tensor_info { struct gguf_str name; diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp index 117f5f349..30a59b596 100644 --- a/neural_speed/models/qwen/qwen.cpp +++ b/neural_speed/models/qwen/qwen.cpp @@ -77,7 +77,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu model_context& lctx = *ctx; // static batching for now - const int N = 2; + const int N = inputs->n_tokens; const int n_past = inputs->n_past; const int n_total = inputs->n_total; const bool shift_roped_k = lctx.shift_roped_k; @@ -102,7 +102,12 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_rot; const int head_dim = n_embd / n_head; - const int qwen_version = 2; + int qwen_version = 0; + if (hparams.max_seq_len == 8192) { + qwen_version = 1; + } else { + qwen_version = 2; + } auto& mem_per_token = lctx.mem_per_token; auto& buf_compute = lctx.buf_compute; @@ -120,7 +125,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA&&false; + const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA && false; kv_cache_info_t kv_cache_info = {}; if (run_mha_reordered) { NE_ASSERT(("kv cache should be the same dtype", kv_self.v->type == NE_TYPE_BTLA)); @@ -144,9 +149,8 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu } struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size); ne_set_name(embd, "embd"); - uint32_t input_tokens[2] = {14990, 1879}; for (int i = 0; i < batch_size; ++i) { - memcpy(static_cast(embd->data) + i * N, input_tokens, N * ne_element_size(embd)); + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -170,35 +174,33 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu struct ne_tensor* Kcur; struct ne_tensor* Vcur; - if(qwen_version == 1){ + if (qwen_version == 1) { + cur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); + + cur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], cur), cur); + size_t fused_qkv_row_nb = (3 * n_embd) * sizeof(float); + Qcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), fused_qkv_row_nb, + 0 * sizeof(float) * n_embd)); + // head_dim, n_head, N --> head_dim, N, n_head + Kcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), fused_qkv_row_nb, + 1 * sizeof(float) * n_embd)); + // head_dim, n_head, N --> N, head_dim, n_head + Vcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), fused_qkv_row_nb, + 2 * sizeof(float) * n_embd)); + } else { + Qcur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); + Qcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], Qcur), Qcur); + Qcur = ne_reshape_3d(ctx0, Qcur, head_dim, n_head, N); - cur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); + Kcur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); + Kcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[3], Kcur), Kcur); + Kcur = ne_reshape_3d(ctx0, Kcur, head_dim, n_head, N); - cur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], cur), cur); - size_t fused_qkv_row_nb = (3 * n_embd) * sizeof(float); - Qcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), - fused_qkv_row_nb, 0 * sizeof(float) * n_embd)); - // head_dim, n_head, N --> head_dim, N, n_head - Kcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), - fused_qkv_row_nb, 1 * sizeof(float) * n_embd)); - // head_dim, n_head, N --> N, head_dim, n_head - Vcur = ne_cont(ctx0, ne_view_3d(ctx0, cur, head_dim, n_head, N, head_dim * sizeof(float), - fused_qkv_row_nb, 2 * sizeof(float) * n_embd)); + Vcur = ne_mul_mat(ctx0, model.layers[il].attn[4], cur); + Vcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[5], Vcur), Vcur); + Vcur = ne_reshape_3d(ctx0, Vcur, head_dim, n_head, N); } - Qcur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur); - Qcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[1], Qcur), Qcur); - Qcur = ne_reshape_3d(ctx0, Qcur,head_dim, n_head, N); - - Kcur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); - Kcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[3], Kcur), Kcur); - Kcur = ne_reshape_3d(ctx0, Kcur,head_dim, n_head, N); - - Vcur = ne_mul_mat(ctx0, model.layers[il].attn[4], cur); - Vcur = ne_add(ctx0, ne_repeat(ctx0, model.layers[il].attn[5], Vcur), Vcur); - Vcur = ne_reshape_3d(ctx0, Vcur,head_dim, n_head, N); - - // using mode = 2 for GPT-NeoX mode Qcur = ne_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0, hparams.freq_base, hparams.freq_scale); ne_set_name(Qcur, "Qcur"); @@ -320,9 +322,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu cur = ne_view_2d(ctx0, KQV_Out, n_embd, N, n_embd * ne_element_size(KQV_Out), 0); } // projection - if(qwen_version == 1){ - cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); - }else{ + if (qwen_version == 1) { + cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur); + } else { cur = ne_mul_mat(ctx0, model.layers[il].attn[6], cur); } } diff --git a/neural_speed/models/qwen/qwen_utils.cpp b/neural_speed/models/qwen/qwen_utils.cpp index fe03d18c0..bce8e779f 100644 --- a/neural_speed/models/qwen/qwen_utils.cpp +++ b/neural_speed/models/qwen/qwen_utils.cpp @@ -53,12 +53,15 @@ void QWEN::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo model_file_version file_version = ml->file_loaders.at(0)->file_version; auto& hparams = model.hparams; n_ff = hparams.ffn_hidden_size; + if (hparams.max_seq_len == 8192) { + n_ff = n_ff / 2; + } fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: n_ff = %u\n", __func__, hparams.ffn_hidden_size / 2); + fprintf(stderr, "%s: n_ff = %u\n", __func__, hparams.ffn_hidden_size); fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); n_embd = hparams.n_embd; n_vocab = hparams.n_vocab; @@ -117,28 +120,26 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); // qkv GEMM - if (ml->verify_tensor(layers_i + ".attn_qkv.weight")){ - layer.attn[0] = ml->get_tensor(layers_i + ".attn_qkv.weight", {n_embd, 3 * n_embd}, backend); - layer.attn[1] = ml->get_tensor(layers_i + ".attn_qkv.bias", {3 * n_embd}, backend); - layer.attn[2] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); - } - else { - layer.attn[0] = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend); - layer.attn[1] = ml->get_tensor(layers_i + ".attn_q.bias", {n_embd}, backend); - layer.attn[2] = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd}, backend); - layer.attn[3] = ml->get_tensor(layers_i + ".attn_k.bias", {n_embd}, backend); - layer.attn[4] = ml->get_tensor(layers_i + ".attn_v.weight", {n_embd, n_embd}, backend); - layer.attn[5] = ml->get_tensor(layers_i + ".attn_v.bias", {n_embd}, backend); - layer.attn[6] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); + if (ml->verify_tensor(layers_i + ".attn_qkv.weight")) { + layer.attn[0] = ml->get_tensor(layers_i + ".attn_qkv.weight", {n_embd, 3 * n_embd}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".attn_qkv.bias", {3 * n_embd}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); + } else { + layer.attn[0] = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".attn_q.bias", {n_embd}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd}, backend); + layer.attn[3] = ml->get_tensor(layers_i + ".attn_k.bias", {n_embd}, backend); + layer.attn[4] = ml->get_tensor(layers_i + ".attn_v.weight", {n_embd, n_embd}, backend); + layer.attn[5] = ml->get_tensor(layers_i + ".attn_v.bias", {n_embd}, backend); + layer.attn[6] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); } - // ffn GEMM layer.ffn[0] = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend); layer.ffn[1] = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend); layer.ffn[2] = ml->get_tensor(layers_i + ".ffn_down.weight", {n_ff, n_embd}, backend); } - } else { + } else if (ml->verify_tensor("transformer.wte.weight")) { model.others[0] = ml->get_tensor("transformer.wte.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); model.others[1] = ml->get_tensor("transformer.ln_f.weight", {n_embd}, NE_BACKEND_CPU); model.others[2] = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); @@ -162,6 +163,34 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v layer.ffn[1] = ml->get_tensor(layers_i + ".mlp.w2.weight", {n_embd, n_ff}, backend); layer.ffn[2] = ml->get_tensor(layers_i + ".mlp.c_proj.weight", {n_ff, n_embd}, backend); } + } else { + model.others[0] = ml->get_tensor("model.embed_tokens.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); + model.others[1] = ml->get_tensor("model.norm.weight", {n_embd}, NE_BACKEND_CPU); + model.others[2] = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ne_backend backend = static_cast(i) < i_gpu_start ? NE_BACKEND_CPU : MODEL_BACKEND_OFFLOAD; + auto& layer = model.layers[i]; + std::string layers_i = "model.layers." + std::to_string(i); + + // norm: cur = ln_1_g*cur + ln_1_b + layer.norm[0] = ml->get_tensor(layers_i + ".input_layernorm.weight", {n_embd}, backend); + layer.norm[1] = ml->get_tensor(layers_i + ".post_attention_layernorm.weight", {n_embd}, backend); + + // qkv GEMM + out proj GEMM + layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.q_proj.bias", {n_embd}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); + layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.k_proj.bias", {n_embd}, backend); + layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); + layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.v_proj.bias", {n_embd}, backend); + layer.attn[6] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); + + // ffn GEMM + layer.ffn[0] = ml->get_tensor(layers_i + ".mlp.up_proj.weight", {n_embd, n_ff}, backend); + layer.ffn[1] = ml->get_tensor(layers_i + ".mlp.gate_proj.weight", {n_embd, n_ff}, backend); + layer.ffn[2] = ml->get_tensor(layers_i + ".mlp.down_proj.weight", {n_ff, n_embd}, backend); + } } // print memory requirements @@ -192,7 +221,7 @@ class qwen_quant_layer : public quant_layer_base { public: quant_params_internal get_layer_config(std::string layername, std::vector ne, ne_type type) override { bool quantize = layername.rfind("weight") == layername.size() - 6; // ends with 'weight'? - if (layername == "transformer.wte.weight") { + if (layername == "transformer.wte.weight" || layername == "model.embed_tokens.weight") { // special layer process, can be loaded by config file return quant_params_internal(); // return q4_0 to cover the usage of getrow } From bf53ec3782e39749f134714b01c1de91811325ee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Mar 2024 03:32:59 +0000 Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_speed/convert/convert_qwen2.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/neural_speed/convert/convert_qwen2.py b/neural_speed/convert/convert_qwen2.py index f36b31eeb..b06218012 100644 --- a/neural_speed/convert/convert_qwen2.py +++ b/neural_speed/convert/convert_qwen2.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # limitations under the License. # Convert Hugging Face fine-tuned gpt-neox-like models to ne format # @@ -171,4 +185,3 @@ def main(args_in: Optional[List[str]] = None) -> None: if __name__ == '__main__': main() - From a37d977feedc107cfb609f422a04aaa27a164fb3 Mon Sep 17 00:00:00 2001 From: intellinjun <105184542+intellinjun@users.noreply.github.com> Date: Fri, 1 Mar 2024 13:42:56 +0800 Subject: [PATCH 04/12] Update qwen_utils.cpp --- neural_speed/models/qwen/qwen_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_speed/models/qwen/qwen_utils.cpp b/neural_speed/models/qwen/qwen_utils.cpp index bce8e779f..8e636318e 100644 --- a/neural_speed/models/qwen/qwen_utils.cpp +++ b/neural_speed/models/qwen/qwen_utils.cpp @@ -242,5 +242,5 @@ void model_load_internal(const std::string& fname, model_archs arch, model_conte ms->init(fname.c_str(), ctx, n_gpu_layers, use_mmap, use_mlock, vocab_only); ms->load(ctx, progress_callback, progress_callback_user_data); model_context& lctx = *ctx; - lctx.support_bestla_kv = false; + lctx.support_bestla_kv = true; } From 209059e4796fdd375e11a89d8b04d97fd93fd050 Mon Sep 17 00:00:00 2001 From: intellinjun <105184542+intellinjun@users.noreply.github.com> Date: Fri, 1 Mar 2024 15:12:39 +0800 Subject: [PATCH 05/12] Update qwen.cpp --- neural_speed/models/qwen/qwen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp index 30a59b596..2aa65eb2d 100644 --- a/neural_speed/models/qwen/qwen.cpp +++ b/neural_speed/models/qwen/qwen.cpp @@ -125,7 +125,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA && false; + const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; kv_cache_info_t kv_cache_info = {}; if (run_mha_reordered) { NE_ASSERT(("kv cache should be the same dtype", kv_self.v->type == NE_TYPE_BTLA)); From 54860b5fc27a14043405eb40fe01e3b1cadeade9 Mon Sep 17 00:00:00 2001 From: intellinjun Date: Fri, 1 Mar 2024 15:43:20 +0800 Subject: [PATCH 06/12] fusion convert qwen and qwen2 Signed-off-by: intellinjun --- neural_speed/convert/convert_qwen.py | 9 +- neural_speed/convert/convert_qwen2.py | 187 -------------------------- 2 files changed, 4 insertions(+), 192 deletions(-) delete mode 100644 neural_speed/convert/convert_qwen2.py diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index 900d8cfe8..e5fb4e0d3 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -100,9 +100,9 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", hparams["num_attention_heads"])) fout.write(struct.pack("i", 0)) # multi-query attention fout.write(struct.pack("i", hparams["num_hidden_layers"])) - fout.write(struct.pack("i", hparams["kv_channels"])) + fout.write(struct.pack("i", hparams["kv_channels"] if "kv_channels" in hparams else int(hparams["hidden_size"]/hparams["num_attention_heads"]))) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["seq_length"])) + fout.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"])) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("i", 0)) @@ -119,9 +119,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) - - fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) - fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>'])) + fout.write(struct.pack("i", hparams["bos_token_id"] if hparams["bos_token_id"] else tokenizer.special_tokens['<|endoftext|>'])) + fout.write(struct.pack("i", hparams["eos_token_id"] if hparams["eos_token_id"] else tokenizer.special_tokens['<|endoftext|>'])) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_qwen2.py b/neural_speed/convert/convert_qwen2.py deleted file mode 100644 index b06218012..000000000 --- a/neural_speed/convert/convert_qwen2.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# limitations under the License. -# Convert Hugging Face fine-tuned gpt-neox-like models to ne format -# -# Usage: -# -# python3 models/convert-h5-to-ne.py -# -# This script is similar to "convert-pt-to-ne.py" -# - -import io -import os -import sys -import struct -import json -import code -import torch -import numpy as np -from pathlib import Path -import argparse -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, - Union) -from transformers import AutoModelForCausalLM, AutoTokenizer - - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -def main(args_in: Optional[List[str]] = None) -> None: - parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") - parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file") - args = parser.parse_args(args_in) - - dir_model = args.model.as_posix() - fname_out = args.outfile.as_posix() - - # possible data types - # ftype == 0 -> float32 - # ftype == 1 -> float16 - ftype = 0 - if args.outtype == "f16": - ftype = 1 - - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) - model.eval() - for p in model.parameters(): - p.requires_grad = False - hparams = model.config.to_dict() - print("Model loaded: ", dir_model) - - fout = open(fname_out, "wb") - - # 0x67676d6c is unversioned ne - # 0x67676d66 is versioned ggmf (requires token scores) - ne_file_magic = 0x67676d66 - #ne_file_version = 0x00000001 # v1 - - fout.write(struct.pack("i", ne_file_magic)) # magic: ne in hex - fout.write(struct.pack("i", 1)) - #import pdb;pdb.set_trace() - fout.write(struct.pack("i", hparams["vocab_size"])) - fout.write(struct.pack("i", hparams["hidden_size"])) - fout.write(struct.pack("i", hparams["intermediate_size"])) # dummy data - fout.write(struct.pack("i", hparams["num_attention_heads"])) - fout.write(struct.pack("i", 0)) # multi-query attention - fout.write(struct.pack("i", hparams["num_hidden_layers"])) - fout.write(struct.pack("i", int(hparams["hidden_size"]/hparams["num_attention_heads"]))) - fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["max_position_embeddings"])) - fout.write(struct.pack("f", 0.0)) - fout.write(struct.pack("f", 0.0)) - fout.write(struct.pack("i", 0)) - fout.write(struct.pack("i", 0)) # word_embed_proj_dim (for opt) - fout.write(struct.pack("i", 0)) # do_layer_norm_before (for opt) - - fout.write(struct.pack("i", 0)) - fout.write(struct.pack("i", hparams["intermediate_size"])) - fout.write(struct.pack("i", 0)) - fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps - fout.write(struct.pack("f", 10000.0)) # freq_base - fout.write(struct.pack("f", 1.0)) # rope_factor - - fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled - fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings - fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) - - fout.write(struct.pack("i", hparams["bos_token_id"])) - fout.write(struct.pack("i", hparams["eos_token_id"])) - fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) - fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) - - for i in range(hparams["vocab_size"]): - if i < tokenizer.vocab_size: - text = tokenizer.decode([i]).encode('utf-8') - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", 0.0 - i)) - else: - text = tokenizer.decode([tokenizer.vocab_size - 1]).encode('utf-8') - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", -10000)) - - list_vars = model.state_dict() - - print(hparams) - - for name in list_vars.keys(): - # No gradients for these - list_vars[name].requires_grad = False - src = name - nn = name - - print(src, ' -> ', name) - data = list_vars[src].squeeze().numpy() - data = data.astype(np.float32) - - n_dims = len(data.shape) - print(name, n_dims, data.shape) - - # default type is fp32 - ftype_cur = 0 - if ftype == 1 and n_dims > 1: - print(" Converting to float16", data.shape, data[:3, :3].tolist()) - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32", data.shape, data[:3, :3].tolist() if n_dims > 1 else data[:3].tolist()) - data = data.astype(np.float32) - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - print(str) - fout.write(str) - - # data - data.tofile(fout) - - fout.close() - - print("Done. Output file: " + fname_out) - print("") - - -if __name__ == '__main__': - main() From 51593c1fb8e69f46ce288bdf16528220d788e363 Mon Sep 17 00:00:00 2001 From: intellinjun Date: Fri, 1 Mar 2024 15:50:36 +0800 Subject: [PATCH 07/12] add comments Signed-off-by: intellinjun --- neural_speed/models/qwen/qwen_utils.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neural_speed/models/qwen/qwen_utils.cpp b/neural_speed/models/qwen/qwen_utils.cpp index 8e636318e..2ea38492e 100644 --- a/neural_speed/models/qwen/qwen_utils.cpp +++ b/neural_speed/models/qwen/qwen_utils.cpp @@ -105,7 +105,7 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v model.layers.resize(n_layer); size_t vram_total = 0; - if (ml->verify_tensor("token_embd.weight")) { + if (ml->verify_tensor("token_embd.weight")) { // gguf model.others[0] = ml->get_tensor("token_embd.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); model.others[1] = ml->get_tensor("output_norm.weight", {n_embd}, NE_BACKEND_CPU); model.others[2] = ml->get_tensor("output.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); @@ -124,7 +124,7 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v layer.attn[0] = ml->get_tensor(layers_i + ".attn_qkv.weight", {n_embd, 3 * n_embd}, backend); layer.attn[1] = ml->get_tensor(layers_i + ".attn_qkv.bias", {3 * n_embd}, backend); layer.attn[2] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend); - } else { + } else { // qwen2 gguf layer.attn[0] = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend); layer.attn[1] = ml->get_tensor(layers_i + ".attn_q.bias", {n_embd}, backend); layer.attn[2] = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd}, backend); @@ -139,7 +139,7 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v layer.ffn[1] = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend); layer.ffn[2] = ml->get_tensor(layers_i + ".ffn_down.weight", {n_ff, n_embd}, backend); } - } else if (ml->verify_tensor("transformer.wte.weight")) { + } else if (ml->verify_tensor("transformer.wte.weight")) { // qwen1 bin model.others[0] = ml->get_tensor("transformer.wte.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); model.others[1] = ml->get_tensor("transformer.ln_f.weight", {n_embd}, NE_BACKEND_CPU); model.others[2] = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); @@ -163,7 +163,7 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v layer.ffn[1] = ml->get_tensor(layers_i + ".mlp.w2.weight", {n_embd, n_ff}, backend); layer.ffn[2] = ml->get_tensor(layers_i + ".mlp.c_proj.weight", {n_ff, n_embd}, backend); } - } else { + } else { // qwen2 bin model.others[0] = ml->get_tensor("model.embed_tokens.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); model.others[1] = ml->get_tensor("model.norm.weight", {n_embd}, NE_BACKEND_CPU); model.others[2] = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, NE_BACKEND_CPU); From 36c10a046ad4041cf5f174fcf27d0e96f864b54d Mon Sep 17 00:00:00 2001 From: intellinjun Date: Fri, 1 Mar 2024 17:15:25 +0800 Subject: [PATCH 08/12] fix format error Signed-off-by: intellinjun --- neural_speed/convert/convert_qwen.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index e5fb4e0d3..702948156 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -100,9 +100,11 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", hparams["num_attention_heads"])) fout.write(struct.pack("i", 0)) # multi-query attention fout.write(struct.pack("i", hparams["num_hidden_layers"])) - fout.write(struct.pack("i", hparams["kv_channels"] if "kv_channels" in hparams else int(hparams["hidden_size"]/hparams["num_attention_heads"]))) + fout.write(struct.pack("i", hparams["kv_channels"] if "kv_channels" in hparams + else int(hparams["hidden_size"]/hparams["num_attention_heads"]))) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"])) + fout.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams + else hparams["max_position_embeddings"])) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("i", 0)) @@ -119,8 +121,10 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) - fout.write(struct.pack("i", hparams["bos_token_id"] if hparams["bos_token_id"] else tokenizer.special_tokens['<|endoftext|>'])) - fout.write(struct.pack("i", hparams["eos_token_id"] if hparams["eos_token_id"] else tokenizer.special_tokens['<|endoftext|>'])) + fout.write(struct.pack("i", hparams["bos_token_id"] if hparams["bos_token_id"] + else tokenizer.special_tokens['<|endoftext|>'])) + fout.write(struct.pack("i", hparams["eos_token_id"] if hparams["eos_token_id"] + else tokenizer.special_tokens['<|endoftext|>'])) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) From 5fdc7bc64bdf7002328d919b540252e5c631bd96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Mar 2024 09:15:59 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_speed/convert/convert_qwen.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index 702948156..0c5d4fbd3 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -103,7 +103,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", hparams["kv_channels"] if "kv_channels" in hparams else int(hparams["hidden_size"]/hparams["num_attention_heads"]))) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams + fout.write(struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"])) fout.write(struct.pack("f", 0.0)) fout.write(struct.pack("f", 0.0)) @@ -121,9 +121,9 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) - fout.write(struct.pack("i", hparams["bos_token_id"] if hparams["bos_token_id"] + fout.write(struct.pack("i", hparams["bos_token_id"] if hparams["bos_token_id"] else tokenizer.special_tokens['<|endoftext|>'])) - fout.write(struct.pack("i", hparams["eos_token_id"] if hparams["eos_token_id"] + fout.write(struct.pack("i", hparams["eos_token_id"] if hparams["eos_token_id"] else tokenizer.special_tokens['<|endoftext|>'])) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1)) From 3f159df29c64573989bf3fc3e4c2433269f5aceb Mon Sep 17 00:00:00 2001 From: intellinjun Date: Fri, 1 Mar 2024 17:19:26 +0800 Subject: [PATCH 10/12] add qwen-1_5 extension test Signed-off-by: intellinjun --- tests/model-test/cpp_graph_inference.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh index e5e45b2da..177e6a712 100644 --- a/tests/model-test/cpp_graph_inference.sh +++ b/tests/model-test/cpp_graph_inference.sh @@ -155,6 +155,7 @@ model_name_map["qwen-7b"]="Qwen/Qwen-7B-Chat" model_name_map["magicoder"]="ise-uiuc/Magicoder-S-DS-6.7B" model_name_map["whisper"]="openai/whisper-tiny" model_name_map["phi2"]="microsoft/phi-2" +model_name_map["qwen-1_5"]="Qwen/Qwen1.5-7B-Chat" function main() { conda_env="$1" @@ -250,6 +251,10 @@ function main() { quant_script="./build/bin/quant_qwen" convert_script="${convert_script}/convert_qwen.py" infer_cmd="./build/bin/run_qwen" + elif [[ "${model}" == "qwen-1_5" ]]; then + quant_script="./build/bin/quant_qwen" + convert_script="${convert_script}/convert_qwen.py" + infer_cmd="./build/bin/run_qwen" elif [[ "${model}" == "magicoder" ]]; then quant_script="./build/bin/quant_llama" convert_script="${convert_script}/convert_llama.py" From 51ac439468042ce179e7ff691e49e2c3d0e89958 Mon Sep 17 00:00:00 2001 From: intellinjun Date: Mon, 4 Mar 2024 15:25:49 +0800 Subject: [PATCH 11/12] update support model Signed-off-by: intellinjun --- docs/supported_models.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/supported_models.md b/docs/supported_models.md index cab1d1f38..e4edb685b 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -182,11 +182,13 @@ Neural Speed supports the following models: Qwen-7B, - Qwen-14B + Qwen-14B, + Qwen1.5-7B, + Qwen1.5-0.5B ✅ - ✅ + ✅ Latest @@ -355,7 +357,8 @@ Neural Speed supports the following models: ✅ - Qwen-7B-Chat + Qwen-7B-Chat, + Qwen1.5-7B-Chat-GGUF ✅ ✅ ✅ From 95d0b6036096b9622c50f04587906747b8ed2439 Mon Sep 17 00:00:00 2001 From: intellinjun Date: Mon, 4 Mar 2024 15:54:48 +0800 Subject: [PATCH 12/12] update support model Signed-off-by: intellinjun --- docs/supported_models.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/supported_models.md b/docs/supported_models.md index e4edb685b..0434ed186 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -305,6 +305,14 @@ Neural Speed supports the following models: ✅ + + TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUFF, + ✅ + ✅ + ✅ + ✅ + + TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF ✅