diff --git a/docs/supported_models.md b/docs/supported_models.md index cab1d1f38..84c443010 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -171,14 +171,15 @@ Neural Speed supports the following models: 4.33.1 - Mistral-7B - ✅ - ✅ - ✅ - ✅ + Mistral-7B, + Mixtral-8x7B ✅ + + ✅ - 4.34.0 or newer + + + 4.36.0 or newer Qwen-7B, diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index bd2a37633..c8ecab1f3 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -72,6 +72,8 @@ def __import_package(self, model_type): import neural_speed.phi_cpp as cpp_model elif model_type == "whisper": import neural_speed.whisper_cpp as cpp_model + elif model_type == "mixtral": + import neural_speed.mixtral_cpp as cpp_model else: raise TypeError("Unsupported model type {}!".format(model_type)) self.module = cpp_model diff --git a/neural_speed/application/CMakeLists.txt b/neural_speed/application/CMakeLists.txt index 3782e5e1d..46a3c44cb 100644 --- a/neural_speed/application/CMakeLists.txt +++ b/neural_speed/application/CMakeLists.txt @@ -67,6 +67,7 @@ compile_quant(quant_chatglm quant_model.cpp chatglm chatglm) compile_quant(quant_chatglm2 quant_model.cpp chatglm2 chatglm2) compile_quant(quant_baichuan quant_model.cpp baichuan baichuan) compile_quant(quant_mistral quant_model.cpp mistral llama) +compile_quant(quant_mixtral quant_model.cpp mixtral llama) compile_quant(quant_qwen quant_model.cpp qwen qwen) compile_quant(quant_phi quant_model.cpp phi phi) compile_quant(quant_whisper quant_whisper.cpp whisper whisper) @@ -93,6 +94,7 @@ set(mymap_mistral 14) set(mymap_qwen 15) set(mymap_phi 16) set(mymap_whisper 17) +set(mymap_mixtral 18) @@ -129,6 +131,7 @@ compile_run(run_baichuan main_run.cpp main_pybind.cpp baichuan baichuan) compile_run(run_mistral main_run.cpp main_pybind.cpp mistral llama) compile_run(run_qwen main_run.cpp main_pybind.cpp qwen qwen) compile_run(run_phi main_run.cpp main_pybind.cpp phi phi) +compile_run(run_mixtral main_run.cpp main_pybind.cpp mixtral llama) # speech recognition compile_run(run_whisper audio_run.cpp whisper_pybind.cpp whisper whisper) diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp index 3da5aaaab..4d064854e 100644 --- a/neural_speed/application/main_pybind.cpp +++ b/neural_speed/application/main_pybind.cpp @@ -898,6 +898,10 @@ PYBIND11_MODULE(phi_cpp, m) PYBIND11_MODULE(whisper_cpp, m) +#elif MODEL_NAME_ID == 18 + +PYBIND11_MODULE(mixtral_cpp, m) + #endif { m.doc() = "cpp model python binding"; diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py index d7fdca3af..9303df2c1 100644 --- a/neural_speed/convert/convert_baichuan.py +++ b/neural_speed/convert/convert_baichuan.py @@ -156,6 +156,8 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", hparams["intermediate_size"])) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py index bb7e8dd43..7bd263d52 100644 --- a/neural_speed/convert/convert_bloom.py +++ b/neural_speed/convert/convert_bloom.py @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py index 92bb8734a..124d19ec3 100644 --- a/neural_speed/convert/convert_chatglm.py +++ b/neural_speed/convert/convert_chatglm.py @@ -393,6 +393,8 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["multi_query_group_num"])) fout.write(struct.pack("i", hparams["ffn_hidden_size"])) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py index 61a0bb0ac..0d90c29fb 100644 --- a/neural_speed/convert/convert_dolly.py +++ b/neural_speed/convert/convert_dolly.py @@ -113,9 +113,14 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py index d1de3ce59..3c88850f5 100644 --- a/neural_speed/convert/convert_falcon.py +++ b/neural_speed/convert/convert_falcon.py @@ -107,6 +107,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index 5b2b72aa3..d41b0cf5f 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index 6fc57d6c2..0dade0563 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -113,6 +113,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 41f405bc6..5470035db 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1089,6 +1089,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write(struct.pack("i", params.ffn_hidden_size)) self.fout.write(struct.pack("i", 0)) + self.fout.write(struct.pack("i", 0)) # n_experts + self.fout.write(struct.pack("i", 0)) # n_expert_used self.fout.write(struct.pack("f", params.rms_norm_eps)) self.fout.write(struct.pack("f", params.rope_theta)) self.fout.write(struct.pack("f", params.rope_scale)) diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py index cc8e0b46b..ccc65bad7 100644 --- a/neural_speed/convert/convert_mistral.py +++ b/neural_speed/convert/convert_mistral.py @@ -1062,6 +1062,9 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write(struct.pack("i", 0)) self.fout.write(struct.pack("i", params.ffn_hidden_size)) self.fout.write(struct.pack("i", 0)) + + self.fout.write(struct.pack("i", 0)) # n_experts + self.fout.write(struct.pack("i", 0)) # n_expert_used self.fout.write(struct.pack("f", params.rms_norm_eps)) self.fout.write(struct.pack("f", params.rope_theta)) self.fout.write(struct.pack("f", params.rope_scale)) diff --git a/neural_speed/convert/convert_mixtral.py b/neural_speed/convert/convert_mixtral.py new file mode 100644 index 000000000..1c8eded16 --- /dev/null +++ b/neural_speed/convert/convert_mixtral.py @@ -0,0 +1,1348 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import concurrent.futures +import copy +import enum +import faulthandler +import functools +import io +import itertools +import json +import math +import mmap +import pickle +import re +import signal +import struct +import sys +import zipfile +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, + Union) + +import numpy as np +from sentencepiece import SentencePieceProcessor # type: ignore +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig + +if TYPE_CHECKING: + from typing_extensions import TypeAlias + +if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): + faulthandler.register(signal.SIGUSR1) + +NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' + + +@dataclass(frozen=True) +class UnquantizedDataType: + name: str + + +DT_F16 = UnquantizedDataType('F16') +DT_F32 = UnquantizedDataType('F32') +DT_I32 = UnquantizedDataType('I32') +DT_BF16 = UnquantizedDataType('BF16') +DT_BOOL = UnquantizedDataType('BOOL') + + +@dataclass(frozen=True) +class QuantizedDataType: + groupsize: int + have_addends: bool + have_g_idx: bool + + +DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False) +DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False) + +DataType = Union[UnquantizedDataType, QuantizedDataType] + +DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {DT_F32: 0, DT_F16: 1, DT_Q4_0: 2, DT_Q4_1: 3, DT_BOOL: 4} + +FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \ + {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()} + +DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = { + DT_BF16: np.dtype(np.uint16), + DT_F16: np.dtype(np.float16), + DT_F32: np.dtype(np.float32), + DT_I32: np.dtype(np.int32), + DT_BOOL: np.dtype(np.bool_) +} + +NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \ + {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()} + + +class NEFileType(enum.Enum): + AllF32 = 0 + MostlyF16 = 1 # except 1d tensors + MostlyQ4_0 = 2 # except 1d tensors + MostlyQ4_1 = 3 # except 1d tensors + PerLayerIsQ4_1 = 4 # but tok_embeddings.weight and output.weight are F16 + + def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType: + if len(tensor.shape) == 1: + # 1D tensors are always F32. + return DT_F32 + elif self == NEFileType.AllF32: + return DT_F32 + elif self == NEFileType.MostlyF16: + return DT_F16 + elif self == NEFileType.MostlyQ4_0: + return DT_Q4_0 + elif self == NEFileType.MostlyQ4_1: + return DT_Q4_1 + elif self == NEFileType.PerLayerIsQ4_1: + if name in ('output.weight', 'tok_embeddings.weight'): + return DT_F16 + else: + return DT_Q4_1 + else: + raise ValueError(self) + + +def make_tensors_list() -> List[str]: + ret = [ + 'tok_embeddings.weight', + 'norm.weight', + 'output.weight', + ] + for i in range(80): # maximum number of layer + ret += [ + f'layers.{i}.attention.wq.weight', + f'layers.{i}.attention.wk.weight', + f'layers.{i}.attention.wv.weight', + f'layers.{i}.attention.wo.weight', + f'layers.{i}.attention_norm.weight', + f'layers.{i}.ffn_norm.weight', + f'layers.{i}.ffn_gate_inp.weight', + ] + for j in range(8): + ret += [ + f'layers.{i}.ffn_gate.{j}.weight', + f'layers.{i}.ffn_down.{j}.weight', + f'layers.{i}.ffn_up.{j}.weight', + ] + return ret + + +TENSORS_LIST = make_tensors_list() +TENSORS_SET = set(TENSORS_LIST) + + +@dataclass +class Params: + n_vocab: int + n_embd: int + n_mult: int + n_head: int + n_layer: int + n_head_kv: int + ffn_hidden_size: int + rms_norm_eps: float + rope_theta: float + rope_scale: float + + @staticmethod + def guessed(model: 'LazyModel') -> 'Params': + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model[ + "tok_embeddings.weight"].shape + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_mult=256, + n_head=n_embd // 128, + n_head_kv=n_embd // 128, + f_norm_eps=1e-5, + n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model), + ) + + @staticmethod + def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"] + n_embd = config["hidden_size"] + n_layer = config["num_hidden_layers"] + n_head = config["num_attention_heads"] + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head + ffn_hidden_size = config["intermediate_size"] + rms_norm_eps = config["rms_norm_eps"] + rope_theta = config["rope_theta"] if "rope_theta" in config else 10000 + rope_scale = 1 + if "rope_scaling" in config and config["rope_scaling"] is not None: + rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1 + + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_layer=n_layer, + n_mult=256, + n_head=n_head, + n_head_kv=n_head_kv, + ffn_hidden_size=ffn_hidden_size, + rms_norm_eps=rms_norm_eps, + rope_theta=rope_theta, + rope_scale=rope_scale, + ) + + # LLaMA v2 70B params.json + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, + # "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} + @staticmethod + def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"] if "vocab_size" in config else -1 + n_embd = config["dim"] + n_layer = config["n_layers"] + n_head = config["n_heads"] + n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head + ffn_hidden_size = config["intermediate_size"] + # hack to determine LLaMA v1 vs v2 vs CodeLlama + + if n_vocab == -1: + n_vocab = model["tok_embeddings.weight"].shape[0] + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_mult=256, + n_layer=n_layer, + n_head=n_head, + n_head_kv=n_head_kv, + ffn_hidden_size=ffn_hidden_size, + ) + + @staticmethod + def load(model: 'ModelPlus') -> 'Params': + hf_config_path = model.paths[0].parent / "config.json" + orig_config_path = model.paths[0].parent / "params.json" + + if hf_config_path.exists(): + params = Params.loadHFTransformerJson(model.model, hf_config_path) + elif orig_config_path.exists(): + params = Params.loadOriginalParamsJson(model.model, orig_config_path) + elif model.format != 'none': + params = Params.guessed(model.model) + else: + raise ValueError('Cannot guess params when model format is none') + + params.path_model = model.paths[0].parent + + return params + + +class SentencePieceVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) + added_tokens: Dict[str, int] + if fname_added_tokens is not None: + added_tokens = json.load(open(fname_added_tokens, encoding='utf-8')) + else: + added_tokens = {} + vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) + if expected_ids != actual_ids: + print(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + added_tokens = {} + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: + tokenizer = self.sentencepiece_tokenizer + for i in range(tokenizer.vocab_size()): + text: bytes + if tokenizer.is_unknown(i): + text = " \u2047 ".encode("utf-8") + elif tokenizer.is_control(i): + text = b"" + elif tokenizer.is_byte(i): + piece = tokenizer.id_to_piece(i) + if len(piece) != 6: + raise Exception(f"Invalid token: {piece}") + byte_value = int(piece[3:-1], 16) + text = struct.pack("B", byte_value) + else: + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + score: float = tokenizer.get_score(i) + yield text, score + + def added_tokens(self) -> Iterable[Tuple[bytes, float]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score + + def all_tokens(self) -> Iterable[Tuple[bytes, float]]: + yield from self.sentencepiece_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class NEVocab: + def __init__(self, tokens: List[Tuple[bytes, float]]): + self.tokens = tokens + self.vocab_size = len(tokens) + + def all_tokens(self) -> Iterable[Tuple[bytes, float]]: + return self.tokens + + def __repr__(self) -> str: + return f"" + + +Vocab = Union[SentencePieceVocab, NEVocab] + + +def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: + if n_head_kv is not None and n_head != n_head_kv: + n_head //= n_head_kv + return (weights.reshape(n_head_kv, 2, weights.shape[0] // n_head_kv // 2, + *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) + + +def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], + g_idx: Optional[NDArray]) -> NDArray: + # First reinterpret each row from a list of int32s containing 8 values each + # to a list of uint8s containing 2 values each. + qvalues_pack8 = qvalues_pack32.view(np.uint8) + + # Then split out the two values per int8 (which requires an actual + # conversion because numpy doesn't natively support int4s). + qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8) + qvalues[:, 0::2] = qvalues_pack8 & 0xf + qvalues[:, 1::2] = qvalues_pack8 >> 4 + + assert addends is None or addends.shape == scales.shape + assert qvalues.shape[0] == scales.shape[0] + assert qvalues.shape[1] % scales.shape[1] == 0 + if g_idx is None: + repeat_count = qvalues.shape[1] // scales.shape[1] + scales = scales[:, :, np.newaxis] + if addends is not None: + addends = addends[:, :, np.newaxis] + # Reshape so that the below computation broadcasts over scales and addends: + qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count)) + else: + # In this case the scale and addend is selected for each column by g_idx: + assert addends is not None + scales = scales[:, g_idx] + addends = addends[:, g_idx] + if addends is None: + # Q4_0 + qvalues = qvalues.view(np.int8) + qvalues -= 8 + # And do the actual 'value = scale * qvalue + addend' computation. + values = scales * qvalues + if addends is not None: + values += addends + if g_idx is None: + values.shape = (values.shape[0], values.shape[1] * values.shape[2]) + return values + + +class Tensor(metaclass=ABCMeta): + data_type: DataType + + @abstractmethod + def astype(self, data_type: DataType) -> 'Tensor': + ... + + @abstractmethod + def permute(self, n_head: int, kv_head: int) -> 'Tensor': + ... + + @abstractmethod + def to_ne(self) -> 'NECompatibleTensor': + ... + + +def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray: + assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" + fp32_arr = bf16_arr.astype(np.uint32) << 16 + return fp32_arr.view(np.float32) + + +class UnquantizedTensor(Tensor): + def __init__(self, ndarray: NDArray) -> None: + assert isinstance(ndarray, np.ndarray) + self.ndarray = ndarray + self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] + + def astype(self, data_type: DataType) -> Tensor: + dtype = DATA_TYPE_TO_NUMPY[data_type] + if self.data_type == DT_BF16: + self.ndarray = bf16_to_fp32(self.ndarray) + return UnquantizedTensor(self.ndarray.astype(dtype)) + + def to_ne(self) -> 'UnquantizedTensor': + return self + + def permute(self, n_head: int, kv_head: int) -> 'UnquantizedTensor': + return UnquantizedTensor(permute(self.ndarray, n_head, kv_head)) + + +def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: + tensor = lazy_tensor.load() + assert isinstance(tensor, UnquantizedTensor) + + # double-check: + actual_shape = list(tensor.ndarray.shape) + assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape) + if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype: + if convert: + tensor.ndarray = tensor.ndarray.astype(expected_dtype) + else: + raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}') + + return tensor.ndarray + + +class NEQuantizedTensor(Tensor): + data_type: QuantizedDataType + + def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None: + rows, columns = shape + assert data_type in (DT_Q4_1, DT_Q4_0) # for now + assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this + assert columns % data_type.groupsize == 0 + words_in_block = 6 if data_type == DT_Q4_1 else 5 + self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block)) + self.shape = shape[:] + self.data_type = data_type + + def astype(self, data_type: DataType) -> Tensor: + if data_type == self.data_type: + return self + scales = self.ndarray[:, :, 0].view(np.float32) + if self.data_type.have_addends: + addends = self.ndarray[:, :, 1].view(np.float32) + else: + addends = None + qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8]) + + dq = dequantize_q4(qweights, scales, addends, g_idx=None) + return UnquantizedTensor(dq).astype(data_type) + + def to_ne(self) -> 'NEQuantizedTensor': + return self + + def permute(self, n_head: int, kv_head: int) -> 'NEQuantizedTensor': + return NEQuantizedTensor(permute(self.ndarray, n_head, kv_head), self.shape, self.data_type) + + +NECompatibleTensor = Union[UnquantizedTensor, NEQuantizedTensor] + + +class DeferredPermutedTensor(Tensor): + def __init__(self, base: Tensor, n_head: int, kv_head: int) -> None: + self.base = base + self.n_head = n_head + self.kv_head = kv_head + self.data_type = self.base.data_type + + def astype(self, data_type: DataType) -> Tensor: + return self.base.astype(data_type).permute(self.n_head, self.kv_head) + + def to_ne(self) -> NECompatibleTensor: + return self.base.to_ne().permute(self.n_head, self.kv_head) + + def permute(self, n_head: int, kv_head: int) -> Tensor: + raise Exception("shouldn't permute twice") + + +class GPTQForLLaMaQuantizedTensor(Tensor): + def __init__(self, model: 'LazyModel', namebase: str) -> None: + qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32) + scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True) + + bias = model.get(f"{namebase}.bias") + if bias is not None: + # Q4_1 does not support bias; good thing the bias is always all zeros. + assert not np.any(load_unquantized(bias)) + + if f"{namebase}.zeros" in model: + zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32) + else: + qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32) + assert qzeros.dtype == np.int32 + zeros = dequantize_q4(qzeros, scales, scales, g_idx=None) + assert zeros.dtype == np.float32 + + assert zeros.shape == scales.shape + + # Output is transposed compared to the input, and addends have their sign flipped. + # Scales and zeros similarly must be transposed but only for newer + # versions of GPTQ-for-LLaMa; the older versions can be identified by + # having shape (n_embd, 1). + qweight = qweight.T + if scales.shape[1] != 1: + scales = scales.T + zeros = zeros.T + + # Output also has signs flipped for the addends. + self.qweight = qweight + self.scales = scales + self.addends = -zeros + + self.g_idx: Optional[NDArray] + if f"{namebase}.g_idx" in model: + self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32) + assert self.g_idx.shape == (qweight.shape[1] * 8, ) + else: + self.g_idx = None + + self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8] + self.data_type = QuantizedDataType(groupsize=self.groupsize(), + have_addends=True, + have_g_idx=(self.g_idx is not None)) + + def inspect(self, row: int, col: int) -> None: + '''For debugging.''' + qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf + if self.g_idx is not None: + group = self.g_idx[col] + else: + group = int(col // self.groupsize()) + scale = self.scales[row, group] + addend = self.addends[row, group] + with np.printoptions(precision=None, suppress=True): + print(f'scale:{scale} addend:{addend} qweight:{qweight}') + print('possible values:', np.arange(16) * scale + addend) + print('actual value:', qweight * scale + addend) + + def astype(self, data_type: DataType) -> Tensor: + if isinstance(data_type, QuantizedDataType): + assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False + return self.regroup(data_type.groupsize) + + dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx) + return UnquantizedTensor(dequantized).astype(data_type) + + def groupsize(self) -> int: + assert self.addends.shape == self.scales.shape + assert self.shape[1] % self.scales.shape[1] == 0 + return self.shape[1] // self.scales.shape[1] + + def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': + # Old versions of GPTQ-for-LLaMa shared scales and addends between all the + # columns in a row. Newer versions share them between every set of N + # columns in a row, where N is the `groupsize` parameter, usually 128. The + # output format shares them between every set of 32 columns. To handle + # this, duplicate scales and addends for every smaller group. + # (In the above, 'row' and 'column' are in the sense of the output.) + assert self.g_idx is None + old_groupsize = self.groupsize() + assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize + ret = copy.copy(self) + ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1) + ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1) + ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False) + return ret + + def permute(self, n_head: int, kv_head: int) -> Tensor: + return DeferredPermutedTensor(self, n_head, kv_head) + + def to_ne(self) -> NEQuantizedTensor: + # The output format looks like this: + # For each row: + # For each group of 32 columns: + # - addend (float32, 4 bytes) + # - scale (float32, 4 bytes) + # - weights (int4 * 32, 16 bytes) + + if self.groupsize() != 32: + raise Exception("should have been regrouped before converting to ne") + + # Since the output format is mixed between integers and floats, we have + # to hackily view the floats as int32s just so numpy will let us + # concatenate them. + addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis] + scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis] + + # Split into groups of 4 columns (i.e. 32 columns of quantized data): + grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4]) + + # And concatenate: + grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no') + + return NEQuantizedTensor(grouped, self.shape, DT_Q4_1) + + +@dataclass +class LazyTensor: + _load: Callable[[], Tensor] + shape: List[int] + data_type: DataType + description: str + + def load(self) -> Tensor: + ret = self._load() + assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description) + return ret + + def astype(self, data_type: DataType) -> 'LazyTensor': + self.validate_conversion_to(data_type) + + def load() -> Tensor: + return self.load().astype(data_type) + + return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') + + def validate_conversion_to(self, data_type: DataType) -> None: + if data_type == self.data_type: + return + if isinstance(data_type, QuantizedDataType): + if not isinstance(self.data_type, QuantizedDataType): + raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})") + if self.data_type.have_g_idx: + sys.stderr.write( + "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively \ + supported by NE. For now you can still convert this model by passing `--outtype f16` to \ + dequantize, but that will result in a much larger output file for no quality benefit.\n" + ) + sys.exit(1) + assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends + + +LazyModel = Dict[str, LazyTensor] + + +@dataclass +class ModelPlus: + model: LazyModel + paths: List[Path] # Where this was read from. + format: Literal['ne', 'torch', 'safetensors'] + vocab: Optional[Vocab] # For NE models (which have vocab built in), the vocab. + + +def merge_sharded(models: List[LazyModel]) -> LazyModel: + # Original LLaMA models have each file contain one part of each tensor. + # Use a dict instead of a set to preserve order. + names = {name: None for model in models for name in model} + + def convert(name: str) -> LazyTensor: + lazy_tensors: List[LazyTensor] = [model[name] for model in models] + if len(lazy_tensors) == 1: + # only one file; don't go through this procedure since there might + # be quantized tensors + return lazy_tensors[0] + if len(lazy_tensors[0].shape) == 1: + # the tensor is just duplicated in every file + return lazy_tensors[0] + if name.startswith('tok_embeddings.') or \ + name.endswith('.attention.wo.weight') or \ + name.endswith('.feed_forward.w2.weight'): + # split by columns + axis = 1 + else: + # split by rows + axis = 0 + concatenated_shape = list(lazy_tensors[0].shape) + concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) + + def load() -> UnquantizedTensor: + ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] + concatenated: NDArray = np.concatenate(ndarrays, axis=axis) + return UnquantizedTensor(concatenated) + + description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' + return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) + + return {name: convert(name) for name in names} + + +def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: + formats = set(mp.format for mp in models_plus) + assert len(formats) == 1, "different formats?" + format = formats.pop() + paths = [path for mp in models_plus for path in mp.paths] + # Use the first non-None vocab, if any. + try: + vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None) + except StopIteration: + vocab = None + + if any("model.embed_tokens.weight" in mp.model for mp in models_plus): + # Transformers models put different tensors in different files, but + # don't split individual tensors between files. + model: LazyModel = {} + for mp in models_plus: + model.update(mp.model) + else: + model = merge_sharded([mp.model for mp in models_plus]) + + return ModelPlus(model, paths, format, vocab) + + +def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: + def load() -> Tensor: + return lazy_tensor.load().permute(n_head, n_head_kv) + + return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, + f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + + +def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: + out: LazyModel = {} + out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] + out["norm.weight"] = model["model.norm.weight"] + out["output.weight"] = model["lm_head.weight"] + for i in itertools.count(): + if f"model.layers.{i}.self_attn.q_proj.weight" not in model: + break + out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], + params.n_head, params.n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], + params.n_head, params.n_head_kv) + out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] + for j in range(8): + out[f"layers.{i}.ffn_gate.{j}.weight"] = model[f"model.layers.{i}.block_sparse_moe.experts.{j}.w1.weight"] + out[f"layers.{i}.ffn_down.{j}.weight"] = model[f"model.layers.{i}.block_sparse_moe.experts.{j}.w2.weight"] + out[f"layers.{i}.ffn_up.{j}.weight"] = model[f"model.layers.{i}.block_sparse_moe.experts.{j}.w3.weight"] + out[f"layers.{i}.ffn_gate_inp.weight"] = model[f"model.layers.{i}.block_sparse_moe.gate.weight"] + out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"] + out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"] + return out + + +def handle_quantization(model: LazyModel) -> LazyModel: + '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc. + (which resolve to UnquantizedTensors with the raw data) to one with entries + for 'foo.weight' (which resolve to QuantizedTensors). + ''' + def convert(name: str) -> Tuple[str, LazyTensor]: + if name.endswith(".qweight"): + namebase = name.rsplit('.', 1)[0] + orig_name = namebase + ".weight" + + lazy_tensor = model[name] + assert len(lazy_tensor.shape) == 2 + real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8] + + # Calculate type. This replicates the logic in + # GPTQForLLaMaQuantizedTensor (which is executed when the modelis + # actually loaded). + lazy_scales = model[f"{namebase}.scales"] + scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0] + assert real_shape[1] % scales_width == 0 + groupsize = real_shape[1] // scales_width + have_g_idx = f"{namebase}.g_idx" in model + data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx) + + def load() -> Tensor: + return GPTQForLLaMaQuantizedTensor(model, namebase) + + return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]')) + else: + return (name, model[name]) + + return dict(convert(name) for name in model) + + +# Functionality that simulates `torch.load` but where individual tensors are +# only loaded into memory on demand, not all at once. +# PyTorch can't do this natively as of time of writing: +# - https://github.com/pytorch/pytorch/issues/64327 +# This allows us to de-shard without multiplying RAM usage, and also +# conveniently drops the PyTorch dependency (though we still need numpy). + + +@dataclass +class LazyStorageKind: + data_type: DataType + + +@dataclass +class LazyStorage: + load: Callable[[int, int], NDArray] + kind: LazyStorageKind + description: str + + +class LazyUnpickler(pickle.Unpickler): + def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): + super().__init__(fp) + self.data_base_path = data_base_path + self.zip_file = zip_file + + def persistent_load(self, pid: Any) -> Any: + assert pid[0] == 'storage' + assert isinstance(pid[1], LazyStorageKind) + data_type = pid[1].data_type + filename_stem = pid[2] + filename = self.data_base_path + '/' + filename_stem + info = self.zip_file.getinfo(filename) + + def load(offset: int, elm_count: int) -> NDArray: + dtype = DATA_TYPE_TO_NUMPY.get(data_type) + if dtype is None: + raise Exception("tensor stored in unsupported format") + fp = self.zip_file.open(info) + fp.seek(offset * dtype.itemsize) + size = elm_count * dtype.itemsize + data = fp.read(size) + assert len(data) == size + return np.frombuffer(data, dtype) + + description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' + return LazyStorage(load=load, kind=pid[1], description=description) + +# @staticmethod + + def lazy_rebuild_tensor_v2( + storage: Any, + storage_offset: Any, + size: Any, + stride: Any, # pyright: ignore[reportSelfClsParameterName] + requires_grad: Any, + backward_hooks: Any, + metadata: Any = None) -> LazyTensor: + assert isinstance(storage, LazyStorage) + + def load() -> UnquantizedTensor: + elm_count = stride[0] * size[0] + return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) + + description = f'pickled storage_offset={storage_offset} in {storage.description}' + return LazyTensor(load, list(size), storage.kind.data_type, description) + + # @staticmethod + def rebuild_from_type_v2(func, new_type, args, state): + return func(*args) + + CLASSES: Dict[Any, Any] = { + ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2, + ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, + ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), + ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), + ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), + ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + ('torch', 'BoolStorage'): LazyStorageKind(DT_BOOL), + ('torch', 'Tensor'): LazyTensor, + } + + def find_class(self, module: str, name: str) -> Any: + if not module.startswith('torch'): + return super().find_class(module, name) + return self.CLASSES[(module, name)] + + +def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: + zf = zipfile.ZipFile(outer_fp) + pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] + assert len(pickle_paths) == 1, pickle_paths + pickle_fp = zf.open(pickle_paths[0], 'r') + unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf) + model = unpickler.load() + as_dict = dict(model.items()) + return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) + + +SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {'F16': DT_F16, 'F32': DT_F32, 'I32': DT_I32, 'BOOL': DT_BOOL, + 'BF16': DT_BF16} + +def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: + header_size, = struct.unpack(' LazyTensor: + data_type = SAFETENSORS_DATA_TYPES[info['dtype']] + numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] + shape: List[int] = info['shape'] + begin, end = info['data_offsets'] + assert 0 <= begin <= end <= len(byte_buf) + assert end - begin == math.prod(shape) * numpy_dtype.itemsize + buf = byte_buf[begin:end] + + def load() -> UnquantizedTensor: + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + + description = f'safetensors begin={begin} end={end} type={data_type} path={path}' + return LazyTensor(load, shape, data_type, description) + + model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'} + return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) + + +def must_read(fp: IO[bytes], length: int) -> bytes: + ret = fp.read(length) + if len(ret) < length: + raise Exception("unexpectedly reached end of file") + return ret + + +def lazy_load_ne_file(fp: io.BufferedReader, path: Path) -> ModelPlus: + magic = must_read(fp, 4)[::-1] + if magic in (b'ggmf', b'ggjt'): + version, = struct.unpack("i", must_read(fp, 4)) + assert version == 1 + else: + assert magic == b'ne' + version = None + n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28)) + + tokens: List[Tuple[bytes, float]] = [] + for i in range(n_vocab): + if i == 32000: + # HACK: GPT4All messed with the format without changing the magic + # number. Specifically, they changed the vocab section to contain + # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the + # extra pad token). Try to detect if we're reading a file like + # this. + orig_pos = fp.tell() + fp.seek(20, io.SEEK_CUR) + is_gpt4all = fp.read(21) == b'tok_embeddings.weight' + fp.seek(orig_pos) + if is_gpt4all: + break + + length, = struct.unpack("i", must_read(fp, 4)) + text = must_read(fp, length) + if magic != b'ne': + score, = struct.unpack("f", must_read(fp, 4)) + tokens.append((text, score)) + vocab = NEVocab(tokens) if magic != b'ne' else None + + model: LazyModel = {} + # Use mmap for the actual data to avoid race conditions with the file offset. + off = fp.raw.tell() + mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) + fp.raw.seek(off) # needed on Windows + + def read_tensor() -> None: # this is a function so that variables captured in `load` don't change + shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) + assert 0 <= shape_len <= 3 + shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len))) + shape = shape[::-1] + name = must_read(fp, name_len).decode('utf-8') + data_type = FTYPE_TO_DATA_TYPE[ftype] + + if magic == b'ggjt': + fp.seek((fp.tell() + 31) & -32) + + if data_type == DT_Q4_1: + # See GPTQForLLaMaQuantizedTensor.ne_ndarray() + size = 24 * (shape[1] // 32) * shape[0] + elif data_type == DT_Q4_0: + size = 20 * (shape[1] // 32) * shape[0] + else: + numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] + elm_count = math.prod(shape) + size = elm_count * numpy_dtype.itemsize + offset = fp.tell() + buf = mapped[offset:offset + size] + fp.seek(size, io.SEEK_CUR) + + def load() -> Tensor: + if isinstance(data_type, QuantizedDataType): + ndarray = np.frombuffer(buf, dtype=np.uint32) + return NEQuantizedTensor(ndarray, shape, data_type) + else: + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + + description = f'ne offset={offset} type={data_type} path={path}' + model[name] = LazyTensor(load, shape, data_type, description) + + while fp.read(1) != b'': + fp.seek(-1, io.SEEK_CUR) + read_tensor() + + return ModelPlus(model=model, paths=[path], format='ne', vocab=vocab) + + +@functools.lru_cache(maxsize=None) +def lazy_load_file(path: Path) -> ModelPlus: + fp = open(path, 'rb') + first8 = fp.read(8) + fp.seek(0) + if first8[:2] == b'PK': + # A zip file, i.e. PyTorch format + return lazy_load_torch_file(fp, path) + elif first8[2:4] == b'gg': + # NE format + return lazy_load_ne_file(fp, path) + elif struct.unpack(' Iterable[Out]: + '''Parallel map, but with backpressure. If the caller doesn't call `next` + fast enough, this will stop calling `func` at some point rather than + letting results pile up in memory. Specifically, there is a max of one + output value buffered per thread.''' + with concurrent.futures.ThreadPoolExecutor() as executor: + futures: List[concurrent.futures.Future[Out]] = [] + items_rev = list(iterable)[::-1] + for i in range(min(concurrency, len(items_rev))): + futures.append(executor.submit(func, items_rev.pop())) + while futures: + result = futures.pop(0).result() + if items_rev: + futures.append(executor.submit(func, items_rev.pop())) + yield result + + +def check_vocab_size(params: Params, vocab: Vocab) -> None: + if params.n_vocab != vocab.vocab_size: + # NEVocab comes from the same file as the model so shouldn't mismatch: + assert isinstance(vocab, SentencePieceVocab) + if params.n_vocab == vocab.vocab_size_base: + print("Ignoring added_tokens.json since model matches vocab size without it.") + vocab.added_tokens_list = [] + vocab.vocab_size = vocab.vocab_size_base + return + msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" + if vocab.fname_added_tokens is not None: + msg += f" combined with {vocab.fname_added_tokens}" + msg += f" has {vocab.vocab_size})." + if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: + msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + raise Exception(msg) + + +class OutputFile: + def __init__(self, fname_out: Path) -> None: + self.fout = open(fname_out, "wb") + + def write_file_header(self, params: Params, file_type: NEFileType) -> None: + self.fout.write(b"ggjt"[::-1]) # magic + values = [ + 1, # file version + params.n_vocab, + params.n_embd, + params.n_mult, + params.n_head, + params.n_head_kv, # n_head_kv (multi_query attention) + params.n_layer, + params.n_embd // params.n_head, # rot (obsolete) + file_type.value, + ] + self.fout.write(struct.pack("i" * len(values), *values)) + self.fout.write(struct.pack("i", 0)) + self.fout.write(struct.pack("f", 0)) + self.fout.write(struct.pack("f", 0)) + self.fout.write(struct.pack("i", 0)) + self.fout.write(struct.pack("i", 0)) # word_embed_proj_dim (for opt) + self.fout.write(struct.pack("i", 0)) # do_layer_norm_before (for opt) + + self.fout.write(struct.pack("i", 0)) + self.fout.write(struct.pack("i", params.ffn_hidden_size)) + self.fout.write(struct.pack("i", 0)) + self.fout.write(struct.pack("i", 8)) + self.fout.write(struct.pack("i", 2)) + self.fout.write(struct.pack("f", params.rms_norm_eps)) + self.fout.write(struct.pack("f", params.rope_theta)) + self.fout.write(struct.pack("f", params.rope_scale)) + + self.fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + self.fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + self.fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) + + self.fout.write( + struct.pack("i", 1) + ) + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json + # but bos_token_id = 1 in llama.cpp + self.fout.write(struct.pack("i", 2)) + + self.fout.write(struct.pack("i", 0)) + self.fout.write(struct.pack("i", 0)) + + def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: + sname = name.encode('utf-8') + self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type])) + self.fout.write(struct.pack("i" * len(shape), *shape[::-1])) + self.fout.write(sname) + self.fout.seek((self.fout.tell() + 31) & -32) + + def write_vocab(self, vocab: Vocab) -> None: + for text, score in vocab.all_tokens(): + self.fout.write(struct.pack("i", len(text))) + self.fout.write(text) + self.fout.write(struct.pack("f", score)) + + @staticmethod + def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: + of = OutputFile(fname_out) + params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32) + of = OutputFile(fname_out) + of.write_file_header(params) + of.write_vocab(vocab) + of.fout.close() + + @staticmethod + def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None: + check_vocab_size(params, vocab) + of = OutputFile(fname_out) + of.write_file_header(params, file_type) + print("Writing vocab...") + of.write_vocab(vocab) + + def do_item(item: Tuple[str, LazyTensor]) -> NDArray: + name, lazy_tensor = item + return lazy_tensor.load().to_ne().ndarray + + ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print( + f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} |\ + type {lazy_tensor.data_type}" + ) + of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) + ndarray.tofile(of.fout) + of.fout.close() + + +def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> NEFileType: + wq_type = model["layers.0.attention.wq.weight"].data_type + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): + return NEFileType.AllF32 + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): + return NEFileType.MostlyF16 + if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) + and wq_type.have_addends): + if isinstance(model["output.weight"].data_type, QuantizedDataType): + return NEFileType.MostlyQ4_1 + else: + return NEFileType.PerLayerIsQ4_1 + if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)): + return NEFileType.MostlyQ4_0 + name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} + raise Exception(f"Unexpected combination of types: {name_to_type}") + + +def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: + model = handle_quantization(model) + + if "lm_head.weight" in model: + model = convert_transformers_to_orig(model, params) + model = filter_and_sort_tensors(model) + + return model + + +def convert_to_output_type(model: LazyModel, output_type: NEFileType) -> LazyModel: + return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} + + +def nth_multifile_path(path: Path, n: int) -> Optional[Path]: + '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return + the nth path in the model. + ''' + # Support the following patterns: + patterns: List[Tuple[str, str]] = [ + # - x.00.pth, x.01.pth, etc. + (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), + # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. + (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), + # x.bin, x.bin.1, etc. + (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') + ] + for regex, replacement in patterns: + if re.search(regex, path.name): + new_path = path.with_name(re.sub(regex, replacement, path.name)) + if new_path.exists(): + return new_path + return None + + +def find_multifile_paths(path: Path) -> List[Path]: + '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return + the whole list of paths in the model. + ''' + ret: List[Path] = [] + for i in itertools.count(): + nth_path = nth_multifile_path(path, i) + if nth_path is None: + break + ret.append(nth_path) + if not ret: + # No matches. This should only happen if the file was named, e.g., + # foo.0, and there was no file named foo. Oh well, try to process it + # as a single file. + return [path] + return ret + + +def load_some_model(path: Path) -> ModelPlus: + '''Load a model of any supported format.''' + # Be extra-friendly and accept either a file or a directory: + if path.is_dir(): + # Check if it's a set of safetensors files first + files = list(path.glob("model-00001-of-*.safetensors")) + if not files: + # Try the PyTorch patterns too, with lower priority + globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] + files = [file for glob in globs for file in path.glob(glob)] + if not files: + # Try NE too, but with lower priority, since if both a non-NE + # model and a NE model exist in the same directory, we assume the + # latter was converted from the former. + files = list(path.glob("ne-model*.bin*")) + if not files: + raise Exception(f"Can't find model in directory {path}") + if len(files) > 1: + raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") + path = files[0] + + paths = find_multifile_paths(path) + models_plus: List[ModelPlus] = [] + for path in paths: + print(f"Loading model file {path}") + models_plus.append(lazy_load_file(path)) + + model_plus = merge_multifile_models(models_plus) + return model_plus + + +def filter_and_sort_tensors(model: LazyModel) -> LazyModel: + return {name: model[name] for name in TENSORS_LIST if name in model} + + +def load_vocab(path: Path) -> SentencePieceVocab: + # Be extra-friendly and accept either a file or a directory. Also, if it's + # a directory, it might be the model directory, and tokenizer.model might + # be in the parent of that. + if path.is_dir(): + path2 = path / "tokenizer.model" + # Use `.parent` instead of /.. to handle the symlink case better. + path3 = path.parent / "tokenizer.model" + if path2.exists(): + path = path2 + elif path3.exists(): + path = path3 + else: + raise FileNotFoundError( + f"Could not find tokenizer.model in {path} or its parent; if it's in another directory,\ + pass the directory as --vocab-dir" + ) + added_tokens_path = path.parent / "added_tokens.json" + print(f"Loading vocab file {path}") + return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) + + +def default_outfile(model_paths: List[Path], params: Params) -> Path: + namestr = { + NEFileType.AllF32: "f32", + NEFileType.MostlyF16: "f16", + NEFileType.MostlyQ4_0: "q4_0", + NEFileType.MostlyQ4_1: "q4_1", + NEFileType.PerLayerIsQ4_1: "q4_1", + }[params.file_type] + ret = model_paths[0].parent / f"ne-model-{namestr}.bin" + if ret in model_paths: + sys.stderr.write( + f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify \ + a path using --outfile.\n" + ) + sys.exit(1) + return ret + + +def do_dump_model(model_plus: ModelPlus) -> None: + print(f"model_plus.paths = {model_plus.paths!r}") + print(f"model_plus.format = {model_plus.format!r}") + print(f"model_plus.vocab = {model_plus.vocab!r}") + for name, lazy_tensor in model_plus.model.items(): + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") + + +def main(args_in: Optional[List[str]] = None) -> None: + parser = argparse.ArgumentParser(description="Convert a LLaMa model to a NE compatible file") + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", + action="store_true", + help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", + choices=["f32", "f16", "q4_1", "q4_0"], + help="output format (default: based on input)") + parser.add_argument("--vocab-dir", + type=Path, + help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", + type=Path, + help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + args = parser.parse_args(args_in) + + vocab: Vocab + if args.dump_single: + model_plus = lazy_load_file(args.model) + do_dump_model(model_plus) + elif args.vocab_only: + vocab = load_vocab(args.vocab_dir or args.model) + assert args.outfile, "need --outfile if using --vocab-only" + outfile = args.outfile + OutputFile.write_vocab_only(outfile, vocab) + print(f"Wrote {outfile}") + else: + if Path(args.model).is_dir(): + print("Loadding the model from the local path.") + else: + print("Loadding the model from HF.") + model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + cache_path = Path(tokenizer.vocab_file).parent + args.model = cache_path + + model_plus = load_some_model(args.model) + if args.dump: + do_dump_model(model_plus) + return + if model_plus.vocab is not None and args.vocab_dir is None: + vocab = model_plus.vocab + else: + vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent + vocab = load_vocab(vocab_dir) + model = model_plus.model + params = Params.load(model_plus) + model = do_necessary_conversions(model, params) + output_type = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, output_type) + outfile = args.outfile or default_outfile(model_plus.paths, params) + OutputFile.write_all(outfile, params, model, vocab, output_type) + print(f"Wrote {outfile}") + + +if __name__ == '__main__': + main() diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py index fcad21bda..cd56af41d 100644 --- a/neural_speed/convert/convert_mpt.py +++ b/neural_speed/convert/convert_mpt.py @@ -95,6 +95,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index 0068311e9..07b7a632a 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -106,6 +106,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py index f74fdf5d1..6e02b0b55 100644 --- a/neural_speed/convert/convert_phi.py +++ b/neural_speed/convert/convert_phi.py @@ -197,9 +197,14 @@ def phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor + fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else -1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -1)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_quantized_bloom.py b/neural_speed/convert/convert_quantized_bloom.py index b68d4bff9..21ce87ab6 100644 --- a/neural_speed/convert/convert_quantized_bloom.py +++ b/neural_speed/convert/convert_quantized_bloom.py @@ -170,7 +170,12 @@ def bytes_to_unicode(): f.write(struct.pack("i", 0)) f.write(struct.pack("i", 0)) f.write(struct.pack("i", 0)) +f.write(struct.pack("i", 0)) # n_experts +f.write(struct.pack("i", 0)) # n_expert_used f.write(struct.pack("f", 1e-6)) # rms norm eps +f.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled +f.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings +f.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) f.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) f.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py index 44a3a5d59..5ce8f863e 100644 --- a/neural_speed/convert/convert_quantized_gptj.py +++ b/neural_speed/convert/convert_quantized_gptj.py @@ -142,11 +142,12 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get( "rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py index dc51b37b4..b3bbfd9c5 100644 --- a/neural_speed/convert/convert_quantized_llama.py +++ b/neural_speed/convert/convert_quantized_llama.py @@ -146,6 +146,8 @@ def main(args_in: Optional[List[str]] = None) -> None: f.write(struct.pack("i", 0)) f.write(struct.pack("i", ffn_hidden_size)) f.write(struct.pack("i", 0)) + f.write(struct.pack("i", 0)) # n_experts + f.write(struct.pack("i", 0)) # n_expert_used f.write(struct.pack("f", config["rms_norm_eps"])) f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000)) diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py index 82403dc94..8e154a8ca 100644 --- a/neural_speed/convert/convert_quantized_mistral.py +++ b/neural_speed/convert/convert_quantized_mistral.py @@ -82,10 +82,15 @@ def main(args_in: Optional[List[str]] = None) -> None: f.write(struct.pack("i", 0)) f.write(struct.pack("i", ffn_hidden_size)) f.write(struct.pack("i", 0)) + f.write(struct.pack("i", 0)) # n_experts + f.write(struct.pack("i", 0)) # n_expert_used f.write(struct.pack("f", config["rms_norm_eps"])) f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000)) f.write(struct.pack("f", rope_scale)) + f.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled + f.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings + f.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0)) # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index 900d8cfe8..704aa9ee6 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -112,6 +112,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", hparams["intermediate_size"])) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py index 11dec1f70..f176ef8d9 100644 --- a/neural_speed/convert/convert_starcoder.py +++ b/neural_speed/convert/convert_starcoder.py @@ -110,6 +110,8 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", 0)) + fout.write(struct.pack("i", 0)) # n_experts + fout.write(struct.pack("i", 0)) # n_expert_used fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor diff --git a/neural_speed/core/layers/Ops.h b/neural_speed/core/layers/Ops.h index dc3e28a8d..b0f441b42 100644 --- a/neural_speed/core/layers/Ops.h +++ b/neural_speed/core/layers/Ops.h @@ -48,6 +48,8 @@ enum ne_op { NE_OP_MUL_MAT, NE_OP_MUL_MAT_BIAS, + NE_OP_MUL_MAT_ID, + NE_OP_MUL_ID_FFN_SILU, NE_OP_SCALE, NE_OP_SET, NE_OP_CPY, @@ -88,6 +90,7 @@ enum ne_op { NE_OP_DUMP_TENSOR, NE_OP_DEBUG, NE_OP_CONV_1D, + NE_OP_ARGSORT, NE_OP_COUNT, }; diff --git a/neural_speed/core/layers/argsort.cpp b/neural_speed/core/layers/argsort.cpp new file mode 100644 index 000000000..958b4a8d1 --- /dev/null +++ b/neural_speed/core/layers/argsort.cpp @@ -0,0 +1,69 @@ +// Copyright (c) 2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "argsort.h" +#include +#include + +static void ne_compute_forward_argsort_f32(const struct ne_compute_params* params, const struct ne_tensor* src0, + struct ne_tensor* dst) { + if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) { + return; + } + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const size_t nb00 = src0->nb[0]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = src0->ne[1] * src0->ne[2] * src0->ne[3]; + + for (int64_t i = ith; i < nr; i += nth) { + int32_t* dst_data = (int32_t*)((char*)dst->data + i * nb1); + const float* src_data = (float*)((char*)src0->data + i * nb01); + + for (int64_t j = 0; j < ne0; j++) { + dst_data[j] = j; + } + std::sort(dst_data, dst_data + ne0, [src_data](int pos1, int pos2) { return (src_data[pos1] > src_data[pos2]); }); + } +} +void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0, + struct ne_tensor* dst) { + switch (src0->type) { + case NE_TYPE_F32: { + ne_compute_forward_argsort_f32(params, src0, dst); + } break; + default: { + NE_ASSERT(false); + } break; + } +} diff --git a/neural_speed/core/layers/argsort.h b/neural_speed/core/layers/argsort.h new file mode 100644 index 000000000..a9c7c2058 --- /dev/null +++ b/neural_speed/core/layers/argsort.h @@ -0,0 +1,28 @@ +// Copyright (c) 2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "core/ne.h" +#include "core/data_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0, + struct ne_tensor* dst); + +#ifdef __cplusplus +} +#endif diff --git a/neural_speed/core/layers/layers.h b/neural_speed/core/layers/layers.h index ba7ba0816..34f7620b8 100644 --- a/neural_speed/core/layers/layers.h +++ b/neural_speed/core/layers/layers.h @@ -16,3 +16,4 @@ #include "conv.h" #include "memory.h" +#include "argsort.h" diff --git a/neural_speed/core/layers/mha_dense.cpp b/neural_speed/core/layers/mha_dense.cpp index 0fcd39031..af2953514 100644 --- a/neural_speed/core/layers/mha_dense.cpp +++ b/neural_speed/core/layers/mha_dense.cpp @@ -72,7 +72,7 @@ bool bestla_reordered_attn_fp32_support(const attn_shape_t* params) { // TODO(Yi): check K V's layout if (_cd->AMX_BF16()) return true; #endif - return _cd->AVX512F() || _cd->AVX2(); // use avx2 and f16c on avx2 platforms + return !_cd->AVX512F() || _cd->AVX2(); // use avx2 and f16c on avx2 platforms } // kv cache sizes in bytes per layer per batch per beam for; void bestla_reordered_attn_fp32_batch_kv_info(const kv_shape_t* params, kv_cache_info_t* out) { diff --git a/neural_speed/core/ne.h b/neural_speed/core/ne.h index 4790a297e..33bf4f0b6 100644 --- a/neural_speed/core/ne.h +++ b/neural_speed/core/ne.h @@ -43,7 +43,7 @@ #define NE_MAX_NODES 16384 #define NE_MAX_PARAMS 256 #define NE_MAX_CONTEXTS 64 -#define NE_MAX_OPT 4 +#define NE_MAX_OPT 36 #define NE_DEFAULT_N_THREADS 4 #define NE_MAX_OP_PARAMS 32 diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c index 38c76a252..791c94076 100644 --- a/neural_speed/core/ne_layers.c +++ b/neural_speed/core/ne_layers.c @@ -402,9 +402,10 @@ static const char* NE_OP_LABEL[NE_OP_COUNT] = { "NORM", "RMS_NORM", "RMS_NORM_BACK", - + "ARGSORT", "MUL_MAT", "MUL_MAT_WITH_BIAS", + "MUL_MAT_ID", "SCALE", "SET", "CPY", @@ -431,10 +432,10 @@ static const char* NE_OP_LABEL[NE_OP_COUNT] = { "FFN_SILU", "FFN_GeLU", "FFN_ADD_GeLU", + "FFN_ID_SILU", "FLASH_ATTN", "FLASH_ATTN_KV_UPDATE", "FLASH_FF", - "MAP_UNARY", "MAP_BINARY", "SPLIT", @@ -445,7 +446,7 @@ static const char* NE_OP_LABEL[NE_OP_COUNT] = { "DEBUG", }; -static_assert(NE_OP_COUNT == 64, "NE_OP_COUNT != 64"); +static_assert(NE_OP_COUNT == 67, "NE_OP_COUNT != 67"); static const char* NE_OP_SYMBOL[NE_OP_COUNT] = { "none", @@ -479,6 +480,7 @@ static const char* NE_OP_SYMBOL[NE_OP_COUNT] = { "X*Y", "X*Y+Z", "x*v", + "matmul_id", "y-\\>view(x)", "x-\\>y", "cont(x)", @@ -502,16 +504,17 @@ static const char* NE_OP_SYMBOL[NE_OP_COUNT] = { "QKV(x)", "ffn_silu(x)", + "ffn_id_silu(x)", "ffn_gelu(x)", "ffn_gelu_with_bias(x)", "flash_attn(x)", "flash_attn_kv_update(x)", "flash_ff(x)", - "f(x)", "f(x,y)", "conv_1d(x)", "debug(x)", + "argsort(x)", }; static_assert(sizeof(struct ne_object) % NE_MEM_ALIGN == 0, "ne_object size must be a multiple of NE_MEM_ALIGN"); @@ -2178,6 +2181,109 @@ struct ne_tensor* ne_mul_mat_with_bias(struct ne_context* ctx, struct ne_tensor* return result; } +struct ne_tensor* ne_mul_mat_id(struct ne_context* ctx, struct ne_tensor* const as[], int n_as, struct ne_tensor* ids, + int id, struct ne_tensor* b) { + NE_ASSERT(ids->type == NE_TYPE_I32); + NE_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); + NE_ASSERT(ids->ne[1] == b->ne[1]); + NE_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); + NE_ASSERT(n_as > 0 && n_as <= 8); + NE_ASSERT(id >= 0 && id < ids->ne[0]); + + bool is_node = false; + + if (as[0]->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = {as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3]}; + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne, NE_SIZE_CALC); + int params[] = {id, n_as}; + ne_set_op_params(result, ¶ms, sizeof(params)); + result->op = NE_OP_MUL_MAT_ID; + result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; + result->src0 = ids; + result->src1 = b; + + for (int i = 0; i < n_as; i++) { + struct ne_tensor* a = as[i]; + NE_ASSERT(ne_are_same_shape(as[0], a)); + NE_ASSERT(ne_can_mul_mat(a, b)); + NE_ASSERT(!ne_is_transposed(a)); + result->opt[i] = a; + } + + return result; +} + +struct ne_tensor* ne_mul_id_ffn_silu(struct ne_context* ctx, struct ne_tensor* const down[], + struct ne_tensor* const gate[], struct ne_tensor* const up[], int n_as, + struct ne_tensor* ids, int id, struct ne_tensor* src) { + struct ne_tensor* w1 = gate[0]; + struct ne_tensor* w2 = down[0]; + struct ne_tensor* w3 = up[0]; + NE_ASSERT(ids->type == NE_TYPE_I32); + NE_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); + NE_ASSERT(ids->ne[1] == src->ne[1]); + NE_ASSERT(ids->ne[2] == src->ne[2] && ids->ne[3] == src->ne[3]); + NE_ASSERT(n_as > 0 && n_as <= 8); + NE_ASSERT(id >= 0 && id < ids->ne[0]); + NE_ASSERT(ne_are_same_shape(w1, w3)); + NE_ASSERT(w2->ne[0] == w1->ne[1]); + + bool is_node = false; + + if (down[0]->grad || src->grad) { + is_node = true; + } + const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + int params[] = {id, n_as}; + ne_set_op_params(result, ¶ms, sizeof(params)); + result->op = NE_OP_MUL_ID_FFN_SILU; + result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; + result->src0 = src; + result->src1 = ids; + for (int i = 0; i < n_as; i++) { + struct ne_tensor* a = gate[i]; + struct ne_tensor* b = down[i]; + struct ne_tensor* c = up[i]; + result->opt[i] = a; + result->opt[i + 8] = b; + result->opt[i + 16] = c; + } + result->opt[24] = tmp; + result->opt[25] = tmp1; + // struct ne_tensor *result = ne_ffn_silu(ctx,gate[row_id], down[row_id],up[row_id], b); + return result; +} +struct ne_tensor* ne_argsort(struct ne_context* ctx, struct ne_tensor* a) { + bool is_node = false; + + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_I32, NE_MAX_DIMS, a->ne, NE_SIZE_CALC); + + result->op = NE_OP_ARGSORT; + result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; + result->src0 = a; + + return result; +} + +// ne_top_k + +struct ne_tensor* ne_top_k(struct ne_context* ctx, struct ne_tensor* a, int k) { + NE_ASSERT(a->ne[0] >= k); + + struct ne_tensor* result = ne_argsort(ctx, a); + + result = ne_view_4d(ctx, result, k, result->ne[1], result->ne[2], result->ne[3], result->nb[1], result->nb[2], + result->nb[3], 0); + + return result; +} // ne_mul_qkv struct ne_tensor* ne_mul_qkv(struct ne_context* ctx, struct ne_tensor* qw, struct ne_tensor* kw, struct ne_tensor* vw, @@ -2754,17 +2860,22 @@ struct ne_tensor* ne_transpose(struct ne_context* ctx, struct ne_tensor* a) { // ne_get_rows struct ne_tensor* ne_get_rows(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) { - NE_ASSERT(ne_is_matrix(a) && ne_is_vector(b) && b->type == NE_TYPE_I32); + NE_ASSERT(a->ne[2] == b->ne[1]); + NE_ASSERT(b->ne[3] == 1); + NE_ASSERT(b->type == NE_TYPE_I32); bool is_node = false; if (a->grad || b->grad) { is_node = true; } - + enum ne_type type = NE_TYPE_F32; + if (a->type == NE_TYPE_I32) { + type = a->type; + } // TODO: implement non F32 return // struct ne_tensor * result = ne_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ne_tensor* result = ne_new_tensor_2d(ctx, NE_TYPE_F32, a->ne[0], b->ne[0], NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor_4d(ctx, NE_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2], NE_SIZE_CALC); result->op = NE_OP_GET_ROWS; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -5265,7 +5376,7 @@ static void ne_compute_forward_mul(const struct ne_compute_params* params, const static void ne_compute_forward_div_f32(const struct ne_compute_params* params, const struct ne_tensor* src0, const struct ne_tensor* src1, struct ne_tensor* dst) { - assert(params->ith == 0); + // assert(params->ith == 0); assert(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst)); if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) { @@ -5494,7 +5605,7 @@ static void ne_compute_forward_sum(const struct ne_compute_params* params, const static void ne_compute_forward_sum_rows_f32(const struct ne_compute_params* params, const struct ne_tensor* src0, struct ne_tensor* dst) { - NE_ASSERT(params->ith == 0); + // NE_ASSERT(params->ith == 0); if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) { return; @@ -6435,7 +6546,7 @@ static void ne_compute_forward_mul_mat_f32(const struct ne_compute_params* param const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); const int64_t i03 = (ir0 / (ne02)); - // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: const int64_t i02 = (i12 / (ne12 / ne02)); // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) @@ -6558,7 +6669,7 @@ static void ne_compute_forward_mul_mat_f16_f32(const struct ne_compute_params* p const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); const int64_t i03 = (ir0 / (ne02)); - // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: const int64_t i02 = (i12 / (ne12 / ne02)); // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) @@ -6692,7 +6803,7 @@ static void ne_compute_forward_mul_mat_q_f32(const struct ne_compute_params* par const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); const int64_t i03 = (ir0 / (ne02)); - // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: const int64_t i02 = (i12 / (ne12 / ne02)); // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) @@ -6831,6 +6942,606 @@ static void ne_compute_forward_mul_mat(const struct ne_compute_params* params, c } } +static void ne_compute_forward_mul_mat_id_q_f32(const struct ne_compute_params* params, const struct ne_tensor* ids, + const struct ne_tensor* src1, struct ne_tensor* dst) { + int64_t t0 = ne_perf_time_us(); + UNUSED(t0); + const struct ne_tensor* src0 = dst->opt[0]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const size_t nb00 = src0->nb[0]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb10 = src1->nb[0]; + + const size_t nb11 = src1->nb[1]; + UNUSED(nb11); + const size_t nb12 = src1->nb[2]; + UNUSED(nb12); + const size_t nb13 = src1->nb[3]; + UNUSED(nb13); + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ne_type type = src0->type; + quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; + vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; + enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type; + + NE_ASSERT(ne0 == ne01); + NE_ASSERT(ne1 == ne11); + NE_ASSERT(ne2 == ne12); + NE_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]); + NE_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + NE_ASSERT(nb0 == sizeof(float)); + NE_ASSERT(nb0 <= nb1); + NE_ASSERT(nb1 <= nb2); + NE_ASSERT(nb2 <= nb3); + const int id = dst->op_params[0]; + const int n_as = dst->op_params[1]; + // char * wdata_src1_end = (char *)params->wdata; + // int64_t wdata_src1_end = 0; + +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == NE_TASK_INIT) { + if (ith != 0) { + return; + } + char* wdata = params->wdata; + const size_t row_size = ne10 * NE_TYPE_SIZE[vec_dot_type] / NE_BLCK_SIZE[vec_dot_type]; + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + quantize_row_q_dot((float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11), (void*)wdata, ne10); + wdata += row_size; + } + } + } + + return; + } + + if (params->type == NE_TASK_FINALIZE) { + return; + } + int64_t matrix_row_counts[100]; // [n_as] + int64_t matrix_rows[30000]; // [n_as][ne11] + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + memset(matrix_rows, -1, 30000 * sizeof(int64_t)); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); + NE_ASSERT(row_id >= 0 && row_id < n_as); + mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + if (cne1 == 0) { + continue; + } + const struct ne_tensor* src0_cur = dst->opt[cur_a]; + // parallelize by src0 rows + const int64_t dr = (ne01 + nth - 1) / nth; + + const int64_t ir10 = dr * ith; + const int64_t ir11 = MIN(ir10 + dr, ne01); + + // src1 rows + const int64_t nr1 = cne1 * ne12 * ne13; + + void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10 * NE_TYPE_SIZE[vec_dot_type] / NE_BLCK_SIZE[vec_dot_type]; + + for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { + const int64_t i13 = (ir1 / (ne12 * cne1)); + const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1; + const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1); + const int64_t i11 = mmid_matrix_row(cur_a, _i11); + if (i11 == -1) { + continue; + } + + const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); + const int64_t i03 = (ir0 / (ne02)); + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. + // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: + const int64_t i02 = (i12 / (ne12 / ne02)); + // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) + // const int64_t i02 = (ir0 - i03*ne02); + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + char* src0_row = (char*)src0_cur->data + (0 + i02 * nb02 + i03 * nb03); + char* src1_col = (char*)wdata + (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size; + + float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + for (int64_t ir = ir10; ir < ir11; ++ir) { + vec_dot_q(ne00, &dst_col[ir], src0_row + ir * nb01, src1_col); + } + } + } +} + +static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* params, const struct ne_tensor* ids, + const struct ne_tensor* src1, struct ne_tensor* dst) { + int64_t t0 = ne_perf_time_us(); + UNUSED(t0); + const struct ne_tensor* src0 = dst->opt[0]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne11 = src1->ne[1]; + + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const size_t nb00 = src0->nb[0]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb10 = src1->nb[0]; + + const size_t nb11 = src1->nb[1]; + UNUSED(nb11); + const size_t nb12 = src1->nb[2]; + UNUSED(nb12); + const size_t nb13 = src1->nb[3]; + UNUSED(nb13); + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + NE_ASSERT(ne0 == ne01); + NE_ASSERT(ne1 == ne11); + NE_ASSERT(ne2 == ne12); + NE_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + NE_ASSERT(nb00 == sizeof(float)); + NE_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + NE_ASSERT(nb0 == sizeof(float)); + NE_ASSERT(nb0 <= nb1); + NE_ASSERT(nb1 <= nb2); + NE_ASSERT(nb2 <= nb3); + const int id = dst->op_params[0]; + const int n_as = dst->op_params[1]; + // char * wdata_src1_end = (char *)params->wdata; + // int64_t wdata_src1_end = 0; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == NE_TASK_INIT) { + return; + } + + if (params->type == NE_TASK_FINALIZE) { + return; + } + int64_t matrix_row_counts[100]; // [n_as] + int64_t matrix_rows[30000]; // [n_as][ne11] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + memset(matrix_rows, -1, 30000 * sizeof(int64_t)); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); + NE_ASSERT(row_id >= 0 && row_id < n_as); + mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + if (cne1 == 0) { + continue; + } + const struct ne_tensor* src0_cur = dst->opt[cur_a]; + // parallelize by src0 rows + const int64_t dr = (ne01 + nth - 1) / nth; + + const int64_t ir10 = dr * ith; + const int64_t ir11 = MIN(ir10 + dr, ne01); + + // src1 rows + const int64_t nr1 = cne1 * ne12 * ne13; + + for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { + const int64_t i13 = (ir1 / (ne12 * cne1)); + const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1; + const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1); + const int64_t i11 = mmid_matrix_row(cur_a, _i11); + if (i11 == -1) { + continue; + } + + const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); + const int64_t i03 = (ir0 / (ne02)); + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. + // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: + const int64_t i02 = (i12 / (ne12 / ne02)); + // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) + // const int64_t i02 = (ir0 - i03*ne02); + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + char* src0_row = (char*)src0_cur->data + (0 + i02 * nb02 + i03 * nb03); + char* src1_col = (char*)src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13); + + float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + for (int64_t ir = ir10; ir < ir11; ++ir) { + ne_vec_dot_f32(ne00, &dst_col[ir], (float*)(src0_row + ir * nb01), (float*)src1_col); + } + } + } +} + +static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* ids, + const struct ne_tensor* src1, struct ne_tensor* dst) { + int64_t t0 = ne_perf_time_us(); + UNUSED(t0); + const struct ne_tensor* src0 = dst->opt[0]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const size_t nb00 = src0->nb[0]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb10 = src1->nb[0]; + + const size_t nb11 = src1->nb[1]; + UNUSED(nb11); + const size_t nb12 = src1->nb[2]; + UNUSED(nb12); + const size_t nb13 = src1->nb[3]; + UNUSED(nb13); + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + NE_ASSERT(ne0 == ne01); + NE_ASSERT(ne1 == ne11); + NE_ASSERT(ne2 == ne12); + NE_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + NE_ASSERT(nb00 == sizeof(ne_fp16_t)); + + // dst cannot be transposed or permuted + NE_ASSERT(nb0 == sizeof(float)); + NE_ASSERT(nb0 <= nb1); + NE_ASSERT(nb1 <= nb2); + NE_ASSERT(nb2 <= nb3); + const int id = dst->op_params[0]; + const int n_as = dst->op_params[1]; + // char * wdata_src1_end = (char *)params->wdata; + // int64_t wdata_src1_end = 0; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == NE_TASK_INIT) { + ne_fp16_t* const wdata = params->wdata; + + size_t id = 0; + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + wdata[id++] = + NE_FP32_TO_FP16(*(float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11 + i10 * nb10)); + } + } + } + } + + NE_ASSERT(id * sizeof(ne_fp16_t) <= params->wsize); + + return; + } + + if (params->type == NE_TASK_FINALIZE) { + return; + } + int64_t matrix_row_counts[100]; // [n_as] + int64_t matrix_rows[30000]; // [n_as][ne11] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + memset(matrix_rows, -1, 30000 * sizeof(int64_t)); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); + NE_ASSERT(row_id >= 0 && row_id < n_as); + mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + if (cne1 == 0) { + continue; + } + assert(nb10 / 2 == sizeof(ne_fp16_t)); + const struct ne_tensor* src0_cur = dst->opt[cur_a]; + // parallelize by src0 rows + const int64_t dr = (ne01 + nth - 1) / nth; + + const int64_t ir10 = dr * ith; + const int64_t ir11 = MIN(ir10 + dr, ne01); + + // src1 rows + const int64_t nr1 = cne1 * ne12 * ne13; + void* wdata = params->wdata; + const size_t row_size = ne10 * NE_TYPE_SIZE[NE_TYPE_F16]; + + for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { + const int64_t i13 = (ir1 / (ne12 * cne1)); + const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1; + const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1); + const int64_t i11 = mmid_matrix_row(cur_a, _i11); + if (i11 == -1) { + continue; + } + + const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); + const int64_t i03 = (ir0 / (ne02)); + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. + // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: + const int64_t i02 = (i12 / (ne12 / ne02)); + // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) + // const int64_t i02 = (ir0 - i03*ne02); + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + char* src0_row = (char*)src0_cur->data + (0 + i02 * nb02 + i03 * nb03); + char* src1_col = (char*)wdata + (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size; + + float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + for (int64_t ir = ir10; ir < ir11; ++ir) { + ne_vec_dot_f16(ne00, &dst_col[ir], (ne_fp16_t*)(src0_row + ir * nb01), (ne_fp16_t*)src1_col); + } + } + } +} + +static void ne_compute_forward_mul_mat_id_q_f32_bestla(const struct ne_compute_params* params, + const struct ne_tensor* ids, const struct ne_tensor* src1, + struct ne_tensor* dst) { + int64_t t0 = ne_perf_time_us(); + UNUSED(t0); + const struct ne_tensor* src0 = dst->opt[0]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const size_t nb00 = src0->nb[0]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb10 = src1->nb[0]; + + const size_t nb11 = src1->nb[1]; + UNUSED(nb11); + const size_t nb12 = src1->nb[2]; + UNUSED(nb12); + const size_t nb13 = src1->nb[3]; + UNUSED(nb13); + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + NE_ASSERT(ne0 == ne01); + NE_ASSERT(ne1 == ne11); + NE_ASSERT(ne2 == ne12); + NE_ASSERT(ne3 == ne13); + + const enum ne_type type = src0->type; + quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; + vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; + enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type; + // we don't support permuted src0 or src1 + NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]); + NE_ASSERT(nb10 == sizeof(float)); + // dst cannot be transposed or permuted + NE_ASSERT(nb0 == sizeof(float)); + NE_ASSERT(nb0 <= nb1); + NE_ASSERT(nb1 <= nb2); + NE_ASSERT(nb2 <= nb3); + const int id = dst->op_params[0]; + const int n_as = dst->op_params[1]; + // char * wdata_src1_end = (char *)params->wdata; + // int64_t wdata_src1_end = 0; + int64_t matrix_row_counts[100]; // [n_as] + int64_t matrix_rows[30000]; // [n_as][ne11] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == NE_TASK_INIT) { + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + memset(matrix_rows, -1, 30000 * sizeof(int64_t)); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); + NE_ASSERT(row_id >= 0 && row_id < n_as); + mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } + + return; + } + + if (params->type == NE_TASK_FINALIZE) { + return; + } + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + if (cne1 == 0) { + continue; + } + // assert(nb10 / 2 == sizeof(ne_fp16_t)); + const struct ne_tensor* src0_cur = dst->opt[cur_a]; + // parallelize by src0 rows + + // src1 rows + const int64_t nr1 = cne1 * ne12 * ne13; + const size_t row_size = ne10 * NE_TYPE_SIZE[src1->type]; + for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { + const int64_t i13 = (ir1 / (ne12 * cne1)); + const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1; + const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1); + const int64_t i11 = mmid_matrix_row(cur_a, _i11); + if (i11 == -1) { + continue; + } + + const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03); + const int64_t i03 = (ir0 / (ne02)); + // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2. + // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: + const int64_t i02 = (i12 / (ne12 / ne02)); + // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) + // const int64_t i02 = (ir0 - i03*ne02); + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + char* src0_row = (char*)src0_cur->data; + char* src1_col = (char*)src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13); + + float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + // parallelize by src0 rows + + bestla_f32f32_forward((float*)src1_col, (float*)src0_row, dst_col, 1, ne0, ne10, nb11 / ne_element_size(src1), + nb1 / ne_element_size(dst), params->wdata); + } + } +} +static void ne_compute_forward_mul_mat_id(const struct ne_compute_params* params, const struct ne_tensor* ids, + const struct ne_tensor* src1, struct ne_tensor* dst) { + switch (dst->opt[0]->type) { + case NE_TYPE_Q4_0: + case NE_TYPE_Q4_1: + case NE_TYPE_Q5_0: + case NE_TYPE_Q5_1: + case NE_TYPE_Q8_0: + case NE_TYPE_Q6_K: + case NE_TYPE_Q8_1: { + ne_compute_forward_mul_mat_id_q_f32(params, ids, src1, dst); + } break; + case NE_TYPE_BTLA: { + ne_compute_forward_mul_mat_id_q_f32_bestla(params, ids, src1, dst); + } break; + case NE_TYPE_F16: { + ne_compute_forward_mul_mat_id_f16_f32(params, ids, src1, dst); + } break; + case NE_TYPE_F32: { + ne_compute_forward_mul_mat_id_f32(params, ids, src1, dst); + } break; + default: { + NE_ASSERT(false); + } break; + } +} + static void ne_compute_forward_mul_mat_bias_q_f32_bestla(const struct ne_compute_params* params, const struct ne_tensor* src0, const struct ne_tensor* src1, const struct ne_tensor* bias, struct ne_tensor* dst) { @@ -6939,7 +7650,29 @@ static void ne_compute_forward_mul_qkv(const struct ne_compute_params* params, c bestla_fusion_QKV_f32f32_forward((float*)src->data, qw->data, kw->data, vw->data, (float*)dst->data, m, n, k, k, n, params->wdata); } +static void ne_compute_forward_ffn_id_silu(const struct ne_compute_params* params, const struct ne_tensor* src, + const struct ne_tensor* ids, const struct ne_tensor* tmp, + struct ne_tensor* tmp1, struct ne_tensor* dst) { + const int id = dst->op_params[0]; + if (params->type == NE_TASK_INIT) { + return; + } + + if (params->type == NE_TASK_FINALIZE) { + return; + } + const int32_t row_id = *(const int32_t*)((const char*)ids->data + id * ids->nb[0]); + const struct ne_tensor* w1 = dst->opt[row_id]; + const struct ne_tensor* w2 = dst->opt[row_id + 8]; + const struct ne_tensor* w3 = dst->opt[row_id + 16]; + const int fin = src->ne[0]; + const int fout = dst->ne[0]; + const int fmid = w1->ne[1]; + const int seq = dst->ne[1]; + bestla_fusion_FFN_SiLu_f32f32_forward((float*)src->data, w1->data, w2->data, w3->data, (float*)tmp->data, + (float*)tmp1->data, (float*)dst->data, seq, fin, fmid, fout, params->wdata); +} static void ne_compute_forward_ffn_silu(const struct ne_compute_params* params, const struct ne_tensor* src, const struct ne_tensor* w1, const struct ne_tensor* w2, struct ne_tensor* w3, const struct ne_tensor* tmp, struct ne_tensor* tmp1, struct ne_tensor* dst) { @@ -7212,12 +7945,37 @@ static void ne_compute_forward_get_rows_q(const struct ne_compute_params* params assert(dst->ne[0] == nc); assert(dst->ne[1] == nr); assert(src0->nb[0] == NE_TYPE_SIZE[type]); + assert(src0->ne[2] == src1->ne[1]); + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t*)src1->data)[i]; + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t*)((char*)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); - dequantize_row_q((const void*)((char*)src0->data + r * src0->nb[1]), (float*)((char*)dst->data + i * dst->nb[1]), - nc); + dequantize_row_q((const void*)((char*)src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03), + (float*)((char*)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc); + } + } } } @@ -7235,13 +7993,35 @@ static void ne_compute_forward_get_rows_f16(const struct ne_compute_params* para assert(dst->ne[0] == nc); assert(dst->ne[1] == nr); assert(src0->nb[0] == sizeof(ne_fp16_t)); - - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t*)src1->data)[i]; - - for (int j = 0; j < nc; ++j) { - ne_fp16_t v = ((ne_fp16_t*)((char*)src0->data + r * src0->nb[1]))[j]; - ((float*)((char*)dst->data + i * dst->nb[1]))[j] = NE_FP16_TO_FP32(v); + assert(src0->ne[2] == src1->ne[1]); + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t*)((char*)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); + + ne_fp16_to_fp32_row((const void*)((char*)src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03), + (float*)((char*)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc); + } } } } @@ -7258,13 +8038,38 @@ static void ne_compute_forward_get_rows_f32(const struct ne_compute_params* para const int nr = ne_nelements(src1); assert(dst->ne[0] == nc); - assert(dst->ne[1] == nr); + assert(ne_nrows(dst) == nr); + assert(src0->ne[2] == src1->ne[1]); assert(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t*)src1->data)[i]; - - ne_vec_cpy_f32(nc, (float*)((char*)dst->data + i * dst->nb[1]), (float*)((char*)src0->data + r * src0->nb[1])); + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t*)((char*)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); + ne_vec_cpy_f32(nc, (float*)((char*)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), + (float*)((char*)src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03)); + } + } } } @@ -9487,9 +10292,18 @@ static void ne_compute_forward(struct ne_compute_params* params, struct ne_tenso case NE_OP_MUL_MAT: { ne_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); } break; + case NE_OP_MUL_MAT_ID: { + ne_compute_forward_mul_mat_id(params, tensor->src0, tensor->src1, tensor); + } break; + case NE_OP_ARGSORT: { + ne_compute_forward_argsort(params, tensor->src0, tensor); + } break; case NE_OP_MUL_QKV: { ne_compute_forward_mul_qkv(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor); } break; + case NE_OP_MUL_ID_FFN_SILU: { + ne_compute_forward_ffn_id_silu(params, tensor->src0, tensor->src1, tensor->opt[24], tensor->opt[25], tensor); + } break; case NE_OP_MUL_FFN_SILU: { ne_compute_forward_ffn_silu(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor->opt[3], tensor); @@ -10466,14 +11280,18 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { work_size = MAX(work_size, cur); } break; case NE_OP_SUB: + case NE_OP_SUM: case NE_OP_DIV: + case NE_OP_SUM_ROWS: + // { + // node->n_tasks = 1; + // } break; case NE_OP_SQR: case NE_OP_SQRT: case NE_OP_LOG: - case NE_OP_SUM: - case NE_OP_SUM_ROWS: case NE_OP_MEAN: case NE_OP_ABS: + case NE_OP_ARGSORT: case NE_OP_SGN: case NE_OP_NEG: case NE_OP_STEP: @@ -10504,6 +11322,7 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { node->n_tasks = n_threads; } break; case NE_OP_MUL_MAT_BIAS: + case NE_OP_MUL_MAT_ID: case NE_OP_CONV_1D: case NE_OP_MUL_MAT: { node->n_tasks = n_threads; @@ -10516,17 +11335,20 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { // printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); size_t cur = 0; - if (node->src0->type == NE_TYPE_BTLA) { - cur = bestla_f32f32_get_workspace_size(node->src1->ne[1], node->src0->ne[1], node->src1->ne[0], - node->src0->data); + struct ne_tensor* wei = node->src0; + if (node->op == NE_OP_MUL_MAT_ID) { + wei = node->opt[0]; + } + if (wei->type == NE_TYPE_BTLA) { + cur = bestla_f32f32_get_workspace_size(node->src1->ne[1], wei->ne[1], node->src1->ne[0], wei->data); node->n_tasks = 1; - } else if (node->src0->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) { + } else if (wei->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) { cur = NE_TYPE_SIZE[NE_TYPE_F16] * ne_nelements(node->src1); - } else if (node->src0->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) { + } else if (wei->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) { cur = 0; - } else if (ne_is_quantized(node->src0->type) && node->src1->type == NE_TYPE_F32) { + } else if (ne_is_quantized(wei->type) && node->src1->type == NE_TYPE_F32) { { - const enum ne_type type_q = quantize_fns[node->src0->type].vec_dot_type; + const enum ne_type type_q = quantize_fns[wei->type].vec_dot_type; cur = NE_TYPE_SIZE[type_q] * ne_nelements(node->src1) / NE_BLCK_SIZE[type_q]; } } else { @@ -10544,6 +11366,14 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { work_size = MAX(work_size, cur); node->n_tasks = 1; } break; + case NE_OP_MUL_ID_FFN_SILU: { + size_t cur = 0; + cur = + bestla_fusion_FFN_f32f32_get_workspace_size(node->src0->ne[1], node->src0->ne[0], node->opt[0]->ne[1], + node->opt[9]->ne[1], node->opt[0]->data, node->opt[9]->data); + work_size = MAX(work_size, cur); + node->n_tasks = 1; + } break; case NE_OP_MUL_QKV: { size_t cur = 0; cur = bestla_fusion_QKV_f32f32_get_workspace_size(node->src0->ne[1], node->src1->ne[1], node->src1->ne[0], @@ -10909,12 +11739,16 @@ void ne_graph_profiling(const struct ne_cgraph* cgraph) { NE_PRINT("=== GRAPH Profiling ===\n"); int64_t ip_duration = 0; + int64_t mul_mat_id_duration = 0; for (int i = 0; i < cgraph->n_nodes; i++) { struct ne_tensor* node = cgraph->nodes[i]; if (node->op == NE_OP_MUL_MAT && node->ne[1] == node->ne[2]) { ip_duration += node->perf_time_us; } else { perf_total_per_op_us[node->op] += node->perf_time_us; + if (node->op == NE_OP_MUL_MAT_ID) { + mul_mat_id_duration += node->perf_time_us; + } } } @@ -10925,6 +11759,7 @@ void ne_graph_profiling(const struct ne_cgraph* cgraph) { NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", NE_OP_LABEL[i], (double)perf_total_per_op_us[i] / 1000.0); } NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", "INNER PRODUCT", (double)ip_duration / 1000.0); + NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", "MUL_MAT_ID", (double)mul_mat_id_duration / 1000.0); NE_PRINT("========================================\n"); #else diff --git a/neural_speed/core/ne_layers.h b/neural_speed/core/ne_layers.h index c8332a6e8..21cd48d44 100644 --- a/neural_speed/core/ne_layers.h +++ b/neural_speed/core/ne_layers.h @@ -254,9 +254,17 @@ NE_API struct ne_tensor* ne_rms_norm_back(struct ne_context* ctx, struct ne_tens // result is m columns, p rows NE_API struct ne_tensor* ne_mul_mat(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b); +NE_API struct ne_tensor* ne_mul_mat_id(struct ne_context* ctx, struct ne_tensor* const as[], int n_as, + struct ne_tensor* ids, int id, struct ne_tensor* b); +NE_API struct ne_tensor* ne_mul_id_ffn_silu(struct ne_context* ctx, struct ne_tensor* const down[], + struct ne_tensor* const gate[], struct ne_tensor* const up[], int n_as, + struct ne_tensor* ids, int id, struct ne_tensor* b); + NE_API struct ne_tensor* ne_mul_mat_with_bias(struct ne_context* ctx, struct ne_tensor* w, struct ne_tensor* b, struct ne_tensor* a); +NE_API struct ne_tensor* ne_argsort(struct ne_context* ctx, struct ne_tensor* a); +NE_API struct ne_tensor* ne_top_k(struct ne_context* ctx, struct ne_tensor* a, int k); // merged Q K V ne_mul_mat NE_API struct ne_tensor* ne_mul_qkv(struct ne_context* ctx, struct ne_tensor* qw, struct ne_tensor* kw, struct ne_tensor* vw, struct ne_tensor* src); diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp index fad3d6e2d..41aedf08d 100644 --- a/neural_speed/models/llama/llama.cpp +++ b/neural_speed/models/llama/llama.cpp @@ -88,6 +88,8 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp int n_head = hparams.n_head; int head_size = n_embd / n_head; int n_head_kv = hparams.n_head_kv; + int n_expert = hparams.n_experts; + int n_expert_used = hparams.n_experts_used; bool enable_tp = false; #ifdef NS_TP_MODEL @@ -147,6 +149,7 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, N, NE_SIZE_CALC); ne_set_name(embd, "embd"); + for (int i = 0; i < batch_size; ++i) { memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); } @@ -351,17 +354,70 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp // cur = cur*ffn_norm(broadcasted) cur = ne_mul(ctx0, cur, model.layers[il].norm[1]); } - - if (bestla_fusion_FFN_SiLu_f32f32_support(model.layers[il].ffn[0]->data, model.layers[il].ffn[1]->data, - model.layers[il].ffn[2]->data, N, cur->ne[0], - model.layers[il].ffn[0]->ne[1], model.layers[il].ffn[1]->ne[1])) { - cur = ne_ffn_silu(ctx0, model.layers[il].ffn[0], model.layers[il].ffn[1], model.layers[il].ffn[2], cur); + if (n_expert == 0) { + if (bestla_fusion_FFN_SiLu_f32f32_support(model.layers[il].ffn[0]->data, model.layers[il].ffn[1]->data, + model.layers[il].ffn[2]->data, N, cur->ne[0], + model.layers[il].ffn[0]->ne[1], model.layers[il].ffn[1]->ne[1])) { + cur = ne_ffn_silu(ctx0, model.layers[il].ffn[0], model.layers[il].ffn[1], model.layers[il].ffn[2], cur); + } else { + struct ne_tensor* tmp = ne_mul_mat(ctx0, model.layers[il].ffn[2], cur); + cur = ne_mul_mat(ctx0, model.layers[il].ffn[0], cur); + cur = ne_silu(ctx0, cur); + cur = ne_mul(ctx0, cur, tmp); + cur = ne_mul_mat(ctx0, model.layers[il].ffn[1], cur); + } } else { - struct ne_tensor* tmp = ne_mul_mat(ctx0, model.layers[il].ffn[2], cur); - cur = ne_mul_mat(ctx0, model.layers[il].ffn[0], cur); - cur = ne_silu(ctx0, cur); - cur = ne_mul(ctx0, cur, tmp); - cur = ne_mul_mat(ctx0, model.layers[il].ffn[1], cur); + ne_tensor* logits = ne_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + ne_tensor* probs = ne_soft_max_inplace(ctx0, logits); + ne_tensor* selected_experts = ne_top_k(ctx0, probs, n_expert_used); + ne_tensor* weights = ne_get_rows(ctx0, ne_reshape_3d(ctx0, probs, 1, n_expert, N), selected_experts); + weights = ne_reshape_2d(ctx0, weights, n_expert_used, N); + ne_tensor* weights_sum = ne_sum_rows(ctx0, weights); + weights_sum = ne_repeat(ctx0, weights_sum, weights); + weights = ne_div(ctx0, weights, weights_sum); + ne_tensor* moe_out = nullptr; + + for (int i = 0; i < n_expert_used; ++i) { + ne_tensor* cur_expert; + if (N == 1 && bestla_fusion_FFN_SiLu_f32f32_support( + model.layers[il].ffn_gate_exp[0]->data, model.layers[il].ffn_down_exp[0]->data, + model.layers[il].ffn_up_exp[0]->data, N, cur->ne[0], + model.layers[il].ffn_gate_exp[0]->ne[1], model.layers[il].ffn_down_exp[0]->ne[1])) { + cur_expert = ne_mul_id_ffn_silu(ctx0, model.layers[il].ffn_down_exp, model.layers[il].ffn_gate_exp, + model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); + } else { + ne_tensor* cur_up = ne_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); + ne_set_name(cur_up, "ffn_moe_up"); + + ne_tensor* cur_gate = + ne_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); + ne_set_name(cur_gate, "ffn_moe_gate"); + + cur_gate = ne_silu(ctx0, cur_gate); + ne_set_name(cur_gate, "ffn_moe_silu"); + + cur_expert = ne_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] + ne_set_name(cur_expert, "ffn_moe_gate_par"); + + cur_expert = ne_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, + cur_expert); // [n_tokens, n_embd] + ne_set_name(cur_expert, "ffn_moe_down"); + } + + cur_expert = + ne_mul(ctx0, cur_expert, + ne_repeat(ctx0, ne_view_2d(ctx0, weights, 1, N, weights->nb[1], i * weights->nb[0]), cur_expert)); + ne_set_name(cur_expert, "ffn_moe_weighted"); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ne_add(ctx0, moe_out, cur_expert); + ne_set_name(moe_out, "ffn_moe_out"); + } + } + + cur = moe_out; } #ifdef NS_TP_MODEL // ffn2 and ffn0 use split row, ffn1 use split column @@ -424,7 +480,6 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp sizeof(float) * n_vocab); } } - // extract embeddings if (!lctx.embedding.empty()) { auto& embedding_out = lctx.embedding; diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h index e498de254..2cf7bdd08 100644 --- a/neural_speed/models/llama/llama.h +++ b/neural_speed/models/llama/llama.h @@ -47,7 +47,7 @@ class Llama : public IModel { private: model_archs arch = MODEL_LLAMA; std::unique_ptr ml; - uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv; + uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_expert, n_expert_used; int n_gpu_layer; bool use_mmap, use_mlock, vocab_only; model_scratch scratch; diff --git a/neural_speed/models/llama/llama_utils.cpp b/neural_speed/models/llama/llama_utils.cpp index fd8fe065b..128f249a9 100644 --- a/neural_speed/models/llama/llama_utils.cpp +++ b/neural_speed/models/llama/llama_utils.cpp @@ -79,6 +79,8 @@ void Llama::init(const char* path_model, model_context* ctx, int n_gpu_layer_, b n_layer = hparams.n_layer; n_head_kv = hparams.n_head_kv; n_head = hparams.n_head; + n_expert = hparams.n_experts; + n_expert_used = hparams.n_experts_used; scratch = llama_mem_req(n_layer); model.scratchs = scratch; } @@ -140,9 +142,25 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback, layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); // ffn GEMM - layer.ffn[0] = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend); - layer.ffn[1] = ml->get_tensor(layers_i + ".ffn_down.weight", {n_ff, n_embd}, backend); - layer.ffn[2] = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend); + if (ml->verify_tensor(layers_i + ".ffn_gate.weight")) { + NE_ASSERT(n_expert == 0); + NE_ASSERT(n_expert_used == 0); + layer.ffn[0] = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend); + layer.ffn[1] = ml->get_tensor(layers_i + ".ffn_down.weight", {n_ff, n_embd}, backend); + layer.ffn[2] = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend); + } else { + NE_ASSERT(n_expert > 0); + NE_ASSERT(n_expert_used > 0); + layer.ffn_gate_inp = ml->get_tensor(layers_i + ".ffn_gate_inp.weight", {n_embd, n_expert}, backend); + for (uint32_t x = 0; x < n_expert; ++x) { + layer.ffn_gate_exp[x] = + ml->get_tensor(layers_i + ".ffn_gate." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend); + layer.ffn_down_exp[x] = + ml->get_tensor(layers_i + ".ffn_down." + std::to_string(x) + ".weight", {n_ff, n_embd}, backend); + layer.ffn_up_exp[x] = + ml->get_tensor(layers_i + ".ffn_up." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend); + } + } if (backend != NE_BACKEND_CPU) { vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) + @@ -176,10 +194,26 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback, layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); // ffn GEMM - layer.ffn[0] = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend); - layer.ffn[1] = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {n_ff, n_embd}, backend); - layer.ffn[2] = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend); + if (ml->verify_tensor(layers_i + ".feed_forward.w1.weight")) { + NE_ASSERT(n_expert == 0); + NE_ASSERT(n_expert_used == 0); + layer.ffn[0] = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend); + layer.ffn[1] = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {n_ff, n_embd}, backend); + layer.ffn[2] = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend); + } else { + NE_ASSERT(n_expert > 0); + NE_ASSERT(n_expert_used > 0); + layer.ffn_gate_inp = ml->get_tensor(layers_i + ".ffn_gate_inp.weight", {n_embd, n_expert}, backend); + for (uint32_t x = 0; x < n_expert; ++x) { + layer.ffn_gate_exp[x] = + ml->get_tensor(layers_i + ".ffn_gate." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend); + layer.ffn_down_exp[x] = + ml->get_tensor(layers_i + ".ffn_down." + std::to_string(x) + ".weight", {n_ff, n_embd}, backend); + layer.ffn_up_exp[x] = + ml->get_tensor(layers_i + ".ffn_up." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend); + } + } if (backend != NE_BACKEND_CPU) { vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) + ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.norm[1]) + diff --git a/neural_speed/models/model_utils/gguf.h b/neural_speed/models/model_utils/gguf.h index 0018ec7d3..251280fa3 100644 --- a/neural_speed/models/model_utils/gguf.h +++ b/neural_speed/models/model_utils/gguf.h @@ -423,6 +423,8 @@ enum llm_kv { LLM_KV_ATTENTION_CLAMP_KQV, LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + LLM_KV_NUM_EXPERTS, + LLM_KV_NUM_EXPERTS_USED, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -466,6 +468,8 @@ static std::map LLM_KV_NAMES = { {LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length"}, {LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual"}, {LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout"}, + {LLM_KV_NUM_EXPERTS, "%s.expert_count"}, + {LLM_KV_NUM_EXPERTS_USED, "%s.expert_used_count"}, {LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count"}, {LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv"}, diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h index 7c6ed97ec..3813a0a09 100644 --- a/neural_speed/models/model_utils/model_files.h +++ b/neural_speed/models/model_utils/model_files.h @@ -908,6 +908,9 @@ struct gguf_loader { GGUF_GET_KEY(ctx_gguf, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT)); GGUF_GET_KEY(ctx_gguf, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + GGUF_GET_KEY(ctx_gguf, hparams.n_experts, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_NUM_EXPERTS)); + GGUF_GET_KEY(ctx_gguf, hparams.n_experts_used, gguf_get_val_u32, GGUF_TYPE_UINT32, false, + kv(LLM_KV_NUM_EXPERTS_USED)); GGUF_GET_KEY(ctx_gguf, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_BLOCK_COUNT)); GGUF_GET_KEY(ctx_gguf, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); @@ -1095,6 +1098,8 @@ struct model_file_loader { // For ChatGLM-2 hparams.inner_hidden_size = file.read_u32(); + hparams.n_experts = file.read_u32(); + hparams.n_experts_used = file.read_u32(); file.read_raw(&hparams.rms_norm_eps, sizeof(float)); file.read_raw(&hparams.freq_base, sizeof(float)); @@ -1219,6 +1224,8 @@ struct model_file_saver { file.write_u32(hparams.multi_query_group_num); file.write_u32(hparams.ffn_hidden_size); file.write_u32(hparams.inner_hidden_size); + file.write_u32(hparams.n_experts); + file.write_u32(hparams.n_experts_used); file.write_raw(&hparams.rms_norm_eps, sizeof(float)); file.write_raw(&hparams.freq_base, sizeof(float)); diff --git a/neural_speed/models/model_utils/model_types.h b/neural_speed/models/model_utils/model_types.h index 33e7df888..8f5bc43f1 100644 --- a/neural_speed/models/model_utils/model_types.h +++ b/neural_speed/models/model_utils/model_types.h @@ -45,6 +45,7 @@ #define MODEL_MAX_ATTN 8 #define MODEL_MAX_FFN 6 #define MODEL_MAX_OTHERS 7 +#define MODEL_MAX_EXPERTS 8 #define MODEL_USE_SCRATCH #define MODEL_MAX_SCRATCH_BUFFERS 16 @@ -139,6 +140,9 @@ struct model_hparams { // ChatGLM-1 int32_t inner_hidden_size = 0; + uint32_t n_experts = 0; + uint32_t n_experts_used = 0; + float rope_scaling_factor = 0.0f; int32_t original_max_position_embeddings = 0; int32_t use_yarn = 0; @@ -158,6 +162,14 @@ struct model_layer { // ff struct ne_tensor* ffn[MODEL_MAX_FFN]; + struct ne_tensor* ffn_gate_inp; + + struct ne_tensor* ffn_gate_exp[MODEL_MAX_EXPERTS]; + + struct ne_tensor* ffn_down_exp[MODEL_MAX_EXPERTS]; + + struct ne_tensor* ffn_up_exp[MODEL_MAX_EXPERTS]; + struct ne_tensor* k_cache; struct ne_tensor* v_cache; @@ -471,7 +483,8 @@ class model_name_to_arch { {"dolly", MODEL_GPTNEOX}, {"polyglot", MODEL_GPTNEOX}, {"starcoder", MODEL_STARCODER}, {"falcon", MODEL_FALCON}, {"bloom", MODEL_BLOOM}, {"chatglm2", MODEL_CHATGLM2}, {"chatglm", MODEL_CHATGLM}, {"baichuan", MODEL_BAICHUAN}, {"mistral", MODEL_LLAMA}, - {"qwen", MODEL_QWEN}, {"phi", MODEL_PHI}, {"whisper", MODEL_WHISPER}}; + {"qwen", MODEL_QWEN}, {"phi", MODEL_PHI}, {"whisper", MODEL_WHISPER}, + {"mixtral", MODEL_LLAMA}}; }; #ifdef __cplusplus diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh index 973c09892..500fb37fd 100644 --- a/tests/model-test/cpp_graph_inference.sh +++ b/tests/model-test/cpp_graph_inference.sh @@ -155,6 +155,7 @@ model_name_map["qwen-7b"]="Qwen/Qwen-7B-Chat" model_name_map["magicoder"]="ise-uiuc/Magicoder-S-DS-6.7B" model_name_map["whisper"]="openai/whisper-tiny" model_name_map["phi2"]="microsoft/phi-2" +model_name_map["mixtral"]="mistralai/Mixtral-8x7B-Instruct-v0.1" function main() { conda_env="$1" @@ -263,6 +264,10 @@ function main() { quant_script="./build/bin/quant_phi" convert_script="${convert_script}/convert_phi.py" infer_cmd="./build/bin/run_phi" + elif [[ "${model}" == "mixtral" ]]; then + quant_script="./build/bin/quant_mixtral" + convert_script="${convert_script}/convert_mixtral.py" + infer_cmd="./build/bin/run_mixtral" else echo "Error: Unexpedted model: $model" 1>&2 exit 1