Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Gemma-7b&&Gemma-2b (#171)
Browse files Browse the repository at this point in the history
  • Loading branch information
intellinjun authored Mar 22, 2024
1 parent 94e74d7 commit e4c5f71
Show file tree
Hide file tree
Showing 44 changed files with 1,069 additions and 13 deletions.
16 changes: 15 additions & 1 deletion docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,21 @@ Neural Speed supports the following models:
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<tr>
<td><a href="https://huggingface.co/google/gemma-2b-it" target="_blank" rel="noopener noreferrer">gemma-2b-it </a>,
<a href="https://huggingface.co/google/gemma-7b" target="_blank" rel="noopener noreferrer">gemma-7b</a></td>
<td>✅</td>
<td> </td>
<td> </td>
<td> </td>
<td>✅</td>
<td> </td>
<td> </td>
<td> </td>
<td>Latest</td>
<td>8192</td>
</tr>
<tr>
<td><a href="https://huggingface.co/openai/whisper-tiny" target="_blank" rel="noopener noreferrer">Whisper-tiny</a>,
<a href="https://huggingface.co/openai/whisper-base" target="_blank" rel="noopener noreferrer">Whisper-base</a>
<a href="https://huggingface.co/openai/whisper-small" target="_blank" rel="noopener noreferrer">Whisper-small</a>
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def __import_package(self, model_type):
import neural_speed.qwen_cpp as cpp_model
elif model_type == "phi":
import neural_speed.phi_cpp as cpp_model
elif model_type == "gemma":
import neural_speed.gemma_cpp as cpp_model
elif model_type == "stablelm":
import neural_speed.stablelm_cpp as cpp_model
elif model_type == "whisper":
Expand Down
5 changes: 5 additions & 0 deletions neural_speed/application/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ compile_quant(quant_mistral quant_model.cpp mistral llama)
compile_quant(quant_mixtral quant_model.cpp mixtral llama)
compile_quant(quant_qwen quant_model.cpp qwen qwen)
compile_quant(quant_phi quant_model.cpp phi phi)
compile_quant(quant_gemma quant_model.cpp gemma gemma)
compile_quant(quant_stablelm quant_model.cpp stablelm stablelm)
compile_quant(quant_whisper quant_whisper.cpp whisper whisper)

Expand Down Expand Up @@ -99,6 +100,8 @@ set(mymap_stablelm 17)
set(mymap_whisper 18)
set(mymap_mixtral 19)
set(mymap_chatglm3 20)
set(mymap_gemma 21)



function(compile_run TARGET MAIN_CPP MAIN_PY MODEL_NAME MODEL_LIB)
Expand Down Expand Up @@ -135,8 +138,10 @@ compile_run(run_baichuan main_run.cpp main_pybind.cpp baichuan baichuan)
compile_run(run_mistral main_run.cpp main_pybind.cpp mistral llama)
compile_run(run_qwen main_run.cpp main_pybind.cpp qwen qwen)
compile_run(run_phi main_run.cpp main_pybind.cpp phi phi)
compile_run(run_gemma main_run.cpp main_pybind.cpp gemma gemma)
compile_run(run_stablelm main_run.cpp main_pybind.cpp stablelm stablelm)
compile_run(run_mixtral main_run.cpp main_pybind.cpp mixtral llama)


# speech recognition
compile_run(run_whisper audio_run.cpp whisper_pybind.cpp whisper whisper)
3 changes: 3 additions & 0 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,9 @@ PYBIND11_MODULE(mixtral_cpp, m)
#elif MODEL_NAME_ID == 20

PYBIND11_MODULE(chatglm3_cpp, m)
#elif MODEL_NAME_ID == 21

PYBIND11_MODULE(gemma_cpp, m)

#endif
{
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/application/main_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ int main(int argc, char** argv) { // NOLINT

// tokenize the prompt
bool add_bos = false;
if (params.model_arch == MODEL_LLAMA) {
if (params.model_arch == MODEL_LLAMA || params.model_arch == MODEL_GEMMA) {
add_bos = true;
}

Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["intermediate_size"]))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_epsilon", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
5 changes: 3 additions & 2 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,6 @@ def chatglm3_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams
gguf_file = fname_out
gguf_writer = gguf.GGUFWriter(gguf_file, "chatglm3")
gguf_writer.add_uint32('magic', 0x67676d66)
import pdb
pdb.set_trace()
gguf_writer.add_uint32('version', 1)
gguf_writer.add_uint32('n_vocab', hparams["padded_vocab_size"])
gguf_writer.add_embedding_length(hparams["hidden_size"])
Expand Down Expand Up @@ -561,6 +559,7 @@ def chatglm3_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down Expand Up @@ -711,6 +710,7 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down Expand Up @@ -862,6 +862,7 @@ def chatglm1_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-5))) # rms_norm_eps or layer_norm_eps

fout.write(struct.pack("f", 10000.0)) # freq_base
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_eps", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_epsilon", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
195 changes: 195 additions & 0 deletions neural_speed/convert/convert_gemma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Convert Hugging Face fine-tuned gpt-neox-like models to ne format
#
# Usage:
#
# python3 models/convert-h5-to-ne.py
#
# This script is similar to "convert-pt-to-ne.py"
#

import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np
from pathlib import Path
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)



# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))


def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

dir_model = args.model.as_posix()
fname_out = args.outfile.as_posix()

# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
ftype = 0
if args.outtype == "f16":
ftype = 1
if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model.eval()
for p in model.parameters():
p.requires_grad = False
hparams = model.config.to_dict()
print("Model loaded: ", dir_model)

fout = open(fname_out, "wb")

# 0x67676d6c is unversioned ne
# 0x67676d66 is versioned ggmf (requires token scores)
ne_file_magic = 0x67676d66
#ne_file_version = 0x00000001 # v1

fout.write(struct.pack("i", ne_file_magic)) # magic: ne in hex
fout.write(struct.pack("i", 1))
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["hidden_size"]))
fout.write(struct.pack("i", hparams["intermediate_size"])) # dummy data
fout.write(struct.pack("i", hparams["num_attention_heads"]))
fout.write(struct.pack("i", hparams["num_key_value_heads"])) # multi-query attention
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", hparams["head_dim"]))
fout.write(struct.pack("i", ftype))
fout.write(
struct.pack("i", hparams["seq_length"] if "seq_length" in hparams else hparams["max_position_embeddings"]))
fout.write(struct.pack("f", 0.0))
fout.write(struct.pack("f", 0.0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # word_embed_proj_dim (for opt)
fout.write(struct.pack("i", 0)) # do_layer_norm_before (for opt)

fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", hparams["intermediate_size"]))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", hparams["head_dim"])) # n_embd_head_k
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
fout.write(struct.pack("i", 0)) # rope_scaling.original_max_position_embeddings
fout.write(struct.pack("i", 0)) # params["rope_scaling"]["type"] =="yarn" else 0))
fout.write(struct.pack("i", hparams["bos_token_id"]))
fout.write(struct.pack("i", hparams["eos_token_id"]))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))

for i in range(hparams["vocab_size"]):
if i < tokenizer.vocab_size:
text = tokenizer.decode([i]).encode('utf-8')
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", 0.0 - i))
else:
text = tokenizer.decode([tokenizer.vocab_size - 1]).encode('utf-8')
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", -10000))

list_vars = model.state_dict()

print(hparams)

for name in list_vars.keys():
# No gradients for these
list_vars[name].requires_grad = False
src = name
nn = name

print(src, ' -> ', name)
data = list_vars[src].squeeze().numpy()
data = data.astype(np.float32)

n_dims = len(data.shape)
print(name, n_dims, data.shape)

# default type is fp32
ftype_cur = 0
if ftype == 1 and n_dims > 1:
print(" Converting to float16", data.shape, data[:3, :3].tolist())
data = data.astype(np.float16)
ftype_cur = 1
else:
print(" Converting to float32", data.shape, data[:3, :3].tolist() if n_dims > 1 else data[:3].tolist())
data = data.astype(np.float32)
# gemma_rms:
# output = self._norm(x.float()).type_as(x)
# return output * (1 + self.weight)
if "norm" in name:
data = data + 1
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
print(str)
fout.write(str)

# data
data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_epsilon", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_eps", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:

self.fout.write(struct.pack("i", 0)) # n_experts
self.fout.write(struct.pack("i", 0)) # n_expert_used
self.fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:

self.fout.write(struct.pack("i", 0)) # n_experts
self.fout.write(struct.pack("i", 0)) # n_expert_used
self.fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("i", 0))
self.fout.write(struct.pack("i", 8))
self.fout.write(struct.pack("i", 2))
self.fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_eps", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma
fout.write(struct.pack("f", hparams.get("layer_norm_eps", 1e-5))) # rms_norm_eps or layer_norm_eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
Loading

0 comments on commit e4c5f71

Please sign in to comment.