Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

improve SQ to adapt optimum-intel 1.16.1 #1591

Merged
merged 3 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ sentencepiece != 0.1.92
torch==2.3.0+cpu
transformers
intel_extension_for_pytorch==2.3.0
git+https://github.com/huggingface/optimum.git@e38d40ad220a180213f99b1d93d0407a826c326d
optimum-intel
optimum-intel==1.16.1
bitsandbytes #baichuan
transformers_stream_generator
tiktoken #qwen
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@
from intel_extension_for_transformers.transformers.llm.evaluation.models import (
TSModelCausalLMForITREX,
)

if args.restore:
from intel_extension_for_transformers.transformers.utils.utility import (
recover_model_from_json,
Expand All @@ -231,11 +232,13 @@
args.trust_remote_code,
)
else:
user_model = TSModelCausalLMForITREX.from_pretrained(
args.model,
file_name="best_model.pt",
trust_remote_code=args.trust_remote_code,
)
user_model = torch.jit.load(os.path.join( args.model, "best_model.pt"))
config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
origin_model_type = config.model_type
if origin_model_type in ["chatglm", "qwen", "baichuan"]:
config.model_type = "qwen2"
user_model = TSModelCausalLMForITREX(user_model, config=config)
user_model.config.model_type = origin_model_type
elif not (args.sq or args.mixed_precision):
user_model = AutoModelForCausalLM.from_pretrained(
args.model,
Expand Down
259 changes: 156 additions & 103 deletions intel_extension_for_transformers/transformers/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,52 +98,64 @@ def __init__(self) -> None:
def generate_dummy_past_key_values(config, input_bs):
"""Generate the dummy past_key_values."""
from optimum.utils import NormalizedConfigManager

normalized_config = NormalizedConfigManager.get_normalized_config_class(
config.model_type
)(config)
nb_pkv = 2
num_layers = normalized_config.num_layers
num_attention_heads = normalized_config.num_attention_heads
hidden_size = normalized_config.hidden_size
d_k = hidden_size // num_attention_heads
num_key_value_heads = num_attention_heads
if hasattr(normalized_config, "num_key_value_heads"):
num_key_value_heads = normalized_config.num_key_value_heads
if hasattr(normalized_config, "multi_query_group_num"):
num_key_value_heads = normalized_config.multi_query_group_num

if config.model_type == "bloom":
shape_key = (input_bs * num_attention_heads, d_k, 1)
shape_value = (input_bs * num_attention_heads, 1, d_k)
key = torch.ones(size=shape_key)
value = torch.ones(size=shape_value)
past_key_values = tuple(
tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv))
for _ in range(num_layers)
)
return past_key_values
elif config.model_type == "gpt_bigcode":
new_shape = [input_bs, 0, d_k * 2]
dummy_tensor = torch.zeros(size=new_shape)
past_key_values = tuple([dummy_tensor] * num_layers)
return past_key_values
elif config.model_type == "qwen":
new_shape = [input_bs, 1, num_key_value_heads, d_k]
past_key_values = [
(
torch.ones(size=new_shape).contiguous(),
torch.ones(size=new_shape).contiguous(),
)
for _ in range(num_layers)
if config.model_type == "qwen":
new_shape = [
input_bs,
0,
config.num_attention_heads,
config.hidden_size // config.num_attention_heads,
]
return tuple(past_key_values)
num_layers = config.num_hidden_layers
elif config.model_type == "baichuan":
new_shape = [
input_bs,
config.num_attention_heads,
0,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_hidden_layers
elif config.model_type == "chatglm":
new_shape = [0, input_bs, num_key_value_heads, d_k]
elif config.model_type == "falcon":
new_shape = [input_bs, 1, 0, d_k]
new_shape = [
0,
input_bs,
config.num_attention_heads,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_layers
else:
new_shape = [input_bs, num_key_value_heads, 0, d_k]
normalized_config = NormalizedConfigManager.get_normalized_config_class(
config.model_type
)(config)
nb_pkv = 2
num_layers = normalized_config.num_layers
num_attention_heads = normalized_config.num_attention_heads
hidden_size = normalized_config.hidden_size
d_k = hidden_size // num_attention_heads
num_key_value_heads = num_attention_heads
if hasattr(normalized_config, "num_key_value_heads"):
num_key_value_heads = normalized_config.num_key_value_heads
if hasattr(normalized_config, "multi_query_group_num"):
num_key_value_heads = normalized_config.multi_query_group_num

if config.model_type == "bloom":
shape_key = (input_bs * num_attention_heads, d_k, 1)
shape_value = (input_bs * num_attention_heads, 1, d_k)
key = torch.ones(size=shape_key)
value = torch.ones(size=shape_value)
past_key_values = tuple(
tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv))
for _ in range(num_layers)
)
return past_key_values
elif config.model_type == "gpt_bigcode":
new_shape = [input_bs, 0, d_k * 2]
dummy_tensor = torch.zeros(size=new_shape)
past_key_values = tuple([dummy_tensor] * num_layers)
return past_key_values
elif config.model_type == "falcon":
new_shape = [input_bs, 1, 0, d_k]
else:
new_shape = [input_bs, num_key_value_heads, 0, d_k]
past_key_values = [
(
torch.zeros(size=new_shape).contiguous(),
Expand All @@ -156,44 +168,64 @@ def generate_dummy_past_key_values(config, input_bs):
def generate_dummy_past_key_values_for_inference(config, input_bs):
"""Generate the dummy past_key_values."""
from optimum.utils import NormalizedConfigManager

normalized_config = NormalizedConfigManager.get_normalized_config_class(
config.model_type
)(config)
nb_pkv = 2
num_layers = normalized_config.num_layers
num_attention_heads = normalized_config.num_attention_heads
hidden_size = normalized_config.hidden_size
d_k = hidden_size // num_attention_heads
num_key_value_heads = num_attention_heads
if hasattr(normalized_config, "num_key_value_heads"):
num_key_value_heads = normalized_config.num_key_value_heads
if hasattr(normalized_config, "multi_query_group_num"):
num_key_value_heads = normalized_config.multi_query_group_num

if config.model_type == "bloom":
shape_key = (input_bs * num_attention_heads, d_k, 0)
shape_value = (input_bs * num_attention_heads, 0, d_k)
key = torch.empty(size=shape_key)
value = torch.empty(size=shape_value)
past_key_values = tuple(
tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv))
for _ in range(num_layers)
)
return past_key_values
elif config.model_type == "gpt_bigcode":
new_shape = [input_bs, 0, d_k * 2]
dummy_tensor = torch.zeros(size=new_shape)
past_key_values = tuple([dummy_tensor] * num_layers)
return past_key_values
elif config.model_type == "qwen":
new_shape = [input_bs, 0, num_key_value_heads, d_k]
if config.model_type == "qwen":
new_shape = [
input_bs,
0,
config.num_attention_heads,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_hidden_layers
elif config.model_type == "baichuan":
new_shape = [
input_bs,
config.num_attention_heads,
0,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_hidden_layers
elif config.model_type == "chatglm":
new_shape = [0, input_bs, num_key_value_heads, d_k]
elif config.model_type == "falcon":
new_shape = [input_bs, 1, 0, d_k]
new_shape = [
0,
input_bs,
config.num_attention_heads,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_layers
else:
new_shape = [input_bs, num_key_value_heads, 0, d_k]
normalized_config = NormalizedConfigManager.get_normalized_config_class(
config.model_type
)(config)
nb_pkv = 2
num_layers = normalized_config.num_layers
num_attention_heads = normalized_config.num_attention_heads
hidden_size = normalized_config.hidden_size
d_k = hidden_size // num_attention_heads
num_key_value_heads = num_attention_heads
if hasattr(normalized_config, "num_key_value_heads"):
num_key_value_heads = normalized_config.num_key_value_heads
if hasattr(normalized_config, "multi_query_group_num"):
num_key_value_heads = normalized_config.multi_query_group_num

if config.model_type == "bloom":
shape_key = (input_bs * num_attention_heads, d_k, 0)
shape_value = (input_bs * num_attention_heads, 0, d_k)
key = torch.empty(size=shape_key)
value = torch.empty(size=shape_value)
past_key_values = tuple(
tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv))
for _ in range(num_layers)
)
return past_key_values
elif config.model_type == "gpt_bigcode":
new_shape = [input_bs, 0, d_k * 2]
dummy_tensor = torch.zeros(size=new_shape)
past_key_values = tuple([dummy_tensor] * num_layers)
return past_key_values
elif config.model_type == "falcon":
new_shape = [input_bs, 1, 0, d_k]
else:
new_shape = [input_bs, num_key_value_heads, 0, d_k]
past_key_values = [
(
torch.zeros(size=new_shape).contiguous(),
Expand All @@ -206,32 +238,53 @@ def generate_dummy_past_key_values_for_inference(config, input_bs):
def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
"""Generate the dummy past_key_values."""
from optimum.utils import NormalizedConfigManager

normalized_config = NormalizedConfigManager.get_normalized_config_class(
config.model_type
)(config)
num_layers = normalized_config.num_layers
num_attention_heads = normalized_config.num_attention_heads
hidden_size = normalized_config.hidden_size
d_k = hidden_size // num_attention_heads
num_key_value_heads = num_attention_heads
nb_pkv = 2
if hasattr(normalized_config, "num_key_value_heads"):
num_key_value_heads = normalized_config.num_key_value_heads
if hasattr(normalized_config, "multi_query_group_num"):
num_key_value_heads = normalized_config.multi_query_group_num
if config.model_type == "bloom":
for nb_pkv in range(nb_pkv):
if nb_pkv % 2 == 0:
new_shape = [input_bs * num_key_value_heads, d_k, 1]
else:
new_shape = [input_bs * num_key_value_heads, 1, d_k]
elif config.model_type == "qwen":
new_shape = [input_bs, 1, num_key_value_heads, d_k]
if config.model_type == "qwen":
new_shape = [
input_bs,
1,
config.num_attention_heads,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_hidden_layers
elif config.model_type == "baichuan":
new_shape = [
input_bs,
config.num_attention_heads,
1,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_hidden_layers
elif config.model_type == "chatglm":
new_shape = [1, input_bs, num_key_value_heads, d_k]
new_shape = [
1,
input_bs,
config.num_attention_heads,
config.hidden_size // config.num_attention_heads,
]
num_layers = config.num_layers
else:
new_shape = [input_bs, num_key_value_heads, 1, d_k]
normalized_config = NormalizedConfigManager.get_normalized_config_class(
config.model_type
)(config)
num_layers = normalized_config.num_layers
num_attention_heads = normalized_config.num_attention_heads
hidden_size = normalized_config.hidden_size
d_k = hidden_size // num_attention_heads
num_key_value_heads = num_attention_heads
nb_pkv = 2
if hasattr(normalized_config, "num_key_value_heads"):
num_key_value_heads = normalized_config.num_key_value_heads
if hasattr(normalized_config, "multi_query_group_num"):
num_key_value_heads = normalized_config.multi_query_group_num
if config.model_type == "bloom":
for nb_pkv in range(nb_pkv):
if nb_pkv % 2 == 0:
new_shape = [input_bs * num_key_value_heads, d_k, 1]
else:
new_shape = [input_bs * num_key_value_heads, 1, d_k]

else:
new_shape = [input_bs, num_key_value_heads, 1, d_k]

beam_idx_tmp = torch.zeros(
(2048, int(input_bs * num_beams)), dtype=torch.long
Expand Down
Loading