Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Fix SQ baichuan without position_ids for torch and ipex 2.3.0 #1597

Merged
merged 8 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions examples/.config/pytorch_optimize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2148,32 +2148,6 @@
}
}
},
"baichuan_7b_gen_ipex_static": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "baichuan_7b",
"task": "generation",
"approach": "static",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "baichuan_7b",
"task": "generation",
"approach": "static",
"backend": "ipex",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"baichuan2_7b_gen_ipex_static": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ protobuf
sentencepiece != 0.1.92
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.3.0+cpu
transformers
transformers==4.38.1
intel_extension_for_pytorch==2.3.0
optimum-intel==1.16.1
bitsandbytes #baichuan
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,12 @@ function run_benchmark {
elif [ "${topology}" = "llama_7b" ]; then
model_name_or_path="meta-llama/Llama-2-7b-chat-hf"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "llama2_7b_gptq" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
script="run_generation_cpu_woq.py"
elif [ "${topology}" = "llama_13b" ]; then
model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "dolly_v2_3b" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
script="run_generation_sq.py"
Expand All @@ -137,47 +135,32 @@ function run_benchmark {
model_name_or_path="THUDM/chatglm3-6b"
script="run_generation_sq.py"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm2_6b" ]; then
model_name_or_path="THUDM/chatglm2-6b"
script="run_generation_sq.py"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm_6b" ]; then
model_name_or_path="THUDM/chatglm-6b"
script="run_generation_sq.py"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
elif [ "${topology}" = "falcon_7b" ]; then
model_name_or_path="tiiuae/falcon-7b-instruct"
script="run_generation_sq.py"
pip install transformers==4.33
elif [ "${topology}" = "baichuan_7b" ]; then
model_name_or_path="baichuan-inc/Baichuan-7B"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan_13b" ]; then
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
model_name_or_path="baichuan-inc/Baichuan-13B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_7b" ]; then
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
model_name_or_path="baichuan-inc/Baichuan2-7B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_13b" ]; then
model_name_or_path="baichuan-inc/Baichuan2-13B-Base"
model_name_or_path="baichuan-inc/Baichuan2-13B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "qwen_7b" ]; then
model_name_or_path="Qwen/Qwen-7B"
model_name_or_path="Qwen/Qwen-7B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "mistral_7b" ]; then
model_name_or_path="Intel/neural-chat-7b-v3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,13 @@ function run_tuning {
model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "llama_13b" ]; then
alpha=0.8
model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "dolly_v2_3b" ]; then
alpha=0.6
model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
Expand All @@ -161,72 +159,54 @@ function run_tuning {
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm2_6b" ]; then
alpha=0.75
model_name_or_path="THUDM/chatglm2-6b"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm_6b" ]; then
alpha=0.75
model_name_or_path="THUDM/chatglm-6b"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "falcon_7b" ]; then
alpha=0.7
model_name_or_path="tiiuae/falcon-7b-instruct"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
pip install transformers==4.33.3
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan_7b" ]; then
alpha=0.85
model_name_or_path="baichuan-inc/Baichuan-7B"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
script="run_generation_sq.py"
pip install transformers==4.33
elif [ "${topology}" = "baichuan_13b" ]; then
alpha=0.85
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
model_name_or_path="baichuan-inc/Baichuan-13B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_7b" ]; then
alpha=0.85
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
model_name_or_path="baichuan-inc/Baichuan2-7B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_13b" ]; then
alpha=0.55
model_name_or_path="baichuan-inc/Baichuan2-13B-Base"
model_name_or_path="baichuan-inc/Baichuan2-13B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "qwen_7b" ]; then
alpha=0.9
model_name_or_path="Qwen/Qwen-7B"
model_name_or_path="Qwen/Qwen-7B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
pip install transformers==4.35.2
script="run_generation_sq.py"
script="run_generation_sq.py"
elif [ "${topology}" = "mistral_7b" ]; then
alpha=0.8
model_name_or_path="Intel/neural-chat-7b-v3"
Expand All @@ -240,15 +220,13 @@ function run_tuning {
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.36.1
script="run_generation_sq.py"
elif [ "${topology}" = "phi_1_5b" ]; then
alpha=0.5
model_name_or_path="susnato/phi-1_5_dev"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.36.1
script="run_generation_sq.py"
elif [ "${topology}" = "llama2_7b_gptq" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,9 @@ def forward(
input_bs, input_len = input_ids.shape
if self.use_cache and past_key_values is None:
if model_type in IPEX_OPT_LLM_SUPPORTED:
if model_type == "llama" and transformers.__version__ >= "4.36":
past_key_values = generate_dummy_past_key_values_for_inference(
config=self.config, input_bs=input_bs
)
else:
past_key_values = generate_dummy_past_key_values_for_opt_llm(
config=self.config, input_bs=input_bs, num_beams=1
)
past_key_values = generate_dummy_past_key_values_for_opt_llm(
config=self.config, input_bs=input_bs, num_beams=1
)
else:
past_key_values = generate_dummy_past_key_values_for_inference(
config=self.config, input_bs=input_bs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -845,8 +845,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
model = model.float()
model.eval()
model_type = model.config.model_type.replace("_", "-")
if "llama" in model_type and transformers.__version__ >= "4.36.0":
quantization_config.ipex_opt_llm = False

logger.info("Applying SmoothQuant.")
# ipex.optimize_transformers
if quantization_config.ipex_opt_llm is None:
Expand All @@ -855,7 +854,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
logger.info(
"quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used."
)
logger.warning("The suggested transformers version is 4.35.2.")
logger.warning("The suggested transformers version is 4.38.1.")
else:
quantization_config.ipex_opt_llm = False
if quantization_config.ipex_opt_llm:
Expand Down Expand Up @@ -950,7 +949,7 @@ def collate_batch(batch):
)

last_ind.append(input_ids.shape[0] - 1)
if model_type in ["bloom", "qwen"]:
if model_type in ["bloom"]:
attention_mask = torch.ones(len(input_ids) + 1)
attention_mask[0] = 0
else:
Expand Down
51 changes: 47 additions & 4 deletions intel_extension_for_transformers/transformers/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing import Optional, Tuple
from neural_compressor.utils import logger
from neural_compressor.utils.utility import LazyImport, CpuInfo
from intel_extension_for_transformers.tools.utils import is_ipex_available


CONFIG_NAME = "best_configure.yaml"
Expand All @@ -36,6 +37,8 @@
SAFE_WEIGHTS_NAME = "model.safetensors"
SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"

if is_ipex_available():
import intel_extension_for_pytorch as ipex
torch = LazyImport("torch")

def str2bool(v):
Expand Down Expand Up @@ -300,8 +303,24 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
]
return tuple(past_key_values)


IPEX_OPT_LLM_SUPPORTED = {"gptj", "opt", "llama", "falcon", "chatglm", "baichuan"}
IPEX_OPT_LLM_SUPPORTED_DICT = {
"2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"],
"2.3": [
"gptj",
"opt",
"llama",
"falcon",
"chatglm",
"baichuan",
"qwen",
"bloom",
"codegen",
"gptbigcode",
"t5",
"mixtral",
"mpt",
],
}

MODEL_TYPES_REQUIRING_POSITION_IDS = {
"codegen",
Expand All @@ -314,9 +333,32 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
"llama",
"mistral",
"chatglm",
"baichuan"
}

if is_ipex_available() and ipex.__version__ == "2.2.0+cpu":
logger.info(
"ipex.llm.optimize by 2.2.0 version supported model family: {}".format(
",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])
)
)
logger.info(
"The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version."
)
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]
elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu":
logger.info(
"ipex.llm.optimize by 2.3.0 version supported model family: {}".format(
", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])
)
)
logger.info(
"The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version."
)
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]
else:
logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.")
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]

def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4):
"""Generate the dummy example inputs."""
prompt = "Welcome to use Intel Extension for Transformers."
Expand Down Expand Up @@ -420,7 +462,8 @@ def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remot
(object): quantized model
"""
from transformers import AutoModelForCausalLM
user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, trust_remote_code=trust_remote_code)
user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path,
trust_remote_code=trust_remote_code).float()
if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED:
import intel_extension_for_pytorch as ipex
qconfig = ipex.quantization.default_static_qconfig_mapping
Expand Down
Loading