diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py index dd0c2c99102..9c3e837c7d9 100644 --- a/intel_extension_for_transformers/neural_chat/models/model_utils.py +++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py @@ -699,7 +699,7 @@ def load_model( assert ipex.__version__ >= "2.1.0+cpu", "Please use Intel Extension for PyTorch >=2.1.0+cpu." if re.search("falcon", model_name, re.IGNORECASE): assert transformers.__version__ <= "4.33.3", "Please pip install transformers==4.33.3" - from intel_extension_for_transformers.transformers.llm.evaluation.models import TSModelCausalLMForITREX + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import TSModelCausalLMForITREX model = TSModelCausalLMForITREX.from_pretrained( model_name, file_name="best_model.pt" diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index 1ffb7b47001..634ea7499c6 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -14,12 +14,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re + from typing import Optional, Tuple import transformers from datasets import load_dataset -from optimum.intel.generation.modeling import TSModelForCausalLM from torch.nn.functional import pad from torch.utils.data import DataLoader from transformers.modeling_outputs import CausalLMOutputWithPast @@ -315,7 +314,7 @@ def collate_batch(batch): ) return calib_dataloader - +from optimum.intel.generation.modeling import TSModelForCausalLM class TSModelCausalLMForITREX(TSModelForCausalLM): def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index afad1d516c2..4a24dc7121d 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -57,9 +57,7 @@ ) if is_autoround_available(): - from auto_round.export.export_to_itrex.model_wrapper import ( - WeightOnlyLinear as auto_round_woqlinear, - ) # pylint: disable=E0401 + from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woqlinear # pylint: disable=E0401 from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader torch = LazyImport("torch") @@ -299,10 +297,8 @@ def _replace_linear( use_optimum_format=use_optimum_format, ) elif device == "xpu" or device == torch.device("xpu"): - from intel_extension_for_pytorch.nn.utils._quantize_convert import ( - WeightOnlyQuantizedLinear as ipex_linear, - ) # pylint: disable=E0401 - + from intel_extension_for_pytorch.nn.utils._quantize_convert import \ + WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 model._modules[name] = ipex_linear( in_features, out_features, @@ -569,6 +565,8 @@ def convert_to_quantized_model(model, config, device="cpu"): ) model = prepare(model, quant_config) model = convert(model) + # qits module doesn't match with HQQ algorithm. + return model elif config.quant_method.value == "awq": quant_config = AWQConfig( dtype=dtype, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 89ab4f758ea..28dc9715782 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -164,7 +164,11 @@ def build_woq_model(model, quantization_config): if "lm_head" in n or "output_layer" in n or "embed_out" in n: continue if isinstance(m, torch.nn.Linear): - zp = getattr(quantization_config, "zero_point", not getattr(quantization_config, "sym", False)) + zp = getattr( + quantization_config, + "zero_point", + not getattr(quantization_config, "sym", False), + ) with init_empty_weights(): new_module = WeightOnlyLinear( m.in_features, @@ -201,6 +205,7 @@ def convert_model_to_public(model): ]: model = recover_export_model(model) + def make_contiguous(model): for param in model.parameters(): if param.data.ndimension() > 1: @@ -225,7 +230,8 @@ def save_low_bit( self.model.config.quantization_config = self.quantization_config self.model.config.save_pretrained(save_directory) weights_file = os.path.join( - os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME + ) torch.save(self.quantized_state_dict(), weights_file) return @@ -239,25 +245,42 @@ def save_low_bit( ) if self.quantization_config.use_ipex: + def save_linear_parameters(model, save_directory): # only can save to pytorch model.bin due to ipex. weights_file = os.path.join( - os.path.abspath(os.path.expanduser(save_directory)), SAFE_WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(save_directory)), SAFE_WEIGHTS_NAME + ) os.remove(weights_file) weights_file = os.path.join( - os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME + ) linear_parameters = {} - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_cpu_linear, + ) + for name, module in model.named_modules(): if isinstance(module, ipex_cpu_linear): - linear_parameters[name + ".ipex_scales"] = module._op_context.get_scales().contiguous() - linear_parameters[name + ".ipex_weight"] = \ - module._op_context.to_public(module._op_context.get_weight()).contiguous() - linear_parameters[name + ".ipex_zeros"] = module._op_context.get_zero_points().contiguous() + linear_parameters[name + ".ipex_scales"] = ( + module._op_context.get_scales().contiguous() + ) + linear_parameters[name + ".ipex_weight"] = ( + module._op_context.to_public( + module._op_context.get_weight() + ).contiguous() + ) + linear_parameters[name + ".ipex_zeros"] = ( + module._op_context.get_zero_points().contiguous() + ) if module._op_context.get_bias() is not None: - linear_parameters[name + ".ipex_bias"] = module._op_context.get_bias().contiguous() + linear_parameters[name + ".ipex_bias"] = ( + module._op_context.get_bias().contiguous() + ) if module._op_context.get_g_idx() is not None: - linear_parameters[name + ".ipex_g_idx"] = module._op_context.get_g_idx().contiguous() + linear_parameters[name + ".ipex_g_idx"] = ( + module._op_context.get_g_idx().contiguous() + ) others_parameters = model.state_dict() linear_parameters.update(others_parameters) @@ -346,17 +369,27 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): use_vllm = kwargs.pop("use_vllm", None) if use_vllm is not None: logger.info("The backend is vLLM.") - from vllm import LLM # pylint: disable=E1101 - from vllm.model_executor.model_loader import get_model_loader # pylint: disable=E0611 - from vllm.model_executor.model_loader.weight_utils import default_weight_loader # pylint: disable=E0401 disable=E0611 - from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - ColumnParallelLinear, - RowParallelLinear) # pylint: disable=E1101 + from vllm import LLM # pylint: disable=E1101 + from vllm.model_executor.model_loader import ( + get_model_loader, + ) # pylint: disable=E0611 + from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + ) # pylint: disable=E0401 disable=E0611 + from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + ColumnParallelLinear, + RowParallelLinear, + ) # pylint: disable=E1101 os.environ["backend"] = "use_vllm" - llm = LLM(model=pretrained_model_name_or_path, trust_remote_code=True) # Create an vllm instance. - model = llm.llm_engine.model_executor.driver_worker.model_runner.model # pylint: disable=E1101 + llm = LLM( + model=pretrained_model_name_or_path, trust_remote_code=True + ) # Create an vllm instance. + model = ( + llm.llm_engine.model_executor.driver_worker.model_runner.model + ) # pylint: disable=E1101 print("Original model =", model) original_parameter_memo = dict() @@ -366,12 +399,22 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if "qkv_proj" in name or "gate_up_proj" in name: input_dim = getattr(params, "input_dim", None) output_dim = getattr(params, "output_dim", None) - original_parameter_memo[name] = (input_dim, output_dim, params.weight_loader) + original_parameter_memo[name] = ( + input_dim, + output_dim, + params.weight_loader, + ) class linear_adaptor(torch.nn.Linear): - def __init__(self, in_features: int, out_features: int, bias: bool = True, \ - device=None, dtype=None) -> None: + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: super().__init__(in_features, out_features, bias, device, dtype) def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: @@ -379,34 +422,49 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: for name, module in model.named_modules(): bias_flag = False - if isinstance(module, QKVParallelLinear) or isinstance(module, MergedColumnParallelLinear) or \ - isinstance(module, RowParallelLinear) or isinstance(module, ColumnParallelLinear): + if ( + isinstance(module, QKVParallelLinear) + or isinstance(module, MergedColumnParallelLinear) + or isinstance(module, RowParallelLinear) + or isinstance(module, ColumnParallelLinear) + ): out_feature = module.weight.shape[0] in_feature = module.weight.shape[1] if getattr(module, "bias", False) != None: bias_flag = True weight_dtype = module.weight.dtype - torch_linear = linear_adaptor(in_features=in_feature, - out_features=out_feature, - bias=bias_flag, - dtype=weight_dtype) + torch_linear = linear_adaptor( + in_features=in_feature, + out_features=out_feature, + bias=bias_flag, + dtype=weight_dtype, + ) module_traversal = model - all_module_names = name.split('.') + all_module_names = name.split(".") all_module_names_except_last = all_module_names[:-1] for sub_module_name in all_module_names_except_last: module_traversal = module_traversal._modules[sub_module_name] - module_traversal._modules[all_module_names[-1]] = copy.deepcopy(torch_linear) + module_traversal._modules[all_module_names[-1]] = copy.deepcopy( + torch_linear + ) print("Optimized model =", model) - loader = get_model_loader(llm.llm_engine.load_config) # pylint: disable=E1101 + loader = get_model_loader( + llm.llm_engine.load_config + ) # pylint: disable=E1101 + + weights_iterator = loader._get_weights_iterator( + llm.llm_engine.model_config.model, + llm.llm_engine.model_config.revision, + fall_back_to_pt=True, + ) - weights_iterator = loader._get_weights_iterator(llm.llm_engine.model_config.model, - llm.llm_engine.model_config.revision, - fall_back_to_pt=True) + from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + ) # pylint: disable=E0401 disable=E0611 - from vllm.model_executor.model_loader.weight_utils import default_weight_loader # pylint: disable=E0401 disable=E0611 params_dict = dict(model.named_parameters(remove_duplicate=False)) for name in params_dict.keys(): params = params_dict[name] @@ -424,11 +482,13 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: print("INC quantizing...") config = kwargs.pop("config", None) if config is None: - config = RtnConfig(compute_dtype="int8", - group_size=128, - scale_dtype="bf16", - weight_dtype="int4_clip", - bits=4) + config = RtnConfig( + compute_dtype="int8", + group_size=128, + scale_dtype="bf16", + weight_dtype="int4_clip", + bits=4, + ) print("using default RTNConfig = ", config) print("Using customized config = ", config) model = convert_to_quantized_model(model, config) @@ -489,8 +549,12 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: return model device_map = kwargs.get("device_map", "cpu") - use_cpu = True if device_map == torch.device("cpu") or device_map == "cpu" else False - use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False + use_cpu = ( + True if device_map == torch.device("cpu") or device_map == "cpu" else False + ) + use_xpu = ( + True if device_map == torch.device("xpu") or device_map == "xpu" else False + ) config = kwargs.pop("config", None) model_hub = kwargs.pop("model_hub", "huggingface") @@ -498,20 +562,28 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: quantization_config = kwargs.pop("quantization_config", None) if not isinstance(config, PretrainedConfig): if model_hub == "modelscope": - import modelscope # pylint: disable=E0401 - config = modelscope.AutoConfig.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=True) + import modelscope # pylint: disable=E0401 + + config = modelscope.AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=True + ) else: config, _ = AutoConfig.from_pretrained( pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs, - ) - if quantization_config is not None and quantization_config.quant_method in ["sq"]: + if quantization_config is not None and quantization_config.quant_method in [ + "sq" + ]: use_neural_speed = False - elif hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and "quant_method" in config.quantization_config and config.quantization_config["quant_method"] in ["sq"]: + elif ( + hasattr(config, "quantization_config") + and isinstance(config.quantization_config, dict) + and "quant_method" in config.quantization_config + and config.quantization_config["quant_method"] in ["sq"] + ): use_neural_speed = False elif kwargs.get("use_llm_runtime", None) is not None: use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu @@ -544,30 +616,38 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: "Quantization_config loading failed. If you want to load saved " "low bit model, please check your quantizate_config.json." ) - elif use_neural_speed and not config.quantization_config["quant_method"] in ["dynamic", "static", "qat"]: + elif use_neural_speed and not config.quantization_config[ + "quant_method" + ] in ["dynamic", "static", "qat"]: if not os.path.exists(pretrained_model_name_or_path): from huggingface_hub import snapshot_download - pretrained_model_name_or_path = snapshot_download(repo_id=pretrained_model_name_or_path, - allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"], - ) + + pretrained_model_name_or_path = snapshot_download( + repo_id=pretrained_model_name_or_path, + allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"], + ) if quantization_config is None: - ConfigInit = {"rtn": RtnConfig, - "awq": AwqConfig, - "teq": TeqConfig, - "gptq": GPTQConfig, - "autoround": AutoRoundConfig, - } + ConfigInit = { + "rtn": RtnConfig, + "awq": AwqConfig, + "teq": TeqConfig, + "gptq": GPTQConfig, + "autoround": AutoRoundConfig, + } quantization_config = config.quantization_config - assert quantization_config.get("quant_method", None) in ConfigInit, \ - "Detect this model is not a low-bit model." - quantization_config = ConfigInit[quantization_config["quant_method"]].from_dict(quantization_config) + assert ( + quantization_config.get("quant_method", None) in ConfigInit + ), "Detect this model is not a low-bit model." + quantization_config = ConfigInit[ + quantization_config["quant_method"] + ].from_dict(quantization_config) logger.info("Loading Low Bits model by Neural Speed.") quantization_config.post_init_runtime() from neural_speed import Model model = Model() - model.init( # pylint: disable=E1123 + model.init( # pylint: disable=E1123 pretrained_model_name_or_path, weight_dtype=quantization_config.weight_dtype, alg=quantization_config.scheme, @@ -658,9 +738,15 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: else: quantization_config = RtnConfig( bits=4, - compute_dtype=torch.float32 if - (use_cpu and not CpuInfo().bf16 - and torch_dtype == torch.bfloat16) else convert_dtype_torch2str(torch_dtype), + compute_dtype=( + torch.float32 + if ( + use_cpu + and not CpuInfo().bf16 + and torch_dtype == torch.bfloat16 + ) + else convert_dtype_torch2str(torch_dtype) + ), weight_dtype="nf4" if use_cpu else "int4_fullrange", ) else: @@ -674,14 +760,21 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: if quantization_config is None: if use_neural_speed: quantization_config = RtnConfig( - compute_dtype="bf16" if CpuInfo().bf16 else "fp32", weight_dtype="int8" + compute_dtype="bf16" if CpuInfo().bf16 else "fp32", + weight_dtype="int8", ) else: quantization_config = RtnConfig( bits=8, - compute_dtype=torch.float32 if - (use_cpu and not CpuInfo().bf16 - and torch_dtype == torch.bfloat16) else convert_dtype_torch2str(torch_dtype), + compute_dtype=( + torch.float32 + if ( + use_cpu + and not CpuInfo().bf16 + and torch_dtype == torch.bfloat16 + ) + else convert_dtype_torch2str(torch_dtype) + ), weight_dtype="int8", ) else: @@ -731,7 +824,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: from neural_speed import Model model = Model() - model.init( # pylint: disable=E1123 + model.init( # pylint: disable=E1123 pretrained_model_name_or_path, weight_dtype=quantization_config.weight_dtype, alg=quantization_config.scheme, @@ -990,7 +1083,6 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: # torch.tensor(last_ind), # ) - # tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) # tokenized_dataset.set_format(type="torch", columns=["input_ids"]) # calib_dataloader = DataLoader( @@ -1014,7 +1106,6 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: # ) # calib_func = calib_func - # # call inc static quant # from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare # quant_config = StaticQuantConfig( @@ -1130,7 +1221,6 @@ def collate_batch(batch): torch.tensor(last_ind), ) - tokenized_dataset = train_dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids"]) train_dataloader = DataLoader( @@ -1157,7 +1247,7 @@ def train_func(model): optimizer.zero_grad() loss.backward() optimizer.step() - print('Iteration [{}], Loss: {:.4f}'.format(i+1, loss)) + print("Iteration [{}], Loss: {:.4f}".format(i + 1, loss)) return model logger.info( @@ -1170,6 +1260,7 @@ def train_func(model): # call inc static quant from neural_compressor import QuantizationAwareTrainingConfig, quantization from neural_compressor.training import prepare_compression + conf = QuantizationAwareTrainingConfig( backend=quantization_config.backend, excluded_precisions=quantization_config.excluded_precisions, @@ -1181,7 +1272,9 @@ def train_func(model): model = compression_manager.model train_func(model) compression_manager.callbacks.on_train_end() - compression_manager.model.save_pretrained = types.MethodType(save_low_bit, model) + compression_manager.model.save_pretrained = types.MethodType( + save_low_bit, model + ) quantization_config.remove_redundant_parameters() compression_manager.model.quantization_config = quantization_config logger.info("Quant Aware Training done.") @@ -1192,7 +1285,7 @@ def train_func(model): from neural_speed import Model model = Model() - model.init( # pylint: disable=E1123 + model.init( # pylint: disable=E1123 pretrained_model_name_or_path, weight_dtype="fp32", use_quant=False, @@ -1273,7 +1366,11 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): kwarg_attn_imp = kwargs.pop("attn_implementation", None) # lm-eval device map is dictionary - device_map = device_map[""] if isinstance(device_map, dict) and "" in device_map else device_map + device_map = ( + device_map[""] + if isinstance(device_map, dict) and "" in device_map + else device_map + ) if use_safetensors is None and not is_safetensors_available(): use_safetensors = False @@ -1289,8 +1386,12 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) token = use_auth_token - use_cpu = True if device_map == torch.device("cpu") or device_map == "cpu" else False - use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False + use_cpu = ( + True if device_map == torch.device("cpu") or device_map == "cpu" else False + ) + use_xpu = ( + True if device_map == torch.device("xpu") or device_map == "xpu" else False + ) user_agent = { "file_type": "model", @@ -1321,7 +1422,9 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif quantization_config["quant_method"] == "dynamic": quantization_config = DynamicQuantConfig.from_dict(quantization_config) elif quantization_config["quant_method"] == "qat": - quantization_config = QuantAwareTrainingConfig.from_dict(quantization_config) + quantization_config = QuantAwareTrainingConfig.from_dict( + quantization_config + ) elif quantization_config["quant_method"] == "sq": quantization_config = SmoothQuantConfig.from_dict(quantization_config) assert ( @@ -1462,11 +1565,15 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): "_raise_exceptions_for_missing_entries": False, "_commit_hash": commit_hash, } - resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None # result when internet is up, the repo and revision exist, but the file does not. - if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + if resolved_archive_file is None and filename == _add_variant( + SAFE_WEIGHTS_NAME, variant + ): # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( pretrained_model_name_or_path, @@ -1487,9 +1594,13 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # This repo has no safetensors file of any kind, we switch to PyTorch. filename = _add_variant(WEIGHTS_NAME, variant) resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs + pretrained_model_name_or_path, + filename, + **cached_file_kwargs, ) - if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant): + if resolved_archive_file is None and filename == _add_variant( + WEIGHTS_NAME, variant + ): # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( pretrained_model_name_or_path, @@ -1508,7 +1619,9 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): "token": token, } if variant is not None and has_file( - pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs + pretrained_model_name_or_path, + WEIGHTS_NAME, + **has_file_kwargs, ): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named" @@ -1571,8 +1684,11 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config.quant_method in ["static", "dynamic", "qat"]: model = model_class(config, *model_args, **kwargs) from neural_compressor.utils.pytorch import load + weights_file = os.path.join( - os.path.abspath(os.path.expanduser(pretrained_model_name_or_path)), WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(pretrained_model_name_or_path)), + WEIGHTS_NAME, + ) q_model = load(weights_file, model, dataloader=None) del model return q_model @@ -1581,7 +1697,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( TSModelCausalLMForITREX, ) - q_model = torch.jit.load(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)) + + q_model = torch.jit.load( + os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + ) origin_model_type = config.model_type if origin_model_type in ["chatglm", "qwen", "baichuan"]: config.model_type = "qwen2" @@ -1611,19 +1730,25 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): dtype_orig = model_class._set_default_torch_dtype(torch_dtype) if quantization_config.compute_dtype is None: if use_xpu: - quantization_config.compute_dtype = \ - "fp16" if (torch_dtype is None or - torch_dtype == torch.bfloat16) \ + quantization_config.compute_dtype = ( + "fp16" + if (torch_dtype is None or torch_dtype == torch.bfloat16) else convert_dtype_torch2str(torch_dtype) + ) else: - quantization_config.compute_dtype = \ - "fp32" if (torch_dtype is None or - (not CpuInfo().bf16 and torch_dtype == torch.bfloat16) or - (torch_dtype == torch.float16)) \ + quantization_config.compute_dtype = ( + "fp32" + if ( + torch_dtype is None + or (not CpuInfo().bf16 and torch_dtype == torch.bfloat16) + or (torch_dtype == torch.float16) + ) else convert_dtype_torch2str(torch_dtype) + ) else: - if ((not CpuInfo().bf16 and quantization_config.compute_dtype == "bf16") - or (use_cpu and quantization_config.compute_dtype == "fp16")): + if (not CpuInfo().bf16 and quantization_config.compute_dtype == "bf16") or ( + use_cpu and quantization_config.compute_dtype == "fp16" + ): quantization_config.compute_dtype = "fp32" if quantization_config.scale_dtype is None: @@ -1631,7 +1756,9 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config.scale_dtype not in ["fp32", "fp16", "bf16"]: logger.warning("scale_dtype only supports fp32, bf16, fp16.") quantization_config.scale_dtype = "fp32" - logger.warning("fp32 scale_dtype is used, please change the config.json if you don't want to use it.") + logger.warning( + "fp32 scale_dtype is used, please change the config.json if you don't want to use it." + ) # weight dtype is higher priority than bits in config.json when both existed. if quantization_config.weight_dtype is None: @@ -1639,36 +1766,47 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config.weight_dtype = "int4_clip" logger.info( "{} quantization weight_dtype is used due to bits is 4 in config.json.".format( - quantization_config.weight_dtype) + quantization_config.weight_dtype ) + ) elif quantization_config.bits == 8: quantization_config.weight_dtype = "int8" logger.info( "{} quantization weight_dtype is used due to bits is 8 in config.json.".format( - quantization_config.weight_dtype) + quantization_config.weight_dtype ) + ) else: logger.warning("bits number only supports 4, 8.") quantization_config.weight_dtype = "int4_clip" logger.warning( - "int4_clip weight_dtype is used, please change the config.json if you don't want to use it.") + "int4_clip weight_dtype is used, please change the config.json if you don't want to use it." + ) else: - if quantization_config.weight_dtype not in ["int4_fullrange", - "int4_clip", - "int8", - "fp8_e5m2", - "fp8_e4m3", - "nf4", - "fp4_e2m1_bnb", - "fp4_e2m1"]: - logger.warning("Please provide the correct bits number or weight_dtype in config.json.") + if quantization_config.weight_dtype not in [ + "int4_fullrange", + "int4_clip", + "int8", + "fp8_e5m2", + "fp8_e4m3", + "nf4", + "fp4_e2m1_bnb", + "fp4_e2m1", + ]: + logger.warning( + "Please provide the correct bits number or weight_dtype in config.json." + ) raise ValueError( f"weight_dtype must be a string in " f"'int8', 'int4', 'int4_fullrange', 'int4_clip', 'nf4', " f"'fp4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8', 'fp8_e5m2, fp8_e4m3'" ) else: - logger.info("{} quantization weight_dtype is used.".format(quantization_config.weight_dtype)) + logger.info( + "{} quantization weight_dtype is used.".format( + quantization_config.weight_dtype + ) + ) init_contexts = [no_init_weights(_enable=_fast_init)] init_contexts.append(init_empty_weights()) @@ -1706,7 +1844,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if is_ipex_available() and quantization_config.use_ipex: import intel_extension_for_pytorch as ipex - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_linear, + ) + def replace_ipex_cpu_woq_linear(model, current_name=[]): for name, module in model.named_children(): current_name.append(name) @@ -1716,37 +1857,46 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): 8: ipex.quantization.WoqWeightDtype.INT8, } compute_dtype = { - "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. + "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. "bf16": ipex.quantization.WoqLowpMode.BF16, "fp16": ipex.quantization.WoqLowpMode.FP16, "int8": ipex.quantization.WoqLowpMode.INT8, - } - ipex_qconfig_mapping = ( - ipex.quantization.get_weight_only_quant_qconfig_mapping( - weight_dtype=weight_dtype[quantization_config.bits], - lowp_mode=compute_dtype[quantization_config.compute_dtype], - act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, - group_size=quantization_config.group_size, - ) + ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype[quantization_config.bits], + lowp_mode=compute_dtype[quantization_config.compute_dtype], + act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + group_size=quantization_config.group_size, ) tmp_linear = torch.nn.Linear( module.in_features, module.out_features, - True if hasattr(module, "bias") else False - ) + True if hasattr(module, "bias") else False, + ) tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig target_linear = ipex_linear.from_float_and_int4_weight( - mod = tmp_linear, - qweight = state_dict.pop('.'.join(current_name) + ".ipex_weight"), - scales = state_dict.pop('.'.join(current_name) + ".ipex_scales"), - zero_points = state_dict.pop('.'.join(current_name) + ".ipex_zeros"), - bias = state_dict.pop('.'.join(current_name) + ".ipex_bias") \ - if '.'.join(current_name) + ".ipex_bias" in state_dict else None, - group_size = quantization_config.group_size, - g_idx = state_dict.pop('.'.join(current_name) + ".ipex_g_idx") \ - if '.'.join(current_name) + ".ipex_g_idx" in state_dict else None, + mod=tmp_linear, + qweight=state_dict.pop( + ".".join(current_name) + ".ipex_weight" + ), + scales=state_dict.pop( + ".".join(current_name) + ".ipex_scales" + ), + zero_points=state_dict.pop( + ".".join(current_name) + ".ipex_zeros" + ), + bias=( + state_dict.pop(".".join(current_name) + ".ipex_bias") + if ".".join(current_name) + ".ipex_bias" in state_dict + else None + ), + group_size=quantization_config.group_size, + g_idx=( + state_dict.pop(".".join(current_name) + ".ipex_g_idx") + if ".".join(current_name) + ".ipex_g_idx" in state_dict + else None + ), ) setattr(model, name, target_linear) else: @@ -1783,14 +1933,18 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): # Set model in evaluation mode to deactivate DropOut modules by default model.eval() - if quantization_config.weight_dtype not in [ - "fp8_e5m2", - "fp8_e4m3", - "nf4", - "fp4_e2m1", - "fp4_e2m1_bnb", - "int4_fullrange", - ] and not quantization_config.use_ipex: + if ( + quantization_config.weight_dtype + not in [ + "fp8_e5m2", + "fp8_e4m3", + "nf4", + "fp4_e2m1", + "fp4_e2m1_bnb", + "int4_fullrange", + ] + and not quantization_config.use_ipex + ): model = replace_linear( model, quantization_config=quantization_config, @@ -1798,8 +1952,9 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): empty_weights=True, ) - if (not use_xpu and torch_dtype == torch.float16) or (not use_xpu and not CpuInfo().bf16 - and torch_dtype == torch.bfloat16): + if (not use_xpu and torch_dtype == torch.float16) or ( + not use_xpu and not CpuInfo().bf16 and torch_dtype == torch.bfloat16 + ): model.to(dtype=torch.float32) # If it is a model with generation capabilities, attempt to load the generation config diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 2467531fab2..092a3a33a58 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -18,9 +18,7 @@ import argparse import os -from typing import Optional, Tuple -from neural_compressor.utils import logger -from neural_compressor.utils.utility import LazyImport, CpuInfo +from neural_compressor.utils.utility import LazyImport from intel_extension_for_transformers.tools.utils import is_ipex_available @@ -96,411 +94,3 @@ def __init__(self) -> None: self.dataset = dataloader.dataset return INCDataLoader() - - -def generate_dummy_past_key_values(config, input_bs): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 0, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 0, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 0, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - if config.model_type == "bloom": - shape_key = (input_bs * num_attention_heads, d_k, 1) - shape_value = (input_bs * num_attention_heads, 1, d_k) - key = torch.ones(size=shape_key) - value = torch.ones(size=shape_value) - past_key_values = tuple( - tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) - for _ in range(num_layers) - ) - return past_key_values - elif config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - return past_key_values - elif config.model_type == "falcon": - new_shape = [input_bs, 1, 0, d_k] - else: - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -def generate_dummy_past_key_values_for_inference(config, input_bs): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 0, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 0, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 0, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - if config.model_type == "bloom": - shape_key = (input_bs * num_attention_heads, d_k, 0) - shape_value = (input_bs * num_attention_heads, 0, d_k) - key = torch.empty(size=shape_key) - value = torch.empty(size=shape_value) - past_key_values = tuple( - tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) - for _ in range(num_layers) - ) - return past_key_values - elif config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - return past_key_values - elif config.model_type == "falcon": - new_shape = [input_bs, 1, 0, d_k] - else: - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 1, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 1, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 1, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - nb_pkv = 2 - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - if config.model_type == "bloom": - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_bs * num_key_value_heads, d_k, 1] - else: - new_shape = [input_bs * num_key_value_heads, 1, d_k] - - else: - new_shape = [input_bs, num_key_value_heads, 1, d_k] - - beam_idx_tmp = torch.zeros( - (2048, int(input_bs * num_beams)), dtype=torch.long - ).contiguous() - past_key_values = [ - ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - beam_idx_tmp, - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -IPEX_OPT_LLM_SUPPORTED_DICT = { - "2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], - "2.3": [ - "gptj", - "opt", - "llama", - "falcon", - "chatglm", - "baichuan", - "qwen", - "bloom", - "codegen", - "gptbigcode", - "t5", - "mixtral", - "mpt", - ], -} - -MODEL_TYPES_REQUIRING_POSITION_IDS = { - "codegen", - "gpt2", - "gpt-bigcode", - "gpt-neo", - "gpt-neox", - "gptj", - "imagegpt", - "llama", - "mistral", - "chatglm", -} - -if is_ipex_available() and ipex.__version__ == "2.2.0+cpu": - logger.info( - "ipex.llm.optimize by 2.2.0 version supported model family: {}".format( - ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]) - ) - ) - logger.info( - "The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version." - ) - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] -elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu": - logger.info( - "ipex.llm.optimize by 2.3.0 version supported model family: {}".format( - ", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]) - ) - ) - logger.info( - "The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version." - ) - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] -else: - logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] - -def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4): - """Generate the dummy example inputs.""" - prompt = "Welcome to use Intel Extension for Transformers." - prompt = [prompt] * batch_size - input_ids = tokenizer(prompt, return_tensors="pt").input_ids - model_type = model_config.model_type.replace("_", "-") - if model_type in IPEX_OPT_LLM_SUPPORTED: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=model_config, - input_bs=batch_size, - num_beams=num_beams - ) - else: - past_key_values = generate_dummy_past_key_values(config=model_config, input_bs=batch_size) - - input_ids = input_ids[:, :512] - if model_type in ["bloom", "qwen"]: - attention_mask = torch.ones(input_ids.shape[0], input_ids.shape[1] + 1) - attention_mask[:,0] = 0 - else: - attention_mask = torch.ones(input_ids.shape) - position_ids = torch.arange(input_ids.shape[1]).repeat(batch_size, 1) - - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - example_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": position_ids, - "past_key_values": past_key_values - } - else: - example_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values - } - return example_inputs - - -def make_torchscript_model(model, json_file_path, example_inputs): - """Recover ipex model from JSON file. - - Args: - model (object): fp32 model need to do quantization. - json_file_path (json): configuration JSON file for ipex. - example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function. - - Returns: - (object): quantized model - """ - - ipex = LazyImport("intel_extension_for_pytorch") - from torch.ao.quantization.observer import MinMaxObserver - - if ipex.__version__ >= "2.1.100": - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver) - else: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver()) - if isinstance(example_inputs, dict): - model = ipex.quantization.prepare(model, qconfig, example_kwarg_inputs=example_inputs, inplace=True) - else: - model = ipex.quantization.prepare(model, qconfig, example_inputs=example_inputs, inplace=True) - model.load_qconf_summary(qconf_summary=json_file_path) - model = ipex.quantization.convert(model, inplace=True) - model.eval() - with torch.no_grad(): - try: - if isinstance(example_inputs, dict): - # pylint: disable=E1120,E1123 - model = torch.jit.trace(model, example_kwarg_inputs=example_inputs) - else: - model = torch.jit.trace(model, example_inputs) - model = torch.jit.freeze(model.eval()) - except: - if isinstance(example_inputs, dict): - # pylint: disable=E1120,E1123 - model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False, check_trace=False) - else: - model = torch.jit.trace(model, example_inputs, strict=False) - model = torch.jit.freeze(model.eval()) - if isinstance(example_inputs, dict): - model(**example_inputs) - model(**example_inputs) - elif isinstance(example_inputs, tuple) or isinstance(example_inputs, list): - model(*example_inputs) - model(*example_inputs) - else: - model(example_inputs) - model(example_inputs) - return model - -def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remote_code=False): - """Recover ipex model from JSON file. - - Args: - model (object): fp32 model need to do quantization. - json_file_path (json): configuration JSON file for ipex. - trust_remote_code (bool): trust remote code. - - Returns: - (object): quantized model - """ - from transformers import AutoModelForCausalLM - - # ipex recovered int8 model from configure.json requests float32 model input and on cpu device. - user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, - trust_remote_code=trust_remote_code).float() - if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED: - import intel_extension_for_pytorch as ipex - qconfig = ipex.quantization.default_static_qconfig_mapping - user_model = ipex.optimize_transformers( - user_model.eval(), - dtype=torch.float, - inplace=True, - quantization_config=qconfig, - deployment_mode=False, - ) - - # tokenizer - if user_model.config.model_type == "llama": - from transformers import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained(user_model.config.name_or_path) - else: - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained( - user_model.config.name_or_path, trust_remote_code=trust_remote_code - ) - - # example_inputs - example_inputs = get_example_inputs(user_model.config, tokenizer=tokenizer) - - # pylint: disable=E0611 - user_model.config.torchscript = True - config = user_model.config - user_model = make_torchscript_model(user_model, json_file_path, example_inputs) - import intel_extension_for_pytorch as ipex - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, - ) - origin_model_type = config.model_type - if origin_model_type in ["chatglm", "qwen", "baichuan"]: - config.model_type = "qwen2" - user_model = TSModelCausalLMForITREX(user_model, config=config) - user_model.config.model_type = origin_model_type - return user_model