diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index e38749e2ef6..9571ad5f758 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -85,7 +85,7 @@ "batch_size": 8 }, "gpt_j_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", @@ -99,7 +99,7 @@ "batch_size": 1 }, "llama2_7b_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", @@ -113,7 +113,7 @@ "batch_size": 1 }, "opt_125m_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt index f0b56e558d3..d4155dfbf75 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt @@ -11,3 +11,4 @@ neural-compressor intel-extension-for-transformers lm_eval==0.4.2 peft +optimum-intel diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 94acc14344f..694c0505ea4 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -162,15 +162,6 @@ def get_user_model(): collate_fn=calib_evaluator.collate_batch, ) - from neural_compressor.torch.quantization import SmoothQuantConfig - - args.alpha = eval(args.alpha) - excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions) - - if re.search("gpt", user_model.config.model_type): - quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) - from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device from tqdm import tqdm @@ -189,16 +180,39 @@ def run_fn(model): if calib_iter >= args.calib_iters: break return - + + def eval_func(model): + config = AutoConfig.from_pretrained(args.model) + setattr(model, "config", config) + + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + if args.tasks == "wikitext": + return results["results"][args.tasks]["word_perplexity,none"] + else: + return results["results"][args.tasks]["acc,none"] + from utils import get_example_inputs example_inputs = get_example_inputs(user_model, calib_dataloader) - from neural_compressor.torch.quantization import prepare, convert - - user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) - run_fn(user_model) - user_model = convert(user_model) + from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig + tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning()) + user_model = autotune( + user_model, + tune_config=tune_config, + eval_fn=eval_func, + run_fn=run_fn, + example_inputs=example_inputs, + ) user_model.save(args.output_dir) @@ -231,11 +245,10 @@ def run_fn(model): results = evaluate(eval_args) for task_name in args.tasks.split(","): if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity,none"] + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"])) else: - acc = results["results"][task_name]["acc,none"] - print("Accuracy: %.5f" % acc) - print("Batch size = %d" % args.batch_size) + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"])) + if args.performance: user_model.eval() diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py index 0ccb2093537..b56c01f20f5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py @@ -164,9 +164,9 @@ def get_user_model(): ) - from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig - quant_config = get_default_static_config() - quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + from neural_compressor.torch.quantization import StaticQuantConfig + excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + quant_config = StaticQuantConfig(excluded_precisions=excluded_precisions) if re.search("gpt", user_model.config.model_type): quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) diff --git a/neural_compressor/torch/quantization/autotune.py b/neural_compressor/torch/quantization/autotune.py index 7a53b54b0d5..2c6dcaa768f 100644 --- a/neural_compressor/torch/quantization/autotune.py +++ b/neural_compressor/torch/quantization/autotune.py @@ -81,7 +81,7 @@ def autotune( best_quant_model = None eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args) config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) - baseline: float = eval_func_wrapper.evaluate(model) + baseline: float = eval_func_wrapper.evaluate(deepcopy(model)) tuning_monitor.set_baseline(baseline) tuning_logger.tuning_start() for trial_index, quant_config in enumerate(config_loader, 1): diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 99c9f3f202a..4f34314ab21 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1582,8 +1582,14 @@ def get_model_info(self, model: torch.nn.Module, example_inputs) -> List[Tuple[s @classmethod def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]: - """Get the default configuration set for tuning.""" - return SmoothQuantConfig(alpha=[0.1, 0.5], folding=[True, False], scale_sharing=[True, False]) + import numpy as np + + return SmoothQuantConfig( + alpha=np.arange(0.1, 1.0, 0.1).tolist(), + folding=[True, False], + scale_sharing=[True, False], + excluded_precisions=[["bf16"]], + ) def get_default_sq_config() -> SmoothQuantConfig: diff --git a/test/3x/torch/quantization/test_static_quant.py b/test/3x/torch/quantization/test_static_quant.py index 4aecd29eecf..5bc37180045 100644 --- a/test/3x/torch/quantization/test_static_quant.py +++ b/test/3x/torch/quantization/test_static_quant.py @@ -216,7 +216,7 @@ def test_static_quant_with_quantize_API(self): def test_static_quant_mixed_precision(self): fp32_model = copy.deepcopy(self.fp32_model) example_inputs = self.input - quant_config = get_default_static_config() + quant_config = StaticQuantConfig(excluded_precisions=["bf16"]) prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) run_fn(prepared_model) q_model = convert(prepared_model) @@ -229,7 +229,6 @@ def test_static_quant_mixed_precision(self): q_model = convert(prepared_model) assert q_model is not None, "Quantization failed!" - quant_config.excluded_precisions = ["bf16"] prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) run_fn(prepared_model) q_model = convert(prepared_model)