Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

example update for 3.x ipex sq #1902

Merged
merged 21 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions examples/.config/model_params_pytorch_3x.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"batch_size": 8
},
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
Expand All @@ -99,7 +99,7 @@
"batch_size": 1
},
"llama2_7b_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
Expand All @@ -113,7 +113,7 @@
"batch_size": 1
},
"opt_125m_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ neural-compressor
intel-extension-for-transformers
lm_eval==0.4.2
peft
optimum-intel
Original file line number Diff line number Diff line change
Expand Up @@ -162,15 +162,6 @@ def get_user_model():
collate_fn=calib_evaluator.collate_batch,
)

from neural_compressor.torch.quantization import SmoothQuantConfig

args.alpha = eval(args.alpha)
excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions)

if re.search("gpt", user_model.config.model_type):
quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32"))

from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
from tqdm import tqdm

Expand All @@ -189,16 +180,39 @@ def run_fn(model):
if calib_iter >= args.calib_iters:
break
return


def eval_func(model):
config = AutoConfig.from_pretrained(args.model)
setattr(model, "config", config)

from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=model,
tokenizer=tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
device="cpu",
)
results = evaluate(eval_args)
if args.tasks == "wikitext":
return results["results"][args.tasks]["word_perplexity,none"]
else:
return results["results"][args.tasks]["acc,none"]

from utils import get_example_inputs

example_inputs = get_example_inputs(user_model, calib_dataloader)

from neural_compressor.torch.quantization import prepare, convert

user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(user_model)
user_model = convert(user_model)
from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig
tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning())
user_model = autotune(
user_model,
tune_config=tune_config,
eval_fn=eval_func,
run_fn=run_fn,
example_inputs=example_inputs,
)
user_model.save(args.output_dir)


Expand Down Expand Up @@ -231,11 +245,10 @@ def run_fn(model):
results = evaluate(eval_args)
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity,none"]
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"]))
else:
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print("Batch size = %d" % args.batch_size)
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"]))


if args.performance:
user_model.eval()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ def get_user_model():
)


from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig
quant_config = get_default_static_config()
quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
from neural_compressor.torch.quantization import StaticQuantConfig
excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
quant_config = StaticQuantConfig(excluded_precisions=excluded_precisions)
if re.search("gpt", user_model.config.model_type):
quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))

Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/torch/quantization/autotune.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def autotune(
best_quant_model = None
eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args)
config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config)
baseline: float = eval_func_wrapper.evaluate(model)
baseline: float = eval_func_wrapper.evaluate(deepcopy(model))
tuning_monitor.set_baseline(baseline)
tuning_logger.tuning_start()
for trial_index, quant_config in enumerate(config_loader, 1):
Expand Down
10 changes: 8 additions & 2 deletions neural_compressor/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,8 +1582,14 @@ def get_model_info(self, model: torch.nn.Module, example_inputs) -> List[Tuple[s

@classmethod
def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]:
"""Get the default configuration set for tuning."""
return SmoothQuantConfig(alpha=[0.1, 0.5], folding=[True, False], scale_sharing=[True, False])
import numpy as np

return SmoothQuantConfig(
alpha=np.arange(0.1, 1.0, 0.1).tolist(),
folding=[True, False],
scale_sharing=[True, False],
excluded_precisions=[["bf16"]],
)


def get_default_sq_config() -> SmoothQuantConfig:
Expand Down
3 changes: 1 addition & 2 deletions test/3x/torch/quantization/test_static_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def test_static_quant_with_quantize_API(self):
def test_static_quant_mixed_precision(self):
fp32_model = copy.deepcopy(self.fp32_model)
example_inputs = self.input
quant_config = get_default_static_config()
quant_config = StaticQuantConfig(excluded_precisions=["bf16"])
prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
Expand All @@ -229,7 +229,6 @@ def test_static_quant_mixed_precision(self):
q_model = convert(prepared_model)
assert q_model is not None, "Quantization failed!"

quant_config.excluded_precisions = ["bf16"]
prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
Expand Down
Loading