Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

example update for 3.x ipex sq #1902

Merged
merged 21 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ neural-compressor
intel-extension-for-transformers
lm_eval==0.4.2
peft
optimum-intel
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
help="calibration iters.")
parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
type=str, help="tasks for accuracy validation")
parser.add_argument("--max_new_tokens", default=32, type=int, help="output max new tokens")
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
# ============SmoothQuant configs==============
parser.add_argument("--sq", action="store_true")
Expand Down Expand Up @@ -191,32 +192,56 @@ def run_fn(model):
if calib_iter >= args.calib_iters:
break
return


def eval_func(model):
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=model,
tokenizer=tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
device="cpu",
)
results = evaluate(eval_args)
if args.tasks == "wikitext":
return results["results"][args.tasks]["word_perplexity,none"]
else:
return results["results"][args.tasks]["acc,none"]

from utils import get_example_inputs
example_inputs = get_example_inputs(user_model, calib_dataloader)

from neural_compressor.torch.quantization import prepare, convert
user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(user_model)
user_model = convert(user_model)
from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig
tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning())
user_model = autotune(
user_model,
tune_config=tune_config,
eval_fn=eval_func,
run_fn=run_fn,
example_inputs=example_inputs,
)
user_model.save(args.output_dir)


if args.load:
# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
if args.int8 or args.int8_bf16_mixed:
print("load int8 model")
print("Loading SmoothQuant int8 model.")
from neural_compressor.torch.quantization import load
from intel_extension_for_transformers.transformers.llm.evaluation.models import (
TSModelCausalLMForITREX,
)
tokenizer = AutoTokenizer.from_pretrained(args.model)
config = AutoConfig.from_pretrained(args.model)
origin_model_type = config.model_type
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
setattr(user_model, "config", config)
user_model = TSModelCausalLMForITREX(user_model, config=config)
user_model.config.model_type = origin_model_type
else:
user_model, tokenizer = get_user_model()


if args.accuracy:
user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
Expand All @@ -229,36 +254,33 @@ def run_fn(model):
results = evaluate(eval_args)
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity,none"]
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"]))
else:
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print('Batch size = %d' % args.batch_size)
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"]))


if args.performance:
user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
batch_size, input_leng = args.batch_size, 512
example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long)
print("Batch size = {:d}".format(batch_size))
print("The length of input tokens = {:d}".format(input_leng))
import time

samples = args.iters * args.batch_size
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
tokenizer=tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
limit=samples,
device="cpu",
)
start = time.time()
results = evaluate(eval_args)
end = time.time()
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity,none"]
else:
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print('Throughput: %.3f samples/sec' % (samples / (end - start)))
print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
print('Batch size = %d' % args.batch_size)
total_iters = args.iters
warmup_iters = 5
with torch.no_grad():
for i in range(total_iters):
if i == warmup_iters:
start = time.time()
user_model.generate(
example_inputs,
max_new_tokens=args.max_new_tokens,
do_sample=False,
temperature=0.9,
num_beams=4,
)
end = time.time()
latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
print("Latency: {:.3f} ms".format(latency * 10**3))
print("Throughput: {:.3f} samples/sec".format(throughput))
6 changes: 5 additions & 1 deletion neural_compressor/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,11 @@ def get_model_info(self, model: torch.nn.Module, example_inputs) -> List[Tuple[s

@classmethod
def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]:
return SmoothQuantConfig(alpha=[0.1, 0.5], folding=[True, False], scale_sharing=[True, False])
import numpy as np

return SmoothQuantConfig(
alpha=np.arange(0.1, 1.0, 0.1).tolist(), folding=[True, False], scale_sharing=[True, False]
)


def get_default_sq_config() -> SmoothQuantConfig:
Expand Down
Loading