From cdd658b6c7420355e2bdd0ac20f80bca333cf1fe Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Wed, 18 Sep 2024 12:37:23 +0800 Subject: [PATCH 1/5] refine autoawq exporting code --- .../export/export_to_autogptq/export.py | 2 +- auto_round/export/export_to_awq/export.py | 83 +++++++++++++------ 2 files changed, 57 insertions(+), 28 deletions(-) diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index f051304a..7f8a6800 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -37,7 +37,7 @@ import torch from auto_round.utils import check_to_quantized, get_block_names, \ - get_module, logger, get_layer_names_in_block, set_module + get_module, logger, set_module import copy import json import os diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py index 00de4a5c..83f6824e 100644 --- a/auto_round/export/export_to_awq/export.py +++ b/auto_round/export/export_to_awq/export.py @@ -35,10 +35,46 @@ import torch import torch.nn as nn from auto_round.export.register import register_format -from auto_round.utils import convert_dtype_torch2str_hf, logger +from auto_round.utils import convert_dtype_torch2str_hf, logger, get_module, set_module import copy import json from typing import Dict, List, Optional, Union +from .utils import WQLinear_GEMM, clear_memory, get_self_modules +from concurrent.futures import ThreadPoolExecutor +import threadpoolctl as tctl +from tqdm import tqdm + + +def pack_layer(name, model, layer_config, backend, pbar): + with tctl.threadpool_limits(limits=1): + pbar.set_description(f"packing {name}") + if name == "lm_head": ##dese not support lm-head + pbar.update(1) + return + config = layer_config[name] + if config["bits"] > 8: + pbar.update(1) + return + scale, zp = config["scale"], config["zp"] + scale = scale.t().contiguous() + zp = zp.t().contiguous() + config["zp"] = config["zp"].to(torch.float32) + bits = config["bits"] + group_size = config["group_size"] + linear_layer = get_module(model, name) + q_linear = WQLinear_GEMM.from_linear( + linear=linear_layer, + w_bit=bits, + group_size=group_size, + init_only=False, + scales=scale, + zeros=zp, + ) + linear_layer.cpu() + q_linear.to("cpu") + set_module(model, name, q_linear) + clear_memory() + pbar.update(1) @register_format("auto_awq") @@ -67,36 +103,30 @@ def save_quantized_as_autoawq(output_dir, inplace=True, **kwargs): else: compressed_model = copy.deepcopy(model.to("cpu")) - from .utils import WQLinear_GEMM, clear_memory, get_self_modules + names = list(layer_config.keys()) - q_linear_module = WQLinear_GEMM self_modules = get_self_modules(compressed_model) + layers = [] for i in range(len(self_modules)): module = self_modules[i] named_linears = get_named_linears(module) for name, linear_layer in named_linears.items(): key = get_module_name(compressed_model, linear_layer) - logger.info(f"packing {name}") + layers.append(key) config = layer_config[key] if config["bits"] > 8: modules_to_not_convert.append(name) - continue - config["zp"] = config["zp"].to(torch.float32) - scale, zp = config["scale"], config["zp"] - scale = scale.t().contiguous() - zp = zp.t().contiguous() - q_linear = q_linear_module.from_linear( - linear=linear_layer, - w_bit=bits, - group_size=group_size, - init_only=False, - scales=scale, - zeros=zp, - ) - linear_layer.cpu() - q_linear.to(next(module.parameters()).device) - set_op_by_name(module, name, q_linear) - clear_memory() + + backend = None + with ThreadPoolExecutor(max_workers=2) as executor: + with tqdm(total=len(names), leave=True) as pbar: + def wrapper(name): + pack_layer(name, model, layer_config, backend, pbar) + + for _ in executor.map(wrapper, names): + pass + if output_dir is None: + return model quant_config = {} quant_config["quant_method"] = "awq" @@ -123,11 +153,11 @@ def save_quantized_as_autoawq(output_dir, inplace=True, **kwargs): def save_quantized( - model, - save_dir, - quant_config, - safetensors=True, - shard_size="5GB", + model, + save_dir, + quant_config, + safetensors=True, + shard_size="5GB", ): save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir @@ -220,4 +250,3 @@ def get_module_name(model, module_to_find): if module is module_to_find: return name return None - From 3330c15b50628ef4fe15fd7a0a443a48ae82b043 Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 20 Sep 2024 10:52:48 +0800 Subject: [PATCH 2/5] fix 3bit and remove itrex --- README.md | 10 +-- auto_round/__main__.py | 21 +++--- auto_round/eval/evaluation.py | 10 +-- auto_round/utils.py | 5 +- examples/language-modeling/main.py | 100 ++++++++++++++--------------- requirements.txt | 1 - 6 files changed, 75 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 1f24f900..8d29b606 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,13 @@ AutoRound
AutoRound is an advanced quantization algorithm for low-bits LLM inference. It's tailored for a wide range -of models. Our method adopts sign gradient descent to fine-tune rounding values and minmax values of weights in just 200 +of models. AutoRound adopts sign gradient descent to fine-tune rounding values and minmax values of weights in just 200 steps, which competes impressively against recent methods without introducing any additional inference overhead and keeping low tuning cost. The below image presents an overview of AutoRound. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516v4) for more details and visit [low_bit_open_llm_leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard) for -more accuracy data across various models. +more accuracy data and recipes across various models.
@@ -177,8 +177,8 @@ and mixed precision. However, it has not yet gained widespread community adoptio install from the source. **AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the -community. It also benefits from the Marlin kernel, which can boost inference performance notably. However, the -asymmetric kernel has issues that can cause considerable accuracy drops, particularly at 2-bit quantization and small models. +community. It also benefits from the Marlin kernel, which can boost inference performance notably. However, **the +asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small models. Additionally, symmetric quantization tends to perform poorly at 2-bit precision. **AutoAWQ format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted @@ -206,7 +206,7 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) ### AutoRound format -**CPU**: no extra operations +**CPU**: pip install intel-extension-for-transformers **HPU**: docker image with Gaudi Software Stack is recommended. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/). diff --git a/auto_round/__main__.py b/auto_round/__main__.py index e065cead..b14c570a 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -27,7 +27,7 @@ from auto_round import AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate from auto_round.utils import detect_device, get_library_version, detect_device_count - +from auto_round.utils import logger def setup_parser(): parser = argparse.ArgumentParser() @@ -48,7 +48,7 @@ def setup_parser(): parser.add_argument("--batch_size", default=8, type=int, help="train batch size") - parser.add_argument("--eval_bs", default=1, type=int, + parser.add_argument("--eval_bs", default=None, type=int, help="eval batch size") parser.add_argument("--device", default="auto", type=str, @@ -164,7 +164,7 @@ def tune(args): model_name = args.model if model_name[-1] == "/": model_name = model_name[:-1] - print(model_name, flush=True) + logger.info(f"start to quantize {model_name}") device_str = detect_device(args.device) torch_dtype = "auto" @@ -231,8 +231,7 @@ def tune(args): if hasattr(tokenizer, "model_max_length"): if tokenizer.model_max_length < seqlen: - print(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length", - flush=True) + logger.info(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length") seqlen = min(seqlen, tokenizer.model_max_length) args.seqlen = seqlen @@ -248,7 +247,7 @@ def tune(args): if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: layer_config[n] = {"bits": 32} - print( + logger.info( f"{n} will not be quantized due to its shape not being divisible by 32," " resulting in an exporting issue to autogptq") fp_layers_list = args.fp_layers_list.split(",") @@ -258,7 +257,7 @@ def tune(args): name = n.split('.')[-1] if n in fp_layers_list or name in fp_layers_list: layer_config[n] = {"bits": 32} - print( + logger.info( f"{n} will not be quantized.") lm_head_layer_name = "lm_head" for n, _ in model.named_modules(): @@ -271,8 +270,8 @@ def tune(args): for item in tied_keys: if lm_head_layer_name in item: ##TODO extend to encoder-decoder layer, seq classification model args.quant_lm_head = False - print( - f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been " + logger.warning( + f"reset `quant_lm_head` to `False` as quantizing lm_head with tied weights has not been " f"supported currently") break if args.quant_lm_head: @@ -316,7 +315,7 @@ def tune(args): tasks = tasks.split(',') if not args.disable_eval: - print(f"Using the latest {lm_eval_version}") + logger.info(f"Using lm-eval version {lm_eval_version}") model_args = f"pretrained={eval_folder}" model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}" user_model = None @@ -350,6 +349,8 @@ def eval(args): def run(): args = setup_parser() + if args.eval_bs is None: + args.eval_bs = "auto" if args.eval: eval(args) else: diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index 8e50d889..9d1dcdb2 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -19,17 +19,19 @@ import lm_eval from lm_eval import simple_evaluate as lm_simple_evaluate +import os + +os.environ["TOKENIZERS_PARALLELISM"] = "false" def simple_evaluate( model, model_args: Optional[Union[str, dict]] = None, - user_model = None, + user_model=None, batch_size: Optional[int] = None, max_batch_size: Optional[int] = None, device: Optional[str] = None, **kwargs): - try: from auto_round import AutoRoundConfig except: @@ -37,7 +39,7 @@ def simple_evaluate( if model_args is None: model_args = "" - + if isinstance(model_args, dict): lm = lm_eval.api.registry.get_model(model).create_from_arg_obj( model_args, @@ -66,5 +68,3 @@ def simple_evaluate( max_batch_size=max_batch_size, device=device, **kwargs) - - diff --git a/auto_round/utils.py b/auto_round/utils.py index d180980c..c0354318 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -476,6 +476,7 @@ def detect_device(device=None): Returns: str: The device to use for computations, formatted as a string. """ + def is_valid_digit(s): try: num = int(s) @@ -912,6 +913,8 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): class: The dynamically imported QuantLinear class configured according to the specified parameters. """ use_triton = True + if bits not in [2, 4, 8]: + use_triton = False disable_exllamav2 = True disable_exllamav1 = False disable_marlin = True @@ -966,4 +969,4 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): use_qigen=use_qigen, use_marlin=not disable_marlin, ) - return QuantLinear \ No newline at end of file + return QuantLinear diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 1b78d3d6..e40ed069 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -43,7 +43,7 @@ parser.add_argument("--train_bs", default=8, type=int, help="train batch size") - parser.add_argument("--eval_bs", default=4, type=int, + parser.add_argument("--eval_bs", default=None, type=int, help="eval batch size") parser.add_argument("--device", default="auto", type=str, @@ -390,6 +390,9 @@ print('does not support cpu, xpu model evaluation.') exit() ## does not support cpu,xpu model eval + if args.disable_eval: + exit() + from packaging.version import Version from auto_round.utils import get_library_version @@ -402,55 +405,52 @@ use_eval_legacy = False from eval_legacy import eval_model - use_qdq = False - if args.deployment_device and 'fake' in args.deployment_device: - use_qdq = True - if args.format and ('fake' in args.format or 'qdq' in args.format): - use_qdq = True - # evaluation - if not args.disable_eval: + if use_eval_legacy: + print("Using the legacy lm_eval(0.3.0)") + else: + print(f"Using the lm_eval version {lm_eval_version}") + + if isinstance(tasks, str): + tasks = tasks.split(',') + + if lm_eval_version < Version("0.4.2"): + if args.eval_bs is None: + args.eval_bs = 1 if use_eval_legacy: - print("Using the legacy lm_eval(0.3.0)") - else: - print(f"Using the latest {lm_eval_version}") - - if isinstance(tasks, str): - tasks = tasks.split(',') - - if use_qdq and lm_eval_version < Version("0.4.2"): - if use_eval_legacy: - if "mmlu" in tasks: - tmp_tasks = tasks - tasks = ["hendrycksTest-*" if x == "mmlu" else x for x in tmp_tasks] - if "truthfulqa_mc1" in tasks or "truthfulqa_mc2" in tasks: - tmp_tasks = tasks - tasks = ["truthfulqa_mc" if "truthfulqa_mc" in x else x for x in tmp_tasks] - seen = set() + if "mmlu" in tasks: + tmp_tasks = tasks + tasks = ["hendrycksTest-*" if x == "mmlu" else x for x in tmp_tasks] + if "truthfulqa_mc1" in tasks or "truthfulqa_mc2" in tasks: tmp_tasks = tasks - tasks = [x for x in tmp_tasks if not (x in seen or seen.add(x))] - - excel_name = f"{output_dir}_result.xlsx" - output_dir += "/" - print(excel_name, flush=True) - eval_model( - model_path=output_dir, tasks=tasks, dtype=dtype, limit=None, - eval_bs=args.eval_bs, use_accelerate=args.low_gpu_mem_usage, - device=torch_device, excel_file=excel_name, - trust_remote_code=not args.disable_trust_remote_code) - - if lm_eval_version >= Version("0.4.2"): - from eval.evaluation import simple_evaluate - - model_args = f"pretrained={eval_folder}" - model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}" - user_model = None - if args.act_bits <= 8: - user_model = model.to(device_str) - - res = simple_evaluate(model="hf", model_args=model_args, - tasks=tasks, - batch_size=args.eval_bs, user_model=user_model) - from lm_eval.utils import make_table - - print(make_table(res)) + tasks = ["truthfulqa_mc" if "truthfulqa_mc" in x else x for x in tmp_tasks] + seen = set() + tmp_tasks = tasks + tasks = [x for x in tmp_tasks if not (x in seen or seen.add(x))] + + excel_name = f"{output_dir}_result.xlsx" + output_dir += "/" + print(excel_name, flush=True) + eval_model( + model_path=output_dir, tasks=tasks, dtype=dtype, limit=None, + eval_bs=args.eval_bs, use_accelerate=args.low_gpu_mem_usage, + device=torch_device, excel_file=excel_name, + trust_remote_code=not args.disable_trust_remote_code) + + if lm_eval_version >= Version("0.4.2"): + if args.eval_bs is None: + args.eval_bs = "auto" + from eval.evaluation import simple_evaluate + + model_args = f"pretrained={eval_folder}" + model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}" + user_model = None + if args.act_bits <= 8: + user_model = model.to(device_str) + + res = simple_evaluate(model="hf", model_args=model_args, + tasks=tasks, + batch_size=args.eval_bs, user_model=user_model) + from lm_eval.utils import make_table + + print(make_table(res)) diff --git a/requirements.txt b/requirements.txt index 987d26b6..0cc1327b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,5 @@ triton numpy < 2.0 threadpoolctl lm-eval==0.4.4 -intel-extension-for-transformers tqdm packaging \ No newline at end of file From 87f33c88d8515f4c29712598d9cf25369e99565c Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 20 Sep 2024 11:05:55 +0800 Subject: [PATCH 3/5] update readme and fix preci issue --- README.md | 33 +++++++++++++++++++-------------- auto_round/__main__.py | 6 ++++-- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 8d29b606..7c53e926 100644 --- a/README.md +++ b/README.md @@ -48,9 +48,8 @@ pip install -vvv --no-build-isolation -e . pip install auto-round ``` - - ## Model Quantization + ### API Usage (Gaudi2/CPU/GPU) ```python @@ -130,8 +129,9 @@ autoround.save_quantized(output_dir, format='auto_round', inplace=True) ### Basic Usage (version > 0.3.0) -A user guide detailing the full list of supported arguments is provided by calling ```auto_round -h``` on the terminal. Alternatively, you can use ```auto-round``` instead of ```auto_round```. +A user guide detailing the full list of supported arguments is provided by calling ```auto_round -h``` on the terminal. +Alternatively, you can use ```auto-round``` instead of ```auto_round```. ```bash auto_round --model facebook/opt-125m \ @@ -141,6 +141,7 @@ auto_round --model facebook/opt-125m \ --disable_eval \ --output_dir ./tmp_autoround ``` + We provide two recipes for best accuracy and fast running speed with low memory. Details as below.
Other Recipes @@ -167,30 +168,35 @@ We provide two recipes for best accuracy and fast running speed with low memory. --batch_size 4 \ --disable_eval ``` +
#### Formats -**AutoRound format**:This format is well-suited for CPU and HPU devices, as well as mixed-precision inference. It +**AutoRound format**:This format is well-suited for CPU, HPU devices, 2 bits, as well as mixed-precision inference.[2,4] +bits are supported. It resolves the asymmetric quantization kernel issues found in the AutoGPTQ format and supports both LM-head quantization and mixed precision. However, it has not yet gained widespread community adoption. For CUDA support, you will need to -install from the source. +install from the source. [2,4] bits are supported **AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the -community. It also benefits from the Marlin kernel, which can boost inference performance notably. However, **the -asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small models. +community, [2,3,4,8] bits are supported, for 3 bits, pip install auto-gptq first before quantization. It also benefits +from the Marlin kernel, which can boost inference performance notably. However, **the +asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small +models. Additionally, symmetric quantization tends to perform poorly at 2-bit precision. **AutoAWQ format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted -within the community. Asymmetric quantization typically improves accuracy but may reduce inference speed. It features -specialized layer fusion tailored for Llama models. However, it supports only 4-bit asymmetric quantization. +within the community, only 4-bits asymmetric quantization is supported. Asymmetric quantization typically improves +accuracy but may reduce inference speed. It features +specialized layer fusion tailored for Llama models. ## Model Inference - Please run the quantization code first ### AutoGPTQ/AutoAWQ format + ```python from transformers import AutoModelForCausalLM, AutoTokenizer @@ -203,7 +209,6 @@ inputs = tokenizer(text, return_tensors="pt").to(model.device) print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) ``` - ### AutoRound format **CPU**: pip install intel-extension-for-transformers @@ -214,7 +219,6 @@ in [Gaudi Guide](https://docs.habana.ai/en/latest/). **CUDA**: git clone https://github.com/intel/auto-round.git && cd auto-round && pip install -vvv --no-build-isolation -e . - #### CPU/HPU/CUDA on 0.3.0+ ```python @@ -223,7 +227,7 @@ from auto_round import AutoRoundConfig device = "auto" ##cpu, hpu, cuda quantization_config = AutoRoundConfig( - backend=device + backend=device ) quantized_model_path = "./tmp_autoround" model = AutoModelForCausalLM.from_pretrained(quantized_model_path, @@ -248,6 +252,7 @@ text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) ``` +
Evaluation @@ -259,8 +264,8 @@ auto_round --model saved_quantized_model \ --task lambada_openai \ --eval_bs 1 ``` -
+ ## Support List diff --git a/auto_round/__main__.py b/auto_round/__main__.py index b14c570a..3de56571 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -27,7 +27,8 @@ from auto_round import AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate from auto_round.utils import detect_device, get_library_version, detect_device_count -from auto_round.utils import logger +from auto_round.utils import logger + def setup_parser(): parser = argparse.ArgumentParser() @@ -231,7 +232,8 @@ def tune(args): if hasattr(tokenizer, "model_max_length"): if tokenizer.model_max_length < seqlen: - logger.info(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length") + logger.info( + f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length") seqlen = min(seqlen, tokenizer.model_max_length) args.seqlen = seqlen From 180aff90deb258f161287c4343a754d0e372aa3b Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 20 Sep 2024 11:07:01 +0800 Subject: [PATCH 4/5] fix typo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7c53e926..c9637df2 100644 --- a/README.md +++ b/README.md @@ -173,11 +173,11 @@ We provide two recipes for best accuracy and fast running speed with low memory. #### Formats -**AutoRound format**:This format is well-suited for CPU, HPU devices, 2 bits, as well as mixed-precision inference.[2,4] +**AutoRound format**:This format is well-suited for CPU, HPU devices, 2 bits, as well as mixed-precision inference. [2,4] bits are supported. It resolves the asymmetric quantization kernel issues found in the AutoGPTQ format and supports both LM-head quantization and mixed precision. However, it has not yet gained widespread community adoption. For CUDA support, you will need to -install from the source. [2,4] bits are supported +install from the source. **AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the community, [2,3,4,8] bits are supported, for 3 bits, pip install auto-gptq first before quantization. It also benefits From a24e84353110b08b568c0c7076e804e696126303 Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 20 Sep 2024 11:46:39 +0800 Subject: [PATCH 5/5] fix unit test --- test/test_export.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_export.py b/test/test_export.py index aa23ff58..c7d576f5 100644 --- a/test/test_export.py +++ b/test/test_export.py @@ -111,6 +111,10 @@ def test_autoround_format(self): quantized_model_path = "./saved" autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") + try: + import intel_extension_for_transformers + except: + return from auto_round.auto_quantizer import AutoHfQuantizer model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")