From 52ec5615ec37c927d4e170980c0b0e8d9662a353 Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 15 Nov 2024 10:34:11 +0800 Subject: [PATCH 1/7] fix merge error --- auto_round/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/backend.py b/auto_round/backend.py index 6afbf1ef..66c5e667 100644 --- a/auto_round/backend.py +++ b/auto_round/backend.py @@ -404,7 +404,7 @@ def get_autogptq_infer_linear(backend, bits=4, group_size=128, sym=False): from packaging.version import Version # Import the appropriate QuantLinear based on the version of auto_gptq - if Version(version) <= Version("0.7.1"): + if Version(version) < Version("0.7.2"): QuantLinear = dynamically_import_QuantLinear( use_triton=use_triton, desc_act=False, From 7af3e8ad3bb03f33920801e22e8f72ef20872e1b Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 15 Nov 2024 11:27:17 +0800 Subject: [PATCH 2/7] fix fp_layers issues --- auto_round/auto_quantizer.py | 7 +++++-- auto_round/script/llm.py | 26 +++++++++++++++++--------- auto_round/script/mllm.py | 17 +++++++++++++++++ examples/language-modeling/main.py | 26 +++++++++++++++++--------- 4 files changed, 56 insertions(+), 20 deletions(-) diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py index 7cbcd2ea..e859f2ee 100644 --- a/auto_round/auto_quantizer.py +++ b/auto_round/auto_quantizer.py @@ -397,6 +397,10 @@ def convert_model(self, model: nn.Module): if ("hpu" == target_device or "cpu" == target_device) and model.dtype != torch.bfloat16: logger.info(f"Change the dtype to `bfloat16` as {target_device.upper()} does not support float16") model = model.to(torch.bfloat16) + else: + if model.dtype != torch.float16: + logger.info(f"Change the dtype to `float16` for better performance") + model = model.to(torch.float16) bits = quantization_config.bits group_size = quantization_config.group_size @@ -404,7 +408,7 @@ def convert_model(self, model: nn.Module): "data_type") else "int" # pragma: no cover sym = quantization_config.sym to_quant_block_names = quantization_config.to_quant_block_names if hasattr(quantization_config, - "to_quant_block_names") else None + "to_quant_block_names") else None layer_names = get_layer_names_in_block(model, to_quant_block_names=to_quant_block_names) extra_config = {} @@ -741,4 +745,3 @@ def is_serializable(self): transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer - diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 03700d3e..536bac44 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -142,12 +142,13 @@ def __init__(self, *args, **kwargs): self.add_argument("--not_use_best_mse", action='store_true', help="whether to use the iter of best mes loss in the tuning phase") - + self.add_argument("--to_quant_block_names", default=None, type=str, help="Names of quantitative blocks, please use commas to separate them.") self.add_argument("--enable_torch_compile", default=None, type=bool, - help="whether to enable torch compile") + help="whether to enable torch compile") + def setup_parser(): parser = BasicArgumentParser() @@ -213,7 +214,6 @@ def setup_fast_parser(): parser.add_argument("--nsamples", default=128, type=int, help="number of samples") - args = parser.parse_args() return args @@ -366,15 +366,24 @@ def tune(args): logger.info( f"{n} will not be quantized due to its shape not being divisible by 32," " resulting in an exporting issue to autogptq") - fp_layers = args.fp_layers.split(",") - if bool(fp_layers): + + layer_config = {} + if args.fp_layers != "": + fp_layers = args.fp_layers.replace(" ", "").split(",") for n, m in model.named_modules(): - if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): - name = n.split('.')[-1] - if n in fp_layers or name in fp_layers: + if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)): + continue + for fp_layer in fp_layers: + if fp_layer in n: layer_config[n] = {"bits": 16} logger.info( f"{n} will not be quantized.") + if len(layer_config) > 0: + for format in formats: + if "auto_round" not in format: + ##TODO gptq, awq could support some mixed precision config + logger.warning(f"mixed precision exporting does not support {format} currently") + lm_head_layer_name = "lm_head" for n, _ in model.named_modules(): lm_head_layer_name = n @@ -507,4 +516,3 @@ def eval(args): from lm_eval.utils import make_table # pylint: disable=E0401 print(make_table(res)) - diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index 9afcc1d4..95807c8d 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -300,7 +300,24 @@ def tune(args): model = model.to(torch.bfloat16) round = AutoRoundMLLM + layer_config = {} + if args.fp_layers != "": + fp_layers = args.fp_layers.replace(" ", "").split(",") + for n, m in model.named_modules(): + if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)): + continue + for fp_layer in fp_layers: + if fp_layer in n: + layer_config[n] = {"bits": 16} + logger.info( + f"{n} will not be quantized.") + if len(layer_config) > 0: + for format in formats: + if "auto_round" not in format: + ##TODO gptq, awq could support some mixed precision config + logger.warning(f"mixed precision exporting does not support {format} currently") + for n, m in model.named_modules(): if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index f0594f5d..25c12e00 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -300,21 +300,29 @@ round = AutoRoundAdam layer_config = {} + if args.fp_layers != "": + fp_layers = args.fp_layers.replace(" ", "").split(",") + for n, m in model.named_modules(): + if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)): + continue + for fp_layer in fp_layers: + if fp_layer in n: + layer_config[n] = {"bits": 16} + print( + f"{n} will not be quantized.") + if len(layer_config) > 0: + for format in formats: + if "auto_round" not in format: + ##TODO gptq, awq could support some mixed precision config + print(f"mixed precision exporting does not support {format} currently") + for n, m in model.named_modules(): if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: layer_config[n] = {"bits": 16} print( f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq") - fp_layers = args.fp_layers.split(",") - if bool(fp_layers): - for n, m in model.named_modules(): - if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): - name = n.split('.')[-1] - if n in fp_layers or name in fp_layers: - layer_config[n] = {"bits": 16} - print( - f"{n} will not be quantized.") + lm_head_layer_name = "lm_head" for n, _ in model.named_modules(): lm_head_layer_name = n From d44266a7cfcae1cccdacd7a1133426ee9b4d447e Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 15 Nov 2024 11:44:58 +0800 Subject: [PATCH 3/7] Loosen the restrictions of lm-eval --- requirements-cpu.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index c18eda82..a59c07cf 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -7,7 +7,7 @@ transformers>=4.38 triton numpy < 2.0 threadpoolctl -lm-eval>=0.4.2,<=0.4.5 +lm-eval>=0.4.2,<0.5 tqdm packaging auto-gptq>=0.7.1 diff --git a/requirements.txt b/requirements.txt index 698d8c34..31fc02dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ transformers>=4.38 triton numpy < 2.0 threadpoolctl -lm-eval>=0.4.2,<=0.4.5 +lm-eval>=0.4.2,<0.5 tqdm packaging auto-gptq>=0.7.1 From b8331ec6981c4e3d90bc85c5d1ba2cf229da7e48 Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 15 Nov 2024 12:45:57 +0800 Subject: [PATCH 4/7] fix and add ut --- README.md | 32 +++++++++------- auto_round/script/llm.py | 2 +- auto_round/script/mllm.py | 2 +- examples/language-modeling/main.py | 2 +- test/test_autoround.py | 61 ++++++++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 9790fc49..6fff23fe 100644 --- a/README.md +++ b/README.md @@ -41,20 +41,6 @@ more accuracy data and recipes across various models. ## Installation -### Build from Source - -```bash -pip install -r requirements.txt - -# GPU -pip install -vvv --no-build-isolation -e . - -# CPU -pip install -vvv --no-build-isolation -e .[cpu] - -# HPU -pip install -vvv --no-build-isolation -e .[hpu] -``` ### Install from pypi @@ -69,6 +55,24 @@ pip install auto-round[cpu] pip install auto-round[hpu] ``` + +
+ Build from Source + + ```bash + pip install -r requirements.txt + + # GPU + pip install -vvv --no-build-isolation -e . + + # CPU + pip install -vvv --no-build-isolation -e .[cpu] + + # HPU + pip install -vvv --no-build-isolation -e .[hpu] + ``` +
+ ## Model Quantization ### Basic Usage (Gaudi2/CPU/GPU) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 536bac44..ee87c33e 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -380,7 +380,7 @@ def tune(args): f"{n} will not be quantized.") if len(layer_config) > 0: for format in formats: - if "auto_round" not in format: + if "auto_round" not in format and "fake" not in format: ##TODO gptq, awq could support some mixed precision config logger.warning(f"mixed precision exporting does not support {format} currently") diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index 70f87012..17fe3caa 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -315,7 +315,7 @@ def tune(args): f"{n} will not be quantized.") if len(layer_config) > 0: for format in formats: - if "auto_round" not in format: + if "auto_round" not in format and "fake" not in format: ##TODO gptq, awq could support some mixed precision config logger.warning(f"mixed precision exporting does not support {format} currently") diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 25c12e00..d3928378 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -312,7 +312,7 @@ f"{n} will not be quantized.") if len(layer_config) > 0: for format in formats: - if "auto_round" not in format: + if "auto_round" not in format and "fake" not in format: ##TODO gptq, awq could support some mixed precision config print(f"mixed precision exporting does not support {format} currently") diff --git a/test/test_autoround.py b/test/test_autoround.py index 20e5ea44..9742bfad 100644 --- a/test/test_autoround.py +++ b/test/test_autoround.py @@ -306,6 +306,67 @@ def test_fp32(self): ) autoround.quantize() + def test_fallback_layers(self): + bits, group_size, sym = 4, 128, True + model_name = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True, + device_map='auto') + layer_config = {"model.layers.0": {"bits": "16"}, + "model.layers.0": {"bits": "16"}} + autoround = AutoRound( + model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + layer_config=layer_config + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) + + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, + device_map="cpu") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0]) + + +def test_fallback_blocks(self): + bits, group_size, sym = 4, 128, True + model_name = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True, + device_map='auto') + layer_config = {"model.layers.0.self_attn.q_proj": {"bits": "16"}, + "model.layers.0.self_attn.k_proj": {"bits": "16"}} + autoround = AutoRound( + model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + layer_config=layer_config + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) + + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, + device_map="cpu") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0]) + if __name__ == "__main__": From c13fc3f0addcceb49490ab60a0c4b9ee943104f9 Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 15 Nov 2024 12:49:33 +0800 Subject: [PATCH 5/7] fix --- test/test_autoround.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_autoround.py b/test/test_autoround.py index 9742bfad..cd636c61 100644 --- a/test/test_autoround.py +++ b/test/test_autoround.py @@ -306,13 +306,13 @@ def test_fp32(self): ) autoround.quantize() - def test_fallback_layers(self): + def test_fallback_blocks(self): bits, group_size, sym = 4, 128, True model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map='auto') layer_config = {"model.layers.0": {"bits": "16"}, - "model.layers.0": {"bits": "16"}} + "model.layers.1": {"bits": "16"}} autoround = AutoRound( model, self.tokenizer, @@ -337,13 +337,13 @@ def test_fallback_layers(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0]) -def test_fallback_blocks(self): +def test_fallback_layers(self): bits, group_size, sym = 4, 128, True model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map='auto') layer_config = {"model.layers.0.self_attn.q_proj": {"bits": "16"}, - "model.layers.0.self_attn.k_proj": {"bits": "16"}} + "model.layers.1.self_attn.k_proj": {"bits": "16"}} autoround = AutoRound( model, self.tokenizer, From 1211ab2db79ac86d0aecf403194923196c5b886a Mon Sep 17 00:00:00 2001 From: wenhuach21 Date: Fri, 15 Nov 2024 16:38:03 +0800 Subject: [PATCH 6/7] API usage does not support fuzzy match --- test/test_autoround.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/test/test_autoround.py b/test/test_autoround.py index cd636c61..0541259b 100644 --- a/test/test_autoround.py +++ b/test/test_autoround.py @@ -306,36 +306,6 @@ def test_fp32(self): ) autoround.quantize() - def test_fallback_blocks(self): - bits, group_size, sym = 4, 128, True - model_name = "facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True, - device_map='auto') - layer_config = {"model.layers.0": {"bits": "16"}, - "model.layers.1": {"bits": "16"}} - autoround = AutoRound( - model, - self.tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - layer_config=layer_config - ) - autoround.quantize() - quantized_model_path = "./saved" - - autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) - - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, - device_map="cpu") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0]) - def test_fallback_layers(self): bits, group_size, sym = 4, 128, True From 523a316e495dc276270b9391b98cea4a101a6e3e Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Tue, 19 Nov 2024 10:08:50 +0800 Subject: [PATCH 7/7] bugfix of UT Signed-off-by: Zhang, Weiwei1 --- auto_round/auto_quantizer.py | 1 + auto_round/autoround.py | 3 ++- auto_round/utils.py | 6 ++++-- test/test_autoround.py | 9 +++++---- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py index bb554b56..b58c84e7 100644 --- a/auto_round/auto_quantizer.py +++ b/auto_round/auto_quantizer.py @@ -745,3 +745,4 @@ def is_serializable(self): transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer + diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 2ce7abdf..fe904016 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -331,7 +331,7 @@ def quantize(self): unquantized_layers = [] for n, m in self.model.named_modules(): if isinstance(m, tuple(self.supported_types)): - if m.bits > 8: + if int(m.bits) > 8: unquantized_layers.append(n) else: quantized_layers.append(n) @@ -1681,3 +1681,4 @@ def __init__( **kwargs, ) + diff --git a/auto_round/utils.py b/auto_round/utils.py index 11d25c7a..140cf007 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -491,11 +491,12 @@ def check_to_quantized(config): False otherwise. """ if isinstance(config, dict): - if config["bits"] > 8: + + if int(config["bits"]) > 8: return False return True else: - if config.bits > 8: + if int(config.bits) > 8: return False return True @@ -978,3 +979,4 @@ def compile_func(fun, device, enable_torch_compile): else: return compile_func_on_cuda_or_cpu(fun, enable_torch_compile) + diff --git a/test/test_autoround.py b/test/test_autoround.py index 0541259b..da857531 100644 --- a/test/test_autoround.py +++ b/test/test_autoround.py @@ -307,13 +307,13 @@ def test_fp32(self): autoround.quantize() -def test_fallback_layers(self): + def test_fallback_layers(self): bits, group_size, sym = 4, 128, True model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map='auto') - layer_config = {"model.layers.0.self_attn.q_proj": {"bits": "16"}, - "model.layers.1.self_attn.k_proj": {"bits": "16"}} + layer_config = {"model.decoder.layers.0.self_attn.q_proj": {"bits": "16"}, + "model.decoder.layers.1.self_attn.k_proj": {"bits": "16"}} autoround = AutoRound( model, self.tokenizer, @@ -331,7 +331,7 @@ def test_fallback_layers(self): autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) model = AutoModelForCausalLM.from_pretrained(quantized_model_path, - device_map="cpu") + device_map='auto') tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -341,3 +341,4 @@ def test_fallback_layers(self): if __name__ == "__main__": unittest.main() +