From e87c95f25d3fe0e286e832857974ce36d43b2f96 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 23 Apr 2024 10:58:31 +0800 Subject: [PATCH] Fix `weight_only` algorithms import (#1742) Signed-off-by: Kaihui-intel Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../quantization/llm/requirements.txt | 2 +- .../quantization/llm/run_clm_no_trainer.py | 36 +++++++++++-------- .../torch/algorithms/weight_only/__init__.py | 9 ----- .../torch/algorithms/weight_only/modules.py | 2 +- .../torch/quantization/algorithm_entry.py | 12 +++---- .../quantization/weight_only/test_gptq.py | 2 +- .../quantization/weight_only/test_rtn.py | 2 +- 7 files changed, 31 insertions(+), 34 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt index 0fac3f8438f..ebea194b93b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt @@ -9,5 +9,5 @@ wandb einops neural-compressor intel-extension-for-transformers -git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2 +lm-eval peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 613c0277579..bc973d28491 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -50,8 +50,7 @@ help="Pad input ids to max length.") parser.add_argument("--calib_iters", default=512, type=int, help="calibration iters.") -parser.add_argument("--tasks", nargs='+', default=["lambada_openai", - "hellaswag", "winogrande", "piqa", "wikitext"], +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", type=str, help="tasks list for accuracy validation") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============SmoothQuant configs============== @@ -390,24 +389,27 @@ def run_fn(model): if args.accuracy: user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate - - results = evaluate( - model="hf-causal", + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32', user_model=user_model, + tokenizer = tokenizer, batch_size=args.batch_size, tasks=args.tasks, + device="cpu", ) + results = evaluate(eval_args) + dumped = json.dumps(results, indent=2) if args.save_accuracy_path: with open(args.save_accuracy_path, "w") as f: f.write(dumped) - for task_name in args.tasks: + for task_name in args.tasks.split(","): if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity"] + acc = results["results"][task_name]["word_perplexity,none"] else: - acc = results["results"][task_name]["acc"] + acc = results["results"][task_name]["acc,none"] print("Accuracy: %.5f" % acc) print('Batch size = %d' % args.batch_size) @@ -417,21 +419,25 @@ def run_fn(model): import time samples = args.iters * args.batch_size - start = time.time() - results = evaluate( - model="hf-causal", + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32', user_model=user_model, + tokenizer = tokenizer, batch_size=args.batch_size, tasks=args.tasks, limit=samples, + device="cpu", ) + start = time.time() + results = evaluate(eval_args) end = time.time() - for task_name in args.tasks: + for task_name in args.tasks.split(","): if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity"] + acc = results["results"][task_name]["word_perplexity,none"] else: - acc = results["results"][task_name]["acc"] + acc = results["results"][task_name]["acc,none"] print("Accuracy: %.5f" % acc) print('Throughput: %.3f samples/sec' % (samples / (end - start))) print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py index 1a1789697e4..28f108cb636 100644 --- a/neural_compressor/torch/algorithms/weight_only/__init__.py +++ b/neural_compressor/torch/algorithms/weight_only/__init__.py @@ -11,12 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .rtn import rtn_quantize -from .gptq import gptq_quantize -from .awq import awq_quantize -from .teq import teq_quantize -from .autoround import autoround_quantize -from .hqq import hqq_quantize -from .modules import WeightOnlyLinear -from .utility import * diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 2fb061821c8..2842429e06e 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -69,7 +69,7 @@ def __init__( bits = self.dtype.lstrip("int") self.dtype = "int" if "int" not in self.dtype: # for nf4, fp4 - from neural_compressor.torch.algorithms.weight_only import FLOAT_MAPPING, INT_MAPPING + from neural_compressor.torch.algorithms.weight_only.utility import FLOAT_MAPPING, INT_MAPPING self.use_optimum_format = False # optimum_format doesn't suit for symmetric nf4 fp4. float_list = FLOAT_MAPPING[self.dtype] diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index f50abb698db..df523293be8 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -40,7 +40,7 @@ def rtn_entry( model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs ) -> torch.nn.Module: """The main entry to apply rtn quantization.""" - from neural_compressor.torch.algorithms.weight_only import rtn_quantize + from neural_compressor.torch.algorithms.weight_only.rtn import rtn_quantize # rebuild weight_config for rtn_quantize function weight_config = {} @@ -75,7 +75,7 @@ def gptq_entry( model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], *args, **kwargs ) -> torch.nn.Module: logger.info("Quantize model with the GPTQ algorithm.") - from neural_compressor.torch.algorithms.weight_only import gptq_quantize + from neural_compressor.torch.algorithms.weight_only.gptq import gptq_quantize # rebuild weight_config for gptq_quantize function weight_config = {} @@ -228,7 +228,7 @@ def awq_quantize_entry( model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AWQConfig], *args, **kwargs ) -> torch.nn.Module: logger.info("Quantize model with the AWQ algorithm.") - from neural_compressor.torch.algorithms.weight_only import awq_quantize + from neural_compressor.torch.algorithms.weight_only.awq import awq_quantize weight_config = {} for (op_name, op_type), op_config in configs_mapping.items(): @@ -288,7 +288,7 @@ def awq_quantize_entry( def teq_quantize_entry( model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], TEQConfig], *args, **kwargs ) -> torch.nn.Module: - from neural_compressor.torch.algorithms.weight_only import teq_quantize + from neural_compressor.torch.algorithms.weight_only.teq import teq_quantize logger.info("Quantize model with the TEQ algorithm.") weight_config = {} @@ -338,7 +338,7 @@ def teq_quantize_entry( def autoround_quantize_entry( model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], *args, **kwargs ) -> torch.nn.Module: - from neural_compressor.torch.algorithms.weight_only import autoround_quantize + from neural_compressor.torch.algorithms.weight_only.autoround import autoround_quantize logger.info("Quantize model with the AutoRound algorithm.") calib_func = kwargs.get("run_fn", None) @@ -407,7 +407,7 @@ def autoround_quantize_entry( def hqq_entry( model: torch.nn.Module, configs_mapping: Dict[Tuple[str, Callable], HQQConfig], *args, **kwargs ) -> torch.nn.Module: - from neural_compressor.torch.algorithms.weight_only import hqq_quantize + from neural_compressor.torch.algorithms.weight_only.hqq import hqq_quantize logger.info("Quantize model with the HQQ algorithm.") q_model = hqq_quantize(model, configs_mapping) diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index 064b75e7203..6325269c2c2 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -4,7 +4,7 @@ import torch import transformers -from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.torch.quantization import GPTQConfig, get_default_gptq_config, get_default_rtn_config, quantize diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index c44fe16669e..f52a8d64e69 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -4,7 +4,7 @@ import torch import transformers -from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.torch.quantization import ( RTNConfig, get_default_double_quant_config,