From e87c95f25d3fe0e286e832857974ce36d43b2f96 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 23 Apr 2024 10:58:31 +0800
Subject: [PATCH] Fix `weight_only`  algorithms import  (#1742)

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../quantization/llm/requirements.txt         |  2 +-
 .../quantization/llm/run_clm_no_trainer.py    | 36 +++++++++++--------
 .../torch/algorithms/weight_only/__init__.py  |  9 -----
 .../torch/algorithms/weight_only/modules.py   |  2 +-
 .../torch/quantization/algorithm_entry.py     | 12 +++----
 .../quantization/weight_only/test_gptq.py     |  2 +-
 .../quantization/weight_only/test_rtn.py      |  2 +-
 7 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt
index 0fac3f8438f..ebea194b93b 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt
@@ -9,5 +9,5 @@ wandb
 einops
 neural-compressor
 intel-extension-for-transformers
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
+lm-eval
 peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
index 613c0277579..bc973d28491 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -50,8 +50,7 @@
                     help="Pad input ids to max length.")
 parser.add_argument("--calib_iters", default=512, type=int,
                     help="calibration iters.")
-parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
-                                                   "hellaswag", "winogrande", "piqa", "wikitext"],
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
                     type=str, help="tasks list for accuracy validation")
 parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
 # ============SmoothQuant configs==============
@@ -390,24 +389,27 @@ def run_fn(model):
 
 if args.accuracy:
     user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
-
-    results = evaluate(
-        model="hf-causal",
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
         model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
         user_model=user_model,
+        tokenizer = tokenizer,
         batch_size=args.batch_size,
         tasks=args.tasks,
+        device="cpu",
     )
+    results = evaluate(eval_args)
+
     dumped = json.dumps(results, indent=2)
     if args.save_accuracy_path:
         with open(args.save_accuracy_path, "w") as f:
             f.write(dumped)
-    for task_name in args.tasks:
+    for task_name in args.tasks.split(","):
         if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity"]
+            acc = results["results"][task_name]["word_perplexity,none"]
         else:
-            acc = results["results"][task_name]["acc"]
+            acc = results["results"][task_name]["acc,none"]
     print("Accuracy: %.5f" % acc)
     print('Batch size = %d' % args.batch_size)
 
@@ -417,21 +419,25 @@ def run_fn(model):
     import time
 
     samples = args.iters * args.batch_size
-    start = time.time()
-    results = evaluate(
-        model="hf-causal",
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
         model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
         user_model=user_model,
+        tokenizer = tokenizer,
         batch_size=args.batch_size,
         tasks=args.tasks,
         limit=samples,
+        device="cpu",
     )
+    start = time.time()
+    results = evaluate(eval_args)
     end = time.time()
-    for task_name in args.tasks:
+    for task_name in args.tasks.split(","):
         if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity"]
+            acc = results["results"][task_name]["word_perplexity,none"]
         else:
-            acc = results["results"][task_name]["acc"]
+            acc = results["results"][task_name]["acc,none"]
     print("Accuracy: %.5f" % acc)
     print('Throughput: %.3f samples/sec' % (samples / (end - start)))
     print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py
index 1a1789697e4..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/weight_only/__init__.py
+++ b/neural_compressor/torch/algorithms/weight_only/__init__.py
@@ -11,12 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .rtn import rtn_quantize
-from .gptq import gptq_quantize
-from .awq import awq_quantize
-from .teq import teq_quantize
-from .autoround import autoround_quantize
-from .hqq import hqq_quantize
-from .modules import WeightOnlyLinear
-from .utility import *
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 2fb061821c8..2842429e06e 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -69,7 +69,7 @@ def __init__(
             bits = self.dtype.lstrip("int")
             self.dtype = "int"
         if "int" not in self.dtype:  # for nf4, fp4
-            from neural_compressor.torch.algorithms.weight_only import FLOAT_MAPPING, INT_MAPPING
+            from neural_compressor.torch.algorithms.weight_only.utility import FLOAT_MAPPING, INT_MAPPING
 
             self.use_optimum_format = False  # optimum_format doesn't suit for symmetric nf4 fp4.
             float_list = FLOAT_MAPPING[self.dtype]
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index f50abb698db..df523293be8 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -40,7 +40,7 @@ def rtn_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs
 ) -> torch.nn.Module:
     """The main entry to apply rtn quantization."""
-    from neural_compressor.torch.algorithms.weight_only import rtn_quantize
+    from neural_compressor.torch.algorithms.weight_only.rtn import rtn_quantize
 
     # rebuild weight_config for rtn_quantize function
     weight_config = {}
@@ -75,7 +75,7 @@ def gptq_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], *args, **kwargs
 ) -> torch.nn.Module:
     logger.info("Quantize model with the GPTQ algorithm.")
-    from neural_compressor.torch.algorithms.weight_only import gptq_quantize
+    from neural_compressor.torch.algorithms.weight_only.gptq import gptq_quantize
 
     # rebuild weight_config for gptq_quantize function
     weight_config = {}
@@ -228,7 +228,7 @@ def awq_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AWQConfig], *args, **kwargs
 ) -> torch.nn.Module:
     logger.info("Quantize model with the AWQ algorithm.")
-    from neural_compressor.torch.algorithms.weight_only import awq_quantize
+    from neural_compressor.torch.algorithms.weight_only.awq import awq_quantize
 
     weight_config = {}
     for (op_name, op_type), op_config in configs_mapping.items():
@@ -288,7 +288,7 @@ def awq_quantize_entry(
 def teq_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], TEQConfig], *args, **kwargs
 ) -> torch.nn.Module:
-    from neural_compressor.torch.algorithms.weight_only import teq_quantize
+    from neural_compressor.torch.algorithms.weight_only.teq import teq_quantize
 
     logger.info("Quantize model with the TEQ algorithm.")
     weight_config = {}
@@ -338,7 +338,7 @@ def teq_quantize_entry(
 def autoround_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], *args, **kwargs
 ) -> torch.nn.Module:
-    from neural_compressor.torch.algorithms.weight_only import autoround_quantize
+    from neural_compressor.torch.algorithms.weight_only.autoround import autoround_quantize
 
     logger.info("Quantize model with the AutoRound algorithm.")
     calib_func = kwargs.get("run_fn", None)
@@ -407,7 +407,7 @@ def autoround_quantize_entry(
 def hqq_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, Callable], HQQConfig], *args, **kwargs
 ) -> torch.nn.Module:
-    from neural_compressor.torch.algorithms.weight_only import hqq_quantize
+    from neural_compressor.torch.algorithms.weight_only.hqq import hqq_quantize
 
     logger.info("Quantize model with the HQQ algorithm.")
     q_model = hqq_quantize(model, configs_mapping)
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 064b75e7203..6325269c2c2 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -4,7 +4,7 @@
 import torch
 import transformers
 
-from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.quantization import GPTQConfig, get_default_gptq_config, get_default_rtn_config, quantize
 
 
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index c44fe16669e..f52a8d64e69 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -4,7 +4,7 @@
 import torch
 import transformers
 
-from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.quantization import (
     RTNConfig,
     get_default_double_quant_config,