From 52ec5615ec37c927d4e170980c0b0e8d9662a353 Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Fri, 15 Nov 2024 10:34:11 +0800
Subject: [PATCH 1/7] fix merge error

---
 auto_round/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/backend.py b/auto_round/backend.py
index 6afbf1ef..66c5e667 100644
--- a/auto_round/backend.py
+++ b/auto_round/backend.py
@@ -404,7 +404,7 @@ def get_autogptq_infer_linear(backend, bits=4, group_size=128, sym=False):
     from packaging.version import Version
 
     # Import the appropriate QuantLinear based on the version of auto_gptq
-    if Version(version) <= Version("0.7.1"):
+    if Version(version) < Version("0.7.2"):
         QuantLinear = dynamically_import_QuantLinear(
             use_triton=use_triton,
             desc_act=False,

From 7af3e8ad3bb03f33920801e22e8f72ef20872e1b Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Fri, 15 Nov 2024 11:27:17 +0800
Subject: [PATCH 2/7] fix fp_layers issues

---
 auto_round/auto_quantizer.py       |  7 +++++--
 auto_round/script/llm.py           | 26 +++++++++++++++++---------
 auto_round/script/mllm.py          | 17 +++++++++++++++++
 examples/language-modeling/main.py | 26 +++++++++++++++++---------
 4 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
index 7cbcd2ea..e859f2ee 100644
--- a/auto_round/auto_quantizer.py
+++ b/auto_round/auto_quantizer.py
@@ -397,6 +397,10 @@ def convert_model(self, model: nn.Module):
             if ("hpu" == target_device or "cpu" == target_device) and model.dtype != torch.bfloat16:
                 logger.info(f"Change the dtype to `bfloat16` as {target_device.upper()} does not support float16")
                 model = model.to(torch.bfloat16)
+            else:
+                if model.dtype != torch.float16:
+                    logger.info(f"Change the dtype to `float16` for better performance")
+                    model = model.to(torch.float16)
 
         bits = quantization_config.bits
         group_size = quantization_config.group_size
@@ -404,7 +408,7 @@ def convert_model(self, model: nn.Module):
                                                              "data_type") else "int"  # pragma: no cover
         sym = quantization_config.sym
         to_quant_block_names = quantization_config.to_quant_block_names if hasattr(quantization_config,
-                                                                           "to_quant_block_names") else None
+                                                                                   "to_quant_block_names") else None
         layer_names = get_layer_names_in_block(model, to_quant_block_names=to_quant_block_names)
 
         extra_config = {}
@@ -741,4 +745,3 @@ def is_serializable(self):
 
 transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
 transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
-
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index 03700d3e..536bac44 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -142,12 +142,13 @@ def __init__(self, *args, **kwargs):
 
         self.add_argument("--not_use_best_mse", action='store_true',
                           help="whether to use the iter of best mes loss in the tuning phase")
-        
+
         self.add_argument("--to_quant_block_names", default=None, type=str,
                           help="Names of quantitative blocks, please use commas to separate them.")
 
         self.add_argument("--enable_torch_compile", default=None, type=bool,
-                            help="whether to enable torch compile")
+                          help="whether to enable torch compile")
+
 
 def setup_parser():
     parser = BasicArgumentParser()
@@ -213,7 +214,6 @@ def setup_fast_parser():
     parser.add_argument("--nsamples", default=128, type=int,
                         help="number of samples")
 
-
     args = parser.parse_args()
 
     return args
@@ -366,15 +366,24 @@ def tune(args):
                 logger.info(
                     f"{n} will not be quantized due to its shape not being divisible by 32,"
                     " resulting in an exporting issue to autogptq")
-    fp_layers = args.fp_layers.split(",")
-    if bool(fp_layers):
+
+    layer_config = {}
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
         for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                name = n.split('.')[-1]
-                if n in fp_layers or name in fp_layers:
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
                     layer_config[n] = {"bits": 16}
                     logger.info(
                         f"{n} will not be quantized.")
+        if len(layer_config) > 0:
+            for format in formats:
+                if "auto_round" not in format:
+                    ##TODO gptq, awq could support some mixed precision config
+                    logger.warning(f"mixed precision exporting does not support {format} currently")
+
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
         lm_head_layer_name = n
@@ -507,4 +516,3 @@ def eval(args):
 
     from lm_eval.utils import make_table  # pylint: disable=E0401
     print(make_table(res))
-
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index 9afcc1d4..95807c8d 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -300,7 +300,24 @@ def tune(args):
             model = model.to(torch.bfloat16)
 
     round = AutoRoundMLLM
+
     layer_config = {}
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
+        for n, m in model.named_modules():
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
+                    layer_config[n] = {"bits": 16}
+                    logger.info(
+                        f"{n} will not be quantized.")
+        if len(layer_config) > 0:
+            for format in formats:
+                if "auto_round" not in format:
+                    ##TODO gptq, awq could support some mixed precision config
+                    logger.warning(f"mixed precision exporting does not support {format} currently")
+
     for n, m in model.named_modules():
         if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
             if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index f0594f5d..25c12e00 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -300,21 +300,29 @@
         round = AutoRoundAdam
 
     layer_config = {}
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
+        for n, m in model.named_modules():
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
+                    layer_config[n] = {"bits": 16}
+                    print(
+                        f"{n} will not be quantized.")
+        if len(layer_config) > 0:
+            for format in formats:
+                if "auto_round" not in format:
+                    ##TODO gptq, awq could support some mixed precision config
+                    print(f"mixed precision exporting does not support {format} currently")
+
     for n, m in model.named_modules():
         if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
             if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
                 layer_config[n] = {"bits": 16}
                 print(
                     f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
-    fp_layers = args.fp_layers.split(",")
-    if bool(fp_layers):
-        for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                name = n.split('.')[-1]
-                if n in fp_layers or name in fp_layers:
-                    layer_config[n] = {"bits": 16}
-                    print(
-                        f"{n} will not be quantized.")
+
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
         lm_head_layer_name = n

From d44266a7cfcae1cccdacd7a1133426ee9b4d447e Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Fri, 15 Nov 2024 11:44:58 +0800
Subject: [PATCH 3/7] Loosen the restrictions of lm-eval

---
 requirements-cpu.txt | 2 +-
 requirements.txt     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index c18eda82..a59c07cf 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -7,7 +7,7 @@ transformers>=4.38
 triton
 numpy < 2.0
 threadpoolctl
-lm-eval>=0.4.2,<=0.4.5
+lm-eval>=0.4.2,<0.5
 tqdm
 packaging
 auto-gptq>=0.7.1
diff --git a/requirements.txt b/requirements.txt
index 698d8c34..31fc02dd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ transformers>=4.38
 triton
 numpy < 2.0
 threadpoolctl
-lm-eval>=0.4.2,<=0.4.5
+lm-eval>=0.4.2,<0.5
 tqdm
 packaging
 auto-gptq>=0.7.1

From b8331ec6981c4e3d90bc85c5d1ba2cf229da7e48 Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Fri, 15 Nov 2024 12:45:57 +0800
Subject: [PATCH 4/7] fix and add ut

---
 README.md                          | 32 +++++++++-------
 auto_round/script/llm.py           |  2 +-
 auto_round/script/mllm.py          |  2 +-
 examples/language-modeling/main.py |  2 +-
 test/test_autoround.py             | 61 ++++++++++++++++++++++++++++++
 5 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 9790fc49..6fff23fe 100644
--- a/README.md
+++ b/README.md
@@ -41,20 +41,6 @@ more accuracy data and recipes across various models.
 
 ## Installation
 
-### Build from Source
-
-```bash
-pip install -r requirements.txt
-
-# GPU
-pip install -vvv --no-build-isolation -e .
-
-# CPU
-pip install -vvv --no-build-isolation -e .[cpu]
-
-# HPU
-pip install -vvv --no-build-isolation -e .[hpu]
-```
 
 ### Install from pypi
 
@@ -69,6 +55,24 @@ pip install auto-round[cpu]
 pip install auto-round[hpu]
 ```
 
+
+<details>
+  <summary>Build from Source</summary>
+
+  ```bash
+  pip install -r requirements.txt
+
+  # GPU
+  pip install -vvv --no-build-isolation -e .
+
+  # CPU
+  pip install -vvv --no-build-isolation -e .[cpu]
+
+  # HPU
+  pip install -vvv --no-build-isolation -e .[hpu]
+  ```
+</details>
+
 ## Model Quantization
 
 ### Basic Usage (Gaudi2/CPU/GPU)
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index 536bac44..ee87c33e 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -380,7 +380,7 @@ def tune(args):
                         f"{n} will not be quantized.")
         if len(layer_config) > 0:
             for format in formats:
-                if "auto_round" not in format:
+                if "auto_round" not in format and "fake" not in format:
                     ##TODO gptq, awq could support some mixed precision config
                     logger.warning(f"mixed precision exporting does not support {format} currently")
 
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index 70f87012..17fe3caa 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -315,7 +315,7 @@ def tune(args):
                         f"{n} will not be quantized.")
         if len(layer_config) > 0:
             for format in formats:
-                if "auto_round" not in format:
+                if "auto_round" not in format and "fake" not in format:
                     ##TODO gptq, awq could support some mixed precision config
                     logger.warning(f"mixed precision exporting does not support {format} currently")
 
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 25c12e00..d3928378 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -312,7 +312,7 @@
                         f"{n} will not be quantized.")
         if len(layer_config) > 0:
             for format in formats:
-                if "auto_round" not in format:
+                if "auto_round" not in format and "fake" not in format:
                     ##TODO gptq, awq could support some mixed precision config
                     print(f"mixed precision exporting does not support {format} currently")
 
diff --git a/test/test_autoround.py b/test/test_autoround.py
index 20e5ea44..9742bfad 100644
--- a/test/test_autoround.py
+++ b/test/test_autoround.py
@@ -306,6 +306,67 @@ def test_fp32(self):
         )
         autoround.quantize()
 
+    def test_fallback_layers(self):
+        bits, group_size, sym = 4, 128, True
+        model_name = "facebook/opt-125m"
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
+                                                     device_map='auto')
+        layer_config = {"model.layers.0": {"bits": "16"},
+                        "model.layers.0": {"bits": "16"}}
+        autoround = AutoRound(
+            model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
+
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
+                                                     device_map="cpu")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
+
+
+def test_fallback_blocks(self):
+        bits, group_size, sym = 4, 128, True
+        model_name = "facebook/opt-125m"
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
+                                                     device_map='auto')
+        layer_config = {"model.layers.0.self_attn.q_proj": {"bits": "16"},
+                        "model.layers.0.self_attn.k_proj": {"bits": "16"}}
+        autoround = AutoRound(
+            model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
+
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
+                                                     device_map="cpu")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
+
 
 
 if __name__ == "__main__":

From c13fc3f0addcceb49490ab60a0c4b9ee943104f9 Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Fri, 15 Nov 2024 12:49:33 +0800
Subject: [PATCH 5/7] fix

---
 test/test_autoround.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_autoround.py b/test/test_autoround.py
index 9742bfad..cd636c61 100644
--- a/test/test_autoround.py
+++ b/test/test_autoround.py
@@ -306,13 +306,13 @@ def test_fp32(self):
         )
         autoround.quantize()
 
-    def test_fallback_layers(self):
+    def test_fallback_blocks(self):
         bits, group_size, sym = 4, 128, True
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
                                                      device_map='auto')
         layer_config = {"model.layers.0": {"bits": "16"},
-                        "model.layers.0": {"bits": "16"}}
+                        "model.layers.1": {"bits": "16"}}
         autoround = AutoRound(
             model,
             self.tokenizer,
@@ -337,13 +337,13 @@ def test_fallback_layers(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
 
 
-def test_fallback_blocks(self):
+def test_fallback_layers(self):
         bits, group_size, sym = 4, 128, True
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
                                                      device_map='auto')
         layer_config = {"model.layers.0.self_attn.q_proj": {"bits": "16"},
-                        "model.layers.0.self_attn.k_proj": {"bits": "16"}}
+                        "model.layers.1.self_attn.k_proj": {"bits": "16"}}
         autoround = AutoRound(
             model,
             self.tokenizer,

From 1211ab2db79ac86d0aecf403194923196c5b886a Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Fri, 15 Nov 2024 16:38:03 +0800
Subject: [PATCH 6/7] API usage does not support fuzzy match

---
 test/test_autoround.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/test/test_autoround.py b/test/test_autoround.py
index cd636c61..0541259b 100644
--- a/test/test_autoround.py
+++ b/test/test_autoround.py
@@ -306,36 +306,6 @@ def test_fp32(self):
         )
         autoround.quantize()
 
-    def test_fallback_blocks(self):
-        bits, group_size, sym = 4, 128, True
-        model_name = "facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
-                                                     device_map='auto')
-        layer_config = {"model.layers.0": {"bits": "16"},
-                        "model.layers.1": {"bits": "16"}}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
-
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
-                                                     device_map="cpu")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        text = "There is a girl who likes adventure,"
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
-        res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
-
 
 def test_fallback_layers(self):
         bits, group_size, sym = 4, 128, True

From 523a316e495dc276270b9391b98cea4a101a6e3e Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 19 Nov 2024 10:08:50 +0800
Subject: [PATCH 7/7] bugfix of UT

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/auto_quantizer.py | 1 +
 auto_round/autoround.py      | 3 ++-
 auto_round/utils.py          | 6 ++++--
 test/test_autoround.py       | 9 +++++----
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
index bb554b56..b58c84e7 100644
--- a/auto_round/auto_quantizer.py
+++ b/auto_round/auto_quantizer.py
@@ -745,3 +745,4 @@ def is_serializable(self):
 
 transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
 transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
+
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 2ce7abdf..fe904016 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -331,7 +331,7 @@ def quantize(self):
         unquantized_layers = []
         for n, m in self.model.named_modules():
             if isinstance(m, tuple(self.supported_types)):
-                if m.bits > 8:
+                if int(m.bits) > 8:
                     unquantized_layers.append(n)
                 else:
                     quantized_layers.append(n)
@@ -1681,3 +1681,4 @@ def __init__(
             **kwargs,
         )
 
+
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 11d25c7a..140cf007 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -491,11 +491,12 @@ def check_to_quantized(config):
             False otherwise.
     """
     if isinstance(config, dict):
-        if config["bits"] > 8:
+        
+        if int(config["bits"]) > 8:
             return False
         return True
     else:
-        if config.bits > 8:
+        if int(config.bits) > 8:
             return False
         return True
 
@@ -978,3 +979,4 @@ def compile_func(fun, device, enable_torch_compile):
     else:
         return compile_func_on_cuda_or_cpu(fun, enable_torch_compile)
 
+
diff --git a/test/test_autoround.py b/test/test_autoround.py
index 0541259b..da857531 100644
--- a/test/test_autoround.py
+++ b/test/test_autoround.py
@@ -307,13 +307,13 @@ def test_fp32(self):
         autoround.quantize()
 
 
-def test_fallback_layers(self):
+    def test_fallback_layers(self):
         bits, group_size, sym = 4, 128, True
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
                                                      device_map='auto')
-        layer_config = {"model.layers.0.self_attn.q_proj": {"bits": "16"},
-                        "model.layers.1.self_attn.k_proj": {"bits": "16"}}
+        layer_config = {"model.decoder.layers.0.self_attn.q_proj": {"bits": "16"},
+                        "model.decoder.layers.1.self_attn.k_proj": {"bits": "16"}}
         autoround = AutoRound(
             model,
             self.tokenizer,
@@ -331,7 +331,7 @@ def test_fallback_layers(self):
         autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
-                                                     device_map="cpu")
+                                                     device_map='auto')
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -341,3 +341,4 @@ def test_fallback_layers(self):
 
 if __name__ == "__main__":
     unittest.main()
+