intel · WeiweiZhang1 · Nov 19, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/README.md b/README.md
@@ -41,20 +41,6 @@ more accuracy data and recipes across various models.
 
 ## Installation
 
-### Build from Source
-
-```bash
-pip install -r requirements.txt
-
-# GPU
-pip install -vvv --no-build-isolation -e .
-
-# CPU
-pip install -vvv --no-build-isolation -e .[cpu]
-
-# HPU
-pip install -vvv --no-build-isolation -e .[hpu]
-```
 
 ### Install from pypi
 
@@ -69,6 +55,24 @@ pip install auto-round[cpu]
 pip install auto-round[hpu]
 ```
 
+
+<details>
+  <summary>Build from Source</summary>
+
+  ```bash
+  pip install -r requirements.txt
+
+  # GPU
+  pip install -vvv --no-build-isolation -e .
+
+  # CPU
+  pip install -vvv --no-build-isolation -e .[cpu]
+
+  # HPU
+  pip install -vvv --no-build-isolation -e .[hpu]
+  ```
+</details>
+
 ## Model Quantization
 
 ### Basic Usage (Gaudi2/CPU/GPU)

diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
@@ -397,14 +397,18 @@ def convert_model(self, model: nn.Module):
             if ("hpu" == target_device or "cpu" == target_device) and model.dtype != torch.bfloat16:
                 logger.info(f"Change the dtype to `bfloat16` as {target_device.upper()} does not support float16")
                 model = model.to(torch.bfloat16)
+            else:
+                if model.dtype != torch.float16:
+                    logger.info(f"Change the dtype to `float16` for better performance")
+                    model = model.to(torch.float16)
 
         bits = quantization_config.bits
         group_size = quantization_config.group_size
         data_type = quantization_config.data_type if hasattr(quantization_config,
                                                              "data_type") else "int"  # pragma: no cover
         sym = quantization_config.sym
         to_quant_block_names = quantization_config.to_quant_block_names if hasattr(quantization_config,
-                                                                           "to_quant_block_names") else None
+                                                                                   "to_quant_block_names") else None
         layer_names = get_layer_names_in_block(model, to_quant_block_names=to_quant_block_names)
 
         extra_config = {}

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -331,7 +331,7 @@ def quantize(self):
         unquantized_layers = []
         for n, m in self.model.named_modules():
             if isinstance(m, tuple(self.supported_types)):
-                if m.bits > 8:
+                if int(m.bits) > 8:
                     unquantized_layers.append(n)
                 else:
                     quantized_layers.append(n)
@@ -1681,3 +1681,4 @@ def __init__(
             **kwargs,
         )
 
+
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
@@ -142,12 +142,13 @@ def __init__(self, *args, **kwargs):
 
         self.add_argument("--not_use_best_mse", action='store_true',
                           help="whether to use the iter of best mes loss in the tuning phase")
-        
+
         self.add_argument("--to_quant_block_names", default=None, type=str,
                           help="Names of quantitative blocks, please use commas to separate them.")
 
         self.add_argument("--enable_torch_compile", default=None, type=bool,
-                            help="whether to enable torch compile")
+                          help="whether to enable torch compile")
+
 
 def setup_parser():
     parser = BasicArgumentParser()
@@ -213,7 +214,6 @@ def setup_fast_parser():
     parser.add_argument("--nsamples", default=128, type=int,
                         help="number of samples")
 
-
     args = parser.parse_args()
 
     return args
@@ -366,15 +366,24 @@ def tune(args):
                 logger.info(
                     f"{n} will not be quantized due to its shape not being divisible by 32,"
                     " resulting in an exporting issue to autogptq")
-    fp_layers = args.fp_layers.split(",")
-    if bool(fp_layers):
+
+    layer_config = {}
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
         for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                name = n.split('.')[-1]
-                if n in fp_layers or name in fp_layers:
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
                     layer_config[n] = {"bits": 16}
                     logger.info(
                         f"{n} will not be quantized.")
+        if len(layer_config) > 0:
+            for format in formats:
+                if "auto_round" not in format and "fake" not in format:
+                    ##TODO gptq, awq could support some mixed precision config
+                    logger.warning(f"mixed precision exporting does not support {format} currently")
+
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
         lm_head_layer_name = n
@@ -507,4 +516,3 @@ def eval(args):
 
     from lm_eval.utils import make_table  # pylint: disable=E0401
     print(make_table(res))
-
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
@@ -301,7 +301,24 @@ def tune(args):
             model = model.to(torch.bfloat16)
 
     round = AutoRoundMLLM
+
     layer_config = {}
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
+        for n, m in model.named_modules():
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
+                    layer_config[n] = {"bits": 16}
+                    logger.info(
+                        f"{n} will not be quantized.")
+        if len(layer_config) > 0:
+            for format in formats:
+                if "auto_round" not in format and "fake" not in format:
+                    ##TODO gptq, awq could support some mixed precision config
+                    logger.warning(f"mixed precision exporting does not support {format} currently")
+
     for n, m in model.named_modules():
         if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
             if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:

diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -491,11 +491,12 @@ def check_to_quantized(config):
             False otherwise.
     """
     if isinstance(config, dict):
-        if config["bits"] > 8:
+
+        if int(config["bits"]) > 8:
             return False
         return True
     else:
-        if config.bits > 8:
+        if int(config.bits) > 8:
             return False
         return True
 
@@ -978,3 +979,4 @@ def compile_func(fun, device, enable_torch_compile):
     else:
         return compile_func_on_cuda_or_cpu(fun, enable_torch_compile)
 
+
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
@@ -300,21 +300,29 @@
         round = AutoRoundAdam
 
     layer_config = {}
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
+        for n, m in model.named_modules():
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
+                    layer_config[n] = {"bits": 16}
+                    print(
+                        f"{n} will not be quantized.")
+        if len(layer_config) > 0:
+            for format in formats:
+                if "auto_round" not in format and "fake" not in format:
+                    ##TODO gptq, awq could support some mixed precision config
+                    print(f"mixed precision exporting does not support {format} currently")
+
     for n, m in model.named_modules():
         if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
             if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
                 layer_config[n] = {"bits": 16}
                 print(
                     f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
-    fp_layers = args.fp_layers.split(",")
-    if bool(fp_layers):
-        for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                name = n.split('.')[-1]
-                if n in fp_layers or name in fp_layers:
-                    layer_config[n] = {"bits": 16}
-                    print(
-                        f"{n} will not be quantized.")
+
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
         lm_head_layer_name = n

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
@@ -7,7 +7,7 @@ transformers>=4.38
 triton
 numpy < 2.0
 threadpoolctl
-lm-eval>=0.4.2,<=0.4.5
+lm-eval>=0.4.2,<0.5
 tqdm
 packaging
 auto-gptq>=0.7.1

diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,7 @@ transformers>=4.38
 triton
 numpy < 2.0
 threadpoolctl
-lm-eval>=0.4.2,<=0.4.5
+lm-eval>=0.4.2,<0.5
 tqdm
 packaging
 auto-gptq>=0.7.1

diff --git a/test/test_autoround.py b/test/test_autoround.py
@@ -307,6 +307,38 @@ def test_fp32(self):
         autoround.quantize()
 
 
+    def test_fallback_layers(self):
+        bits, group_size, sym = 4, 128, True
+        model_name = "facebook/opt-125m"
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True,
+                                                     device_map='auto')
+        layer_config = {"model.decoder.layers.0.self_attn.q_proj": {"bits": "16"},
+                        "model.decoder.layers.1.self_attn.k_proj": {"bits": "16"}}
+        autoround = AutoRound(
+            model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
+
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
+                                                     device_map='auto')
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
+
+
 
 if __name__ == "__main__":
     unittest.main()
+