remove autoawq requirement at packing stage (#249)

* remove autoawq requirement Signed-off-by: n1ck-guo <[email protected]>
intel · Sep 11, 2024 · 98c4656 · 98c4656
1 parent b698213
commit 98c4656
Show file tree

Hide file tree

Showing 7 changed files with 438 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -181,7 +181,7 @@ Additionally, symmetric quantization tends to perform poorly at 2-bit precision.
 
 **AutoAWQ format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
 within the community. Asymmetric quantization typically improves accuracy but may reduce inference speed. It features
-specialized layer fusion tailored for Llama models. However, it supports only 4-bit asymmetric quantization. Currently, please manually install autoawq via `pip install autoawq` before exporting.
+specialized layer fusion tailored for Llama models. However, it supports only 4-bit asymmetric quantization.
 
 ## Model Inference
 

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -314,10 +314,7 @@ def tune(args):
     inplace = False if len(format_list) > 1 else True
     for format_ in format_list:
         eval_folder = f'{export_dir}-{format_}'
-        if 'auto_awq' in format_:
-            autoround.save_quantized(eval_folder, format=format_, inplace=inplace, model_path=model_name)
-        else:
-            autoround.save_quantized(eval_folder, format=format_, inplace=inplace)
+        autoround.save_quantized(eval_folder, format=format_, inplace=inplace)
 
 
     def get_library_version(library_name):

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -1094,7 +1094,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
 
         Args:
             output_dir (str, optional): The directory to save the quantized model. Defaults to None.
-            format (str, optional): The format in which to save the model. Defaults to "auto_gptq".
+            format (str, optional): The format in which to save the model. Defaults to "auto_round".
             inplace (bool, optional): Whether to modify the model in place. Defaults to True.
             **kwargs: Additional keyword arguments specific to the export format.
 

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -125,12 +125,8 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym):
         from auto_round_extension.cuda.qlinear_triton import QuantLinear
         return QuantLinear
     elif "awq" in backend:
-        try:
-            from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
-            return WQLinear_GEMM
-        except:
-            logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
-            return
+        from ..export_to_awq.utils import WQLinear_GEMM
+        return WQLinear_GEMM
     elif "gptq" in backend:
         return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
     else:
@@ -186,7 +182,7 @@ def pack_layer(name, model, layer_config, backend, pbar):
                 qlayer.pack(layer, scale, zero, None)
             qlayer.to(device)
         else:
-            from awq.utils.utils import clear_memory  # pylint: disable=E0401
+            from ..export_to_awq.utils import clear_memory
             scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32)
             scale = scale.t().contiguous()
             zp = zp.t().contiguous()

diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
@@ -42,7 +42,7 @@
 
 
 @register_format("auto_awq")
-def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
+def save_quantized_as_autoawq(output_dir, inplace=True, **kwargs):
     """Export the model to autogptq format to easily leverage cuda kernel."""
     model = kwargs["model"]
     layer_config = kwargs["layer_config"]
@@ -67,17 +67,10 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     else:
         compressed_model = copy.deepcopy(model.to("cpu"))
 
-    try:
-        from awq import AutoAWQForCausalLM  # pylint: disable=E0401
-        from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
-        from awq.utils.utils import clear_memory  # pylint: disable=E0401
-    except:
-        logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
+    from .utils import WQLinear_GEMM, clear_memory, get_self_modules
 
     q_linear_module = WQLinear_GEMM
-    awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
-    self_modules = awq_model.get_model_layers(compressed_model)
-    del awq_model  # release memory
+    self_modules = get_self_modules(compressed_model)
     for i in range(len(self_modules)):
         module = self_modules[i]
         named_linears = get_named_linears(module)