Skip to content

Commit

Permalink
remove autoawq requirement at packing stage (#249)
Browse files Browse the repository at this point in the history
* remove autoawq requirement

Signed-off-by: n1ck-guo <[email protected]>
  • Loading branch information
n1ck-guo authored Sep 11, 2024
1 parent b698213 commit 98c4656
Show file tree
Hide file tree
Showing 7 changed files with 438 additions and 74 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Additionally, symmetric quantization tends to perform poorly at 2-bit precision.

**AutoAWQ format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
within the community. Asymmetric quantization typically improves accuracy but may reduce inference speed. It features
specialized layer fusion tailored for Llama models. However, it supports only 4-bit asymmetric quantization. Currently, please manually install autoawq via `pip install autoawq` before exporting.
specialized layer fusion tailored for Llama models. However, it supports only 4-bit asymmetric quantization.

## Model Inference

Expand Down
5 changes: 1 addition & 4 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,7 @@ def tune(args):
inplace = False if len(format_list) > 1 else True
for format_ in format_list:
eval_folder = f'{export_dir}-{format_}'
if 'auto_awq' in format_:
autoround.save_quantized(eval_folder, format=format_, inplace=inplace, model_path=model_name)
else:
autoround.save_quantized(eval_folder, format=format_, inplace=inplace)
autoround.save_quantized(eval_folder, format=format_, inplace=inplace)


def get_library_version(library_name):
Expand Down
2 changes: 1 addition & 1 deletion auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -1094,7 +1094,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
Args:
output_dir (str, optional): The directory to save the quantized model. Defaults to None.
format (str, optional): The format in which to save the model. Defaults to "auto_gptq".
format (str, optional): The format in which to save the model. Defaults to "auto_round".
inplace (bool, optional): Whether to modify the model in place. Defaults to True.
**kwargs: Additional keyword arguments specific to the export format.
Expand Down
10 changes: 3 additions & 7 deletions auto_round/export/export_to_autoround/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,8 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym):
from auto_round_extension.cuda.qlinear_triton import QuantLinear
return QuantLinear
elif "awq" in backend:
try:
from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
return WQLinear_GEMM
except:
logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
return
from ..export_to_awq.utils import WQLinear_GEMM
return WQLinear_GEMM
elif "gptq" in backend:
return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
else:
Expand Down Expand Up @@ -186,7 +182,7 @@ def pack_layer(name, model, layer_config, backend, pbar):
qlayer.pack(layer, scale, zero, None)
qlayer.to(device)
else:
from awq.utils.utils import clear_memory # pylint: disable=E0401
from ..export_to_awq.utils import clear_memory
scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32)
scale = scale.t().contiguous()
zp = zp.t().contiguous()
Expand Down
13 changes: 3 additions & 10 deletions auto_round/export/export_to_awq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@


@register_format("auto_awq")
def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
def save_quantized_as_autoawq(output_dir, inplace=True, **kwargs):
"""Export the model to autogptq format to easily leverage cuda kernel."""
model = kwargs["model"]
layer_config = kwargs["layer_config"]
Expand All @@ -67,17 +67,10 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
else:
compressed_model = copy.deepcopy(model.to("cpu"))

try:
from awq import AutoAWQForCausalLM # pylint: disable=E0401
from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
from awq.utils.utils import clear_memory # pylint: disable=E0401
except:
logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
from .utils import WQLinear_GEMM, clear_memory, get_self_modules

q_linear_module = WQLinear_GEMM
awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
self_modules = awq_model.get_model_layers(compressed_model)
del awq_model # release memory
self_modules = get_self_modules(compressed_model)
for i in range(len(self_modules)):
module = self_modules[i]
named_linears = get_named_linears(module)
Expand Down
Loading

0 comments on commit 98c4656

Please sign in to comment.