Skip to content

Commit

Permalink
Update AutoRound commit version (#1941)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <[email protected]>
  • Loading branch information
Kaihui-intel authored Jul 23, 2024
1 parent 9077b38 commit c80b68a
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/scripts/ut/env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
fi

if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
pip install git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
fi

# test deps
Expand Down
4 changes: 2 additions & 2 deletions neural_compressor/adaptor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4926,7 +4926,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
act_group_size = self.recipes["autoround_args"].get("act_group_size", None)
act_sym = self.recipes["autoround_args"].get("act_sym", None)
act_dynamic = self.recipes["autoround_args"].get("act_dynamic", True)
multimodal = self.recipes["autoround_args"].get("multimodal", False)
quant_block_list = self.recipes["autoround_args"].get("quant_block_list", None)
use_layer_wise = self.recipes["autoround_args"].get("use_layer_wise", False)

if dataloader is not None:
Expand Down Expand Up @@ -4959,7 +4959,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
dynamic_max_gap=dynamic_max_gap,
data_type=data_type,
scale_dtype=scale_dtype,
multimodal=multimodal,
quant_block_list=quant_block_list,
act_bits=act_bits,
act_group_size=act_group_size,
act_sym=act_sym,
Expand Down
6 changes: 3 additions & 3 deletions neural_compressor/adaptor/torch_utils/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ def autoround_quantize(
dynamic_max_gap: int = -1,
data_type: str = "int", ##only support int for now
scale_dtype: str = "fp16",
multimodal: bool = False,
quant_block_list: list = None,
act_bits: int = 32,
act_group_size: int = None,
act_sym: bool = None,
Expand Down Expand Up @@ -761,7 +761,7 @@ def autoround_quantize(
data_type (str): The data type to be used (default is "int").
scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
have different choices.
multimodal(bool): Enable multimodal model quantization, (default is "False").
quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
act_bits (int): Number of bits for activation quantization. Default is 32.
act_group_size (int): Group size for activation quantization. Default is None.
act_sym (bool): Whether to use symmetric activation quantization. Default is None.
Expand Down Expand Up @@ -800,7 +800,7 @@ def autoround_quantize(
dynamic_max_gap=dynamic_max_gap,
data_type=data_type, ## only support data_type
scale_dtype=scale_dtype,
multimodal=multimodal,
quant_block_list=quant_block_list,
act_bits=act_bits,
act_group_size=act_group_size,
act_sym=act_sym,
Expand Down
10 changes: 5 additions & 5 deletions neural_compressor/torch/algorithms/weight_only/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(
dynamic_max_gap: int = -1,
data_type: str = "int",
scale_dtype: str = "fp16",
multimodal: bool = False,
quant_block_list: list = None,
act_bits: int = 32,
act_group_size: int = None,
act_sym: bool = None,
Expand Down Expand Up @@ -113,8 +113,8 @@ def __init__(
dynamic_max_gap (int): The dynamic maximum gap (default is -1).
data_type (str): The data type to be used (default is "int").
scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
have different choices.
multimodal(bool): Enable multimodal model quantization, (default is "False").
have different choices.
quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
act_bits (int): Number of bits for activation quantization. Default is 32.
act_group_size (int): Group size for activation quantization. Default is None.
act_sym (bool): Whether to use symmetric activation quantization. Default is None.
Expand Down Expand Up @@ -146,7 +146,7 @@ def __init__(
self.dynamic_max_gap = dynamic_max_gap
self.data_type = data_type
self.scale_dtype = scale_dtype
self.multimodal = multimodal
self.quant_block_list = quant_block_list
self.act_bits = act_bits
self.act_group_size = act_group_size
self.act_sym = act_sym
Expand Down Expand Up @@ -202,7 +202,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
dynamic_max_gap=self.dynamic_max_gap,
data_type=self.data_type,
scale_dtype=self.scale_dtype,
multimodal=self.multimodal,
quant_block_list=self.quant_block_list,
act_bits=self.act_bits,
act_group_size=self.act_group_size,
act_sym=self.act_sym,
Expand Down
13 changes: 9 additions & 4 deletions neural_compressor/torch/quantization/algorithm_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,9 +567,14 @@ def autoround_quantize_entry(
if quant_config.name != AUTOROUND or quant_config.dtype == "fp32":
continue
else:
dtype = quant_config.dtype
bits = quant_config.bits
if dtype != "int" and "int" in dtype:
bits = int(dtype.lstrip("int"))
dtype = "int"
weight_config[op_name] = {
"data_type": quant_config.dtype,
"bits": quant_config.bits,
"data_type": dtype,
"bits": bits,
"sym": quant_config.use_sym,
"group_size": quant_config.group_size,
"act_bits": quant_config.act_bits,
Expand All @@ -595,7 +600,7 @@ def autoround_quantize_entry(
not_use_best_mse = quant_config.not_use_best_mse
dynamic_max_gap = quant_config.dynamic_max_gap
scale_dtype = quant_config.scale_dtype
multimodal = quant_config.multimodal
quant_block_list = quant_config.quant_block_list
low_cpu_mem_usage = quant_config.use_layer_wise

kwargs.pop("example_inputs")
Expand All @@ -622,7 +627,7 @@ def autoround_quantize_entry(
not_use_best_mse=not_use_best_mse,
dynamic_max_gap=dynamic_max_gap,
scale_dtype=scale_dtype,
multimodal=multimodal,
quant_block_list=quant_block_list,
low_cpu_mem_usage=low_cpu_mem_usage,
)
model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
Expand Down
6 changes: 3 additions & 3 deletions neural_compressor/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,7 @@ def __init__(
dynamic_max_gap: int = -1,
scale_dtype: str = "fp16",
use_layer_wise: bool = False,
multimodal: bool = False,
quant_block_list: list = None,
white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
):
"""Init AUTOROUND weight-only quantization config.
Expand Down Expand Up @@ -951,7 +951,7 @@ def __init__(
scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
have different choices.
use_layer_wise (bool): Enables quantize model per layer. Defaults to False.
multimodal(bool): Enable multimodal model quantization, (default is "False").
quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
Default is DEFAULT_WHITE_LIST.
"""
Expand Down Expand Up @@ -983,7 +983,7 @@ def __init__(
self.dynamic_max_gap = dynamic_max_gap
self.scale_dtype = scale_dtype
self.use_layer_wise = use_layer_wise
self.multimodal = multimodal
self.quant_block_list = quant_block_list
self._post_init()

@classmethod
Expand Down
17 changes: 17 additions & 0 deletions test/3x/torch/quantization/weight_only/test_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,23 @@ def test_autoround(self, quant_lm_head):
if quant_lm_head is True:
assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed."

def test_int4_dtype(self):
fp32_model = copy.deepcopy(self.gptj)
quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
logger.info(f"Test AutoRound with config {quant_config}")

# prepare + convert API
model = prepare(model=fp32_model, quant_config=quant_config)

run_fn(model, self.dataloader)
q_model = convert(model)
out = q_model(self.inp)[0]
assert torch.allclose(out, self.label, atol=1e-1)
assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."

def test_autoround_with_quantize_API(self):
gpt_j_model = copy.deepcopy(self.gptj)

Expand Down
2 changes: 1 addition & 1 deletion test/3x/torch/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
auto_round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
expecttest
intel_extension_for_pytorch
numpy
Expand Down
2 changes: 1 addition & 1 deletion test/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--find-links https://download.pytorch.org/whl/torch_stable.html
accelerate==0.21.0
auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
auto-round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
dynast==1.6.0rc1
horovod
intel-extension-for-pytorch
Expand Down

0 comments on commit c80b68a

Please sign in to comment.