Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable the tuning of WOQ algorithm #1328

Merged
merged 22 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cb696fb
support WOQ algo tuning
Kaihui-intel Oct 17, 2023
1bb5d85
add WoqTuningParams docstring
Kaihui-intel Oct 17, 2023
84f4bc4
Merge branch 'kaihui/wo_tuning' of https://github.com/intel/neural-co…
Kaihui-intel Oct 17, 2023
9afdc8c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
25df777
Merge branch 'kaihui/wo_tuning' of https://github.com/intel/neural-co…
Kaihui-intel Oct 17, 2023
e5119ba
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
d0b1d4a
fix docstring
Kaihui-intel Oct 17, 2023
5f8cda8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
073f2cb
support woq to auto & add ut
Kaihui-intel Oct 18, 2023
f9b7d43
Merge branch 'kaihui/wo_tuning' of https://github.com/intel/neural-co…
Kaihui-intel Oct 18, 2023
223f9eb
add docstring for util
Kaihui-intel Oct 18, 2023
1bc7c62
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 18, 2023
1b8ac4a
Merge branch 'kaihui/wo_tuning' of https://github.com/intel/neural-co…
Kaihui-intel Oct 18, 2023
0aa5321
support woq tuning for onnxrt
yuwenzho Oct 19, 2023
ce2dac0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 19, 2023
d87ada6
update UT of onnxrt woq tuning
yuwenzho Oct 19, 2023
b411551
Merge branch 'master' into kaihui/wo_tuning
yuwenzho Oct 19, 2023
9e5537f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 19, 2023
1369c7f
update dataloader check for onnxrt woq
yuwenzho Oct 19, 2023
e95fb8b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 19, 2023
8ba8272
Merge branch 'master' into kaihui/wo_tuning
yuwenzho Oct 20, 2023
b409d1c
remove recipes from woq docstring
Kaihui-intel Oct 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 47 additions & 10 deletions neural_compressor/adaptor/onnxrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1632,25 +1632,36 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
Returns:
(dict): quantized model
"""
if self.performance_only:
tmp_model = model
else:
try:
tmp_model = copy.deepcopy(model)
yuwenzho marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e: # pragma: no cover
logger.warning("Fail to deep copy the model due to {}, inplace is used now.".format(repr(e)))
tmp_model = model

assert q_func is None, "quantization aware training has not been supported on ONNXRUNTIME"
for precision in self.query_handler.get_precisions():
if precision == "weight_only_integer":
self.quantizable_op_types += self.query_handler.get_op_types_by_precision(precision=precision)
self.quantizable_ops = self._query_quantizable_ops(model.model)
self.quantizable_ops = self._query_quantizable_ops(tmp_model.model)

self._update_tune_cfg(tune_cfg, tmp_model.model)
quant_config = self._cfg_to_quantize_config(tune_cfg)
algos = set([item["algorithm"] for key, item in quant_config.items() if isinstance(item, dict)])
if "GPTQ" in algos:
from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize

assert data_loader is not None, "GPTQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()"
percdamp = self.recipes.get("gptq_args", {}).get("percdamp", 0.01)
blocksize = self.recipes.get("gptq_args", {}).get("blocksize", 128)
actorder = self.recipes.get("gptq_args", {}).get("actorder", False)
mse = self.recipes.get("gptq_args", {}).get("mse", False)
perchannel = self.recipes.get("gptq_args", {}).get("perchannel", True)
calib_sampling_size = tune_cfg.get("calib_sampling_size", 1)
model = gptq_quantize(
model,
tmp_model = gptq_quantize(
tmp_model,
data_loader,
quant_config,
n_samples=calib_sampling_size,
Expand All @@ -1663,11 +1674,12 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
if "AWQ" in algos:
from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize

assert data_loader is not None, "AWQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()"
enable_auto_scale = self.recipes.get("awq_args", {}).get("enable_auto_scale", True)
enable_mse_search = self.recipes.get("awq_args", {}).get("enable_mse_search", True)
calib_sampling_size = tune_cfg.get("calib_sampling_size", 1)
model = awq_quantize(
model,
tmp_model = awq_quantize(
tmp_model,
data_loader,
quant_config,
n_samples=calib_sampling_size,
Expand All @@ -1677,11 +1689,11 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
elif "RTN" in algos:
from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize

model = rtn_quantize(model, quant_config)
model.q_config = copy.deepcopy(quant_config)
self._dump_model_op_stats(model, tune_cfg)
model.topological_sort()
return model
tmp_model = rtn_quantize(tmp_model, quant_config)
tmp_model.q_config = copy.deepcopy(quant_config)
self._dump_model_op_stats(tmp_model, tune_cfg)
tmp_model.topological_sort()
return tmp_model

def _dump_model_op_stats(self, model, tune_cfg):
import re
Expand Down Expand Up @@ -1747,6 +1759,31 @@ def _cfg_to_quantize_config(self, tune_cfg):

return quantize_config

def _update_tune_cfg(self, tune_cfg, model):
"""Update tune cfg according to woq_tuning_cfg."""
if tune_cfg.get("woq_tuning_cfg") is None:
return tune_cfg

from neural_compressor.strategy.utils.constant import WOQ_TUNING_ALGOS

woq_tuning_cfg = tune_cfg.get("woq_tuning_cfg")
new_woq_cfg = WOQ_TUNING_ALGOS.get(woq_tuning_cfg)

for node_cfg in tune_cfg["op"].values():
node_cfg["weight"].update(
{cfg_name: cfg_value for cfg_name, cfg_value in new_woq_cfg.items() if cfg_name in node_cfg["weight"]}
)

# find last matmul and set to fp32
if "DISABLE_LAST_MATMUL" in woq_tuning_cfg:
last_matmul = None
fp32_op_cfg = {"weight": {"dtype": "fp32"}, "activation": {"dtype": "fp32", "quant_mode": "fp32"}}
for node in model.graph.node:
if node.op_type in ["MatMul"]:
last_matmul = (node.name, node.op_type)
if last_matmul in tune_cfg["op"]:
tune_cfg["op"][last_matmul].update(fp32_op_cfg)

def query_fw_capability(self, model):
"""The function is used to query framework capability.
TODO: will be replaced by framework query API
Expand Down
6 changes: 6 additions & 0 deletions neural_compressor/strategy/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ def next_tune_cfg(self):
op_tuning_cfg["calib_sampling_size"] = calib_sampling_size_lst[0]
if not self.cur_best_tuning_cfg:
self.cur_best_tuning_cfg = deepcopy(op_tuning_cfg)

# try to tune a WeightOnlyQuant algorithm
if self._should_tuning_woq_algo():
for tune_cfg in self.tuning_woq_algo(tuning_space, deepcopy(self.cur_best_tuning_cfg)):
yield tune_cfg

# try to tune sq alpha
if self._should_tuning_sq_alpha(self.config.recipes):
for tune_cfg in self.tuning_sq_alpha(tuning_space, deepcopy(self.cur_best_tuning_cfg), self.config.recipes):
Expand Down
6 changes: 6 additions & 0 deletions neural_compressor/strategy/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,12 @@ def next_tune_cfg(self):
stage1_max = 1e9 # TODO set a more appropriate value
if not self.cur_best_tuning_cfg:
self.cur_best_tuning_cfg = deepcopy(initial_op_tuning_cfg)

# try to tune a WeightOnlyQuant algorithm
Kaihui-intel marked this conversation as resolved.
Show resolved Hide resolved
if self._should_tuning_woq_algo():
for tune_cfg in self.tuning_woq_algo(tuning_space, deepcopy(self.cur_best_tuning_cfg)):
yield tune_cfg

# try to tune sq alpha
if self._should_tuning_sq_alpha(self.config.recipes):
for tune_cfg in self.tuning_sq_alpha(
Expand Down
36 changes: 36 additions & 0 deletions neural_compressor/strategy/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
DotDict,
LazyImport,
Statistics,
check_key_exist,
dump_table,
equal_dicts,
fault_tolerant_file,
Expand Down Expand Up @@ -1153,6 +1154,40 @@ def tuning_sq_alpha(self, tuning_space, tuning_cfg, recipes):
for tune_cfg in sq_sampler:
yield tune_cfg

def _should_tuning_woq_algo(self):
"""Currently, it's only available for the ORT backend with approach is weight_only.

It will be triggered when
a) quant_level is auto or quant_level is 1 && strategy is basic
b) and the "algorithm" is not set in op_type_dict
c) and woq will only trigger once
"""
return (
"onnx" in self.framework.lower()
and "weight_only" in self.config.approach
and not check_key_exist(self.config.op_type_dict, "algorithm")
and not check_key_exist(self.tuning_history, "woq_tuning_cfg")
)

def tuning_woq_algo(self, tuning_space, tuning_cfg):
"""Tuning weight only algorithm.

Args:
tuning_space: tuning space
tuning_cfg: the initial tuning config

Yields:
tuning config
"""
logger.info("[STRATEGY] Start tuning Weight Only Quant' algo.")
woq_sampler = tuning_sampler_dict.get_class("woq_algorithm")(tuning_space, [], tuning_cfg)
for tune_cfg in woq_sampler:
yield tune_cfg

logger.info(
"[Strategy] The best tuning config with WeightOnlyQuant is" f"{self.cur_best_tuning_cfg['woq_tuning_cfg']}."
)

def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig):
"""Init the dynamic tuning config according to the static config.

Expand Down Expand Up @@ -1322,6 +1357,7 @@ def _tune_cfg_converter(self, op_tuning_cfg):
# For not tuning recipe, tune cfg use it directly
tune_cfg["recipe_cfgs"].update(self._not_tuning_recipes_values)
tune_cfg["trial_number"] = deepcopy(self.trials_count)
tune_cfg.setdefault("woq_tuning_cfg", op_tuning_cfg.get("woq_tuning_cfg"))
# The sq-related args comes from user config, current best tuning config
# TODO simplify the logic for transforming the arguments
# update the sq-related args from self.cur_best_tuning_cfg
Expand Down
10 changes: 10 additions & 0 deletions neural_compressor/strategy/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# limitations under the License.
"""Strategy constant."""


PRECISION_LIST = ["bf16", "fp16", "fp32"]
QUANT_MODE_SET = {"static", "dynamic"}
LOWER_BIT_LIST = ["int4"]
Expand Down Expand Up @@ -56,3 +57,12 @@
"last_conv_or_matmul_quantization",
"pre_post_process_quantization",
}


WOQ_TUNING_ALGOS = {
"RTN_G32ASYM": {"algorithm": "RTN", "group_size": 32, "scheme": "asym"},
"GPTQ_G32ASYM": {"algorithm": "GPTQ", "group_size": 32, "scheme": "asym"},
"GPTQ_G32ASYM_DISABLE_LAST_MATMUL": {"algorithm": "GPTQ", "group_size": 32, "scheme": "asym"},
"GPTQ_G128ASYM": {"algorithm": "GPTQ", "group_size": 128, "scheme": "asym"},
"AWQ_G32ASYM": {"algorithm": "AWQ", "group_size": 32, "scheme": "asym"},
}
32 changes: 32 additions & 0 deletions neural_compressor/strategy/utils/tuning_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Any, Dict, List, Tuple, Union

from ...utils import logger
from ..utils.constant import WOQ_TUNING_ALGOS
from .tuning_space import TuningSpace, pattern_to_internal, quant_mode_from_pattern
from .tuning_structs import OpTuningConfig
from .utility import ClassRegister
Expand Down Expand Up @@ -609,3 +610,34 @@ def __iter__(self):
recipe_cfgs["smooth_quant_args"] = {"alpha": alpha}
logger.debug(f"[STRATEGY] set smooth quant alpha with: {alpha:.4f}")
yield new_tune_cfg


@tuning_sampler_dict("woq_algorithm")
class WeightOnlyQuantSampler(TuningSampler):
Kaihui-intel marked this conversation as resolved.
Show resolved Hide resolved
"""Not displayed in API Docs."""

def __init__(
self,
tuning_space: TuningSpace,
tuning_order_lst: List[TuningOrder],
initial_op_tuning_cfg: Dict,
):
"""Init tuning sampler.

Args:
tuning_space: The tuning space.
tuning_order_lst: The traverse orders.
initial_op_tuning_cfg: The initialized tuning config.
"""
super().__init__(tuning_space, tuning_order_lst, initial_op_tuning_cfg)

def __iter__(self):
"""Yield the next tuning config.

Yields:
The next tuning config.
"""
new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)
for algo in WOQ_TUNING_ALGOS.keys():
new_tune_cfg["woq_tuning_cfg"] = algo
yield new_tune_cfg
31 changes: 31 additions & 0 deletions neural_compressor/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,3 +1092,34 @@ def mse_metric_gap(fp32_tensor: Any, dequantize_tensor: Any) -> float:
diff_tensor = fp32_tensor_norm - dequantize_tensor_norm
euclidean_dist = np.sum(diff_tensor**2) # type: ignore
return euclidean_dist / fp32_tensor.size


def check_key_exist(data, key):
"""Recursively checks if a key exists in a dictionary or list.

Args:
data (dict or list): The dictionary or list to search.
key (any): The key to search for.

Returns:
bool: True if the key exists in the data structure, False otherwise.

Examples:
>>> check_key_exist({'a': 1, 'b': {'c': 2}}, 'c')
True
>>> check_key_exist([{'a': 1}, {'b': 2}], 'b')
True
>>> check_key_exist({'a': 1, 'b': [1, 2, 3]}, 'c')
False
"""
if isinstance(data, dict):
if key in data:
return True
for value in data.values():
if check_key_exist(value, key):
return True
elif isinstance(data, list):
for item in data:
if check_key_exist(item, key):
return True
return False
75 changes: 75 additions & 0 deletions test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import os
import shutil
import subprocess
Expand All @@ -9,6 +10,7 @@
from transformers import AutoTokenizer

from neural_compressor import PostTrainingQuantConfig, quantization
from neural_compressor.utils.constant import FP32


def Inference(model, data):
Expand Down Expand Up @@ -265,6 +267,79 @@ def test_GPTQ_quant(self):
]
self.assertTrue(len(rtn_op_names) + 1, len(gptq_op_names))

def _test_woq_tune_common(self, eval_func, quant_level=1, **kwargs):
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion

tuning_criterion = TuningCriterion(max_trials=5)

fp32_model = copy.deepcopy(self.model)
conf = PostTrainingQuantConfig(
approach="weight_only", quant_level=quant_level, tuning_criterion=tuning_criterion, **kwargs
)
q_model = quantization.fit(
fp32_model,
conf,
calib_dataloader=self.dataloader,
eval_func=eval_func,
)
self.assertIsNotNone(q_model)
return q_model

def _count_woq_matmul(self, q_model, bits=4, group_size=32):
op_names = [
i.name
for i in q_model.nodes()
if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size))
]
return len(op_names)

def test_woq_tune(self):
from functools import partial

def fake_eval(model, eval_result_lst):
acc = eval_result_lst.pop(0)
return acc

quant_levels = ["auto", 1]
for quant_level in quant_levels:
# Expect tuning ends with WOQ algorithm 'RTN_G32ASYM'
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 1.1])
woq_model_1 = self._test_woq_tune_common(partial_fake_eval, quant_level)
self.assertEqual(self._count_woq_matmul(woq_model_1), 31)

# Expect tuning ends with WOQ algorithm 'GPTQ_G32ASYM'
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 0.8, 1.1])
woq_model_2 = self._test_woq_tune_common(partial_fake_eval, quant_level)
self.assertEqual(self._count_woq_matmul(woq_model_2), 31)

# Expect tuning ends with WOQ algorithm 'GPTQ_G32ASYM_DISABLE_LAST_MATMUL'
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 0.8, 0.8, 1.1])
woq_model_3 = self._test_woq_tune_common(partial_fake_eval, quant_level)
self.assertEqual(self._count_woq_matmul(woq_model_3), 30)

# Expect tuning ends with WOQ algorithm 'GPTQ_G128ASYM'
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 0.8, 0.8, 0.8, 1.1])
woq_model_4 = self._test_woq_tune_common(partial_fake_eval, quant_level)
self.assertEqual(self._count_woq_matmul(woq_model_4, group_size=128), 31)

# Expect tuning ends with WOQ algorithm 'AWQ_G32ASYM'
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 0.8, 0.8, 0.8, 0.8, 1.1])
woq_model_5 = self._test_woq_tune_common(partial_fake_eval, quant_level)
self.assertEqual(self._count_woq_matmul(woq_model_5), 31)

# test WOQ tuning with fallback
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 1.1])
woq_model = self._test_woq_tune_common(
partial_fake_eval, "auto", op_name_dict={"/transformer/h.*/attn/k_proj/MatMul": FP32}
)
self.assertEqual(self._count_woq_matmul(woq_model), 26)

# test 8 bits WOQ
partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 1.1])
woq_model = self._test_woq_tune_common(partial_fake_eval, "auto", op_type_dict={".*": {"weight": {"bits": 8}}})
self.assertEqual(self._count_woq_matmul(woq_model, bits=8), 31)


if __name__ == "__main__":
unittest.main()