From 935a86c55a9d969d577ae7fc931d4bd23a57c2ba Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Wed, 10 Apr 2024 22:04:11 +0800 Subject: [PATCH 1/4] fix tune_cfg issue for 3.x static quant Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/smooth_quant/utility.py | 161 +------- .../algorithms/static_quant/static_quant.py | 8 +- .../torch/algorithms/static_quant/utility.py | 348 +++++++++++++++--- .../torch/quantization/config.py | 2 +- 4 files changed, 308 insertions(+), 211 deletions(-) diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py index ceb2657b89a..e065ba28200 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/utility.py +++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py @@ -16,13 +16,10 @@ import json import os import re -import subprocess from collections import UserDict -import cpuinfo import intel_extension_for_pytorch as ipex import numpy -import psutil import torch import tqdm from packaging.version import Version @@ -30,6 +27,7 @@ from neural_compressor.torch.algorithms.static_quant import ( TransformerBasedModelBlockPatternDetector, dump_model_op_stats, + generate_activation_observer, get_quantizable_ops_from_cfgs, ipex_config_path, paser_cfgs, @@ -42,99 +40,6 @@ ipex_ver = get_ipex_version() -def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover - """This is a helper method to generate an activation observer. - - Args: - scheme (str): Quantization scheme to be used. - algorithm (str): What algorithm for computing the quantization parameters based on. - - Returns: - An observer. - """ - kl_activation_observer = { - "name": "HistogramObserver", - "bins": 2048, - "upsample_rate": 128, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - } - minmax_activation_observer = { - "name": "MinMaxObserver", - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - } - smoothquant_kl_activation_observer = { - "name": "SmoothQuantActivationObserver", - "smooth_quant_enabled": smooth_quant_enable, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - "alpha": 0.5, - "act_observer": kl_activation_observer, - "act_ic_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": -1, - "dtype": "torch.quint8", - "qscheme": "torch.per_channel_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - }, - } - smoothquant_minmax_activation_observer = { - "name": "SmoothQuantActivationObserver", - "smooth_quant_enabled": smooth_quant_enable, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - "alpha": 0.5, - "act_observer": minmax_activation_observer, - "act_ic_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": -1, - "dtype": "torch.quint8", - "qscheme": "torch.per_channel_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - }, - } - REDUCE_RANGE = False if CpuInfo().vnni else True - if REDUCE_RANGE: - minmax_activation_observer["reduce_range"] = REDUCE_RANGE - kl_activation_observer["reduce_range"] = REDUCE_RANGE - if scheme == "sym": - minmax_activation_observer["qscheme"] = "torch.per_tensor_symmetric" - minmax_activation_observer["dtype"] = "torch.qint8" - minmax_activation_observer["quant_min"] = -128 - minmax_activation_observer["quant_max"] = 127 - kl_activation_observer["qscheme"] = "torch.per_tensor_symmetric" - kl_activation_observer["dtype"] = "torch.qint8" - kl_activation_observer["quant_min"] = -128 - kl_activation_observer["quant_max"] = 127 - if smooth_quant and smooth_quant_enable: - if algorithm == "kl": - return smoothquant_kl_activation_observer - if algorithm == "minmax": - return smoothquant_minmax_activation_observer - else: - if algorithm == "kl": - return kl_activation_observer - if algorithm == "minmax": - return minmax_activation_observer - - def check_cfg_and_qconfig( tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False ): # pragma: no cover @@ -2275,67 +2180,3 @@ def forward(self, x): output = self.orig_layer(x) self.output = output return output - - -class CpuInfo(object): # pragma: no cover - """Get CPU Info.""" - - def __init__(self): - """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket.""" - self._bf16 = False - self._vnni = False - info = cpuinfo.get_cpu_info() - if "arch" in info and "X86" in info["arch"]: - cpuid = cpuinfo.CPUID() - max_extension_support = cpuid.get_max_extension_support() - if max_extension_support >= 7: - ecx = cpuid._run_asm( - b"\x31\xC9", # xor ecx, ecx - b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\x89\xC8" b"\xC3", # mov eax, 7 # cpuid # mov ax, cx # ret - ) - self._vnni = bool(ecx & (1 << 11)) - eax = cpuid._run_asm( - b"\xB9\x01\x00\x00\x00", # mov ecx, 1 - b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\xC3", # mov eax, 7 # cpuid # ret - ) - self._bf16 = bool(eax & (1 << 5)) - if "arch" in info and "ARM" in info["arch"]: # pragma: no cover - self._sockets = 1 - else: - self._sockets = self.get_number_of_sockets() - self._cores = psutil.cpu_count(logical=False) - self._cores_per_socket = int(self._cores / self._sockets) - - @property - def bf16(self): - """Get whether it is bf16.""" - return self._bf16 - - @property - def vnni(self): - """Get whether it is vnni.""" - return self._vnni - - @property - def cores_per_socket(self): - """Get the cores per socket.""" - return self._cores_per_socket - - def get_number_of_sockets(self) -> int: - """Get number of sockets in platform.""" - cmd = "cat /proc/cpuinfo | grep 'physical id' | sort -u | wc -l" - if psutil.WINDOWS: - cmd = r'wmic cpu get DeviceID | C:\Windows\System32\find.exe /C "CPU"' - - with subprocess.Popen( - args=cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=False, - ) as proc: - proc.wait() - if proc.stdout: - for line in proc.stdout: - return int(line.decode("utf-8", errors="ignore").strip()) - return 0 diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py index b3dccdafb00..c34c5265573 100644 --- a/neural_compressor/torch/algorithms/static_quant/static_quant.py +++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py @@ -51,6 +51,12 @@ def static_quantize(model, tune_cfg, run_fn, example_inputs, inplace=True): Returns: A quantized model. """ + _, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name = get_quantizable_ops_recursively( + model, example_inputs + ) + qscheme = cfg_to_qconfig( + tune_cfg, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name + ) # update json file in ipex_config_path model.eval() if ipex_ver.release >= Version("1.12.0").release: @@ -80,8 +86,6 @@ def static_quantize(model, tune_cfg, run_fn, example_inputs, inplace=True): else: # pragma: no cover # for IPEX version < 1.12 - _, cfgs, default_cfgs, fuse_ops = get_quantizable_ops_recursively(model, example_inputs) - qscheme = cfg_to_qconfig(tune_cfg, cfgs, default_cfgs, fuse_ops) ipex_conf = ipex.quantization.QuantConf( configure_file=ipex_config_path, qscheme=qscheme ) # pylint: disable=E1101 diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py index cdfd3cb72d0..0e0895024b6 100644 --- a/neural_compressor/torch/algorithms/static_quant/utility.py +++ b/neural_compressor/torch/algorithms/static_quant/utility.py @@ -16,8 +16,11 @@ import json import os import re +import subprocess from typing import Dict, List, Union +import cpuinfo +import psutil import torch from packaging.version import Version @@ -63,57 +66,238 @@ ] -def cfg_to_qconfig(tune_cfg, cfgs, default_cfgs, fuse_ops): # pragma: no cover +def cfg_to_qconfig( + tune_cfg, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name +): # pragma: no cover assert cfgs is not None, "No configure for IPEX int8 model..." - for key in tune_cfg["op"]: - try: - scheme = tune_cfg["op"][key]["activation"]["scheme"] - except: - scheme = "asym" - if scheme not in ["asym", "sym"]: - scheme = "asym" - break - for key in tune_cfg["op"]: - value = tune_cfg["op"][key] - pattern = get_pattern(key, fuse_ops) - assert isinstance(value, dict) - assert "activation" in value - if value["activation"]["dtype"] == "fp32": - if "weight" in value: - assert value["weight"]["dtype"] == "fp32" - for op_cfg in cfgs: - if op_cfg["id"] == key[0]: - if key[1] in ["relu_", "add_"]: - continue - num_inputs = len(op_cfg["inputs_quantized"]) - num_outputs = len(op_cfg["outputs_quantized"]) - for i_num in range(num_inputs): - op_cfg["inputs_quantized"][i_num] = False - for o_num in range(num_outputs): - op_cfg["outputs_quantized"][o_num] = False - if pattern: - if pattern[1] in ["relu_", "add_"]: + if ipex_ver.release < Version("1.12.0").release: # pragma: no cover + for key in tune_cfg["op"]: + try: + scheme = tune_cfg["op"][key]["activation"]["scheme"] + except: + scheme = "asym" + if scheme not in ["asym", "sym"]: + scheme = "asym" + break + for key in tune_cfg["op"]: + value = tune_cfg["op"][key] + pattern = get_pattern(key, fuse_ops) + assert isinstance(value, dict) + assert "activation" in value + if value["activation"]["dtype"] == "fp32": + if "weight" in value: + assert value["weight"]["dtype"] == "fp32" + for op_cfg in cfgs: + if op_cfg["id"] == key[0]: + if key[1] in ["relu_", "add_"]: continue - tune_cfg["op"][pattern]["activation"]["dtype"] = "fp32" - if "weight" in tune_cfg["op"][pattern]: - tune_cfg["op"][pattern]["weight"]["dtype"] = "fp32" + num_inputs = len(op_cfg["inputs_quantized"]) + num_outputs = len(op_cfg["outputs_quantized"]) + for i_num in range(num_inputs): + op_cfg["inputs_quantized"][i_num] = False + for o_num in range(num_outputs): + op_cfg["outputs_quantized"][o_num] = False + if pattern: + if pattern[1] in ["relu_", "add_"]: + continue + tune_cfg["op"][pattern]["activation"]["dtype"] = "fp32" + if "weight" in tune_cfg["op"][pattern]: + tune_cfg["op"][pattern]["weight"]["dtype"] = "fp32" + else: + for op_cfg in cfgs: + if op_cfg["id"] == key[0]: + if key[1] in ["relu_", "add_"]: + continue + num_inputs = len(op_cfg["inputs_quantized"]) + num_outputs = len(op_cfg["outputs_quantized"]) + for i_num in range(num_inputs): + op_cfg["inputs_quantized"][i_num] = default_cfgs[key[0]]["inputs_quantized"][i_num] + for o_num in range(num_outputs): + op_cfg["outputs_quantized"][o_num] = default_cfgs[key[0]]["outputs_quantized"][o_num] + with open(ipex_config_path, "w") as write_f: + json.dump(cfgs, write_f) + if scheme == "asym": + return torch.per_tensor_affine else: - for op_cfg in cfgs: - if op_cfg["id"] == key[0]: - if key[1] in ["relu_", "add_"]: - continue - num_inputs = len(op_cfg["inputs_quantized"]) - num_outputs = len(op_cfg["outputs_quantized"]) - for i_num in range(num_inputs): - op_cfg["inputs_quantized"][i_num] = default_cfgs[key[0]]["inputs_quantized"][i_num] - for o_num in range(num_outputs): - op_cfg["outputs_quantized"][o_num] = default_cfgs[key[0]]["outputs_quantized"][o_num] - with open(ipex_config_path, "w") as write_f: - json.dump(cfgs, write_f) - if scheme == "asym": - return torch.per_tensor_affine + return torch.per_tensor_symmetric + + else: + op_infos = copy.deepcopy(op_infos_from_cfgs) + cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, smooth_quant=False) + + with open(ipex_config_path, "w") as write_f: + json.dump(cfgs, write_f, indent=4) + return None + + +def check_cfg_and_qconfig( + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False +): # pragma: no cover + """Check configs and quantization configs. + + Args: + tune_cfg (dict): dictionary of quantization configuration. + cfgs (dict): the input configs. + op_infos_from_cfgs (dict): op infos from configs. + output_tensor_ids_op_name (dict): dictionary of output tensor op names. + + Returns: + cfgs (dict). + """ + for op_name in tune_cfg: + inc_op_cfg = tune_cfg[op_name] + for i, name in enumerate(op_name[0]): + # to int8 + ipex_op_cfg = op_infos_from_cfgs[name] + input_tensor_infos = ipex_op_cfg["input_tensor_infos"] + if op_name[1] == "Linear" or op_name[1] == "Linear&add": # record op_name for possible op-wise fallback + logger.debug(f"ipex_op_cfg['fqn'] - op_name {ipex_op_cfg['fqn']} {op_name}") + for index, input_tensor_info in enumerate(input_tensor_infos): + if "force_dtype" not in input_tensor_info.keys(): + continue + if ( + input_tensor_info["force_dtype"] == "torch.qint8" + or input_tensor_info["force_dtype"] == "torch.quint8" + ): + # int8 -> int8 + if inc_op_cfg["weight"]["dtype"] == "int8": + inc_scheme = inc_op_cfg["activation"]["scheme"] + inc_algorithm = inc_op_cfg["activation"]["algorithm"] + ipex_op_cfg["input_tensor_infos"] = input_tensor_infos + if ( + "op_type" in ipex_op_cfg + and ipex_op_cfg["op_type"] == "" + ): + smooth_quant_enable = True + else: + smooth_quant_enable = False + activation_observer = generate_activation_observer( + inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable + ) + if not smooth_quant: + if inc_scheme == "sym": + input_tensor_infos[index]["force_dtype"] = "torch.qint8" + if inc_scheme == "asym": + input_tensor_infos[index]["force_dtype"] = "torch.quint8" + ipex_op_cfg["activation_observer"] = activation_observer + # int8 -> fp32 + else: + input_tensor_infos[index]["force_dtype"] = "torch.float32" + # modify pre_op output inf_dtype + if i == 0: + input_tensor_id = input_tensor_info["id"] + input_tensor_dtype = input_tensor_info["force_dtype"] + if input_tensor_id in output_tensor_ids_op_name.keys(): + pre_op_name = output_tensor_ids_op_name[input_tensor_id] + pre_op_module = pre_op_name[0][0] + pre_op_state = pre_op_name[0][1] + pre_op_index = pre_op_name[0][2] + pre_op_infos = cfgs[pre_op_module][pre_op_state][pre_op_index] + pre_op_output_infos = pre_op_infos["output_tensor_infos"] + for index, pre_op_output in enumerate(pre_op_output_infos): + if pre_op_output["id"] == input_tensor_id: + pre_op_output_infos[index]["inf_dtype"] = input_tensor_dtype + else: + pass + pre_op_infos["output_tensor_infos"] = pre_op_output_infos + cfgs[pre_op_module][pre_op_state][pre_op_index] = pre_op_infos + else: + pass + cfgs[name[0]][name[1]][name[2]] = ipex_op_cfg + return cfgs + + +def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover + """This is a helper method to generate an activation observer. + + Args: + scheme (str): Quantization scheme to be used. + algorithm (str): What algorithm for computing the quantization parameters based on. + + Returns: + An observer. + """ + kl_activation_observer = { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + } + minmax_activation_observer = { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + } + smoothquant_kl_activation_observer = { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": smooth_quant_enable, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": kl_activation_observer, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + }, + } + smoothquant_minmax_activation_observer = { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": smooth_quant_enable, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": minmax_activation_observer, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + }, + } + REDUCE_RANGE = False if CpuInfo().vnni else True + if REDUCE_RANGE: + minmax_activation_observer["reduce_range"] = REDUCE_RANGE + kl_activation_observer["reduce_range"] = REDUCE_RANGE + if scheme == "sym": + minmax_activation_observer["qscheme"] = "torch.per_tensor_symmetric" + minmax_activation_observer["dtype"] = "torch.qint8" + minmax_activation_observer["quant_min"] = -128 + minmax_activation_observer["quant_max"] = 127 + kl_activation_observer["qscheme"] = "torch.per_tensor_symmetric" + kl_activation_observer["dtype"] = "torch.qint8" + kl_activation_observer["quant_min"] = -128 + kl_activation_observer["quant_max"] = 127 + if smooth_quant and smooth_quant_enable: + if algorithm == "kl": + return smoothquant_kl_activation_observer + if algorithm == "minmax": + return smoothquant_minmax_activation_observer else: - return torch.per_tensor_symmetric + if algorithm == "kl": + return kl_activation_observer + if algorithm == "minmax": + return minmax_activation_observer def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover @@ -176,6 +360,9 @@ def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover cfgs = json.load(f) default_cfgs = {} fuse_ops = [] + op_infos_from_cfgs = {} + output_tensor_id_op_name = {} + if ipex_ver.release < Version("1.12.0").release: # pragma: no cover default_cfgs = copy.deepcopy(cfgs) fuse_ops = get_fuse_ops(cfgs) @@ -191,6 +378,7 @@ def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover break if not re_flag: quantizable_ops.append((op_cfg["id"], op_cfg["name"])) + else: ( ops_name, @@ -242,7 +430,7 @@ def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover logger.info(attention_block) logger.info("FFN Blocks : ") logger.info(ffn_blocks) - return quantizable_ops, cfgs, default_cfgs, fuse_ops + return quantizable_ops, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name def simple_inference(q_model, example_inputs, iterations=1): @@ -671,3 +859,67 @@ def _group_block(detect_result): if ffn_block: ffn_block_lst.append(ffn_block) return attention_block_lst, ffn_block_lst + + +class CpuInfo(object): # pragma: no cover + """Get CPU Info.""" + + def __init__(self): + """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket.""" + self._bf16 = False + self._vnni = False + info = cpuinfo.get_cpu_info() + if "arch" in info and "X86" in info["arch"]: + cpuid = cpuinfo.CPUID() + max_extension_support = cpuid.get_max_extension_support() + if max_extension_support >= 7: + ecx = cpuid._run_asm( + b"\x31\xC9", # xor ecx, ecx + b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\x89\xC8" b"\xC3", # mov eax, 7 # cpuid # mov ax, cx # ret + ) + self._vnni = bool(ecx & (1 << 11)) + eax = cpuid._run_asm( + b"\xB9\x01\x00\x00\x00", # mov ecx, 1 + b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\xC3", # mov eax, 7 # cpuid # ret + ) + self._bf16 = bool(eax & (1 << 5)) + if "arch" in info and "ARM" in info["arch"]: # pragma: no cover + self._sockets = 1 + else: + self._sockets = self.get_number_of_sockets() + self._cores = psutil.cpu_count(logical=False) + self._cores_per_socket = int(self._cores / self._sockets) + + @property + def bf16(self): + """Get whether it is bf16.""" + return self._bf16 + + @property + def vnni(self): + """Get whether it is vnni.""" + return self._vnni + + @property + def cores_per_socket(self): + """Get the cores per socket.""" + return self._cores_per_socket + + def get_number_of_sockets(self) -> int: + """Get number of sockets in platform.""" + cmd = "cat /proc/cpuinfo | grep 'physical id' | sort -u | wc -l" + if psutil.WINDOWS: + cmd = r'wmic cpu get DeviceID | C:\Windows\System32\find.exe /C "CPU"' + + with subprocess.Popen( + args=cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=False, + ) as proc: + proc.wait() + if proc.stdout: + for line in proc.stdout: + return int(line.decode("utf-8", errors="ignore").strip()) + return 0 diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 29a8177e9be..38ecdd46b8d 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -818,7 +818,7 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def get_model_info(model: torch.nn.Module, example_inputs) -> List[Tuple[str, Callable]]: from neural_compressor.torch.algorithms.static_quant import get_quantizable_ops_recursively - model_info, _, _, _ = get_quantizable_ops_recursively(model, example_inputs=example_inputs) + model_info, _, _, _, _, _ = get_quantizable_ops_recursively(model, example_inputs=example_inputs) return model_info @classmethod From bea7d972e726443bda992f0dceb4249772573e16 Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Thu, 11 Apr 2024 15:29:52 +0800 Subject: [PATCH 2/4] refine script Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/static_quant/utility.py | 194 +++--------------- 1 file changed, 29 insertions(+), 165 deletions(-) diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py index 0e0895024b6..bd45cff0890 100644 --- a/neural_compressor/torch/algorithms/static_quant/utility.py +++ b/neural_compressor/torch/algorithms/static_quant/utility.py @@ -16,11 +16,8 @@ import json import os import re -import subprocess from typing import Dict, List, Union -import cpuinfo -import psutil import torch from packaging.version import Version @@ -30,7 +27,7 @@ except: pass -from neural_compressor.common.utils import DEFAULT_WORKSPACE +from neural_compressor.common.utils import DEFAULT_WORKSPACE, CpuInfo from neural_compressor.torch.utils import get_ipex_version, get_torch_version, logger version = get_torch_version() @@ -123,29 +120,27 @@ def cfg_to_qconfig( else: op_infos = copy.deepcopy(op_infos_from_cfgs) - cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, smooth_quant=False) + cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name) with open(ipex_config_path, "w") as write_f: json.dump(cfgs, write_f, indent=4) return None -def check_cfg_and_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False -): # pragma: no cover +def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name): # pragma: no cover """Check configs and quantization configs. Args: - tune_cfg (dict): dictionary of quantization configuration. - cfgs (dict): the input configs. - op_infos_from_cfgs (dict): op infos from configs. - output_tensor_ids_op_name (dict): dictionary of output tensor op names. + user_cfg (dict): quantization configuration for ops. + cfgs (dict): configs loaded from ipex config path. + op_infos_from_cfgs (dict): dict containing configs that have been parsed for each op. + output_tensor_ids_op_name (dict): dict containing op names corresponding to 'op_infos_from_cfgs'. Returns: - cfgs (dict). + cfgs (dict): updated configs. """ - for op_name in tune_cfg: - inc_op_cfg = tune_cfg[op_name] + for op_name in user_cfg: + inc_op_cfg = user_cfg[op_name] for i, name in enumerate(op_name[0]): # to int8 ipex_op_cfg = op_infos_from_cfgs[name] @@ -164,21 +159,11 @@ def check_cfg_and_qconfig( inc_scheme = inc_op_cfg["activation"]["scheme"] inc_algorithm = inc_op_cfg["activation"]["algorithm"] ipex_op_cfg["input_tensor_infos"] = input_tensor_infos - if ( - "op_type" in ipex_op_cfg - and ipex_op_cfg["op_type"] == "" - ): - smooth_quant_enable = True - else: - smooth_quant_enable = False - activation_observer = generate_activation_observer( - inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable - ) - if not smooth_quant: - if inc_scheme == "sym": - input_tensor_infos[index]["force_dtype"] = "torch.qint8" - if inc_scheme == "asym": - input_tensor_infos[index]["force_dtype"] = "torch.quint8" + activation_observer = generate_activation_observer(inc_scheme, inc_algorithm) + if inc_scheme == "sym": + input_tensor_infos[index]["force_dtype"] = "torch.qint8" + if inc_scheme == "asym": + input_tensor_infos[index]["force_dtype"] = "torch.quint8" ipex_op_cfg["activation_observer"] = activation_observer # int8 -> fp32 else: @@ -207,74 +192,23 @@ def check_cfg_and_qconfig( return cfgs -def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover - """This is a helper method to generate an activation observer. +def generate_activation_observer(scheme, algorithm): # pragma: no cover + """This is a helper method to generate a dict containing activation observer info. Args: scheme (str): Quantization scheme to be used. algorithm (str): What algorithm for computing the quantization parameters based on. Returns: - An observer. + A dict containing observer info.zs """ - kl_activation_observer = { - "name": "HistogramObserver", - "bins": 2048, - "upsample_rate": 128, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - } - minmax_activation_observer = { - "name": "MinMaxObserver", - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - } - smoothquant_kl_activation_observer = { - "name": "SmoothQuantActivationObserver", - "smooth_quant_enabled": smooth_quant_enable, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - "alpha": 0.5, - "act_observer": kl_activation_observer, - "act_ic_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": -1, - "dtype": "torch.quint8", - "qscheme": "torch.per_channel_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - }, - } - smoothquant_minmax_activation_observer = { - "name": "SmoothQuantActivationObserver", - "smooth_quant_enabled": smooth_quant_enable, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - "alpha": 0.5, - "act_observer": minmax_activation_observer, - "act_ic_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": -1, - "dtype": "torch.quint8", - "qscheme": "torch.per_channel_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - }, - } + from intel_extension_for_pytorch.quantization._utils import _get_observer_setting + + kl_activation_observer = _get_observer_setting(torch.quantization.HistogramObserver(reduce_range=False)) + minmax_activation_observer = _get_observer_setting( + torch.quantization.MinMaxObserver(qscheme=torch.per_tensor_affine, dtype=torch.quint8) + ) + REDUCE_RANGE = False if CpuInfo().vnni else True if REDUCE_RANGE: minmax_activation_observer["reduce_range"] = REDUCE_RANGE @@ -288,16 +222,10 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q kl_activation_observer["dtype"] = "torch.qint8" kl_activation_observer["quant_min"] = -128 kl_activation_observer["quant_max"] = 127 - if smooth_quant and smooth_quant_enable: - if algorithm == "kl": - return smoothquant_kl_activation_observer - if algorithm == "minmax": - return smoothquant_minmax_activation_observer - else: - if algorithm == "kl": - return kl_activation_observer - if algorithm == "minmax": - return minmax_activation_observer + if algorithm == "kl": + return kl_activation_observer + if algorithm == "minmax": + return minmax_activation_observer def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover @@ -859,67 +787,3 @@ def _group_block(detect_result): if ffn_block: ffn_block_lst.append(ffn_block) return attention_block_lst, ffn_block_lst - - -class CpuInfo(object): # pragma: no cover - """Get CPU Info.""" - - def __init__(self): - """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket.""" - self._bf16 = False - self._vnni = False - info = cpuinfo.get_cpu_info() - if "arch" in info and "X86" in info["arch"]: - cpuid = cpuinfo.CPUID() - max_extension_support = cpuid.get_max_extension_support() - if max_extension_support >= 7: - ecx = cpuid._run_asm( - b"\x31\xC9", # xor ecx, ecx - b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\x89\xC8" b"\xC3", # mov eax, 7 # cpuid # mov ax, cx # ret - ) - self._vnni = bool(ecx & (1 << 11)) - eax = cpuid._run_asm( - b"\xB9\x01\x00\x00\x00", # mov ecx, 1 - b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\xC3", # mov eax, 7 # cpuid # ret - ) - self._bf16 = bool(eax & (1 << 5)) - if "arch" in info and "ARM" in info["arch"]: # pragma: no cover - self._sockets = 1 - else: - self._sockets = self.get_number_of_sockets() - self._cores = psutil.cpu_count(logical=False) - self._cores_per_socket = int(self._cores / self._sockets) - - @property - def bf16(self): - """Get whether it is bf16.""" - return self._bf16 - - @property - def vnni(self): - """Get whether it is vnni.""" - return self._vnni - - @property - def cores_per_socket(self): - """Get the cores per socket.""" - return self._cores_per_socket - - def get_number_of_sockets(self) -> int: - """Get number of sockets in platform.""" - cmd = "cat /proc/cpuinfo | grep 'physical id' | sort -u | wc -l" - if psutil.WINDOWS: - cmd = r'wmic cpu get DeviceID | C:\Windows\System32\find.exe /C "CPU"' - - with subprocess.Popen( - args=cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=False, - ) as proc: - proc.wait() - if proc.stdout: - for line in proc.stdout: - return int(line.decode("utf-8", errors="ignore").strip()) - return 0 From a29addc9c908b7dd15f71f04f3c1a1a08cdf7a04 Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Thu, 11 Apr 2024 16:27:16 +0800 Subject: [PATCH 3/4] minor fix Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/static_quant/utility.py | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py index bd45cff0890..6656435e209 100644 --- a/neural_compressor/torch/algorithms/static_quant/utility.py +++ b/neural_compressor/torch/algorithms/static_quant/utility.py @@ -159,7 +159,16 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_ inc_scheme = inc_op_cfg["activation"]["scheme"] inc_algorithm = inc_op_cfg["activation"]["algorithm"] ipex_op_cfg["input_tensor_infos"] = input_tensor_infos - activation_observer = generate_activation_observer(inc_scheme, inc_algorithm) + if ( + "op_type" in ipex_op_cfg + and ipex_op_cfg["op_type"] == "" + ): + smooth_quant_enable = True + else: + smooth_quant_enable = False + activation_observer = generate_activation_observer( + inc_scheme, inc_algorithm, smooth_quant=False, smooth_quant_enable=smooth_quant_enable + ) if inc_scheme == "sym": input_tensor_infos[index]["force_dtype"] = "torch.qint8" if inc_scheme == "asym": @@ -192,7 +201,7 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_ return cfgs -def generate_activation_observer(scheme, algorithm): # pragma: no cover +def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover """This is a helper method to generate a dict containing activation observer info. Args: @@ -202,11 +211,25 @@ def generate_activation_observer(scheme, algorithm): # pragma: no cover Returns: A dict containing observer info.zs """ + from intel_extension_for_pytorch.quantization._smooth_quant import SmoothQuantActivationObserver from intel_extension_for_pytorch.quantization._utils import _get_observer_setting + from torch.quantization import HistogramObserver, MinMaxObserver - kl_activation_observer = _get_observer_setting(torch.quantization.HistogramObserver(reduce_range=False)) + kl_activation_observer = _get_observer_setting(HistogramObserver(reduce_range=False)) minmax_activation_observer = _get_observer_setting( - torch.quantization.MinMaxObserver(qscheme=torch.per_tensor_affine, dtype=torch.quint8) + MinMaxObserver(qscheme=torch.per_tensor_affine, dtype=torch.quint8) + ) + smoothquant_kl_activation_observer = _get_observer_setting( + SmoothQuantActivationObserver( + reduce_range=False, + smooth_quant_enabled=smooth_quant_enable, + ) + ) + smoothquant_minmax_activation_observer = _get_observer_setting( + SmoothQuantActivationObserver( + reduce_range=False, + smooth_quant_enabled=smooth_quant_enable, + ) ) REDUCE_RANGE = False if CpuInfo().vnni else True @@ -222,10 +245,16 @@ def generate_activation_observer(scheme, algorithm): # pragma: no cover kl_activation_observer["dtype"] = "torch.qint8" kl_activation_observer["quant_min"] = -128 kl_activation_observer["quant_max"] = 127 - if algorithm == "kl": - return kl_activation_observer - if algorithm == "minmax": - return minmax_activation_observer + if smooth_quant and smooth_quant_enable: + if algorithm == "kl": + return smoothquant_kl_activation_observer + if algorithm == "minmax": + return smoothquant_minmax_activation_observer + else: + if algorithm == "kl": + return kl_activation_observer + if algorithm == "minmax": + return minmax_activation_observer def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover From 5e32461b1a2ffa566cc565fde595589487c94718 Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Mon, 15 Apr 2024 17:47:45 +0800 Subject: [PATCH 4/4] remove ipex<1.12 Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/smooth_quant/utility.py | 129 +---------- .../algorithms/static_quant/static_quant.py | 68 +++--- .../torch/algorithms/static_quant/utility.py | 208 ++++-------------- .../torch/quantization/config.py | 2 +- 4 files changed, 70 insertions(+), 337 deletions(-) diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py index e065ba28200..3448d705ea7 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/utility.py +++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py @@ -28,11 +28,9 @@ TransformerBasedModelBlockPatternDetector, dump_model_op_stats, generate_activation_observer, - get_quantizable_ops_from_cfgs, + get_quantizable_ops_recursively, ipex_config_path, - paser_cfgs, simple_inference, - unify_op_type_mapping_ipex, ) from neural_compressor.torch.utils import get_ipex_version, get_torch_version, logger @@ -128,131 +126,6 @@ def cfg_to_qconfig( return None -def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover - """Get all quantizable ops from model. - - Args: - model (object): input model - example_inputs (dict|list|tuple|torch.Tensor): used to trace torch model. - Returns: - quantizable_ops (list): list of tuples of op_name and op_type. - cfgs (dict): dict of configuration - """ - quantizable_ops = [] - # group ops by position for transform-based model - detector = TransformerBasedModelBlockPatternDetector(model) - detect_result = detector.detect_block() - attention_block = detect_result.get("attention_blocks", None) - ffn_blocks = detect_result.get("ffn_blocks", None) - logger.info(f"Attention Blocks: {len(attention_block)}") - logger.info(f"FFN Blocks: {len(ffn_blocks)}") - if not os.path.exists(ipex_config_path): - assert isinstance(model, torch.nn.Module), "The model passed in is not the instance of torch.nn.Module" - - if hasattr(model, "save_qconf_summary"): # pragma: no cover - os.makedirs(os.path.dirname(ipex_config_path), exist_ok=True) - model.save_qconf_summary(qconf_summary=ipex_config_path) - else: - model.eval() - - # create a quantization config file for intel pytorch extension model - os.makedirs(os.path.dirname(ipex_config_path), exist_ok=True) - assert example_inputs is not None, "IPEX need q_dataloader or example_inputs to prepare the model" - from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig - - if ipex_ver.release >= Version("2.1").release: - # HistogramObserver will cause a performance issue. - # static_qconfig = ipex.quantization.default_static_qconfig_mapping - qconfig = QConfig( - activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), - weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric), - ) - from torch.ao.quantization import QConfigMapping - - static_qconfig = QConfigMapping().set_global(qconfig) - else: - static_qconfig = QConfig( - activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), - weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric), - ) - - if isinstance(example_inputs, dict): - model = ipex.quantization.prepare(model, static_qconfig, example_kwarg_inputs=example_inputs, inplace=True) - else: - model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=True) - simple_inference(model, example_inputs, iterations=1) - model.save_qconf_summary(qconf_summary=ipex_config_path) - - map_op_name_to_fqn = {} - with open(ipex_config_path, "r") as f: - cfgs = json.load(f) - if ipex_ver.release < Version("1.12.0").release: # pragma: no cover - for op_cfg in cfgs: - if op_cfg["name"] in unify_op_type_mapping_ipex: - quantizable_ops.append((op_cfg["id"], unify_op_type_mapping_ipex[op_cfg["name"]])) - else: - re_flag = False - for pattern, unify_op_type in unify_op_type_mapping_ipex["re"].items(): - if re.match(pattern, op_cfg["name"]): - re_flag = True - quantizable_ops.append((op_cfg["id"], unify_op_type)) - break - if not re_flag: - quantizable_ops.append((op_cfg["id"], op_cfg["name"])) - else: - ( - ops_name, - op_infos_from_cfgs, - input_tensor_id_op_name, - output_tensor_id_op_name, - ) = paser_cfgs(cfgs) - quantizable_op_names = get_quantizable_ops_from_cfgs(ops_name, op_infos_from_cfgs, input_tensor_id_op_name) - for name in quantizable_op_names: - # name : list - if len(name) == 1: - module_key = name[0][0] - op_cfg_id = name[0][2] - ipex_op_type = cfgs[module_key]["q_op_infos"][op_cfg_id]["op_type"] - module_fqn = cfgs[module_key]["q_op_infos"][op_cfg_id].get("fqn", None) - - if ipex_op_type in unify_op_type_mapping_ipex: - quantizable_ops.append((tuple(name), unify_op_type_mapping_ipex[ipex_op_type])) - map_op_name_to_fqn[(tuple(name), ipex_op_type)] = module_fqn - else: - re_flag = False - for pattern, unify_op_type in unify_op_type_mapping_ipex["re"].items(): - if re.match(pattern, ipex_op_type): - re_flag = True - quantizable_ops.append((tuple(name), unify_op_type)) - map_op_name_to_fqn[(tuple(name), unify_op_type)] = module_fqn - break - if not re_flag: - quantizable_ops.append((tuple(name), ipex_op_type)) - map_op_name_to_fqn[(tuple(name), ipex_op_type)] = module_fqn - else: - op_type = "" - for op_name in name: - module_key = op_name[0] - op_cfg_id = op_name[2] - single_op_type = cfgs[module_key]["q_op_infos"][op_cfg_id]["op_type"] - if single_op_type in unify_op_type_mapping_ipex: - single_op_type = unify_op_type_mapping_ipex[single_op_type] - op_type += "&" + single_op_type if op_type else single_op_type - quantizable_ops.append((tuple(name), op_type)) - _module_key = name[0][0] - _op_cfg_id = name[0][2] - module_fqn = cfgs[_module_key]["q_op_infos"][_op_cfg_id]["fqn"] - map_op_name_to_fqn[(tuple(name), op_type)] = module_fqn - - logger.debug("Map op name to fqn: ") - logger.debug(map_op_name_to_fqn) - logger.info("Attention Blocks : ") - logger.info(attention_block) - logger.info("FFN Blocks : ") - logger.info(ffn_blocks) - return quantizable_ops, cfgs, op_infos_from_cfgs, output_tensor_id_op_name - - def get_parent(node, all_parents=False): # pragma: no cover if node.inputs() is None: return None diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py index c34c5265573..626d0f60a2e 100644 --- a/neural_compressor/torch/algorithms/static_quant/static_quant.py +++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py @@ -51,54 +51,38 @@ def static_quantize(model, tune_cfg, run_fn, example_inputs, inplace=True): Returns: A quantized model. """ - _, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name = get_quantizable_ops_recursively( - model, example_inputs - ) - qscheme = cfg_to_qconfig( - tune_cfg, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name - ) # update json file in ipex_config_path + _, cfgs, op_infos_from_cfgs, output_tensor_id_op_name = get_quantizable_ops_recursively(model, example_inputs) + cfg_to_qconfig(tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name) # update json file in ipex_config_path model.eval() - if ipex_ver.release >= Version("1.12.0").release: - # Check save_qconf_summary part is a workaround for IPEX bug. - # Sometimes the prepared model from get_op_capablitiy loss this attribute - if not hasattr(model, "save_qconf_summary") or not hasattr(model, "load_qconf_summary"): - from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig - - if ipex_ver.release >= Version("2.1").release: - static_qconfig = ipex.quantization.default_static_qconfig_mapping - else: - static_qconfig = QConfig( - activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), - weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric), - ) - if isinstance(example_inputs, dict): - model = ipex.quantization.prepare( - model, static_qconfig, example_kwarg_inputs=example_inputs, inplace=inplace - ) - else: - model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=inplace) - - model.load_qconf_summary(qconf_summary=ipex_config_path) - run_fn(model) - model.save_qconf_summary(qconf_summary=ipex_config_path) - model = _ipex_post_quant_process(model, example_inputs, inplace=inplace) - - else: # pragma: no cover - # for IPEX version < 1.12 - ipex_conf = ipex.quantization.QuantConf( - configure_file=ipex_config_path, qscheme=qscheme - ) # pylint: disable=E1101 - run_fn(model) - ipex_conf.save(ipex_config_path) - ipex_conf = ipex.quantization.QuantConf(ipex_config_path) # pylint: disable=E1101 - model = ipex.quantization.convert(model, ipex_conf, example_inputs, inplace=True) # pylint: disable=E1121 + # Check save_qconf_summary part is a workaround for IPEX bug. + # Sometimes the prepared model from get_op_capablitiy loss this attribute + if not hasattr(model, "save_qconf_summary") or not hasattr(model, "load_qconf_summary"): + from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig + + if ipex_ver.release >= Version("2.1").release: + static_qconfig = ipex.quantization.default_static_qconfig_mapping + else: + static_qconfig = QConfig( + activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), + weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric), + ) + if isinstance(example_inputs, dict): + model = ipex.quantization.prepare( + model, static_qconfig, example_kwarg_inputs=example_inputs, inplace=inplace + ) + else: + model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=inplace) + + model.load_qconf_summary(qconf_summary=ipex_config_path) + run_fn(model) + model.save_qconf_summary(qconf_summary=ipex_config_path) + model = _ipex_post_quant_process(model, example_inputs, inplace=inplace) with open(ipex_config_path, "r") as f: model.tune_cfg = json.load(f) model.ipex_config_path = ipex_config_path - if ipex_ver.release >= Version("1.12.0").release: - dump_model_op_stats(tune_cfg) + dump_model_op_stats(tune_cfg) return model diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py index 6656435e209..dd073f50aab 100644 --- a/neural_compressor/torch/algorithms/static_quant/utility.py +++ b/neural_compressor/torch/algorithms/static_quant/utility.py @@ -63,68 +63,12 @@ ] -def cfg_to_qconfig( - tune_cfg, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name -): # pragma: no cover +def cfg_to_qconfig(tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name): # pragma: no cover assert cfgs is not None, "No configure for IPEX int8 model..." - if ipex_ver.release < Version("1.12.0").release: # pragma: no cover - for key in tune_cfg["op"]: - try: - scheme = tune_cfg["op"][key]["activation"]["scheme"] - except: - scheme = "asym" - if scheme not in ["asym", "sym"]: - scheme = "asym" - break - for key in tune_cfg["op"]: - value = tune_cfg["op"][key] - pattern = get_pattern(key, fuse_ops) - assert isinstance(value, dict) - assert "activation" in value - if value["activation"]["dtype"] == "fp32": - if "weight" in value: - assert value["weight"]["dtype"] == "fp32" - for op_cfg in cfgs: - if op_cfg["id"] == key[0]: - if key[1] in ["relu_", "add_"]: - continue - num_inputs = len(op_cfg["inputs_quantized"]) - num_outputs = len(op_cfg["outputs_quantized"]) - for i_num in range(num_inputs): - op_cfg["inputs_quantized"][i_num] = False - for o_num in range(num_outputs): - op_cfg["outputs_quantized"][o_num] = False - if pattern: - if pattern[1] in ["relu_", "add_"]: - continue - tune_cfg["op"][pattern]["activation"]["dtype"] = "fp32" - if "weight" in tune_cfg["op"][pattern]: - tune_cfg["op"][pattern]["weight"]["dtype"] = "fp32" - else: - for op_cfg in cfgs: - if op_cfg["id"] == key[0]: - if key[1] in ["relu_", "add_"]: - continue - num_inputs = len(op_cfg["inputs_quantized"]) - num_outputs = len(op_cfg["outputs_quantized"]) - for i_num in range(num_inputs): - op_cfg["inputs_quantized"][i_num] = default_cfgs[key[0]]["inputs_quantized"][i_num] - for o_num in range(num_outputs): - op_cfg["outputs_quantized"][o_num] = default_cfgs[key[0]]["outputs_quantized"][o_num] - with open(ipex_config_path, "w") as write_f: - json.dump(cfgs, write_f) - if scheme == "asym": - return torch.per_tensor_affine - else: - return torch.per_tensor_symmetric - - else: - op_infos = copy.deepcopy(op_infos_from_cfgs) - cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name) - - with open(ipex_config_path, "w") as write_f: - json.dump(cfgs, write_f, indent=4) - return None + op_infos = copy.deepcopy(op_infos_from_cfgs) + cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name) + with open(ipex_config_path, "w") as write_f: + json.dump(cfgs, write_f, indent=4) def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name): # pragma: no cover @@ -315,71 +259,49 @@ def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover map_op_name_to_fqn = {} with open(ipex_config_path, "r") as f: cfgs = json.load(f) - default_cfgs = {} - fuse_ops = [] - op_infos_from_cfgs = {} - output_tensor_id_op_name = {} - - if ipex_ver.release < Version("1.12.0").release: # pragma: no cover - default_cfgs = copy.deepcopy(cfgs) - fuse_ops = get_fuse_ops(cfgs) - for op_cfg in cfgs: - if op_cfg["name"] in unify_op_type_mapping_ipex: - quantizable_ops.append((op_cfg["id"], unify_op_type_mapping_ipex[op_cfg["name"]])) + ( + ops_name, + op_infos_from_cfgs, + input_tensor_id_op_name, + output_tensor_id_op_name, + ) = paser_cfgs(cfgs) + quantizable_op_names = get_quantizable_ops_from_cfgs(ops_name, op_infos_from_cfgs, input_tensor_id_op_name) + for name in quantizable_op_names: + # name : list + if len(name) == 1: + module_key = name[0][0] + op_cfg_id = name[0][2] + ipex_op_type = cfgs[module_key]["q_op_infos"][op_cfg_id]["op_type"] + module_fqn = cfgs[module_key]["q_op_infos"][op_cfg_id].get("fqn", None) + + if ipex_op_type in unify_op_type_mapping_ipex: + quantizable_ops.append((tuple(name), unify_op_type_mapping_ipex[ipex_op_type])) + map_op_name_to_fqn[(tuple(name), ipex_op_type)] = module_fqn else: re_flag = False for pattern, unify_op_type in unify_op_type_mapping_ipex["re"].items(): - if re.match(pattern, op_cfg["name"]): + if re.match(pattern, ipex_op_type): re_flag = True - quantizable_ops.append((op_cfg["id"], unify_op_type)) + quantizable_ops.append((tuple(name), unify_op_type)) + map_op_name_to_fqn[(tuple(name), unify_op_type)] = module_fqn break if not re_flag: - quantizable_ops.append((op_cfg["id"], op_cfg["name"])) - - else: - ( - ops_name, - op_infos_from_cfgs, - input_tensor_id_op_name, - output_tensor_id_op_name, - ) = paser_cfgs(cfgs) - quantizable_op_names = get_quantizable_ops_from_cfgs(ops_name, op_infos_from_cfgs, input_tensor_id_op_name) - for name in quantizable_op_names: - # name : list - if len(name) == 1: - module_key = name[0][0] - op_cfg_id = name[0][2] - ipex_op_type = cfgs[module_key]["q_op_infos"][op_cfg_id]["op_type"] - module_fqn = cfgs[module_key]["q_op_infos"][op_cfg_id].get("fqn", None) - - if ipex_op_type in unify_op_type_mapping_ipex: - quantizable_ops.append((tuple(name), unify_op_type_mapping_ipex[ipex_op_type])) + quantizable_ops.append((tuple(name), ipex_op_type)) map_op_name_to_fqn[(tuple(name), ipex_op_type)] = module_fqn - else: - re_flag = False - for pattern, unify_op_type in unify_op_type_mapping_ipex["re"].items(): - if re.match(pattern, ipex_op_type): - re_flag = True - quantizable_ops.append((tuple(name), unify_op_type)) - map_op_name_to_fqn[(tuple(name), unify_op_type)] = module_fqn - break - if not re_flag: - quantizable_ops.append((tuple(name), ipex_op_type)) - map_op_name_to_fqn[(tuple(name), ipex_op_type)] = module_fqn - else: - op_type = "" - for op_name in name: - module_key = op_name[0] - op_cfg_id = op_name[2] - single_op_type = cfgs[module_key]["q_op_infos"][op_cfg_id]["op_type"] - if single_op_type in unify_op_type_mapping_ipex: - single_op_type = unify_op_type_mapping_ipex[single_op_type] - op_type += "&" + single_op_type if op_type else single_op_type - quantizable_ops.append((tuple(name), op_type)) - _module_key = name[0][0] - _op_cfg_id = name[0][2] - module_fqn = cfgs[_module_key]["q_op_infos"][_op_cfg_id]["fqn"] - map_op_name_to_fqn[(tuple(name), op_type)] = module_fqn + else: + op_type = "" + for op_name in name: + module_key = op_name[0] + op_cfg_id = op_name[2] + single_op_type = cfgs[module_key]["q_op_infos"][op_cfg_id]["op_type"] + if single_op_type in unify_op_type_mapping_ipex: + single_op_type = unify_op_type_mapping_ipex[single_op_type] + op_type += "&" + single_op_type if op_type else single_op_type + quantizable_ops.append((tuple(name), op_type)) + _module_key = name[0][0] + _op_cfg_id = name[0][2] + module_fqn = cfgs[_module_key]["q_op_infos"][_op_cfg_id]["fqn"] + map_op_name_to_fqn[(tuple(name), op_type)] = module_fqn logger.debug("Map op name to fqn: ") logger.debug(map_op_name_to_fqn) @@ -387,7 +309,7 @@ def get_quantizable_ops_recursively(model, example_inputs): # pragma: no cover logger.info(attention_block) logger.info("FFN Blocks : ") logger.info(ffn_blocks) - return quantizable_ops, cfgs, default_cfgs, fuse_ops, op_infos_from_cfgs, output_tensor_id_op_name + return quantizable_ops, cfgs, op_infos_from_cfgs, output_tensor_id_op_name def simple_inference(q_model, example_inputs, iterations=1): @@ -454,42 +376,6 @@ def dump_model_op_stats(tune_cfg): ).print_stat() -def get_fuse_ops(default_cfgs): # pragma: no cover - elt_wise = ["relu", "sigmoid", "gelu"] - inplace_ops = ["relu_", "add_"] - op_patterns = [] - num_ops = len(default_cfgs) - for cur_id in range(num_ops): - cur_op = default_cfgs[cur_id]["name"] - if cur_op == "dropout": - continue - inputs = default_cfgs[cur_id]["inputs_flow"] - num_input = len(inputs) - pre_ops = {} - for i_num in range(num_input): - inp = inputs[i_num] - for pre_id in range(cur_id): - pre_op = default_cfgs[pre_id]["name"] - pre_out = default_cfgs[pre_id]["outputs_flow"] - num_out = len(pre_out) - for o_num in range(num_out): - if pre_out[o_num] == inp: - if cur_op in inplace_ops and (pre_op in ["conv2d", "conv3d", "linear"]): - op_patterns.append([(pre_id, pre_op), (cur_id, cur_op)]) - if cur_op in elt_wise and (pre_op in ["conv2d", "conv3d", "linear", "add"]): - op_patterns.append([(pre_id, pre_op), (cur_id, cur_op)]) - if cur_op == "add": - pre_ops[i_num] = [pre_id, pre_op] - if len(pre_ops) > 0: - for key, value in pre_ops.items(): - if ( - value[1] in ["conv2d", "conv3d", "linear"] - and default_cfgs[cur_id]["inputs_quantized"][key] is False - ): - op_patterns.append([(value[0], value[1]), (cur_id, cur_op)]) - return op_patterns - - def get_depth(d) -> int: """Query the depth of the dict.""" if isinstance(d, dict): @@ -636,16 +522,6 @@ def get_quantizable_ops_from_cfgs(ops_name, op_infos_from_cfgs, input_tensor_ids return quantizable_ops -def get_pattern(fallback_op, fuse_ops): # pragma: no cover - for fuse_pattern in fuse_ops: - if fuse_pattern[0] == fallback_op: - if fuse_pattern[1] in ["relu_", "add_"]: - return None - else: - return fuse_pattern[1] - return None - - class Statistics: # pragma: no cover """The statistics printer.""" diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 38ecdd46b8d..29a8177e9be 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -818,7 +818,7 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def get_model_info(model: torch.nn.Module, example_inputs) -> List[Tuple[str, Callable]]: from neural_compressor.torch.algorithms.static_quant import get_quantizable_ops_recursively - model_info, _, _, _, _, _ = get_quantizable_ops_recursively(model, example_inputs=example_inputs) + model_info, _, _, _ = get_quantizable_ops_recursively(model, example_inputs=example_inputs) return model_info @classmethod