intel · yiliu30 · Oct 20, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py
@@ -244,7 +244,7 @@ def rtn_quantize(
 
             dtype = weight.dtype
 
-            if node.name in weight_config:
+            if node.name in weight_config and "group_size" in weight_config[node.name]:
                 num_bits = weight_config[node.name]["bits"]
                 group_size = weight_config[node.name]["group_size"]
                 scheme = weight_config[node.name]["scheme"]
@@ -328,7 +328,11 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         weight = []
         org_out = []
         for node in nodes:
-            if node.name in weight_config and weight_config.get(node.name, "fp32") != "fp32":
+            if (
+                node.name in weight_config
+                and weight_config.get(node.name, "fp32") != "fp32"
+                and "group_size" in weight_config[node.name]
+            ):
                 num_bits = weight_config[node.name]["bits"]
                 group_size = weight_config[node.name]["group_size"]
                 scheme = weight_config[node.name]["scheme"]

diff --git a/neural_compressor/strategy/auto.py b/neural_compressor/strategy/auto.py
@@ -120,6 +120,12 @@ def next_tune_cfg(self):
         op_tuning_cfg["calib_sampling_size"] = calib_sampling_size_lst[0]
         if not self.cur_best_tuning_cfg:
             self.cur_best_tuning_cfg = deepcopy(op_tuning_cfg)
+
+        # try to tune a WeightOnlyQuant algorithm
+        if self._should_tuning_woq_algo():
+            for tune_cfg in self.tuning_woq_algo(tuning_space, deepcopy(self.cur_best_tuning_cfg)):
+                yield tune_cfg
+
         # try to tune sq alpha
         if self._should_tuning_sq_alpha(self.config.recipes):
             for tune_cfg in self.tuning_sq_alpha(tuning_space, deepcopy(self.cur_best_tuning_cfg), self.config.recipes):

diff --git a/neural_compressor/strategy/basic.py b/neural_compressor/strategy/basic.py
@@ -312,6 +312,12 @@ def next_tune_cfg(self):
             stage1_max = 1e9  # TODO set a more appropriate value
             if not self.cur_best_tuning_cfg:
                 self.cur_best_tuning_cfg = deepcopy(initial_op_tuning_cfg)
+
+            # try to tune a WeightOnlyQuant algorithm
+            if self._should_tuning_woq_algo():
+                for tune_cfg in self.tuning_woq_algo(tuning_space, deepcopy(self.cur_best_tuning_cfg)):
+                    yield tune_cfg
+
             # try to tune sq alpha
             if self._should_tuning_sq_alpha(self.config.recipes):
                 for tune_cfg in self.tuning_sq_alpha(

diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py
@@ -59,7 +59,7 @@
 from .utils.tuning_sampler import tuning_sampler_dict
 from .utils.tuning_space import TuningSpace
 from .utils.tuning_structs import OpTuningConfig
-from .utils.utility import build_slave_faker_model, quant_options
+from .utils.utility import build_slave_faker_model, check_key_exist, quant_options
 
 STRATEGIES = {}
 
@@ -1153,6 +1153,41 @@ def tuning_sq_alpha(self, tuning_space, tuning_cfg, recipes):
         for tune_cfg in sq_sampler:
             yield tune_cfg
 
+    def _should_tuning_woq_algo(self):
+        """Currently, it's only available for the ORT backend with approach is weight_only.
+
+        It will be triggered when
+            a) quant_level is auto or quant_level is 1 && strategy is basic
+            b) and the "algorithm" is not set in op_type_dict
+            c) and woq will only trigger once
+        """
+        return (
+            "onnx" in self.framework.lower()
+            and "weight_only" in self.config.approach
+            and not check_key_exist(self.config.op_type_dict, "algorithm")
+            and not check_key_exist(self.tuning_history, "woq_tuning_cfg")
+        )
+
+    def tuning_woq_algo(self, tuning_space, tuning_cfg):
+        """Tuning smooth quant's alpha.
+
+        Args:
+            tuning_space: tuning space
+            tuning_cfg: the initial tuning config
+            recipes: recipes specified by user
+
+        Yields:
+            tuning config
+        """
+        logger.info("[STRATEGY] Start tuning Weight Only Quant' algo.")
+        woq_sampler = tuning_sampler_dict.get_class("woq_algorithm")(tuning_space, [], tuning_cfg)
+        for tune_cfg in woq_sampler:
+            yield tune_cfg
+
+        logger.info(
+            "[Strategy] The best tuning config with WeightOnlyQuant is" f"{self.cur_best_tuning_cfg['woq_tuning_cfg']}."
+        )
+
     def initial_dynamic_cfg_based_on_static_cfg(self, op_static_cfg: OpTuningConfig):
         """Init the dynamic tuning config according to the static config.
 
@@ -1322,6 +1357,7 @@ def _tune_cfg_converter(self, op_tuning_cfg):
         # For not tuning recipe, tune cfg use it directly
         tune_cfg["recipe_cfgs"].update(self._not_tuning_recipes_values)
         tune_cfg["trial_number"] = deepcopy(self.trials_count)
+        tune_cfg.setdefault("woq_tuning_cfg", op_tuning_cfg.get("woq_tuning_cfg"))
         # The sq-related args comes from user config, current best tuning config
         # TODO simplify the logic for transforming the arguments
         # update the sq-related args from self.cur_best_tuning_cfg

diff --git a/neural_compressor/strategy/utils/constant.py b/neural_compressor/strategy/utils/constant.py
@@ -16,6 +16,8 @@
 # limitations under the License.
 """Strategy constant."""
 
+from enum import Enum
+
 PRECISION_LIST = ["bf16", "fp16", "fp32"]
 QUANT_MODE_SET = {"static", "dynamic"}
 LOWER_BIT_LIST = ["int4"]
@@ -56,3 +58,26 @@
     "last_conv_or_matmul_quantization",
     "pre_post_process_quantization",
 }
+
+
+class WoqTuningParams(Enum):
+    """This enumeration class represents the different tuning parameters for the weight only quant (WOQ) algorithm.
+
+    Args:
+        Enum (Enum): base enumeration class
+
+    Attributes:
+        RTN (int): Represents the RTN algorithm, which is a type of WOQ algorithm.
+        GPTQ (int): Represents the GPTQ algorithm, which is a type of WOQ algorithm.
+        GPTQ_DISABLE_LAST_MATMUL (int): Represents the GPTQ algorithm with the last matrix multiplication disabled.
+        GPTQ_GROUP_SIZE_32 (int): Represents the GPTQ algorithm with a group size of 32.
+        GPTQ_GROUP_SIZE_128 (int): Represents the GPTQ algorithm with a group size of 128.
+        AWQ (int): Represents the AWQ algorithm, which is a type of WOQ algorithm.
+    """
+
+    RTN = 1
+    GPTQ = 2
+    GPTQ_DISABLE_LAST_MATMUL = 3
+    GPTQ_GROUP_SIZE_32 = 4
+    GPTQ_GROUP_SIZE_128 = 5
+    AWQ = 6
diff --git a/neural_compressor/strategy/utils/tuning_sampler.py b/neural_compressor/strategy/utils/tuning_sampler.py
@@ -23,6 +23,7 @@
 from typing import Any, Dict, List, Tuple, Union
 
 from ...utils import logger
+from ..utils.constant import WoqTuningParams
 from .tuning_space import TuningSpace, pattern_to_internal, quant_mode_from_pattern
 from .tuning_structs import OpTuningConfig
 from .utility import ClassRegister
@@ -609,3 +610,34 @@ def __iter__(self):
             recipe_cfgs["smooth_quant_args"] = {"alpha": alpha}
             logger.debug(f"[STRATEGY] set smooth quant alpha with: {alpha:.4f}")
             yield new_tune_cfg
+
+
+@tuning_sampler_dict("woq_algorithm")
+class WeightOnlyQuantSampler(TuningSampler):
+    """Not displayed in API Docs."""
+
+    def __init__(
+        self,
+        tuning_space: TuningSpace,
+        tuning_order_lst: List[TuningOrder],
+        initial_op_tuning_cfg: Dict,
+    ):
+        """Init tuning sampler.
+
+        Args:
+            tuning_space: The tuning space.
+            tuning_order_lst: The traverse orders.
+            initial_op_tuning_cfg: The initialized tuning config.
+        """
+        super().__init__(tuning_space, tuning_order_lst, initial_op_tuning_cfg)
+
+    def __iter__(self):
+        """Yield the next tuning config.
+
+        Yields:
+            The next tuning config.
+        """
+        new_tune_cfg = copy.deepcopy(self.initial_op_tuning_cfg)
+        for algo in WoqTuningParams:
+            new_tune_cfg["woq_tuning_cfg"] = algo
+            yield new_tune_cfg
diff --git a/neural_compressor/strategy/utils/utility.py b/neural_compressor/strategy/utils/utility.py
@@ -126,6 +126,37 @@ def get_adaptor_name(adaptor):
     return ""
 
 
+def check_key_exist(data, key):
+    """Recursively checks if a key exists in a dictionary or list.
+
+    Args:
+        data (dict or list): The dictionary or list to search.
+        key (any): The key to search for.
+
+    Returns:
+        bool: True if the key exists in the data structure, False otherwise.
+
+    Examples:
+        >>> check_key_exist({'a': 1, 'b': {'c': 2}}, 'c')
+        True
+        >>> check_key_exist([{'a': 1}, {'b': 2}], 'b')
+        True
+        >>> check_key_exist({'a': 1, 'b': [1, 2, 3]}, 'c')
+        False
+    """
+    if isinstance(data, dict):
+        if key in data:
+            return True
+        for value in data.values():
+            if check_key_exist(value, key):
+                return True
+    elif isinstance(data, list):
+        for item in data:
+            if check_key_exist(item, key):
+                return True
+    return False
+
+
 def build_slave_faker_model():
     """Slave does not have a model, so construct a fake model.