Integrate Auto-Round to INC 3.x (#1647)

Signed-off-by: Kaihui-intel <[email protected]> Signed-off-by: chensuyue <[email protected]>
intel · Mar 12, 2024 · 2694bbf · 2694bbf
1 parent d8e60b8
commit 2694bbf
Show file tree

Hide file tree

Showing 12 changed files with 552 additions and 4 deletions.
diff --git a/neural_compressor/common/utils/constants.py b/neural_compressor/common/utils/constants.py
@@ -34,6 +34,7 @@
 AWQ = "awq"  # pragma: no cover
 HQQ = "hqq"  # pragma: no cover
 TEQ = "teq"  # pragma: no cover
+AUTOROUND = "autoround"
 FP8_QUANT = "fp8_quant"
 
 # options

diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py
@@ -16,6 +16,7 @@
 from .gptq import gptq_quantize
 from .awq import awq_quantize
 from .teq import teq_quantize
+from .autoround import autoround_quantize
 from .hqq import hqq_quantize
 from .modules import WeightOnlyLinear
 from .utility import *
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from auto_round import AutoRound  # pylint: disable=E0401
+from auto_round.calib_dataset import CALIB_DATASETS  # pylint: disable=E0401
+
+from neural_compressor.torch.utils import logger
+
+
+@torch.no_grad()
+def get_autoround_default_run_fn(
+    model,
+    tokenizer,
+    dataset_name="NeelNanda/pile-10k",
+    n_samples=512,
+    seqlen=2048,
+    seed=42,
+    bs=8,
+    dataset_split: str = "train",
+    dataloader=None,
+):
+    """Perform calibration for quantization.
+
+    This method calibrates the model for quantization by processing a specified
+    number of samples from the calibration dataset. It ensures that the data is
+    properly formatted and feeds it to the model. If the number of samples processed
+    is less than the specified number, it logs a warning. If no samples are processed,
+    it logs an error and exits.
+
+    Args:
+        n_samples (int): The number of samples to use for calibration.
+    """
+    if dataloader is None:
+        get_dataloader = CALIB_DATASETS.get(dataset_name, CALIB_DATASETS["NeelNanda/pile-10k"])
+        dataloader = get_dataloader(
+            tokenizer,
+            seqlen,
+            seed=seed,
+            bs=bs,
+            split=dataset_split,
+            dataset_name=dataset_name,
+        )
+    total_cnt = 0
+    for data in dataloader:
+        if data is None:
+            continue
+        if isinstance(data, torch.Tensor):
+            data_new = data.to(model.device)
+            input_ids = data_new
+        else:
+            data_new = {}
+            for key in data.keys():
+                data_new[key] = data[key].to(model.device)
+            input_ids = data_new["input_ids"]
+        # if input_ids.shape[-1] < seqlen:
+        #     continue
+        if total_cnt + input_ids.shape[0] > n_samples:
+            input_ids = input_ids[: n_samples - total_cnt, ...]
+        try:
+            if isinstance(data_new, torch.Tensor):
+                model(data_new)
+            elif isinstance(data_new, dict):
+                model(**data_new)
+            else:
+                # Handle cases where data_new is neither a Tensor nor a dict
+                raise NotImplementedError(f"Handling not implemented for data type {type(data)}")
+        except Exception as error:
+            logger.error(error)
+        total_cnt += input_ids.shape[0]
+        if total_cnt >= n_samples:
+            break
+    if total_cnt == 0:
+        logger.error(
+            "no data has been cached, please provide more data with sequence length >= {} in the ".format(seqlen)
+            + "dataloader or decease the sequence length."
+        )
+        exit()
+    elif total_cnt < n_samples:
+        logger.warning(
+            "Insufficient number of samples collected may affect the quantification. "
+            "Effective samples size: {}, Target sample size: {}".format(total_cnt, n_samples)
+        )
+
+
+class InputCaptureModule(torch.nn.Module):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.data_pairs = []
+        self.device = "cpu"
+
+    def forward(self, *args, **kwargs):
+        if kwargs and len(args) == 0:
+            # Handle cases where input data is a dict
+            self.data_pairs.append(kwargs)
+        elif args and len(args) == 1:
+            # Handle cases where input data is a Tensor
+            self.data_pairs.append(args[0])
+        else:
+            logger.error("Handle cases where input data is neither a Tensor nor a dict")
+
+
+def recover_dataloader_from_calib_fn(run_fn, run_args):
+    input_capture_model = InputCaptureModule()
+    input_capture_model.eval()
+    run_fn(input_capture_model, *run_args)
+    dataloader = torch.utils.data.DataLoader(input_capture_model.data_pairs)
+    return dataloader
+
+
+def autoround_quantize(
+    model,
+    weight_config: dict = {},
+    enable_full_range: bool = False,  ##for symmetric, TODO support later
+    batch_size: int = 8,
+    amp: bool = True,
+    device=None,
+    lr_scheduler=None,
+    use_quant_input: bool = True,
+    enable_minmax_tuning: bool = True,
+    lr: float = None,
+    minmax_lr: float = None,
+    low_gpu_mem_usage: bool = True,
+    iters: int = 200,
+    seqlen: int = 2048,
+    n_samples: int = 512,
+    sampler: str = "rand",
+    seed: int = 42,
+    n_blocks: int = 1,
+    gradient_accumulate_steps: int = 1,
+    not_use_best_mse: bool = False,
+    dynamic_max_gap: int = -1,
+    scale_dtype="fp16",
+    run_fn=None,
+    run_args=None,
+):
+    """The entry point of the autoround weight-only quantization.
+    Args:
+    model: The PyTorch model to be quantized.
+    weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
+    weight_config={
+                'layer1':##layer_name
+                {
+                    'data_type': 'int',
+                    'bits': 4,
+                    'group_size': 32,
+                    'sym': False,
+                }
+                ...
+            }
+        keys:
+            data_type (str): The data type to be used (default is "int").
+            bits (int): Number of bits for quantization (default is 4).
+            group_size (int): Size of the quantization group (default is 128).
+            sym (bool): Whether to use symmetric quantization. (default is None).
+    enable_full_range (bool): Whether to enable full range quantization (default is False).
+    batch_size (int): Batch size for training (default is 8).
+    amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set.
+    device: The device to be used for tuning (default is None). Automatically detect and set.
+    lr_scheduler: The learning rate scheduler to be used.
+    use_quant_input (bool): Whether to use quantized input data (default is True).
+    enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
+    lr (float): The learning rate (default is 0.005).
+    minmax_lr (float): The learning rate for min-max tuning (default is None).
+    low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
+    iters (int): Number of iterations (default is 200).
+    seqlen (int): Length of the sequence.
+    n_samples (int): Number of samples (default is 512).
+    sampler (str): The sampling method (default is "rand").
+    seed (int): The random seed (default is 42).
+    n_blocks (int): Number of blocks (default is 1).
+    gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
+    not_use_best_mse (bool): Whether to use mean squared error (default is False).
+    dynamic_max_gap (int): The dynamic maximum gap (default is -1).
+    scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
+                           have different choices.
+    run_fn: a calibration function for calibrating the model. Defaults to None.
+    run_args: positional arguments for `run_fn`. Defaults to None.
+
+    Returns:
+        The quantized model.
+    """
+    if run_fn is None or run_fn == get_autoround_default_run_fn:
+        assert run_args is not None, "Please provide tokenizer for AutoRound default calibration."
+        run_fn = get_autoround_default_run_fn
+    dataloader = recover_dataloader_from_calib_fn(run_fn, run_args)
+
+    rounder = AutoRound(
+        model=model,
+        tokenizer=None,
+        bits=4,
+        group_size=128,
+        sym=False,
+        weight_config=weight_config,
+        enable_full_range=enable_full_range,  ##for symmetric, TODO support later
+        batch_size=batch_size,
+        amp=amp,
+        device=device,
+        lr_scheduler=lr_scheduler,
+        dataloader=dataloader,
+        use_quant_input=use_quant_input,
+        enable_minmax_tuning=enable_minmax_tuning,
+        lr=lr,
+        minmax_lr=minmax_lr,
+        low_gpu_mem_usage=low_gpu_mem_usage,
+        iters=iters,
+        seqlen=seqlen,
+        n_samples=n_samples,
+        sampler=sampler,
+        seed=seed,
+        n_blocks=n_blocks,
+        gradient_accumulate_steps=gradient_accumulate_steps,
+        not_use_best_mse=not_use_best_mse,
+        dynamic_max_gap=dynamic_max_gap,
+        data_type="int",
+        scale_dtype=scale_dtype,
+        run_fn=run_fn,
+        run_args=run_args,
+    )
+    qdq_model, weight_config = rounder.quantize()
+    return qdq_model, weight_config
diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py
@@ -27,6 +27,8 @@
     get_default_sq_config,
     TEQConfig,
     get_default_teq_config,
+    AutoRoundConfig,
+    get_default_AutoRound_config,
     HQQConfig,
     get_default_hqq_config,
     FP8Config,

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -18,8 +18,9 @@
 
 import torch
 
-from neural_compressor.common.utils import AWQ, FP8_QUANT, GPTQ, HQQ, RTN, SMOOTH_QUANT, STATIC_QUANT, TEQ
+from neural_compressor.common.utils import AUTOROUND, AWQ, FP8_QUANT, GPTQ, HQQ, RTN, SMOOTH_QUANT, STATIC_QUANT, TEQ
 from neural_compressor.torch.quantization import (
+    AutoRoundConfig,
     AWQConfig,
     FP8Config,
     GPTQConfig,
@@ -328,6 +329,74 @@ def teq_quantize_entry(
     return model
 
 
+###################### AUTOROUND Algo Entry ##################################
+@register_algo(name=AUTOROUND)
+def autoround_quantize_entry(
+    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], *args, **kwargs
+) -> torch.nn.Module:
+    from neural_compressor.torch.algorithms.weight_only import autoround_quantize
+
+    logger.info("Quantize model with the AutoRound algorithm.")
+    calib_func = kwargs.get("run_fn", None)
+    weight_config = {}
+    for (op_name, op_type), quant_config in configs_mapping.items():
+        if quant_config.name != AUTOROUND or quant_config.dtype == "fp32":
+            continue
+        else:
+            weight_config[op_name] = {
+                "data_type": quant_config.dtype,
+                "bits": quant_config.bits,
+                "sym": quant_config.use_sym,
+                "group_size": quant_config.group_size,
+            }
+            enable_full_range = quant_config.enable_full_range
+            batch_size = quant_config.batch_size
+            lr_scheduler = quant_config.lr_scheduler
+            use_quant_input = quant_config.use_quant_input
+            enable_minmax_tuning = quant_config.enable_minmax_tuning
+            lr = quant_config.lr
+            minmax_lr = quant_config.minmax_lr
+            low_gpu_mem_usage = quant_config.low_gpu_mem_usage
+            iters = quant_config.iters
+            seqlen = quant_config.seqlen
+            n_samples = quant_config.n_samples
+            sampler = quant_config.sampler
+            seed = quant_config.seed
+            n_blocks = quant_config.n_blocks
+            gradient_accumulate_steps = quant_config.gradient_accumulate_steps
+            not_use_best_mse = quant_config.not_use_best_mse
+            dynamic_max_gap = quant_config.dynamic_max_gap
+            scale_dtype = quant_config.scale_dtype
+
+    kwargs.pop("example_inputs")
+    model, autoround_config = autoround_quantize(
+        model=model,
+        weight_config=weight_config,
+        enable_full_range=enable_full_range,
+        batch_size=batch_size,
+        lr_scheduler=lr_scheduler,
+        use_quant_input=use_quant_input,
+        enable_minmax_tuning=enable_minmax_tuning,
+        lr=lr,
+        minmax_lr=minmax_lr,
+        low_gpu_mem_usage=low_gpu_mem_usage,
+        iters=iters,
+        seqlen=seqlen,
+        n_samples=n_samples,
+        sampler=sampler,
+        seed=seed,
+        n_blocks=n_blocks,
+        gradient_accumulate_steps=gradient_accumulate_steps,
+        not_use_best_mse=not_use_best_mse,
+        dynamic_max_gap=dynamic_max_gap,
+        scale_dtype=scale_dtype,
+        **kwargs
+    )
+    model.autoround_config = autoround_config
+    logger.info("AutoRound quantization done.")
+    return model
+
+
 ###################### HQQ Algo Entry ##################################
 @register_algo(name=HQQ)
 @torch.no_grad()