intel · chensuyue · Dec 6, 2023 · Nov 30, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
@@ -93,18 +93,19 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear,
 **Export arguments**
 | export args  | default value |                               comments                              |
 |:----------:|:-------------:|:-------------------------------------------------------------------:|
-| qweight_config_path |      None     |  If need to export model with fp32_model and json file, set the path of qconfig.json |
+|  use_optimum_format  |     True       |  Whether to use the popular format used in [Optimum](https://github.com/huggingface/optimum/blob/e0927976d06d163ed09fe5bd80d013e1cfa0c463/docs/source/llm_quantization/usage_guides/quantization.mdx#L5)  |
 |  sym_full_range |      False     | Whether to leverage the full compression range under symmetric quantization |
-|  compression_dtype  |       torch.int32       |  Data type for compressed dtype, select from [torch.int8\|16\|32\|64]   |
-|  compression_dim  |       1       |   0 means output channel while 1 means input channel   |
-|  scale_dtype  |       torch.float32       |  Data type for scale and bias   |
-|  use_hf_format  |     False       |  Whether to use the popular format present on HuggingFace hub   |
+|  compression_dtype  |       torch.int32       |  Data type for compressed dtype, select from [torch.int8\|16\|32\|64]. It's torch.int32 when use_optimum_format=True |
+|  compression_dim  |       1       |   0 means output channel while 1 means input channel. It's 1 for weight and 0 for zero-point when use_optimum_format=True   |
+|  scale_dtype  |       torch.float32       |  Data type for scale and bias. It's torch.float16 when use_optimum_format=True   |
+| qweight_config_path |      None     |  set the path of qconfig.json if you want to export model with json file |
+| gptq_config_path |      None     |  If need to export model with fp32_model and json file, set the path of gptq_config.json for GPTQ quantized model|
 
-**Note:** HuggingFace format is quite special, the main differences are as follows:
+**Note:** The format used in Optimum is acceptable for transformers, which makes it easy to use. However, this format is rather special, the main differences are as follows:
 
 > 1: Compression Dimension: weight = 1, zero = 0 and both are transposed.   
 > 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym.    
-> 3: Group Index: Use the same number for a group instead of recording channel order.    
+> 3: Group Index: Use the same number for a group instead of recording channel order. 
 
 
 ### **User Code Example**

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -4573,10 +4573,12 @@ def rtn_quantize(self, model, tune_cfg):
             enable_full_range = self.recipes["rtn_args"].get("enable_full_range", False)
             enable_mse_search = self.recipes["rtn_args"].get("enable_mse_search", False)
             group_dim = self.recipes["rtn_args"].get("group_dim", 1)
+            return_int = self.recipes["rtn_args"].get("return_int", False)
         else:  # pragma: no cover
             enable_full_range = False
             enable_mse_search = False
             group_dim = 1
+            return_int = False
         from .torch_utils.util import fetch_module, set_module
         from .torch_utils.weight_only import rtn_quantize
 
@@ -4614,7 +4616,7 @@ def rtn_quantize(self, model, tune_cfg):
                     num_bits,
                     group_size,
                     scheme,
-                    return_int=False,
+                    return_int=return_int,
                     data_type=dtype,
                     enable_full_range=enable_full_range,
                     enable_mse_search=enable_mse_search,

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -217,10 +217,10 @@ def __init__(
         compression_dim=1,
         g_idx=False,
         device="cpu",
-        use_hf_format=False,
+        use_optimum_format=True,
     ):
         super().__init__()
-        self.use_hf_format = use_hf_format
+        self.use_optimum_format = use_optimum_format
         self.dtype = dtype
         if "int" not in self.dtype:  # for nf4, fp4
             from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING
@@ -245,13 +245,13 @@ def __init__(
         dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64}
         self.compress_bits = dtype_bits_mapping[compression_dtype]
         self.n_pack = self.compress_bits // self.bits
-        self.compressed_dtype = compression_dtype
-        self.float_type = scale_dtype
         # K is input channel, N is output channel
         assert compression_dim in [0, 1], (
             "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel."
         )
-        if self.use_hf_format:
+        if self.use_optimum_format:
+            self.float_type = torch.float16
+            self.compressed_dtype = torch.int32
             self.register_buffer(
                 "scales",
                 torch.zeros(
@@ -276,7 +276,10 @@ def __init__(
                 ).to(device),
             )
             self.qzeros = self.qzeros.T
+            self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
         else:
+            self.compressed_dtype = compression_dtype
+            self.float_type = scale_dtype
             self.register_buffer(
                 "scales",
                 torch.zeros(
@@ -316,18 +319,18 @@ def __init__(
                             dtype=self.compressed_dtype,
                         ).to(device),
                     )
+            if bias:
+                self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
+            else:
+                self.bias = None
         if g_idx:
             self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
         else:
             self.g_idx = None
-        if bias:
-            self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
-        else:
-            self.bias = None
 
     def pack(self, int_weight, scale, zp, bias, g_idx=None):
         int_weight = int_weight.to(self.device)
-        if self.use_hf_format and zp is None:
+        if self.use_optimum_format and zp is None:
             # to avoid overflow
             int_weight = int_weight.type(torch.int32)
             shift_bias = 2 ** (self.bits - 1)
@@ -339,13 +342,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
         if g_idx is not None:
             assert hasattr(self, "g_idx"), "g_idx is not set when initializing."
             self.g_idx = g_idx.type(torch.int32).to(self.device)
-            if self.use_hf_format:
+            if self.use_optimum_format:
                 invperm = torch.argsort(self.g_idx)
                 self.g_idx = invperm // self.groupsize
                 self.g_idx = self.g_idx.type(torch.int32).to(self.device)
         assert scale.shape == self.scales.shape, "Scale shape is mismatched."
         self.scales = scale.type(self.float_type).to(self.device)
-        if not self.use_hf_format and self.compression_dim == 0:
+        if not self.use_optimum_format and self.compression_dim == 0:
             int_weight = int_weight.T
             self.qweight = self.qweight.T
         origin_shape = int_weight.shape
@@ -362,14 +365,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
                 tmp[:, e] &= mask
                 tmp[:, e] = tmp[:, e] << (self.bits * e)
                 self.qweight[:, j] |= tmp[:, e]
-        if not self.use_hf_format and self.compression_dim == 0:
+        if not self.use_optimum_format and self.compression_dim == 0:
             self.qweight = self.qweight.T
 
         if zp is not None:
             zp = zp.to(self.device)
-            if self.use_hf_format:
+            if self.use_optimum_format:
                 zp -= 1
-            if self.use_hf_format or self.compression_dim == 0:
+            if self.use_optimum_format or self.compression_dim == 0:
                 zp = zp.T
                 self.qzeros = self.qzeros.T
             assert hasattr(self, "qzeros"), "zp is not set when initializing."
@@ -382,23 +385,19 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
                     tmp[:, e] &= mask
                     tmp[:, e] = tmp[:, e] << (self.bits * e)
                     self.qzeros[:, j] |= tmp[:, e]
-            if self.use_hf_format or self.compression_dim == 0:
+            if self.use_optimum_format or self.compression_dim == 0:
                 self.qzeros = self.qzeros.T
-        if self.use_hf_format:
+        if self.use_optimum_format:
             self.scales = self.scales.T
             self.qweight = self.qweight.T
-            self.g_idx = self.g_idx
             self.qzeros = self.qzeros.T
 
     def recover(self):
         logger.debug(f"Recovering {self} weight")
-        if self.use_hf_format:
-            # Prevent broken id links of self.scales and self.scales
-            self.scales = self.scales.T
-            self.qweight = self.qweight.T
-            self.g_idx = self.g_idx
-            self.qzeros = self.qzeros.T
-        device = self.scales.device
+        scales = self.scales.T if self.use_optimum_format else self.scales
+        qweight = self.qweight.T if self.use_optimum_format else self.qweight
+
+        device = scales.device
         fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
         if self.g_idx is None:
             # used for recovering fp32_weight
@@ -410,8 +409,7 @@ def recover(self):
             weight_dtype = torch.int8
         # unpack weight
         weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
-        qweight = self.qweight
-        if not self.use_hf_format and self.compression_dim == 0:
+        if not self.use_optimum_format and self.compression_dim == 0:
             weight = weight.T
             qweight = qweight.T
         origin_shape = weight.shape
@@ -427,7 +425,7 @@ def recover(self):
                 if weight_dtype == torch.uint8:
                     tmp &= mask  # remove sign bit
                 weight[:, index] = tmp.type(weight_dtype)
-        if not self.use_hf_format and self.compression_dim == 0:
+        if not self.use_optimum_format and self.compression_dim == 0:
             weight = weight.T
         if "int" not in self.dtype:
             new_weight = torch.zeros(self.out_features, self.in_features).to(device)
@@ -437,9 +435,9 @@ def recover(self):
         # unpack zero_point
         if hasattr(self, "qzeros"):
             zp_dtype = self.compressed_dtype  # to avoid overflow when weight-zp
-            zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device)
-            qzeros = self.qzeros
-            if self.use_hf_format or self.compression_dim == 0:
+            zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device)
+            qzeros = self.qzeros.T if self.use_optimum_format else self.qzeros
+            if self.use_optimum_format or self.compression_dim == 0:
                 zp = zp.T
                 qzeros = qzeros.T
             origin_shape = zp.shape
@@ -454,30 +452,34 @@ def recover(self):
                     tmp = tmp >> self.compress_bits - self.bits
                     tmp &= mask
                     zp[:, index] = tmp.type(zp_dtype)
-            if self.use_hf_format or self.compression_dim == 0:
+            if self.use_optimum_format or self.compression_dim == 0:
                 zp = zp.T
-            if self.use_hf_format:
+            if self.use_optimum_format:
                 # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
                 zp += 1
                 zp = torch.where(zp > (2**self.bits - 1), 0, zp)
             # recover fp32 weight with int_weight, scale, and zero_point
             for idx in range(self.in_features):
-                fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]]
+                fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * scales[:, self.g_idx[idx]]
         else:
             # recover fp32 weight with int_weight, scale
             for idx in range(self.in_features):
-                fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]]
+                fp32_weight[:, idx] = weight[:, idx] * scales[:, self.g_idx[idx]]
         return fp32_weight
 
     def forward(self, input):
+        weight = self.recover()
+        device = self.scales.device
+        if weight.dtype == torch.float16 and device.type == "cpu":
+            weight = weight.float()
+            self.bias = self.bias.float() if self.bias is not None else None
         if level == DEBUG:
             if not hasattr(self, "weight"):
-                self.weight = self.recover()
+                self.weight = weight
             input = input.type(self.weight.dtype)
             logger.debug(f"Calculating {self}")
             return F.linear(input, self.weight, self.bias)
         else:
-            weight = self.recover()
             input = input.type(weight.dtype)
             return F.linear(input, weight, self.bias)
 
@@ -489,8 +491,8 @@ def extra_repr(self) -> str:
             self.groupsize,
             self.bias is not None,
         )
-        if self.use_hf_format:
-            tmp_str += ", use_hf_format=True"
+        if self.use_optimum_format:
+            tmp_str += ", use_optimum_format=True"
         return tmp_str
 
 

diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -396,7 +396,7 @@ def rtn_quantize(
         compression_dim = kwargs.get("compression_dim", 1)
         scale_dtype = kwargs.get("scale_dtype", torch.float32)
         device = kwargs.get("device", "cpu")
-        use_hf_format = kwargs.get("use_hf_format", False)
+        use_optimum_format = kwargs.get("use_optimum_format", True)
     for name, m in model.named_modules():
         if m.__class__.__name__ not in supported_layers:
             continue
@@ -452,7 +452,7 @@ def rtn_quantize(
                 compression_dim=compression_dim,
                 scale_dtype=scale_dtype,
                 device=device,
-                use_hf_format=use_hf_format,
+                use_optimum_format=use_optimum_format,
             )
             new_module.pack(int_weight, scale, zp, m.bias)
             if name == "":

diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
@@ -459,7 +459,7 @@ def export_compressed_model(
         scale_dtype=torch.float32,
         gptq_config_path=None,
         device="cpu",
-        use_hf_format=False,
+        use_optimum_format=True,
     ):
         """Convert Linear to WeightOnlyLinear for low memory inference.
 
@@ -475,7 +475,7 @@ def export_compressed_model(
                                                     Defaults to torch.float32.
             gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None.
             device (str, optional): choose device for compression. Defaults to cpu.
-            use_hf_format (bool, optional): use the popular huggingface compression format.
+            use_optimum_format (bool, optional): use the popular huggingface compression format.
                 1: compression_dim: weight = 1, zeros = 0 and both are transposed.
                 2: zeros -= 1 before compression. Why we need it?
                 3: g_idx: use same number for one group instead of recording the channel order.
@@ -520,7 +520,7 @@ def export_compressed_model(
                         compression_dim=compression_dim,
                         scale_dtype=scale_dtype,
                         device=device,
-                        use_hf_format=use_hf_format,
+                        use_optimum_format=use_optimum_format,
                     )
                     set_module(self.model, k, new_module)
                     continue
@@ -551,7 +551,7 @@ def export_compressed_model(
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
                     device=device,
-                    use_hf_format=use_hf_format,
+                    use_optimum_format=use_optimum_format,
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
                 set_module(self.model, k, new_module)
@@ -578,7 +578,7 @@ def export_compressed_model(
                     compression_dim=compression_dim,
                     scale_dtype=scale_dtype,
                     device=device,
-                    use_hf_format=use_hf_format,
+                    use_optimum_format=use_optimum_format,
                 )
                 set_module(self.model, k, mod)
         return self.model