Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change use_optimum_format=True and add bias #1431

Merged
merged 9 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions docs/source/quantization_weight_only.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,19 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear,
**Export arguments**
| export args | default value | comments |
|:----------:|:-------------:|:-------------------------------------------------------------------:|
| qweight_config_path | None | If need to export model with fp32_model and json file, set the path of qconfig.json |
| use_optimum_format | True | Whether to use the popular format used in [Optimum](https://github.com/huggingface/optimum/blob/e0927976d06d163ed09fe5bd80d013e1cfa0c463/docs/source/llm_quantization/usage_guides/quantization.mdx#L5) |
| sym_full_range | False | Whether to leverage the full compression range under symmetric quantization |
| compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64] |
| compression_dim | 1 | 0 means output channel while 1 means input channel |
| scale_dtype | torch.float32 | Data type for scale and bias |
| use_hf_format | False | Whether to use the popular format present on HuggingFace hub |
| compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64]. It's torch.int32 when use_optimum_format=True |
| compression_dim | 1 | 0 means output channel while 1 means input channel. It's 1 for weight and 0 for zero-point when use_optimum_format=True |
| scale_dtype | torch.float32 | Data type for scale and bias. It's torch.float16 when use_optimum_format=True |
| qweight_config_path | None | set the path of qconfig.json if you want to export model with json file |
| gptq_config_path | None | If need to export model with fp32_model and json file, set the path of gptq_config.json for GPTQ quantized model|

**Note:** HuggingFace format is quite special, the main differences are as follows:
**Note:** The format used in Optimum is acceptable for transformers, which makes it easy to use. However, this format is rather special, the main differences are as follows:

> 1: Compression Dimension: weight = 1, zero = 0 and both are transposed.
> 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym.
> 3: Group Index: Use the same number for a group instead of recording channel order.
> 3: Group Index: Use the same number for a group instead of recording channel order.


### **User Code Example**
Expand Down
4 changes: 3 additions & 1 deletion neural_compressor/adaptor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4573,10 +4573,12 @@ def rtn_quantize(self, model, tune_cfg):
enable_full_range = self.recipes["rtn_args"].get("enable_full_range", False)
enable_mse_search = self.recipes["rtn_args"].get("enable_mse_search", False)
group_dim = self.recipes["rtn_args"].get("group_dim", 1)
return_int = self.recipes["rtn_args"].get("return_int", False)
else: # pragma: no cover
enable_full_range = False
enable_mse_search = False
group_dim = 1
return_int = False
from .torch_utils.util import fetch_module, set_module
from .torch_utils.weight_only import rtn_quantize

Expand Down Expand Up @@ -4614,7 +4616,7 @@ def rtn_quantize(self, model, tune_cfg):
num_bits,
group_size,
scheme,
return_int=False,
return_int=return_int,
data_type=dtype,
enable_full_range=enable_full_range,
enable_mse_search=enable_mse_search,
Expand Down
80 changes: 41 additions & 39 deletions neural_compressor/adaptor/torch_utils/model_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,10 @@ def __init__(
compression_dim=1,
g_idx=False,
device="cpu",
use_hf_format=False,
use_optimum_format=True,
):
super().__init__()
self.use_hf_format = use_hf_format
self.use_optimum_format = use_optimum_format
self.dtype = dtype
if "int" not in self.dtype: # for nf4, fp4
from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING
Expand All @@ -245,13 +245,13 @@ def __init__(
dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64}
self.compress_bits = dtype_bits_mapping[compression_dtype]
self.n_pack = self.compress_bits // self.bits
self.compressed_dtype = compression_dtype
self.float_type = scale_dtype
# K is input channel, N is output channel
assert compression_dim in [0, 1], (
"Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel."
)
if self.use_hf_format:
if self.use_optimum_format:
self.float_type = torch.float16
self.compressed_dtype = torch.int32
self.register_buffer(
"scales",
torch.zeros(
Expand All @@ -276,7 +276,10 @@ def __init__(
).to(device),
)
self.qzeros = self.qzeros.T
self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
else:
self.compressed_dtype = compression_dtype
self.float_type = scale_dtype
self.register_buffer(
"scales",
torch.zeros(
Expand Down Expand Up @@ -316,18 +319,18 @@ def __init__(
dtype=self.compressed_dtype,
).to(device),
)
if bias:
self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
else:
self.bias = None
if g_idx:
self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
else:
self.g_idx = None
if bias:
self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
else:
self.bias = None

def pack(self, int_weight, scale, zp, bias, g_idx=None):
int_weight = int_weight.to(self.device)
if self.use_hf_format and zp is None:
if self.use_optimum_format and zp is None:
# to avoid overflow
int_weight = int_weight.type(torch.int32)
shift_bias = 2 ** (self.bits - 1)
Expand All @@ -339,13 +342,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
if g_idx is not None:
assert hasattr(self, "g_idx"), "g_idx is not set when initializing."
self.g_idx = g_idx.type(torch.int32).to(self.device)
if self.use_hf_format:
if self.use_optimum_format:
invperm = torch.argsort(self.g_idx)
self.g_idx = invperm // self.groupsize
self.g_idx = self.g_idx.type(torch.int32).to(self.device)
assert scale.shape == self.scales.shape, "Scale shape is mismatched."
self.scales = scale.type(self.float_type).to(self.device)
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
int_weight = int_weight.T
self.qweight = self.qweight.T
origin_shape = int_weight.shape
Expand All @@ -362,14 +365,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
tmp[:, e] &= mask
tmp[:, e] = tmp[:, e] << (self.bits * e)
self.qweight[:, j] |= tmp[:, e]
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
self.qweight = self.qweight.T

if zp is not None:
zp = zp.to(self.device)
if self.use_hf_format:
if self.use_optimum_format:
zp -= 1
if self.use_hf_format or self.compression_dim == 0:
if self.use_optimum_format or self.compression_dim == 0:
zp = zp.T
self.qzeros = self.qzeros.T
assert hasattr(self, "qzeros"), "zp is not set when initializing."
Expand All @@ -382,23 +385,19 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
tmp[:, e] &= mask
tmp[:, e] = tmp[:, e] << (self.bits * e)
self.qzeros[:, j] |= tmp[:, e]
if self.use_hf_format or self.compression_dim == 0:
if self.use_optimum_format or self.compression_dim == 0:
self.qzeros = self.qzeros.T
if self.use_hf_format:
if self.use_optimum_format:
self.scales = self.scales.T
self.qweight = self.qweight.T
self.g_idx = self.g_idx
self.qzeros = self.qzeros.T

def recover(self):
logger.debug(f"Recovering {self} weight")
if self.use_hf_format:
# Prevent broken id links of self.scales and self.scales
self.scales = self.scales.T
self.qweight = self.qweight.T
self.g_idx = self.g_idx
self.qzeros = self.qzeros.T
device = self.scales.device
scales = self.scales.T if self.use_optimum_format else self.scales
qweight = self.qweight.T if self.use_optimum_format else self.qweight

device = scales.device
fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
if self.g_idx is None:
# used for recovering fp32_weight
Expand All @@ -410,8 +409,7 @@ def recover(self):
weight_dtype = torch.int8
# unpack weight
weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
qweight = self.qweight
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
weight = weight.T
qweight = qweight.T
origin_shape = weight.shape
Expand All @@ -427,7 +425,7 @@ def recover(self):
if weight_dtype == torch.uint8:
tmp &= mask # remove sign bit
weight[:, index] = tmp.type(weight_dtype)
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
weight = weight.T
if "int" not in self.dtype:
new_weight = torch.zeros(self.out_features, self.in_features).to(device)
Expand All @@ -437,9 +435,9 @@ def recover(self):
# unpack zero_point
if hasattr(self, "qzeros"):
zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp
zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device)
qzeros = self.qzeros
if self.use_hf_format or self.compression_dim == 0:
zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device)
qzeros = self.qzeros.T if self.use_optimum_format else self.qzeros
if self.use_optimum_format or self.compression_dim == 0:
zp = zp.T
qzeros = qzeros.T
origin_shape = zp.shape
Expand All @@ -454,30 +452,34 @@ def recover(self):
tmp = tmp >> self.compress_bits - self.bits
tmp &= mask
zp[:, index] = tmp.type(zp_dtype)
if self.use_hf_format or self.compression_dim == 0:
if self.use_optimum_format or self.compression_dim == 0:
zp = zp.T
if self.use_hf_format:
if self.use_optimum_format:
# zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
zp += 1
zp = torch.where(zp > (2**self.bits - 1), 0, zp)
# recover fp32 weight with int_weight, scale, and zero_point
for idx in range(self.in_features):
fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]]
fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * scales[:, self.g_idx[idx]]
else:
# recover fp32 weight with int_weight, scale
for idx in range(self.in_features):
fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]]
fp32_weight[:, idx] = weight[:, idx] * scales[:, self.g_idx[idx]]
return fp32_weight

def forward(self, input):
weight = self.recover()
device = self.scales.device
if weight.dtype == torch.float16 and device.type == "cpu":
weight = weight.float()
self.bias = self.bias.float() if self.bias is not None else None
if level == DEBUG:
if not hasattr(self, "weight"):
self.weight = self.recover()
self.weight = weight
input = input.type(self.weight.dtype)
logger.debug(f"Calculating {self}")
return F.linear(input, self.weight, self.bias)
else:
weight = self.recover()
input = input.type(weight.dtype)
return F.linear(input, weight, self.bias)

Expand All @@ -489,8 +491,8 @@ def extra_repr(self) -> str:
self.groupsize,
self.bias is not None,
)
if self.use_hf_format:
tmp_str += ", use_hf_format=True"
if self.use_optimum_format:
tmp_str += ", use_optimum_format=True"
return tmp_str


Expand Down
4 changes: 2 additions & 2 deletions neural_compressor/adaptor/torch_utils/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def rtn_quantize(
compression_dim = kwargs.get("compression_dim", 1)
scale_dtype = kwargs.get("scale_dtype", torch.float32)
device = kwargs.get("device", "cpu")
use_hf_format = kwargs.get("use_hf_format", False)
use_optimum_format = kwargs.get("use_optimum_format", True)
for name, m in model.named_modules():
if m.__class__.__name__ not in supported_layers:
continue
Expand Down Expand Up @@ -452,7 +452,7 @@ def rtn_quantize(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
new_module.pack(int_weight, scale, zp, m.bias)
if name == "":
Expand Down
10 changes: 5 additions & 5 deletions neural_compressor/model/torch_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def export_compressed_model(
scale_dtype=torch.float32,
gptq_config_path=None,
device="cpu",
use_hf_format=False,
use_optimum_format=True,
):
"""Convert Linear to WeightOnlyLinear for low memory inference.

Expand All @@ -475,7 +475,7 @@ def export_compressed_model(
Defaults to torch.float32.
gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None.
device (str, optional): choose device for compression. Defaults to cpu.
use_hf_format (bool, optional): use the popular huggingface compression format.
use_optimum_format (bool, optional): use the popular huggingface compression format.
1: compression_dim: weight = 1, zeros = 0 and both are transposed.
2: zeros -= 1 before compression. Why we need it?
3: g_idx: use same number for one group instead of recording the channel order.
Expand Down Expand Up @@ -520,7 +520,7 @@ def export_compressed_model(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
set_module(self.model, k, new_module)
continue
Expand Down Expand Up @@ -551,7 +551,7 @@ def export_compressed_model(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
set_module(self.model, k, new_module)
Expand All @@ -578,7 +578,7 @@ def export_compressed_model(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
set_module(self.model, k, mod)
return self.model
Expand Down
Loading
Loading