Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support MatMulFpQ4 for onnxruntime 1.16.0 #1293

Merged
merged 4 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion neural_compressor/adaptor/onnxrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,7 +1690,7 @@ def _dump_model_op_stats(self, model, tune_cfg):

dtype_set = set()
for node in model.nodes():
if node.op_type == "MatMulWithQuantWeight":
if node.op_type == "MatMulFpQ4":
optype = "MatMul"
else:
optype = node.op_type
Expand Down
115 changes: 42 additions & 73 deletions neural_compressor/adaptor/ox_utils/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,63 +20,28 @@
import logging
import math
import os
import struct
import sys

import numpy as np
import onnx
from onnx import helper, numpy_helper
from onnx import onnx_pb as onnx_proto
from packaging.version import Version

from neural_compressor.model.model import BaseModel
from neural_compressor.model.onnx_model import ONNXModel
from neural_compressor.utils.utility import LazyImport

ort = LazyImport("onnxruntime")
logger = logging.getLogger("neural_compressor")


WEIGHT_ONLY_OP_SUPPORTED = False


def check_op_support_status():
"""Check whether weight-only op is supported."""
input_tensor = helper.make_tensor_value_info("input", 1, [1, 32])
output_tensor = helper.make_tensor_value_info("output", 1, [1, 64])
initializers = []
# weight shape (32, 64)
packed_weight = np.random.randint(0, high=16, size=(64, 1, 16), dtype="uint8")
initializers.append(onnx.helper.make_tensor("weight", 2, packed_weight.shape, packed_weight.flatten().tolist()))
scale = np.random.random((64, 1)).astype("float32")
initializers.append(onnx.helper.make_tensor("scale", 1, scale.shape, scale.flatten().tolist()))

kwargs = {}
kwargs["K"] = 32
kwargs["N"] = 64
kwargs["bits"] = 4
kwargs["block_size"] = 32
node = onnx.helper.make_node(
"MatMulWithQuantWeight",
inputs=["input", "weight", "scale"],
outputs=["output"],
name="test",
domain="com.microsoft",
**kwargs,
)

global WEIGHT_ONLY_OP_SUPPORTED
graph = helper.make_graph([node], "test", [input_tensor], [output_tensor], initializer=initializers)
model = helper.make_model(graph)
try:
ort.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"])
WEIGHT_ONLY_OP_SUPPORTED = True
except:
WEIGHT_ONLY_OP_SUPPORTED = False
ONNXRT116_VERSION = Version("1.16.0")


def make_matmul_weight_only_node(
node, weight_shape, num_bits, group_size, k_blocks, q_weight, scale, zero_point
): # pragma: no cover
"""Build MatMulWithQuantWeight node.
"""Build MatMulFpQ4 node.

Args:
node: original matmul node
Expand All @@ -89,46 +54,49 @@ def make_matmul_weight_only_node(
zero_point (array): zero point

Returns:
matmul_weight_only_node: MatMulWithQuantWeight node
new_inits: initializers of the MatMulWithQuantWeight node
matmul_weight_only_node: MatMulFpQ4 node
new_inits: initializers of the MatMulFpQ4 node
"""
blob_size = group_size // 2
if zero_point is not None:
blob_size = group_size // 2 + 4 + 1
offset = 5
else:
blob_size = group_size // 2 + 4
offset = 4

packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
for i in range(q_weight.shape[0]):
for k in range(0, group_size, 2):
packed[i][k // 2] = q_weight[i][k] | q_weight[i][k + 1] << 4
bf = struct.pack("f", scale[i])
packed[i][0] = bf[0]
packed[i][1] = bf[1]
packed[i][2] = bf[2]
packed[i][3] = bf[3]

packed = np.reshape(packed, (-1, k_blocks, blob_size))
scale = np.reshape(scale, (-1, k_blocks)).astype("float32")
if zero_point is not None:
packed[i][4] = zero_point[i]

packed[i][offset:] = np.bitwise_or(
q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits)
)

packed = packed.reshape(-1)
q_weight_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
data_type=2,
dims=packed.shape,
vals=packed.tobytes(),
raw=True,
)
scale_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_scale", data_type=1, dims=scale.shape, vals=scale.tobytes(), raw=True
shape_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64")
)
input_names = [node.input[0], q_weight_tensor.name, scale_tensor.name]
new_inits = [q_weight_tensor, scale_tensor]

if zero_point is not None:
zero_point = np.reshape(zero_point, (-1, k_blocks)).astype("uint8")
zp_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_zp", data_type=2, dims=zero_point.shape, vals=zero_point.tobytes(), raw=True
)
input_names.append(zp_tensor.name)
new_inits.append(zp_tensor)
input_names = [node.input[0], q_weight_tensor.name, shape_tensor.name]
new_inits = [q_weight_tensor, shape_tensor]

kwargs = {}
kwargs["K"] = weight_shape[0]
kwargs["N"] = weight_shape[1]
kwargs["bits"] = num_bits
kwargs["block_size"] = group_size
kwargs["blk_quant_type"] = 1 if zero_point is not None else 0
matmul_weight_only_node = onnx.helper.make_node(
"MatMulWithQuantWeight",
"MatMulFpQ4",
inputs=input_names,
outputs=node.output,
name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
Expand Down Expand Up @@ -260,7 +228,6 @@ def rtn_quantize(
Returns:
model: fake quantized ONNXModel
"""
check_op_support_status()
model = model if isinstance(model, BaseModel) else ONNXModel(model)
new_nodes = []
remove_nodes = []
Expand Down Expand Up @@ -290,8 +257,8 @@ def rtn_quantize(

weight = pad_tensor(weight, group_size, k_blocks)

if WEIGHT_ONLY_OP_SUPPORTED and num_bits == 4 and group_size == 32: # pragma: no cover
# currently MatMulWithQuantWeights only support 4 bits and 32 group_size
if Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32: # pragma: no cover
# currently MatMulFpQ4 only support 4 bits and 32 group_size
q_weight, scale, zp = quant_tensor(
weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
)
Expand Down Expand Up @@ -394,7 +361,9 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
weight = weight.T * scales
weight = pad_tensor(weight, group_size, (org_w_shape[0] + group_size - 1) // group_size).T

if WEIGHT_ONLY_OP_SUPPORTED and num_bits == 4 and group_size == 32: # pragma: no cover
if (
Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
): # pragma: no cover
q_weight = qdq_tensor(weight, num_bits, group_size, scheme, "uint") / np.expand_dims(
scales, axis=-1
)
Expand Down Expand Up @@ -535,8 +504,10 @@ def apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, g
for i_s in range(10):
ratio = 1 - i_s / 100
weight = copy.deepcopy(org_weight)
if WEIGHT_ONLY_OP_SUPPORTED and num_bits == 4 and group_size == 32: # pragma: no cover
# currently MatMulWithQuantWeights only support 4 bits and 32 group_size
if (
Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
): # pragma: no cover
# currently MatMulFpQ4 only support 4 bits and 32 group_size
weight = qdq_tensor(weight, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1))
else:
weight = qdq_tensor(weight, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
Expand Down Expand Up @@ -644,7 +615,6 @@ def awq_quantize(
Returns:
model: fake quantized ONNXModel
"""
check_op_support_status()
model = model if isinstance(model, BaseModel) else ONNXModel(model)
output_dicts = {}
full_ratio = {}
Expand Down Expand Up @@ -918,7 +888,6 @@ def gptq_quantize(
Returns:
model: fake quantized ONNXModel
"""
check_op_support_status()
model = model if isinstance(model, BaseModel) else ONNXModel(model)
output_dicts = {}

Expand Down Expand Up @@ -1013,8 +982,8 @@ def gptq_quantize(

weight_tensor = model.get_initializer(node.input[1])
init_share_num = model.get_initializer_share_num(node.input[1])
if WEIGHT_ONLY_OP_SUPPORTED and num_bits == 4 and group_size == 32: # pragma: no cover
# currently MatMulWithQuantWeights only support 4 bits and 32 group_size
if Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32: # pragma: no cover
# currently MatMulFpQ4 only support 4 bits and 32 group_size
org_shape = weight.shape
k_blocks = (org_shape[0] + group_size - 1) // group_size
q_weight = pad_tensor(q_weight, group_size, k_blocks)
Expand Down