Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FQ2I] Support converting dense -> add to qnn.dense -> add -> requantize #13578

Merged
merged 12 commits into from
Dec 9, 2022
5 changes: 4 additions & 1 deletion python/tvm/relay/frontend/onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -1406,7 +1406,10 @@ def _impl_v1(cls, inputs, attr, params):
inputs[0] *= _expr.const(alpha, dtype=dtype)
out = _op.nn.dense(inputs[0], inputs[1], units=channels)
if len(inputs) == 3:
out = out + _expr.const(beta, dtype=dtype) * inputs[2]
if beta != 1.0:
masahi marked this conversation as resolved.
Show resolved Hide resolved
out += _expr.const(float(beta), dtype=dtype) * inputs[2]
else:
out += inputs[2]
return out


Expand Down
31 changes: 31 additions & 0 deletions python/tvm/relay/transform/fake_quantization_to_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,37 @@ def register_binary_qnn(op_name, op):

def binary(expr, type_map):
left, right, left_t, right_t, out_t = get_binary_types(expr, type_map)

if (
op_name == "add"
and approx_equal(left_t.scale, right_t.scale)
and approx_equal(left_t.zero_point, right_t.zero_point)
and tvm.ir.structural_equal(left_t.dtype, right_t.dtype)
and left_t.dtype == "int32"
and approx_equal(left_t.scale, out_t.scale)
and approx_equal(left_t.zero_point, out_t.zero_point)
and np.all(out_t.zero_point.data.numpy() == 0)
):
# If this add op comes after conv2d or dense, out_t.scale and out_t.zero_point
# can be a vector, which is not supported by QNN binary operators.
# In particular, the pattern of an `add` op following `dense`, where the addition is
# really a bias addtion, can come up often. We identify that pattern and convert it to
# `qnn.dense` -> `add`.
# To avoid overflow, we do this conversion only when the input data type is 32 bit (bias
# addition is typically done in 32 bit).
return [left + right, left_t]

assert (
len(out_t.scale.data.shape) == 0
), "The output scale needs to be a scalar, but got a tensor of shape {}".format(
out_t.scale.data.shape
)
assert (
len(out_t.zero_point.data.shape) == 0
), "The output zero point needs to be a scalar, but got a tensor of shape {}".format(
out_t.zero_point.data.shape
)

out = op(
left,
right,
Expand Down
2 changes: 1 addition & 1 deletion src/relay/transforms/fake_quantization_to_integer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ class SubgraphMutator : public ExprMutator {
return Mutate(expr);
} catch (std::exception& e) {
if (hard_fail_) {
throw e;
LOG(FATAL) << e.what();
} else {
DLOG(INFO) << "Ran into an error rewriting a subgraph, skipping" << expr << std::endl;
return expr;
Expand Down
43 changes: 36 additions & 7 deletions tests/python/relay/test_pass_fake_quantization_to_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,41 @@ def test_fake_quantize_dense_per_channel():
compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)


def test_fake_quantize_dense_bias():
out_dtype = "int8"
x = relay.var("x", shape=[128, 64], dtype="int8")
w = relay.var("w", shape=[256, 64], dtype="int8")
bias = relay.var("bias", shape=[256], dtype="int32")
one = relay.const(1.0)
zero = relay.const(0)
w_scale = np.random.random([256]).astype("float32")

op = relay.op.nn.dense(
relay.qnn.op.dequantize(x, relay.const(2.0), zero),
relay.qnn.op.dequantize(
w,
relay.const(w_scale),
zero,
axis=0,
),
units=256,
)

op += relay.qnn.op.dequantize(
bias,
relay.const(2.0 * w_scale),
zero,
)

op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)

x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
bias_np = np.random.randint(-128, 127, size=[256], dtype="int32")

compare_fq_to_int(op, [x_np, w_np, bias_np], allow_rounding_error=True)


def test_fake_quantize_batch_matmul():
for out_dtype in ["int8", "uint8"]:
x = relay.var("x", shape=[1, 128, 64], dtype="int8")
Expand Down Expand Up @@ -976,15 +1011,9 @@ def test_fq_qat_positive_nothing_to_do():
op1 = relay.qnn.op.quantize(
relay.const(1.0), relay.const(12.0), relay.const(0), out_dtype="int32"
)
op2 = relay.qnn.op.add(
op2 = relay.op.add(
op0,
op1,
relay.const(12.0),
relay.const(0),
relay.const(12.0),
relay.const(0),
relay.const(12.0),
relay.const(0),
)
expected_expr = relay.qnn.op.requantize(
op2, relay.const(12.0), relay.const(0), relay.const(1.0), relay.const(0), out_dtype="int8"
Expand Down