From 4bcce1dcf1c6dc17430ed47a83138c44c15c482d Mon Sep 17 00:00:00 2001 From: yintong-lu <108845308+yintong-lu@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:56:48 +0800 Subject: [PATCH] fix sq skip_connection bugs (#1011) Signed-off-by: Lu, Yintong --- .../adaptor/torch_utils/smooth_quant.py | 68 ++++++------ test/algorithm/test_smooth_quant.py | 100 ++++++++++++++++++ 2 files changed, 137 insertions(+), 31 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/smooth_quant.py b/neural_compressor/adaptor/torch_utils/smooth_quant.py index 00a383e4714..575c3d2724a 100644 --- a/neural_compressor/adaptor/torch_utils/smooth_quant.py +++ b/neural_compressor/adaptor/torch_utils/smooth_quant.py @@ -835,17 +835,15 @@ def _trace(self, op_types): return absorb_to_layer, no_absorb_layers -def get_parent(node): - if node.inputs() == None: - return None - return list(node.inputs())[0].node() - -def get_parents(node): +def get_parent(node, all_parents=False): if node.inputs() == None: return None elif len(list(node.inputs())) == 0: return None - return list(node.inputs()) + if not all_parents: + return list(node.inputs())[0].node() + else: + return list(node.inputs()) class GraphTrace: @@ -912,32 +910,51 @@ def get_nodes(self, traced_model, op_types=['Linear']): break return nodes - def get_prev_absorb_layer(self, nodes, dict_parent_kind=None): + def get_prev_absorb_layer(self, nodes): prev_absorb_layer = [] for node in nodes: parent = get_parent(node) - parent_scopeName = parent.scopeName() while 1: if parent.kind() in self.skip_ops_to_find_absorb: parent = get_parent(parent) continue if parent.kind() in self.could_absorb_layers: - if dict_parent_kind: - parent_out_kinds = set(dict_parent_kind[parent_scopeName]) - parent_out_kinds.discard('aten::size') - if parent_out_kinds == parent_out_kinds.intersection(self.could_absorb_layers): - prev_absorb_layer.append(parent) - elif parent_out_kinds.intersection(self.skip_ops_to_find_absorb): - prev_absorb_layer.append(parent) ##TODO: check other scenarios - else: # When parent to multiple ops, sq transformation could be wrong. - prev_absorb_layer.append(None) - else: + + parent_out_kinds = [] + for val_user in list(parent.outputs())[0].uses(): + next_node = val_user.user + parent_out_kinds.append(next_node.kind()) + parent_out_kinds = set(parent_out_kinds) + parent_out_kinds.discard('aten::size') + + if parent_out_kinds == parent_out_kinds.intersection(self.could_absorb_layers): prev_absorb_layer.append(parent) + elif parent_out_kinds.intersection(self.skip_ops_to_find_absorb): + res = self.skip_op_absorb_helper(parent) + prev_absorb_layer.append(parent) if res else prev_absorb_layer.append(None) + else: # When parent to multiple ops, sq transformation could be wrong. + prev_absorb_layer.append(None) else: prev_absorb_layer.append(None) break return prev_absorb_layer + + def skip_op_absorb_helper(self, parent_node): + for val_user in list(parent_node.outputs())[0].uses(): + next_node = val_user.user + if next_node.kind() == 'aten::size': + continue + elif next_node.kind() in self.could_absorb_layers: + continue + elif next_node.kind() in self.skip_ops_to_find_absorb: + node_res = self.skip_op_absorb_helper(next_node) + if not node_res: + return False + else: + return False + return True + def mapping_torch_module_to_aten(self, op_types): res = [] for op in op_types: @@ -953,21 +970,10 @@ def get_absorb_to_layer(self, model, example_input, op_types): if traced_model == None: return None, None - dict_parent_kind = defaultdict(list) - for node in traced_model.graph.nodes(): - parents_list = get_parents(node) - node_kind, node_scopeName = node.kind(), node.scopeName() - if parents_list: #save input_kinds of all parent nodes - for parent_ in parents_list: - parent = parent_.node() - parent_kind = parent.kind() - if 'prim' not in parent_kind and parent.scopeName() != node_scopeName: - dict_parent_kind[parent.scopeName()].append(node_kind) - aten_op_types = self.mapping_torch_module_to_aten(op_types) nodes_types = self.get_nodes(traced_model, aten_op_types) nodes = [node_type[0] for node_type in nodes_types] - nodes_prev_absorb = self.get_prev_absorb_layer(nodes, dict_parent_kind) + nodes_prev_absorb = self.get_prev_absorb_layer(nodes) absorb_to_layer = {} no_absorb_layers = [] for index, absorb in enumerate(nodes_prev_absorb): diff --git a/test/algorithm/test_smooth_quant.py b/test/algorithm/test_smooth_quant.py index fc90b7f3bd5..c8ab47340fc 100644 --- a/test/algorithm/test_smooth_quant.py +++ b/test/algorithm/test_smooth_quant.py @@ -3,6 +3,11 @@ import numpy as np import shutil import torch +import sys +import math + +sys.path.append('./') + from neural_compressor.data import Datasets, DATALOADERS from neural_compressor.data.dataloaders.pytorch_dataloader import PyTorchDataLoader from neural_compressor.adaptor.torch_utils.smooth_quant import TorchSmoothQuant @@ -701,5 +706,100 @@ def __iter__(self): output2 = q_model.model(input_ids) +class TestSqSkipOp(unittest.TestCase): + @classmethod + def setUpClass(self): + class RandDataloader: + def __init__(self): + pass + def __iter__(self): + yield torch.rand((1, 4)) + + self.linear_dl = RandDataloader() + + @classmethod + def test_sq_skip_op_auto(self): + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.linear0 = nn.Linear(4, 4, bias=False) + self.layernorm1 = nn.LayerNorm(4) + self.linear1 = nn.Linear(4, 4, bias=False) + self.ac1 = nn.ReLU() + self.ac2 = nn.LeakyReLU() + self.linear2 = nn.Linear(4, 4, bias=True) + self.linear3 = nn.Linear(4, 2, bias=True) + self.ac3 = nn.Sigmoid() + + def forward(self, x): + x = self.linear0(x) + x1 = self.layernorm1(x) + x_l1 = self.linear1(x1) + x_ac1 = self.ac1(x1) + x_ac2 = self.ac2(x_ac1) + x_l2 = self.linear2(x1) + x = x_l1 * x_l2 + x_ac2 + x = self.linear3(x) + x = self.ac3(x) + return x + + model = Model() + sq = TorchSmoothQuant(model, self.linear_dl) + sq.transform(alpha='auto', calib_iter=1, folding=True) + #the layernorm could not used for sq-absorb because it outputs to an add op. + assert len(sq.absorb_to_layer) == 0 + +class TestSqSkipOp_attn(unittest.TestCase): + @classmethod + def setUpClass(self): + class RandDataloader: + def __init__(self): + pass + def __iter__(self): + yield torch.rand((1, 4)) + self.linear_dl = RandDataloader() + + @classmethod + def test_sq_skip_op_attn_auto(self): + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.hidden_size = 4 + self.linear0 = nn.Linear(self.hidden_size, self.hidden_size,bias=False) + self.layernorm1 = nn.LayerNorm(self.hidden_size) + self.dim_k, self.dim_v = 8, 4 + self.linear_q = nn.Linear(self.hidden_size, self.dim_k, bias=False) + self.linear_k = nn.Linear(self.hidden_size, self.dim_k, bias=False) + self.linear_v = nn.Linear(self.hidden_size, self.dim_v, bias=False) + self.ac1 = nn.ReLU() + self.ac2 = nn.LeakyReLU() + self.linear3 = nn.Linear(self.hidden_size, 3, bias=True) + self.ac3 = nn.Sigmoid() + + def forward(self, x): + x = self.linear0(x) + x = self.layernorm1(x) + q = self.linear_q(x) + k = self.linear_k(x) + v = self.linear_v(x) + score = torch.matmul(q, k.transpose(1, 0)) / math.sqrt(self.dim_k) + score = torch.softmax(score, dim=-1) + attn = torch.matmul(score, v) + x_ac1 = self.ac1(x) + x_ac2 = self.ac2(x_ac1) + x = attn + x_ac2 + x = self.linear3(x) + x = self.ac3(x) + return x + + + model = Model() + sq = TorchSmoothQuant(model, self.linear_dl) + sq.transform(alpha='auto', calib_iter=1, folding=True) + #the layernorm could not used for sq-absorb because it outputs to an add op. + assert len(sq.absorb_to_layer) == 0 + + + if __name__ == '__main__': unittest.main()