Skip to content

Commit

Permalink
fix sq skip_connection bugs (#1011)
Browse files Browse the repository at this point in the history
Signed-off-by: Lu, Yintong <[email protected]>
  • Loading branch information
yintong-lu authored Jul 18, 2023
1 parent 79be8b9 commit 4bcce1d
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 31 deletions.
68 changes: 37 additions & 31 deletions neural_compressor/adaptor/torch_utils/smooth_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,17 +835,15 @@ def _trace(self, op_types):
return absorb_to_layer, no_absorb_layers


def get_parent(node):
if node.inputs() == None:
return None
return list(node.inputs())[0].node()

def get_parents(node):
def get_parent(node, all_parents=False):
if node.inputs() == None:
return None
elif len(list(node.inputs())) == 0:
return None
return list(node.inputs())
if not all_parents:
return list(node.inputs())[0].node()
else:
return list(node.inputs())


class GraphTrace:
Expand Down Expand Up @@ -912,32 +910,51 @@ def get_nodes(self, traced_model, op_types=['Linear']):
break
return nodes

def get_prev_absorb_layer(self, nodes, dict_parent_kind=None):
def get_prev_absorb_layer(self, nodes):
prev_absorb_layer = []
for node in nodes:
parent = get_parent(node)
parent_scopeName = parent.scopeName()
while 1:
if parent.kind() in self.skip_ops_to_find_absorb:
parent = get_parent(parent)
continue
if parent.kind() in self.could_absorb_layers:
if dict_parent_kind:
parent_out_kinds = set(dict_parent_kind[parent_scopeName])
parent_out_kinds.discard('aten::size')
if parent_out_kinds == parent_out_kinds.intersection(self.could_absorb_layers):
prev_absorb_layer.append(parent)
elif parent_out_kinds.intersection(self.skip_ops_to_find_absorb):
prev_absorb_layer.append(parent) ##TODO: check other scenarios
else: # When parent to multiple ops, sq transformation could be wrong.
prev_absorb_layer.append(None)
else:

parent_out_kinds = []
for val_user in list(parent.outputs())[0].uses():
next_node = val_user.user
parent_out_kinds.append(next_node.kind())
parent_out_kinds = set(parent_out_kinds)
parent_out_kinds.discard('aten::size')

if parent_out_kinds == parent_out_kinds.intersection(self.could_absorb_layers):
prev_absorb_layer.append(parent)
elif parent_out_kinds.intersection(self.skip_ops_to_find_absorb):
res = self.skip_op_absorb_helper(parent)
prev_absorb_layer.append(parent) if res else prev_absorb_layer.append(None)
else: # When parent to multiple ops, sq transformation could be wrong.
prev_absorb_layer.append(None)
else:
prev_absorb_layer.append(None)
break
return prev_absorb_layer


def skip_op_absorb_helper(self, parent_node):
for val_user in list(parent_node.outputs())[0].uses():
next_node = val_user.user
if next_node.kind() == 'aten::size':
continue
elif next_node.kind() in self.could_absorb_layers:
continue
elif next_node.kind() in self.skip_ops_to_find_absorb:
node_res = self.skip_op_absorb_helper(next_node)
if not node_res:
return False
else:
return False
return True

def mapping_torch_module_to_aten(self, op_types):
res = []
for op in op_types:
Expand All @@ -953,21 +970,10 @@ def get_absorb_to_layer(self, model, example_input, op_types):
if traced_model == None:
return None, None

dict_parent_kind = defaultdict(list)
for node in traced_model.graph.nodes():
parents_list = get_parents(node)
node_kind, node_scopeName = node.kind(), node.scopeName()
if parents_list: #save input_kinds of all parent nodes
for parent_ in parents_list:
parent = parent_.node()
parent_kind = parent.kind()
if 'prim' not in parent_kind and parent.scopeName() != node_scopeName:
dict_parent_kind[parent.scopeName()].append(node_kind)

aten_op_types = self.mapping_torch_module_to_aten(op_types)
nodes_types = self.get_nodes(traced_model, aten_op_types)
nodes = [node_type[0] for node_type in nodes_types]
nodes_prev_absorb = self.get_prev_absorb_layer(nodes, dict_parent_kind)
nodes_prev_absorb = self.get_prev_absorb_layer(nodes)
absorb_to_layer = {}
no_absorb_layers = []
for index, absorb in enumerate(nodes_prev_absorb):
Expand Down
100 changes: 100 additions & 0 deletions test/algorithm/test_smooth_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import numpy as np
import shutil
import torch
import sys
import math

sys.path.append('./')

from neural_compressor.data import Datasets, DATALOADERS
from neural_compressor.data.dataloaders.pytorch_dataloader import PyTorchDataLoader
from neural_compressor.adaptor.torch_utils.smooth_quant import TorchSmoothQuant
Expand Down Expand Up @@ -701,5 +706,100 @@ def __iter__(self):
output2 = q_model.model(input_ids)


class TestSqSkipOp(unittest.TestCase):
@classmethod
def setUpClass(self):
class RandDataloader:
def __init__(self):
pass
def __iter__(self):
yield torch.rand((1, 4))

self.linear_dl = RandDataloader()

@classmethod
def test_sq_skip_op_auto(self):
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear0 = nn.Linear(4, 4, bias=False)
self.layernorm1 = nn.LayerNorm(4)
self.linear1 = nn.Linear(4, 4, bias=False)
self.ac1 = nn.ReLU()
self.ac2 = nn.LeakyReLU()
self.linear2 = nn.Linear(4, 4, bias=True)
self.linear3 = nn.Linear(4, 2, bias=True)
self.ac3 = nn.Sigmoid()

def forward(self, x):
x = self.linear0(x)
x1 = self.layernorm1(x)
x_l1 = self.linear1(x1)
x_ac1 = self.ac1(x1)
x_ac2 = self.ac2(x_ac1)
x_l2 = self.linear2(x1)
x = x_l1 * x_l2 + x_ac2
x = self.linear3(x)
x = self.ac3(x)
return x

model = Model()
sq = TorchSmoothQuant(model, self.linear_dl)
sq.transform(alpha='auto', calib_iter=1, folding=True)
#the layernorm could not used for sq-absorb because it outputs to an add op.
assert len(sq.absorb_to_layer) == 0

class TestSqSkipOp_attn(unittest.TestCase):
@classmethod
def setUpClass(self):
class RandDataloader:
def __init__(self):
pass
def __iter__(self):
yield torch.rand((1, 4))
self.linear_dl = RandDataloader()

@classmethod
def test_sq_skip_op_attn_auto(self):
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.hidden_size = 4
self.linear0 = nn.Linear(self.hidden_size, self.hidden_size,bias=False)
self.layernorm1 = nn.LayerNorm(self.hidden_size)
self.dim_k, self.dim_v = 8, 4
self.linear_q = nn.Linear(self.hidden_size, self.dim_k, bias=False)
self.linear_k = nn.Linear(self.hidden_size, self.dim_k, bias=False)
self.linear_v = nn.Linear(self.hidden_size, self.dim_v, bias=False)
self.ac1 = nn.ReLU()
self.ac2 = nn.LeakyReLU()
self.linear3 = nn.Linear(self.hidden_size, 3, bias=True)
self.ac3 = nn.Sigmoid()

def forward(self, x):
x = self.linear0(x)
x = self.layernorm1(x)
q = self.linear_q(x)
k = self.linear_k(x)
v = self.linear_v(x)
score = torch.matmul(q, k.transpose(1, 0)) / math.sqrt(self.dim_k)
score = torch.softmax(score, dim=-1)
attn = torch.matmul(score, v)
x_ac1 = self.ac1(x)
x_ac2 = self.ac2(x_ac1)
x = attn + x_ac2
x = self.linear3(x)
x = self.ac3(x)
return x


model = Model()
sq = TorchSmoothQuant(model, self.linear_dl)
sq.transform(alpha='auto', calib_iter=1, folding=True)
#the layernorm could not used for sq-absorb because it outputs to an add op.
assert len(sq.absorb_to_layer) == 0



if __name__ == '__main__':
unittest.main()

0 comments on commit 4bcce1d

Please sign in to comment.