Skip to content

Commit

Permalink
fix ort woq failure with None model_path
Browse files Browse the repository at this point in the history
Signed-off-by: yuwenzho <[email protected]>
  • Loading branch information
yuwenzho committed Nov 23, 2023
1 parent 0f8bf5e commit cbd0a41
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 8 deletions.
20 changes: 12 additions & 8 deletions neural_compressor/adaptor/ox_utils/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ def rtn_quantize(
model: fake quantized ONNXModel
"""
model = model if isinstance(model, BaseModel) else ONNXModel(model)
base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
new_nodes = []
remove_nodes = []
for node in model.nodes():
Expand All @@ -321,7 +322,7 @@ def rtn_quantize(
and weight_config.get(node.name, {}) != "fp32"
):
weight_tensor = model.get_initializer(node.input[1])
weight = numpy_helper.to_array(weight_tensor, base_dir=os.path.dirname(model.model_path)).copy()
weight = numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
if len(weight.shape) != 2:
continue

Expand Down Expand Up @@ -401,6 +402,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
new_added_mul_nodes = []
replace_input = []
updated_nodes = []
base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""

for parent, nodes in absorb_pairs.items():
if any([node.input[0] not in output_dicts for node in nodes]):
Expand Down Expand Up @@ -434,7 +436,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
if weight_config.get(node.name, {}) == "fp32":
continue

weight = numpy_helper.to_array(model.get_initializer(node.input[1]), os.path.dirname(model.model_path))
weight = numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir)
if len(weight.shape) != 2:
continue

Expand Down Expand Up @@ -476,7 +478,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,

init_share_num = model.get_initializer_share_num(node.input[1])
weight_tensor = model.get_initializer(node.input[1])
tensor = numpy_helper.to_array(weight_tensor, os.path.dirname(model.model_path))
tensor = numpy_helper.to_array(weight_tensor, base_dir)

tensor = tensor.T * best_scale
tensor = (tensor.T).astype("float32")
Expand All @@ -497,7 +499,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
) == len(nodes):
for idx in [1, 2]:
tensor = numpy_helper.to_array(
model.get_initializer(parent.input[idx]), os.path.dirname(model.model_path)
model.get_initializer(parent.input[idx]), base_dir
)
new_tensor = tensor / np.reshape(best_scale, (1, -1))
model.set_initializer(parent.input[idx], new_tensor.astype(tensor.dtype), raw=True)
Expand All @@ -511,7 +513,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
): # pragma: no cover
for inp in parent.input:
if model.get_initializer(inp) is not None:
tensor = numpy_helper.to_array(model.get_initializer(inp), os.path.dirname(model.model_path))
tensor = numpy_helper.to_array(model.get_initializer(inp), base_dir)
new_tensor = tensor / np.reshape(best_scale, (1, -1))
model.set_initializer(inp, new_tensor.astype(tensor.dtype), raw=True)
updated_nodes.append(parent.name)
Expand All @@ -520,7 +522,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes[nodes[0].input[0]]) == len(
nodes
): # pragma: no cover
tensor = numpy_helper.to_array(model.get_initializer(parent.input[2]), os.path.dirname(model.model_path))
tensor = numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir)
new_tensor = tensor / np.reshape(best_scale, (1, -1))
model.set_initializer(parent.input[2], new_tensor.astype(tensor.dtype), raw=True)
updated_nodes.append(parent.name)
Expand Down Expand Up @@ -558,6 +560,7 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,

def apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme):
"""Apply clip for weight by checking mse."""
base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
ratios = {}
for parent, nodes in absorb_pairs.items():
if any([node.input[0] not in output_dicts for node in nodes]):
Expand All @@ -577,7 +580,7 @@ def apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, g
scheme = weight_config[node.name]["scheme"]

org_weight = numpy_helper.to_array(
model.get_initializer(node.input[1]), base_dir=os.path.dirname(model.model_path)
model.get_initializer(node.input[1]), base_dir=base_dir
)
org_w_shape = org_weight.shape # ic, oc
group_size = group_size if group_size != -1 else org_w_shape[0]
Expand Down Expand Up @@ -983,6 +986,7 @@ def gptq_quantize(
model: fake quantized ONNXModel
"""
model = model if isinstance(model, BaseModel) else ONNXModel(model)
base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
output_dicts = {}

inputs, so = prepare_inputs(model, n_samples, dataloader)
Expand Down Expand Up @@ -1028,7 +1032,7 @@ def gptq_quantize(
and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
):
weight = numpy_helper.to_array(
model.get_initializer(model.get_node(node.name).input[1]), os.path.dirname(model.model_path)
model.get_initializer(model.get_node(node.name).input[1]), base_dir
).copy()
if len(weight.shape) != 2:
continue
Expand Down
34 changes: 34 additions & 0 deletions test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from transformers import AutoTokenizer

from neural_compressor import PostTrainingQuantConfig, quantization
from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize, gptq_quantize, rtn_quantize
from neural_compressor.utils.constant import FP32


Expand Down Expand Up @@ -340,6 +341,39 @@ def fake_eval(model, eval_result_lst):
woq_model = self._test_woq_tune_common(partial_fake_eval, "auto", op_type_dict={".*": {"weight": {"bits": 8}}})
self.assertEqual(self._count_woq_matmul(woq_model, bits=8), 31)

def test_woq_with_ModelProto_input(self):
from neural_compressor.model.onnx_model import ONNXModel

q4_node_config = {}
template_config_q4 = {"bits": 4, "group_size": 32, "scheme": "sym"}
template_config_fp32 = "fp32"
for node in self.gptj_model.graph.node:
if node.op_type in ["MatMul"]:
if not all([ONNXModel(self.gptj_model).get_initializer(i) is None for i in node.input]):
q4_node_config[node.name] = template_config_q4
else:
q4_node_config[node.name] = template_config_fp32

q_model = rtn_quantize(self.gptj_model, q4_node_config)
for data, _ in self.gptj_dataloader:
q_out = Inference(q_model.model, data)
org_out = Inference(self.gptj_model, data)
for q, org in zip(q_out, org_out):
self.assertTrue((np.abs(q_out[0] - org_out[0]) < 0.5).all())

q_model = gptq_quantize(self.gptj_model, self.gptj_dataloader, q4_node_config)
for data, _ in self.gptj_dataloader:
q_out = Inference(q_model.model, data)
org_out = Inference(self.gptj_model, data)
for q, org in zip(q_out, org_out):
self.assertTrue((np.abs(q_out[0] - org_out[0]) < 0.5).all())

q_model = awq_quantize(self.gptj_model, self.gptj_dataloader, q4_node_config)
for data, _ in self.gptj_dataloader:
q_out = Inference(q_model.model, data)
org_out = Inference(self.gptj_model, data)
for q, org in zip(q_out, org_out):
self.assertTrue((np.abs(q_out[0] - org_out[0]) < 0.5).all())

if __name__ == "__main__":
unittest.main()

0 comments on commit cbd0a41

Please sign in to comment.