From 5592acc60562b7fccb308af0eaaba9cad53004a5 Mon Sep 17 00:00:00 2001 From: zehao-intel Date: Tue, 2 Jul 2024 14:18:51 +0800 Subject: [PATCH 1/2] Remove Gelu Fusion for TF Newapi (#1886) Signed-off-by: zehao-intel --- .../utils/graph_rewriter/generic/fuse_gelu.py | 3 +- .../ptq/newapi/test_graph_fuse_gelu_newapi.py | 413 ------------------ 2 files changed, 1 insertion(+), 415 deletions(-) delete mode 100644 test/3x/tensorflow/quantization/ptq/newapi/test_graph_fuse_gelu_newapi.py diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_gelu.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_gelu.py index 4c1984138ab..ac963c63ce9 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_gelu.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_gelu.py @@ -22,7 +22,6 @@ from neural_compressor.tensorflow.quantization.utils.graph_rewriter.graph_base import GraphRewriterBase from neural_compressor.tensorflow.quantization.utils.graph_util import GraphAnalyzer from neural_compressor.tensorflow.quantization.utils.graph_util import GraphRewriterHelper as Helper -from neural_compressor.tensorflow.utils import SPR_BASE_VERSIONS class FuseGeluOptimizer(GraphRewriterBase): # pragma: no cover @@ -30,7 +29,7 @@ class FuseGeluOptimizer(GraphRewriterBase): # pragma: no cover def do_transformation(self): """Execute the fusion from small ops to Gelu.""" - if not (tf.version.VERSION in ("1.15.0-up2", "1.15.0-up3") or tf.version.VERSION in SPR_BASE_VERSIONS): + if tf.version.VERSION not in ("1.15.0-up2", "1.15.0-up3"): return self.model cur_graph = GraphAnalyzer() diff --git a/test/3x/tensorflow/quantization/ptq/newapi/test_graph_fuse_gelu_newapi.py b/test/3x/tensorflow/quantization/ptq/newapi/test_graph_fuse_gelu_newapi.py deleted file mode 100644 index e0194700fc3..00000000000 --- a/test/3x/tensorflow/quantization/ptq/newapi/test_graph_fuse_gelu_newapi.py +++ /dev/null @@ -1,413 +0,0 @@ -# -# -*- coding: utf-8 -*- -# -import unittest - -import tensorflow as tf -from tensorflow.compat.v1 import graph_util - -from neural_compressor.tensorflow.quantization.utils.graph_rewriter.generic.fuse_gelu import FuseGeluOptimizer -from neural_compressor.tensorflow.utils import disable_random - - -class TestGeluFusion(unittest.TestCase): - def gelu(self, input_tensor, mul_value=0.5, addv2_value=1.0, sqrt_value=2.0): - cdf = mul_value * (addv2_value + tf.math.erf(input_tensor / tf.sqrt(sqrt_value))) - return input_tensor * cdf - - def gelu_enable_approximation( - self, - input_tensor, - another_mul_value=0.5, - mul1_value=0.044715, - addv2_value=1.0, - mul2_value=0.7978845608028654, - pow_value=3, - ): - coeff = tf.cast(mul1_value, input_tensor.dtype) - return ( - another_mul_value - * input_tensor - * (addv2_value + tf.tanh(mul2_value * (input_tensor + coeff * tf.pow(input_tensor, pow_value)))) - ) - - def gelu_enable_approximation_varaint( - self, - input_tensor, - another_mul_value=0.5, - mul1_value=0.044715, - addv2_value=1.0, - mul2_value=0.7978845608028654, - pow_value=3, - ): - coeff = tf.cast(mul1_value, input_tensor.dtype) - cdf = another_mul_value * ( - addv2_value + tf.tanh(mul2_value * (input_tensor + coeff * tf.pow(input_tensor, pow_value))) - ) - - return input_tensor * cdf - - def gelu_disable_approximation( - self, - input_tensor, - another_add_value=0.5, - mul1_value=0.044715, - addv2_value=1.0, - mul2_value=0.7978845608028654, - pow_value=3, - ): - coeff = tf.cast(mul1_value, input_tensor.dtype) - return (another_add_value + input_tensor) * ( - addv2_value + tf.tanh(mul2_value * (input_tensor + coeff * tf.pow(input_tensor, pow_value))) - ) - - @disable_random() - def test_gelu_disable_approximation_fusion(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_disable_approximation(conv_bias) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_approximation_fusion(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation(conv_bias) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, True) - - @disable_random() - def test_gelu_approximation_fusion_varaint(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation_varaint(conv_bias) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, True) - - @disable_random() - def test_gelu_approximation_fusion_with_invalid_pow_value(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation(conv_bias, pow_value=1.0) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_approximation_fusion_with_invalid_mul2_value(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation(conv_bias, mul2_value=1.0) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_approximation_fusion_with_invalid_addv2_value(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation(conv_bias, addv2_value=12.0) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_approximation_fusion_with_invalid_mul1_value(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation(conv_bias, mul1_value=1.0) - relu = tf.nn.relu(gelu) - - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_approximation_fusion_with_invalid_another_mul(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu_enable_approximation(conv_bias, another_mul_value=1.0) - relu = tf.nn.relu(gelu) - - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_fusion_with_invalid_sqrt(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu(conv_bias, sqrt_value=1.0) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[gelu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_fusion_with_invalid_addv2(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu(conv_bias, addv2_value=10.0) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[gelu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_fusion_with_invalid_mul(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu(conv_bias, mul_value=1.0) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[gelu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, False) - - @disable_random() - def test_gelu_fusion(self): - x = tf.compat.v1.placeholder(tf.float32, [1, 224, 224, 3], name="input") - - conv_weights = tf.compat.v1.get_variable( - "weight", [3, 3, 3, 32], initializer=tf.compat.v1.random_normal_initializer() - ) - conv_bias = tf.compat.v1.get_variable("bias", [32], initializer=tf.compat.v1.random_normal_initializer()) - conv1 = tf.nn.conv2d(x, conv_weights, strides=[1, 1, 1, 1], padding="SAME") - conv_bias = tf.math.add(conv1, conv_bias) - - gelu = self.gelu(conv_bias) - relu = tf.nn.relu(gelu) - with tf.compat.v1.Session() as sess: - sess.run(tf.compat.v1.global_variables_initializer()) - output_graph_def = graph_util.convert_variables_to_constants( - sess=sess, input_graph_def=sess.graph_def, output_node_names=[relu.name.split(":")[0]] - ) - - output_graph_def = FuseGeluOptimizer(output_graph_def).do_transformation() - - found_gelu = False - for i in output_graph_def.node: - if i.op == "Gelu": - found_gelu = True - break - - self.assertEqual(found_gelu, True) - - -if __name__ == "__main__": - unittest.main() From 63b29126b7c1958939af388d48e56fcceb85db6f Mon Sep 17 00:00:00 2001 From: Yi Liu <106061964+yiliu30@users.noreply.github.com> Date: Tue, 2 Jul 2024 14:46:02 +0800 Subject: [PATCH 2/2] Refine HQQ UTs (#1888) Signed-off-by: yiliu30 --- .../weight_only/hqq/test_hqq_cuda.py | 130 ------------------ .../{hqq/test_hqq_cpu.py => test_hqq.py} | 30 +++- 2 files changed, 23 insertions(+), 137 deletions(-) delete mode 100644 test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py rename test/3x/torch/quantization/weight_only/{hqq/test_hqq_cpu.py => test_hqq.py} (88%) diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py deleted file mode 100644 index 777daf0e60b..00000000000 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py +++ /dev/null @@ -1,130 +0,0 @@ -from copy import deepcopy - -import pytest -import torch -from transformers import AutoModelForCausalLM - -from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option -from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear -from neural_compressor.torch.algorithms.weight_only.hqq.utility import see_cuda_memory_usage -from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator - - -def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128): - # Parse config - weight_qconfig = QTensorConfig( - nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False - ) - zero_qconfig = None - if quant_zero: - zero_qconfig = QTensorConfig(nbits=8, channel_wise=False, group_size=None, optimize=False) - scale_qconfig = None - if quant_scale: - scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) - hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - device = torch.cuda.current_device() - - # Create HQQ Linear - bs = 4 - in_features = 64 - out_features = 128 - see_cuda_memory_usage(message="Before create float linear") - float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) - if hqq_global_option.use_half: - float_linear = float_linear.half() - see_cuda_memory_usage(message="After create float linear") - float_linear.to(device) - float_linear_copy = deepcopy(float_linear) - see_cuda_memory_usage(message="After copy the float linear") - hqq_linear = HQQLinear.from_float(float_linear_copy, quant_config=hqq_quant_config) - see_cuda_memory_usage(message="After create hqq linear") - - # Forward - input = torch.randn(bs, in_features, device=device) - if hqq_global_option.use_half: - input = input.half() - float_output = float_linear(input) - input_for_hqq = deepcopy(input) - hqq_output = hqq_linear(input_for_hqq) - hqq_output_2 = hqq_linear(input_for_hqq) - float_qdq_diff = 0.1 # hard code it first - torch.allclose(float_output, hqq_output, atol=float_qdq_diff) - torch.allclose(hqq_output, hqq_output_2) - del float_linear, hqq_linear - del float_output, hqq_output, hqq_output_2 - see_cuda_memory_usage("At the end of test") - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") -class TestHQQCUDA: - @classmethod - def setup_class(cls): - torch.manual_seed(0) - torch.cuda.manual_seed(0) - hqq_global_option.use_half = True - - def test_hqq_quant(self): - from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare, quantize - - fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") - example_inputs = torch.tensor( - [[10, 20, 30, 40, 50, 60]], dtype=torch.long, device=auto_detect_accelerator().current_device() - ) - # test_default_config - quant_config = get_default_hqq_config() - - # prepare + convert API - model = prepare(deepcopy(fp32_model), quant_config) - model = convert(model) - q_label_1 = model(example_inputs)[0] - - # quantize API - model = quantize(deepcopy(fp32_model), quant_config) - q_label_2 = model(example_inputs)[0] - - # compare the results of calling `convert` + `prepare` and calling `quantize` - assert torch.all( - q_label_1.eq(q_label_2) - ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal." - - @pytest.mark.parametrize( - "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", - [ - (4, 64, True, False, 128), - (4, 64, False, False, 128), - (4, 64, True, True, 128), - (4, 64, False, True, 128), - (8, 64, True, False, 128), - (8, 64, False, False, 128), - (8, 64, True, True, 128), - (8, 64, False, True, 128), - (4, 64, True, False, 64), - (4, 64, False, False, 64), - (4, 64, True, True, 64), - (4, 64, False, True, 64), - ], - ) - def test_hqq_module_cuda( - self, - nbits, - group_size, - quant_zero, - quant_scale, - scale_quant_group_size, - ): - _common_cuda_test( - nbits=nbits, - group_size=group_size, - quant_zero=quant_zero, - quant_scale=quant_scale, - scale_quant_group_size=scale_quant_group_size, - ) - - -# _common_cuda_test( -# nbits=4, -# group_size=64, -# quant_zero=False, -# quant_scale=False, -# scale_quant_group_size=128 -# ) diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py b/test/3x/torch/quantization/weight_only/test_hqq.py similarity index 88% rename from test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py rename to test/3x/torch/quantization/weight_only/test_hqq.py index 9a0290ffe29..1d68a553859 100644 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py +++ b/test/3x/torch/quantization/weight_only/test_hqq.py @@ -6,6 +6,7 @@ import transformers from transformers import AutoModelForCausalLM +from neural_compressor.common.utils import logger from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear from neural_compressor.torch.quantization import HQQConfig, convert, get_default_hqq_config, prepare, quantize @@ -14,7 +15,9 @@ device = accelerator.current_device_name() -def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128): +def _common_hqq_test( + nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None +): # Parse config weight_qconfig = QTensorConfig( nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False @@ -26,7 +29,6 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, if quant_scale: scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - device = "cpu" # Create HQQ Linear bs = 4 @@ -34,7 +36,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, out_features = 128 float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) if hqq_global_option.use_half: - print(f"hqq_global_option use half: {hqq_global_option.use_half}") + logger.info(f"hqq_global_option use half: {hqq_global_option.use_half}") float_linear = float_linear.half() float_linear.to(device) float_linear_copy = deepcopy(float_linear) @@ -54,7 +56,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, del float_output, hqq_output, hqq_output_2 -class TestHQQCPU: +class TestHQQ: @classmethod def setup_class(cls): @@ -137,6 +139,7 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half): id(model.model.decoder.embed_tokens.weight) == lm_head_id ), "The tied lm_head weight is not deep copied, please check!" + @pytest.mark.parametrize("device_name", ["cuda", "cpu"]) @pytest.mark.parametrize( "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", [ @@ -155,13 +158,26 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half): (4, -1, False, True, 64), ], ) - def test_hqq_module_cpu( - self, force_use_cpu, force_not_half, nbits, group_size, quant_zero, quant_scale, scale_quant_group_size + def test_hqq_module( + self, + nbits, + group_size, + quant_zero, + quant_scale, + scale_quant_group_size, + device_name, ): - _common_cpu_test( + if device_name == "cuda" and not torch.cuda.is_available(): + pytest.skip("Skipping CUDA test because cuda is not available") + if device_name == "cpu": + os.environ["FORCE_DEVICE"] = "cpu" + hqq_global_option.use_half = False + + _common_hqq_test( nbits=nbits, group_size=group_size, quant_zero=quant_zero, quant_scale=quant_scale, scale_quant_group_size=scale_quant_group_size, + device=torch.device(device_name), )