Enhance 3.x torch UT (#1787)

Enhance 3.x torch UT --------- Signed-off-by: yuwenzho <[email protected]>
intel · May 13, 2024 · 7c0b700 · 7c0b700
1 parent a4675c7
commit 7c0b700
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 510 deletions.
diff --git a/test/3x/torch/quantization/test_static_quant.py b/test/3x/torch/quantization/test_static_quant.py
@@ -47,127 +47,6 @@ def setup_class(self):
     def teardown_class(self):
         pass
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
-    def test_static_quant_default(self):
-        fp32_model = copy.deepcopy(self.fp32_model)
-        quant_config = get_default_static_config()
-        example_inputs = self.input
-        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
-        assert q_model is not None, "Quantization failed!"
-
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
-    def test_static_quant_fallback(self):
-        fp32_model = copy.deepcopy(self.fp32_model)
-        quant_config = get_default_static_config()
-        example_inputs = self.input
-        # fallback by op_type
-        quant_config.set_local(torch.nn.modules.linear.Linear, StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
-        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
-        assert q_model is not None, "Quantization failed!"
-
-        # fallback by op_name
-        quant_config.set_local("fc1", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
-        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
-        assert q_model is not None, "Quantization failed!"
-
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
-    @pytest.mark.parametrize(
-        "act_sym, act_algo",
-        [
-            (True, "kl"),
-            (True, "minmax"),
-            (False, "kl"),
-            (False, "minmax"),
-        ],
-    )
-    def test_static_quant_params(self, act_sym, act_algo):
-        fp32_model = copy.deepcopy(self.fp32_model)
-        quant_config = StaticQuantConfig(act_sym=act_sym, act_algo=act_algo)
-        example_inputs = self.input
-        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
-        assert q_model is not None, "Quantization failed!"
-
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
-    def test_static_quant_accuracy(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(2, 2, False)
-
-            def forward(self, x):
-                x = self.linear(x)
-                x = x + x
-                return x
-
-        model = M()
-
-        def run_fn(model):
-            model(torch.randn(3, 2))
-
-        fp32_model = copy.deepcopy(model)
-        fp32_model.linear.weight = torch.nn.Parameter(torch.tensor([[0.0, 1.0], [1.0, 0.0]]))
-        example_inputs = torch.zeros(3, 2)
-        quant_config = StaticQuantConfig(act_sym=True, act_algo="kl")
-        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
-        output1 = fp32_model(example_inputs)
-        output2 = q_model(example_inputs)
-        # set a big atol to avoid random issue
-        assert torch.allclose(output1, output2, atol=2e-2), "Accuracy gap atol > 0.02 is unexpected. Please check."
-
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
-    def test_static_quant_save_load(self):
-        from intel_extension_for_pytorch.quantization import convert, prepare
-
-        example_inputs = torch.zeros(1, 30)
-        try:
-            qconfig = ipex.quantization.default_static_qconfig_mapping
-        except:
-            from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig
-
-            qconfig = QConfig(
-                activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
-                weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric),
-            )
-        user_model = copy.deepcopy(self.fp32_model)
-        user_model = prepare(user_model.eval(), qconfig, example_inputs=example_inputs, inplace=True)
-
-        def run_fn(model):
-            model(example_inputs)
-
-        run_fn(user_model)
-        with torch.no_grad():
-            user_model = convert(user_model.eval(), inplace=True).eval()
-            user_model(example_inputs)
-            user_model = torch.jit.trace(user_model.eval(), example_inputs, strict=False)
-            user_model = torch.jit.freeze(user_model.eval())
-            user_model(example_inputs)
-            user_model(example_inputs)
-        ipex_out = user_model(example_inputs)
-
-        fp32_model = copy.deepcopy(self.fp32_model)
-        quant_config = get_default_static_config()
-        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
-        assert q_model is not None, "Quantization failed!"
-        inc_out = q_model(example_inputs)
-        # set a big atol to avoid random issue
-        assert torch.allclose(inc_out, ipex_out, atol=2e-02), "Unexpected result. Please double check."
-        q_model.save("saved_results")
-
-        from neural_compressor.torch.quantization import load
-
-        # load
-        loaded_model = load("saved_results")
-        assert isinstance(loaded_model, torch.jit.ScriptModule)
-
-
-class TestStaticQuantWithNewAPI:
-    def setup_class(self):
-        self.fp32_model = build_simple_torch_model()
-        self.input = torch.randn(1, 30)
-
-    def teardown_class(self):
-        pass
-
     @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
     def test_static_quant_default(self):
         fp32_model = copy.deepcopy(self.fp32_model)
@@ -292,3 +171,12 @@ def run_fn(model):
         # load
         loaded_model = load("saved_results")
         assert isinstance(loaded_model, torch.jit.ScriptModule)
+
+    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    def test_static_quant_with_quantize_API(self):
+        # quantize API
+        fp32_model = copy.deepcopy(self.fp32_model)
+        quant_config = get_default_static_config()
+        example_inputs = self.input
+        q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
+        assert q_model is not None, "Quantization failed!"
diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py
@@ -65,29 +65,27 @@ def force_not_half(self, monkeypatch):
         monkeypatch.setattr(hqq_global_option, "use_half", False)
 
     def test_hqq_quant(self, force_use_cpu, force_not_half):
-        from neural_compressor.torch.quantization import get_default_hqq_config, quantize
+        from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare, quantize
 
         hqq_global_option.use_half = False
-        model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
         example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long, device="cpu")
         # test_default_config
         quant_config = get_default_hqq_config()
-        model = quantize(model, quant_config)
-        q_label = model(example_inputs)[0]
-        print(q_label)
 
-    def test_hqq_quant_with_new_api(self, force_use_cpu, force_not_half):
-        from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare
-
-        hqq_global_option.use_half = False
-        model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
-        example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long, device="cpu")
-        # test_default_config
-        quant_config = get_default_hqq_config()
-        model = prepare(model, quant_config)
+        # prepare + convert API
+        model = prepare(deepcopy(fp32_model), quant_config)
         model = convert(model)
-        q_label = model(example_inputs)[0]
-        print(q_label)
+        q_label_1 = model(example_inputs)[0]
+
+        # quantize API
+        model = quantize(deepcopy(fp32_model), quant_config)
+        q_label_2 = model(example_inputs)[0]
+
+        # compare the results of calling `convert` + `prepare` and calling `quantize`
+        assert torch.all(
+            q_label_1.eq(q_label_2)
+        ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
 
     @pytest.mark.parametrize(
         "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",

diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py
@@ -64,73 +64,28 @@ def setup_class(cls):
         hqq_global_option.use_half = True
 
     def test_hqq_quant(self):
-        from neural_compressor.torch.quantization import get_default_hqq_config, quantize
+        from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare, quantize
 
-        model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
         example_inputs = torch.tensor(
             [[10, 20, 30, 40, 50, 60]], dtype=torch.long, device=auto_detect_accelerator().current_device()
         )
         # test_default_config
         quant_config = get_default_hqq_config()
-        model = quantize(model, quant_config)
-        q_label = model(example_inputs)[0]
-        print(q_label)
 
-    def test_hqq_quant_with_new_api(self):
-        from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare
-
-        model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
-        example_inputs = torch.tensor(
-            [[10, 20, 30, 40, 50, 60]], dtype=torch.long, device=auto_detect_accelerator().current_device()
-        )
-        # test_default_config
-        quant_config = get_default_hqq_config()
-        model = prepare(model, quant_config)
+        # prepare + convert API
+        model = prepare(deepcopy(fp32_model), quant_config)
         model = convert(model)
-        q_label = model(example_inputs)[0]
-        print(q_label)
-
-    @pytest.mark.parametrize(
-        "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",
-        [
-            (4, 64, True, False, 128),
-            (4, 64, False, False, 128),
-            (4, 64, True, True, 128),
-            (4, 64, False, True, 128),
-            (8, 64, True, False, 128),
-            (8, 64, False, False, 128),
-            (8, 64, True, True, 128),
-            (8, 64, False, True, 128),
-            (4, 64, True, False, 64),
-            (4, 64, False, False, 64),
-            (4, 64, True, True, 64),
-            (4, 64, False, True, 64),
-        ],
-    )
-    def test_hqq_module_cuda(
-        self,
-        nbits,
-        group_size,
-        quant_zero,
-        quant_scale,
-        scale_quant_group_size,
-    ):
-        _common_cuda_test(
-            nbits=nbits,
-            group_size=group_size,
-            quant_zero=quant_zero,
-            quant_scale=quant_scale,
-            scale_quant_group_size=scale_quant_group_size,
-        )
+        q_label_1 = model(example_inputs)[0]
 
+        # quantize API
+        model = quantize(deepcopy(fp32_model), quant_config)
+        q_label_2 = model(example_inputs)[0]
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU")
-class TestHQQCUDAWithNewAPI:
-    @classmethod
-    def setup_class(cls):
-        torch.manual_seed(0)
-        torch.cuda.manual_seed(0)
-        hqq_global_option.use_half = True
+        # compare the results of calling `convert` + `prepare` and calling `quantize`
+        assert torch.all(
+            q_label_1.eq(q_label_2)
+        ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
 
     @pytest.mark.parametrize(
         "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -49,29 +49,20 @@ def test_autoround(self):
         quant_config = AutoRoundConfig(n_samples=20, seqlen=10, iters=10, scale_dtype="fp32")
         logger.info(f"Test AutoRound with config {quant_config}")
 
-        qdq_model = quantize(
-            model=gpt_j_model,
-            quant_config=quant_config,
-            run_fn=get_autoround_default_run_fn,
-            run_args=(
-                tokenizer,
-                "NeelNanda/pile-10k",
-                20,
-                10,
-            ),
-        )
-        """run_args of get_autoround_default_run_fn:
+        run_fn = get_autoround_default_run_fn
+        run_args = (
             tokenizer,
-            dataset_name="NeelNanda/pile-10k",
-            n_samples=512,
-            seqlen=2048,
-            seed=42,
-            bs=8,
-            dataset_split: str = "train",
-            dataloader=None,
-        """
-
-        q_model = qdq_model
+            "NeelNanda/pile-10k",
+            20,
+            10,
+        )
+        fp32_model = gpt_j_model
+
+        # prepare + convert API
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        run_fn(model, *run_args)
+        q_model = convert(model)
+
         out2 = q_model(inp)
         assert torch.allclose(out1[0], out2[0], atol=1e-1)
         assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
@@ -116,33 +107,28 @@ def test_quantizer(self):
         assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
 
-    def test_prepare_and_convert_api(self):
+    def test_autoround_with_quantize_API(self):
         inp = torch.ones([1, 10], dtype=torch.long)
         gpt_j_model = copy.deepcopy(self.gptj)
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
         )
-
         out1 = gpt_j_model(inp)
+
         quant_config = get_default_AutoRound_config()
         logger.info(f"Test AutoRound with config {quant_config}")
 
-        run_fn = get_autoround_default_run_fn
-        run_args = (
-            tokenizer,
-            "NeelNanda/pile-10k",
-            20,
-            10,
+        # quantize API
+        q_model = quantize(
+            model=gpt_j_model,
+            quant_config=quant_config,
+            run_fn=get_autoround_default_run_fn,
+            run_args=(
+                tokenizer,
+                "NeelNanda/pile-10k",
+                20,
+                10,
+            ),
         )
-        fp32_model = gpt_j_model
-
-        # quantizer execute
-        model = prepare(model=fp32_model, quant_config=quant_config)
-        run_fn(model, *run_args)
-        q_model = convert(model)
-
         out2 = q_model(inp)
         assert torch.allclose(out1[0], out2[0], atol=1e-1)
-        assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
-        assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
-        assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]