From 4cf0620c6e4e4ccc5a26d5a1b72afd6d9d73156d Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 25 Jun 2024 16:12:35 +0800
Subject: [PATCH 01/38] support rtn & gptq(draft)

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/layer_wise/utils.py      |   2 +
 .../torch/algorithms/weight_only/gptq.py      | 130 +++++++++++-------
 .../torch/algorithms/weight_only/modules.py   |   3 +-
 .../torch/algorithms/weight_only/rtn.py       |  40 +++++-
 .../torch/quantization/algorithm_entry.py     |   1 +
 .../quantization/weight_only/test_gptq.py     |  36 +++--
 .../quantization/weight_only/test_rtn.py      |   8 +-
 7 files changed, 153 insertions(+), 67 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index 464a25cdee0..93c41fa9fc1 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -214,6 +214,8 @@ def _get_path(pretrained_model_name_or_path):
         path = dowload_hf_model(pretrained_model_name_or_path)
     return path
 
+get_path = _get_path
+
 
 def load_value(model, param_name, path):
     if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True):
diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 4e2c19a8815..53217e35ee3 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -224,11 +224,13 @@ def __init__(
 
         # device
         self.device = get_accelerator(kwargs.pop("device", "auto")).current_device_name()
-        self.model.to(self.device)
+        if not use_layer_wise:
+            self.model.to(self.device)
         self.is_ready = False
 
         self.use_layer_wise = use_layer_wise
-        self.model_path = model_path
+        if use_layer_wise:
+            self.prepare_layer_wise(model_path)
 
         # dataloader
         self.use_max_length = use_max_length
@@ -237,6 +239,18 @@ def __init__(
         self.dataloader = []
         self.nsamples = nsamples
 
+    def prepare_layer_wise(self, model_path):
+        from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks
+        import os
+        os.makedirs(LWQ_WORKSPACE, exist_ok=True)
+        if model_path == "":
+            model_path = self.model.path
+        assert model_path, "model_path should not be None."
+        self.model_path = get_path(model_path)
+        register_weight_hooks(
+            self.model, self.model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE
+        )
+    
     def get_full_layer_name(self, sub_layer_name, block_idx):
         transformer_name = self.gptq_related_blocks["transformers_name"]
         return ".".join([transformer_name, str(block_idx), sub_layer_name])
@@ -394,7 +408,6 @@ def execute_quantization(self, means=None, stds=None):
         # Step1: prepare quantization (calibration datasets)
 
         logger.info("Begin ====>")
-        model_path = self.model_path
 
         # Step2: run gptq quantization in a transformer block-wise manner.
         gptq_config = {}
@@ -430,8 +443,8 @@ def execute_quantization(self, means=None, stds=None):
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
                 if self.use_layer_wise:  # pragma: no cover
                     from neural_compressor.torch.algorithms.layer_wise import load_value
-
-                    W = load_value(self.model, full_layer_name + ".weight", model_path)
+                    # import pdb; pdb.set_trace()
+                    W = load_value(self.model, full_layer_name + ".weight", self.model_path)
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
 
@@ -467,12 +480,23 @@ def tmp(_, inp, out):
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 logger.info(f"Quantizing layer {layer_name}")
                 if self.use_layer_wise:  # pragma: no cover
-                    from neural_compressor.torch.algorithms.layer_wise import load_value
+                    from neural_compressor.torch.algorithms.layer_wise import load_value, set_module_tensor_to_device
 
                     full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    W = load_value(self.model, full_layer_name + ".weight", model_path)
+                    for n, p in sub_layers[layer_name].named_parameters():
+                        param_name = full_layer_name + "." + n
+                        # breakpoint()
+                        if n == "weight":
+                            W = load_value(self.model, full_layer_name + ".weight", self.model_path)
+                        else:
+                            value = load_value(self.model, param_name, self.model_path)
+                            set_module_tensor_to_device(self.model, param_name, self.device, value)
+                    
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
+                    
+                    
+                    
                 accelerator.mark_step()
                 if "hpu" in self.device:
                     W = W.to("cpu")
@@ -484,30 +508,8 @@ def tmp(_, inp, out):
                     act_order=weight_config_this_layer["act_order"],
                     static_groups=weight_config_this_layer["static_groups"],
                 )
-                if self.use_layer_wise:  # pragma: no cover
-                    from neural_compressor.torch.algorithms.layer_wise import (
-                        LWQ_WORKSPACE,
-                        clean_module_weight,
-                        load_value,
-                        set_module_tensor_to_device,
-                    )
-
-                    sub_layer = sub_layers[layer_name]
-                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    for n, p in sub_layer.named_parameters():
-                        param_name = full_layer_name + "." + n
-                        if n == "weight":
-                            set_module_tensor_to_device(self.model, param_name, self.device, Q)
-                        else:
-                            value = load_value(self.model, param_name, model_path)
-                            set_module_tensor_to_device(self.model, param_name, self.device, value)
-                    # sub_layer.weight.data = Q
-                    torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
-                    clean_module_weight(sub_layer)
-                    del Q
-                    gc.collect()
-                else:
-                    sub_layers[layer_name].weight.data = Q
+                
+                # Step 2.5: export to compressed model
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
@@ -515,24 +517,7 @@ def tmp(_, inp, out):
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[
                         layer_name
                     ].perm
-                gptq_for_this_block[layer_name].free()
-
-            # Step 2.5: replace output data with quantized weights
-            outs = []
-            batch_num = self.cache_key_arguments.pop("batch_num")
-            for j in range(batch_num):
-                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
-                cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
-                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
-                out = self.track_hidden_states(out)
-                outs.append(out)
-            self.cache_key_arguments["batch_num"] = batch_num
-            if self.use_layer_wise:  # pragma: no cover
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
-            else:
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
-            # Step 2.6: export to compressed model
-            for layer_name in sub_layers:
+                
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 gptq_scale = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["scale"]
                 if not weight_config_this_layer["sym"]:
@@ -543,7 +528,6 @@ def tmp(_, inp, out):
                     gptq_perm = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"]
                 else:
                     gptq_perm = None
-                Q = sub_layers[layer_name].weight.data
                 if weight_config_this_layer["act_order"]:
                     Q.copy_(Q[:, gptq_perm])
                 if is_transformers_imported() and isinstance(sub_layers[layer_name], transformers.Conv1D):
@@ -584,7 +568,52 @@ def tmp(_, inp, out):
                     device=self.device,
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm)
+                
+                    
+                if self.use_layer_wise:  # pragma: no cover
+                    from neural_compressor.torch.algorithms.layer_wise import (
+                        LWQ_WORKSPACE,
+                        clean_module_weight,
+                        load_value,
+                        set_module_tensor_to_device,
+                    )
+
+                    # sub_layer = sub_layers[layer_name]
+                    # full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                    # for n, p in sub_layer.named_parameters():
+                    #     param_name = full_layer_name + "." + n
+                    #     # breakpoint()
+                    #     if n == "weight":
+                    #         set_module_tensor_to_device(self.model, param_name, self.device, Q)
+                    #     else:
+                    #         value = load_value(self.model, param_name, model_path)
+                    #         set_module_tensor_to_device(self.model, param_name, self.device, value)
+                    # sub_layer.weight.data = Q
+                    # torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
+                    torch.save(new_module.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
+                    clean_module_weight(new_module)
+                    del Q
+                    gc.collect()
                 set_module(transformer_block, layer_name, new_module)
+
+                gptq_for_this_block[layer_name].free()
+
+            # Step 2.6: replace output data with quantized weights
+            outs = []
+            batch_num = self.cache_key_arguments.pop("batch_num")
+            for j in range(batch_num):
+                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
+                cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
+                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
+                out = self.track_hidden_states(out)
+                outs.append(out)
+            self.cache_key_arguments["batch_num"] = batch_num
+            if self.use_layer_wise:  # pragma: no cover
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
+            else:
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
+            
+                
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.
@@ -999,6 +1028,7 @@ def prepare(
     def convert(self, model, *args, **kwargs):
         self.gptq_quantizer.model = model
         self.gptq_quantizer.remove_prepare_for_calibration()
+
         q_model, gptq_config = self.gptq_quantizer.execute_quantization()
         q_model.gptq_config = gptq_config
         logger.info("GPTQ quantizing done.")
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 18cf6e46e55..30c40cfa9c3 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -175,7 +175,8 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
             self.scales = self.scales.T.contiguous()
             self.qweight = self.qweight.T.contiguous()
             self.qzeros = self.qzeros.T.contiguous()
-        int_weight = int_weight.to(self.device)
+        if int_weight.device.type != "meta":
+            int_weight = int_weight.to(self.device)
         if self.use_optimum_format and zp is None:
             # to avoid overflow
             int_weight = int_weight.type(torch.int32)
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index fc083191ffe..20040438c00 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -22,6 +22,7 @@
 from collections import OrderedDict
 
 import torch
+import gc
 
 from neural_compressor.torch.algorithms import Quantizer
 from neural_compressor.torch.utils import get_accelerator, is_transformers_imported, logger, set_module
@@ -89,10 +90,6 @@ def convert(
         weight_config = self.quant_config
         device = get_accelerator(kwargs.pop("device", "auto")).current_device_name()
 
-        # Put model on device explicitly
-        # TODO: refine it later, Put module on device one by one instead of the whole model
-        model.to(device)
-
         assert isinstance(model, torch.nn.Module), "only support torch module"
         if is_transformers_imported():
             supported_layers = (torch.nn.Linear, transformers.Conv1D)
@@ -130,6 +127,7 @@ def convert(
                 use_full_range = weight_config[name]["use_full_range"]
                 use_mse_search = weight_config[name]["use_mse_search"]
                 use_layer_wise = weight_config[name]["use_layer_wise"]
+                model_path = weight_config[name]["model_path"]
                 use_optimum_format = kwargs.get("use_optimum_format", True)
                 # double quant config
                 double_quant_config = {
@@ -154,6 +152,24 @@ def convert(
                 continue
             logger.debug(f"RTN quantized module:{name, m}")
             logger.debug(log_msg)
+            
+            if use_layer_wise:
+                from neural_compressor.common.utils import DEFAULT_WORKSPACE
+                from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, load_value
+                import os
+                lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir")
+                os.makedirs(lwq_workspace, exist_ok=True)
+                model_path = get_path(model_path)
+            
+                # load weight
+                # breakpoint()
+                load_module(model, name, model_path, device=device)
+                # load_value(model, name + ".weight", model_path)
+            else:
+                # Put model on device explicitly
+                # TODO: refine it later, Put module on device one by one instead of the whole model
+                model.to(device)
+            
             # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight.
             if is_transformers_imported():
                 transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D))
@@ -202,8 +218,24 @@ def convert(
                 device=device,
             )
             new_module.pack(int_weight, scale, zp, m.bias)
+            
+            # import pdb; pdb.set_trace()
+            if use_layer_wise:
+                # save and clean weight
+                from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
+
+                torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt"))
+                clean_module_weight(new_module)
+                del m
+                gc.collect()
             if name == "":
                 return new_module
             else:
                 set_module(model, name, new_module)
+            
+        if use_layer_wise:
+            # register hooks
+            from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks
+
+            register_weight_hooks(model, model_path, device=device, clean_weight=True)
         return model
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 733e4409b91..1850829104f 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -84,6 +84,7 @@ def rtn_entry(
             "use_full_range": quant_config.use_full_range,
             "use_mse_search": quant_config.use_mse_search,
             "use_layer_wise": quant_config.use_layer_wise,
+            "model_path": quant_config.model_path,
             "use_double_quant": quant_config.use_double_quant,
             "double_quant_dtype": quant_config.double_quant_dtype,
             "double_quant_bits": quant_config.double_quant_bits,
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index be408af2564..3f258204c75 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -28,8 +28,10 @@ def run_fn(model):
     # GPTQ uses ValueError to reduce computation when collecting input data of the first block
     # It's special for UTs, no need to add this wrapper in examples.
     with pytest.raises(ValueError):
-        model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device))
-        model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device))
+    #     model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device))
+    #     model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device))
+        model(torch.tensor([[10, 20, 30]], dtype=torch.long))
+        model(torch.tensor([[40, 50, 60]], dtype=torch.long))
 
 
 class TestGPTQQuant:
@@ -170,14 +172,28 @@ def test_act_order(self):
         # compare atol, this case is an ideal case.
         assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check."
 
-    # def test_layer_wise(self):
-    #     model = copy.deepcopy(self.tiny_gptj)
-    #     quant_config = GPTQConfig(
-    #         use_layer_wise=True,
-    #     )
-    #     model = quantize(model, quant_config, run_fn=run_fn)
-    # TODO: (Xin) not implemented
-
+    def test_layer_wise(self):
+        # model = copy.deepcopy(self.tiny_gptj)
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig()
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        q_label = model(self.example_inputs)[0]
+    
+        from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+        model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM",  torchscript=True)
+        
+    
+        quant_config = GPTQConfig(
+            use_layer_wise=True,
+            model_path="hf-internal-testing/tiny-random-GPTJForCausalLM"
+        )
+        model = quantize(model, quant_config, run_fn=run_fn)
+        out = model(self.example_inputs)[0]
+        atol_true = (out - q_label).amax()
+        print(out, atol_true)
+        
     @pytest.mark.parametrize("dtype", ["nf4", "int4"])
     @pytest.mark.parametrize("double_quant_bits", [6])
     @pytest.mark.parametrize("double_quant_group_size", [8, 256])
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index f82185cc82e..889aa902b87 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -139,13 +139,17 @@ def test_mse_search(self):
             assert torch.allclose(atol_false, atol_true, atol=0.012), "atol is very close, double checked the logic."
 
     def test_layer_wise(self):
-        model = copy.deepcopy(self.tiny_gptj)
+        # model = copy.deepcopy(self.tiny_gptj)
+        from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+        model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
         quant_config = RTNConfig(
             use_layer_wise=True,
+            model_path="hf-internal-testing/tiny-random-GPTJForCausalLM",
         )
         model = prepare(model, quant_config)
         model = convert(model)
-        # TODO: (Xin) not implemented
+        out = model(self.example_inputs)[0]
+        assert torch.equal(out, self.q_label),  "use_layer_wise=True output should be same. Please double check."
 
     @pytest.mark.parametrize(
         "dtype",

From a1d9e1045bb7485ffc82dd6f3eb2de928bec02a2 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 25 Jun 2024 16:53:02 +0800
Subject: [PATCH 02/38] clean code

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/weight_only/gptq.py       |  5 -----
 .../torch/algorithms/weight_only/rtn.py        | 18 ++++++++----------
 .../torch/quantization/algorithm_entry.py      |  3 +--
 .../quantization/weight_only/test_gptq.py      |  3 +--
 4 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 53217e35ee3..08745ddceaa 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -443,7 +443,6 @@ def execute_quantization(self, means=None, stds=None):
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
                 if self.use_layer_wise:  # pragma: no cover
                     from neural_compressor.torch.algorithms.layer_wise import load_value
-                    # import pdb; pdb.set_trace()
                     W = load_value(self.model, full_layer_name + ".weight", self.model_path)
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
@@ -485,7 +484,6 @@ def tmp(_, inp, out):
                     full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                     for n, p in sub_layers[layer_name].named_parameters():
                         param_name = full_layer_name + "." + n
-                        # breakpoint()
                         if n == "weight":
                             W = load_value(self.model, full_layer_name + ".weight", self.model_path)
                         else:
@@ -495,8 +493,6 @@ def tmp(_, inp, out):
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
                     
-                    
-                    
                 accelerator.mark_step()
                 if "hpu" in self.device:
                     W = W.to("cpu")
@@ -568,7 +564,6 @@ def tmp(_, inp, out):
                     device=self.device,
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm)
-                
                     
                 if self.use_layer_wise:  # pragma: no cover
                     from neural_compressor.torch.algorithms.layer_wise import (
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 20040438c00..fb823abea82 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -65,6 +65,7 @@ def convert(
         quantile=1.0,
         use_full_range=False,
         use_mse_search=False,
+        use_layer_wise=False,
         *args,
         **kwargs,
     ):
@@ -90,6 +91,11 @@ def convert(
         weight_config = self.quant_config
         device = get_accelerator(kwargs.pop("device", "auto")).current_device_name()
 
+        # Put model on device explicitly
+        # TODO: refine it later, Put module on device one by one instead of the whole model
+        if not use_layer_wise:
+            model.to(device)
+
         assert isinstance(model, torch.nn.Module), "only support torch module"
         if is_transformers_imported():
             supported_layers = (torch.nn.Linear, transformers.Conv1D)
@@ -126,7 +132,6 @@ def convert(
                 group_dim = weight_config[name]["group_dim"]
                 use_full_range = weight_config[name]["use_full_range"]
                 use_mse_search = weight_config[name]["use_mse_search"]
-                use_layer_wise = weight_config[name]["use_layer_wise"]
                 model_path = weight_config[name]["model_path"]
                 use_optimum_format = kwargs.get("use_optimum_format", True)
                 # double quant config
@@ -162,14 +167,8 @@ def convert(
                 model_path = get_path(model_path)
             
                 # load weight
-                # breakpoint()
                 load_module(model, name, model_path, device=device)
-                # load_value(model, name + ".weight", model_path)
-            else:
-                # Put model on device explicitly
-                # TODO: refine it later, Put module on device one by one instead of the whole model
-                model.to(device)
-            
+           
             # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight.
             if is_transformers_imported():
                 transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D))
@@ -218,8 +217,7 @@ def convert(
                 device=device,
             )
             new_module.pack(int_weight, scale, zp, m.bias)
-            
-            # import pdb; pdb.set_trace()
+
             if use_layer_wise:
                 # save and clean weight
                 from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 1850829104f..01b496ee9a3 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -83,7 +83,6 @@ def rtn_entry(
             "group_dim": quant_config.group_dim,
             "use_full_range": quant_config.use_full_range,
             "use_mse_search": quant_config.use_mse_search,
-            "use_layer_wise": quant_config.use_layer_wise,
             "model_path": quant_config.model_path,
             "use_double_quant": quant_config.use_double_quant,
             "double_quant_dtype": quant_config.double_quant_dtype,
@@ -93,7 +92,7 @@ def rtn_entry(
         }
 
     quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config)
-    model = quantizer.execute(model, mode=mode)
+    model = quantizer.execute(model, mode=mode, use_layer_wise=quant_config.use_layer_wise)
     model.qconfig = configs_mapping
     model.save = MethodType(save, model)
     postprocess_model(model, mode, quantizer)
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 3f258204c75..447b0c03343 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -191,8 +191,7 @@ def test_layer_wise(self):
         )
         model = quantize(model, quant_config, run_fn=run_fn)
         out = model(self.example_inputs)[0]
-        atol_true = (out - q_label).amax()
-        print(out, atol_true)
+        assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check."
         
     @pytest.mark.parametrize("dtype", ["nf4", "int4"])
     @pytest.mark.parametrize("double_quant_bits", [6])

From b4e93f3625d240ff93e7e143c5739d2ed9c88d08 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 25 Jun 2024 16:58:17 +0800
Subject: [PATCH 03/38] clean gptq

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/weight_only/gptq.py            | 12 ------------
 test/3x/torch/quantization/weight_only/test_gptq.py | 13 ++++++-------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 08745ddceaa..4b914e506b7 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -573,18 +573,6 @@ def tmp(_, inp, out):
                         set_module_tensor_to_device,
                     )
 
-                    # sub_layer = sub_layers[layer_name]
-                    # full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    # for n, p in sub_layer.named_parameters():
-                    #     param_name = full_layer_name + "." + n
-                    #     # breakpoint()
-                    #     if n == "weight":
-                    #         set_module_tensor_to_device(self.model, param_name, self.device, Q)
-                    #     else:
-                    #         value = load_value(self.model, param_name, model_path)
-                    #         set_module_tensor_to_device(self.model, param_name, self.device, value)
-                    # sub_layer.weight.data = Q
-                    # torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
                     torch.save(new_module.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
                     clean_module_weight(new_module)
                     del Q
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 447b0c03343..432a5446bc9 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -28,10 +28,8 @@ def run_fn(model):
     # GPTQ uses ValueError to reduce computation when collecting input data of the first block
     # It's special for UTs, no need to add this wrapper in examples.
     with pytest.raises(ValueError):
-    #     model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device))
-    #     model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device))
-        model(torch.tensor([[10, 20, 30]], dtype=torch.long))
-        model(torch.tensor([[40, 50, 60]], dtype=torch.long))
+        model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device))
+        model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device))
 
 
 class TestGPTQQuant:
@@ -182,14 +180,15 @@ def test_layer_wise(self):
         q_label = model(self.example_inputs)[0]
     
         from neural_compressor.torch.algorithms.layer_wise import load_empty_model
-        model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM",  torchscript=True)
-        
+        model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
     
         quant_config = GPTQConfig(
             use_layer_wise=True,
             model_path="hf-internal-testing/tiny-random-GPTJForCausalLM"
         )
-        model = quantize(model, quant_config, run_fn=run_fn)
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
         out = model(self.example_inputs)[0]
         assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check."
         

From a3a061e227285063d64d366136ffa1a71080f86b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Jun 2024 09:03:20 +0000
Subject: [PATCH 04/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/layer_wise/utils.py      |  1 +
 .../torch/algorithms/weight_only/gptq.py      | 20 ++++++++++---------
 .../torch/algorithms/weight_only/rtn.py       | 14 +++++++------
 .../quantization/weight_only/test_gptq.py     | 12 +++++------
 .../quantization/weight_only/test_rtn.py      |  3 ++-
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index 93c41fa9fc1..f02c0d2de3a 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -214,6 +214,7 @@ def _get_path(pretrained_model_name_or_path):
         path = dowload_hf_model(pretrained_model_name_or_path)
     return path
 
+
 get_path = _get_path
 
 
diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 4b914e506b7..30b4d07be6a 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -240,8 +240,10 @@ def __init__(
         self.nsamples = nsamples
 
     def prepare_layer_wise(self, model_path):
-        from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks
         import os
+
+        from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks
+
         os.makedirs(LWQ_WORKSPACE, exist_ok=True)
         if model_path == "":
             model_path = self.model.path
@@ -250,7 +252,7 @@ def prepare_layer_wise(self, model_path):
         register_weight_hooks(
             self.model, self.model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE
         )
-    
+
     def get_full_layer_name(self, sub_layer_name, block_idx):
         transformer_name = self.gptq_related_blocks["transformers_name"]
         return ".".join([transformer_name, str(block_idx), sub_layer_name])
@@ -443,6 +445,7 @@ def execute_quantization(self, means=None, stds=None):
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
                 if self.use_layer_wise:  # pragma: no cover
                     from neural_compressor.torch.algorithms.layer_wise import load_value
+
                     W = load_value(self.model, full_layer_name + ".weight", self.model_path)
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
@@ -489,10 +492,10 @@ def tmp(_, inp, out):
                         else:
                             value = load_value(self.model, param_name, self.model_path)
                             set_module_tensor_to_device(self.model, param_name, self.device, value)
-                    
+
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
-                    
+
                 accelerator.mark_step()
                 if "hpu" in self.device:
                     W = W.to("cpu")
@@ -504,7 +507,7 @@ def tmp(_, inp, out):
                     act_order=weight_config_this_layer["act_order"],
                     static_groups=weight_config_this_layer["static_groups"],
                 )
-                
+
                 # Step 2.5: export to compressed model
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
@@ -513,7 +516,7 @@ def tmp(_, inp, out):
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[
                         layer_name
                     ].perm
-                
+
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 gptq_scale = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["scale"]
                 if not weight_config_this_layer["sym"]:
@@ -564,7 +567,7 @@ def tmp(_, inp, out):
                     device=self.device,
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm)
-                    
+
                 if self.use_layer_wise:  # pragma: no cover
                     from neural_compressor.torch.algorithms.layer_wise import (
                         LWQ_WORKSPACE,
@@ -595,8 +598,7 @@ def tmp(_, inp, out):
                 self.gptq_related_blocks["transformers"][block_idx] = transformer_block
             else:
                 self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
-            
-                
+
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index fb823abea82..dfe9d18522f 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -19,10 +19,10 @@
 # limitations under the License.
 
 
+import gc
 from collections import OrderedDict
 
 import torch
-import gc
 
 from neural_compressor.torch.algorithms import Quantizer
 from neural_compressor.torch.utils import get_accelerator, is_transformers_imported, logger, set_module
@@ -157,18 +157,20 @@ def convert(
                 continue
             logger.debug(f"RTN quantized module:{name, m}")
             logger.debug(log_msg)
-            
+
             if use_layer_wise:
+                import os
+
                 from neural_compressor.common.utils import DEFAULT_WORKSPACE
                 from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, load_value
-                import os
+
                 lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir")
                 os.makedirs(lwq_workspace, exist_ok=True)
                 model_path = get_path(model_path)
-            
+
                 # load weight
                 load_module(model, name, model_path, device=device)
-           
+
             # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight.
             if is_transformers_imported():
                 transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D))
@@ -230,7 +232,7 @@ def convert(
                 return new_module
             else:
                 set_module(model, name, new_module)
-            
+
         if use_layer_wise:
             # register hooks
             from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 432a5446bc9..13d25eff188 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -178,20 +178,18 @@ def test_layer_wise(self):
         run_fn(model)
         model = convert(model)
         q_label = model(self.example_inputs)[0]
-    
+
         from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
-    
-        quant_config = GPTQConfig(
-            use_layer_wise=True,
-            model_path="hf-internal-testing/tiny-random-GPTJForCausalLM"
-        )
+
+        quant_config = GPTQConfig(use_layer_wise=True, model_path="hf-internal-testing/tiny-random-GPTJForCausalLM")
         model = prepare(model, quant_config)
         run_fn(model)
         model = convert(model)
         out = model(self.example_inputs)[0]
         assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check."
-        
+
     @pytest.mark.parametrize("dtype", ["nf4", "int4"])
     @pytest.mark.parametrize("double_quant_bits", [6])
     @pytest.mark.parametrize("double_quant_group_size", [8, 256])
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 889aa902b87..6a8ac4bab96 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -141,6 +141,7 @@ def test_mse_search(self):
     def test_layer_wise(self):
         # model = copy.deepcopy(self.tiny_gptj)
         from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
         quant_config = RTNConfig(
             use_layer_wise=True,
@@ -149,7 +150,7 @@ def test_layer_wise(self):
         model = prepare(model, quant_config)
         model = convert(model)
         out = model(self.example_inputs)[0]
-        assert torch.equal(out, self.q_label),  "use_layer_wise=True output should be same. Please double check."
+        assert torch.equal(out, self.q_label), "use_layer_wise=True output should be same. Please double check."
 
     @pytest.mark.parametrize(
         "dtype",

From 02ee1f8144820a3b6de9d8fcdd2d148dfa60ec9c Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 25 Jun 2024 17:05:39 +0800
Subject: [PATCH 05/38] del unused line

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/quantization/weight_only/test_rtn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 6a8ac4bab96..94f7c2954b1 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -139,7 +139,6 @@ def test_mse_search(self):
             assert torch.allclose(atol_false, atol_true, atol=0.012), "atol is very close, double checked the logic."
 
     def test_layer_wise(self):
-        # model = copy.deepcopy(self.tiny_gptj)
         from neural_compressor.torch.algorithms.layer_wise import load_empty_model
 
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")

From 060ea50169712d9316698b78cb37bd3cb86777b6 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 26 Jun 2024 09:34:24 +0800
Subject: [PATCH 06/38] fix load import

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/layer_wise/load.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/load.py b/neural_compressor/torch/algorithms/layer_wise/load.py
index 09700044a8f..a883bfe3848 100644
--- a/neural_compressor/torch/algorithms/layer_wise/load.py
+++ b/neural_compressor/torch/algorithms/layer_wise/load.py
@@ -32,7 +32,7 @@
     _open_zipfile_reader,
 )
 
-from neural_compressor.adaptor.torch_utils.layer_wise_quant import modified_pickle as pickle
+from neural_compressor.torch.algorithms.layer_wise import modified_pickle as pickle
 
 from .utils import torch
 

From 1a60731343c76cc2d9d4eddb3548286f96eb3944 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 26 Jun 2024 10:04:58 +0800
Subject: [PATCH 07/38] fix rtn model_path

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/rtn.py  |  5 ++++-
 .../torch/quantization/algorithm_entry.py              | 10 +++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index dfe9d18522f..9738bc13846 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -66,6 +66,7 @@ def convert(
         use_full_range=False,
         use_mse_search=False,
         use_layer_wise=False,
+        model_path="",
         *args,
         **kwargs,
     ):
@@ -132,7 +133,6 @@ def convert(
                 group_dim = weight_config[name]["group_dim"]
                 use_full_range = weight_config[name]["use_full_range"]
                 use_mse_search = weight_config[name]["use_mse_search"]
-                model_path = weight_config[name]["model_path"]
                 use_optimum_format = kwargs.get("use_optimum_format", True)
                 # double quant config
                 double_quant_config = {
@@ -166,6 +166,9 @@ def convert(
 
                 lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir")
                 os.makedirs(lwq_workspace, exist_ok=True)
+                if model_path == "":
+                    model_path = self.model.path
+                assert model_path, "model_path should not be None."
                 model_path = get_path(model_path)
 
                 # load weight
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 01b496ee9a3..678dc9a0a13 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -83,16 +83,20 @@ def rtn_entry(
             "group_dim": quant_config.group_dim,
             "use_full_range": quant_config.use_full_range,
             "use_mse_search": quant_config.use_mse_search,
-            "model_path": quant_config.model_path,
             "use_double_quant": quant_config.use_double_quant,
             "double_quant_dtype": quant_config.double_quant_dtype,
             "double_quant_bits": quant_config.double_quant_bits,
             "double_quant_scheme": "sym" if quant_config.double_quant_use_sym else "asym",
             "double_quant_group_size": quant_config.double_quant_group_size,
         }
-
+    kwargs.update(
+        {
+            "use_layer_wise": quant_config.use_layer_wise,
+            "model_path": quant_config.model_path,
+        }
+    )
     quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config)
-    model = quantizer.execute(model, mode=mode, use_layer_wise=quant_config.use_layer_wise)
+    model = quantizer.execute(model, mode=mode,  *args, **kwargs)
     model.qconfig = configs_mapping
     model.save = MethodType(save, model)
     postprocess_model(model, mode, quantizer)

From 04e1923d7436b73bd61666e3a6696a0eecb42c05 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 26 Jun 2024 02:07:34 +0000
Subject: [PATCH 08/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/quantization/algorithm_entry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 678dc9a0a13..07898ed1dd3 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -96,7 +96,7 @@ def rtn_entry(
         }
     )
     quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config)
-    model = quantizer.execute(model, mode=mode,  *args, **kwargs)
+    model = quantizer.execute(model, mode=mode, *args, **kwargs)
     model.qconfig = configs_mapping
     model.save = MethodType(save, model)
     postprocess_model(model, mode, quantizer)

From 8f27d4781e277cde11e4ffa749e63bba03ede567 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 26 Jun 2024 10:56:42 +0800
Subject: [PATCH 09/38] update rtn model

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/rtn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 9738bc13846..4dcf046e05f 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -167,7 +167,7 @@ def convert(
                 lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir")
                 os.makedirs(lwq_workspace, exist_ok=True)
                 if model_path == "":
-                    model_path = self.model.path
+                    model_path = model.path
                 assert model_path, "model_path should not be None."
                 model_path = get_path(model_path)
 

From 5a3f0906c6ff03d33cd6182014400f1bad3a1014 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 26 Jun 2024 15:53:01 +0800
Subject: [PATCH 10/38] fix clean module

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/layer_wise/utils.py | 8 +++++++-
 neural_compressor/torch/algorithms/weight_only/rtn.py  | 3 ++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index f02c0d2de3a..bb1e2f8ebcc 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -278,12 +278,18 @@ def hook(module, input, output):
     return handle
 
 
-def clean_module_weight(module):
+def clean_module_weight(module, woq_type=False):
     if isinstance(module, QDQLayer):
         submodule = module.module
     else:
         submodule = module
 
+    if woq_type is True:
+        for n, m in submodule._buffers.items():
+            old_value = getattr(submodule, n)
+            with torch.no_grad():
+                submodule._buffers[n] = torch.zeros(old_value.shape, device="meta")
+        
     for n, m in submodule.named_parameters():
         is_buffer = n in submodule._buffers
         old_value = getattr(submodule, n)
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 4dcf046e05f..aa8f4f70ed9 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -228,7 +228,8 @@ def convert(
                 from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
 
                 torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt"))
-                clean_module_weight(new_module)
+                clean_module_weight(new_module, woq_type=True)
+                clean_module_weight(m)
                 del m
                 gc.collect()
             if name == "":

From 14bd733bbf8abfe06f460ff68649fe0b0df5b8c9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 26 Jun 2024 07:55:37 +0000
Subject: [PATCH 11/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/layer_wise/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index bb1e2f8ebcc..095d27779aa 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -289,7 +289,7 @@ def clean_module_weight(module, woq_type=False):
             old_value = getattr(submodule, n)
             with torch.no_grad():
                 submodule._buffers[n] = torch.zeros(old_value.shape, device="meta")
-        
+
     for n, m in submodule.named_parameters():
         is_buffer = n in submodule._buffers
         old_value = getattr(submodule, n)

From 4ce74db461e8eb6d34462084e388b8ba113773a6 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 26 Jun 2024 18:45:33 +0800
Subject: [PATCH 12/38] fix layerwise woq forward

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/layer_wise/utils.py      | 25 +++++++++++--------
 .../torch/algorithms/weight_only/rtn.py       |  2 +-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index bb1e2f8ebcc..8f4272597e4 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -25,12 +25,13 @@
 from accelerate.utils import set_module_tensor_to_device
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 
 from neural_compressor.common import options
 
 from .load import load
 
-LWQ_WORKSPACE = os.path.join(options.workspace, "layer_wise_tmp")
+LWQ_WORKSPACE = os.path.join(options.workspace, "lwq_tmpdir")
 
 
 class QDQLayer(torch.nn.Module):
@@ -250,13 +251,17 @@ def hook(module, input):
             state_dict = None
             if os.path.exists(os.path.join(LWQ_WORKSPACE, f"{name}.pt")):
                 state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
-            for n, p in module.named_parameters():
-                param_name = name + "." + n
-                if state_dict:
-                    value = state_dict[n]
-                else:
-                    value = load_value(model, param_name, path)
-                set_module_tensor_to_device(model, param_name, device, value)
+            if isinstance(module, WeightOnlyLinear):
+                for n, p in module._buffers.items():
+                    setattr(module, n, state_dict[n]) 
+            else:
+                for n, p in module.named_parameters():
+                    param_name = name + "." + n
+                    if state_dict:
+                        value = state_dict[n]
+                    else:
+                        value = load_value(model, param_name, path)
+                    set_module_tensor_to_device(model, param_name, device, value)
 
         return hook
 
@@ -278,13 +283,13 @@ def hook(module, input, output):
     return handle
 
 
-def clean_module_weight(module, woq_type=False):
+def clean_module_weight(module):
     if isinstance(module, QDQLayer):
         submodule = module.module
     else:
         submodule = module
 
-    if woq_type is True:
+    if isinstance(module, WeightOnlyLinear):
         for n, m in submodule._buffers.items():
             old_value = getattr(submodule, n)
             with torch.no_grad():
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index aa8f4f70ed9..bf539f46a6e 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -228,7 +228,7 @@ def convert(
                 from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
 
                 torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt"))
-                clean_module_weight(new_module, woq_type=True)
+                clean_module_weight(new_module)
                 clean_module_weight(m)
                 del m
                 gc.collect()

From b700d39617ba7b77b10f768211986429189ac3d0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 26 Jun 2024 10:48:26 +0000
Subject: [PATCH 13/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/layer_wise/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index bd1ee39998f..2722a891144 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -25,9 +25,9 @@
 from accelerate.utils import set_module_tensor_to_device
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
-from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 
 from neural_compressor.common import options
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 
 from .load import load
 
@@ -253,7 +253,7 @@ def hook(module, input):
                 state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
             if isinstance(module, WeightOnlyLinear):
                 for n, p in module._buffers.items():
-                    setattr(module, n, state_dict[n]) 
+                    setattr(module, n, state_dict[n])
             else:
                 for n, p in module.named_parameters():
                     param_name = name + "." + n

From 96d0e05ab04a47276aac3fb7ea2d618da345fb1f Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 26 Jun 2024 19:09:07 +0800
Subject: [PATCH 14/38] fix import

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/rtn.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index bf539f46a6e..b1331050419 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -20,6 +20,7 @@
 
 
 import gc
+import os
 from collections import OrderedDict
 
 import torch
@@ -159,13 +160,10 @@ def convert(
             logger.debug(log_msg)
 
             if use_layer_wise:
-                import os
-
                 from neural_compressor.common.utils import DEFAULT_WORKSPACE
-                from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, load_value
+                from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, LWQ_WORKSPACE
 
-                lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir")
-                os.makedirs(lwq_workspace, exist_ok=True)
+                os.makedirs(LWQ_WORKSPACE, exist_ok=True)
                 if model_path == "":
                     model_path = model.path
                 assert model_path, "model_path should not be None."
@@ -227,7 +225,7 @@ def convert(
                 # save and clean weight
                 from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
 
-                torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt"))
+                torch.save(new_module.state_dict(), os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
                 clean_module_weight(new_module)
                 clean_module_weight(m)
                 del m

From 7b2d3268bf3d66ed22672b7d2bd2641bc3647dec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:11:50 +0000
Subject: [PATCH 15/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/weight_only/rtn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index b1331050419..faf7f43a3c3 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -161,7 +161,7 @@ def convert(
 
             if use_layer_wise:
                 from neural_compressor.common.utils import DEFAULT_WORKSPACE
-                from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, LWQ_WORKSPACE
+                from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
 
                 os.makedirs(LWQ_WORKSPACE, exist_ok=True)
                 if model_path == "":

From 77cde5c1d9e6014342029a61dd164ee0492a246c Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Tue, 2 Jul 2024 23:30:32 -0700
Subject: [PATCH 16/38] update clean module & add timestep

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 42 ++++++++++----
 .../torch/algorithms/weight_only/rtn.py       | 58 +++++++++++++++++--
 2 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 8655c47a8da..21964f5d5b2 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -212,17 +212,35 @@ def get_user_model():
 
 if args.quantize:
     # dataset
-    user_model, tokenizer = get_user_model()
-    calib_dataset = load_dataset(args.dataset, split="train")
+    if 0:
+        user_model, tokenizer = get_user_model()
+        use_layer_wise =False
+        # user_model.save_pretrained("./saved",max_shard_size="20GB", safe_serialization=False)
+    else:
+        from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+        user_model = load_empty_model(args.model)
+    #     user_model = AutoModelForCausalLM.from_pretrained(
+    #     args.model,
+    #     #trust_remote_code=args.trust_remote_code,
+    #     low_cpu_mem_usage=True,
+    #     torch_dtype="auto"
+    #    )
+       #from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+        #tokenizer = AutoTokenizer.from_pretrained(args.model)
+       # checkpoint_file = "/home/sdp/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590"
+        #checkpoint_file = "./saved" #if checkpoint_file in "./saved" else checkpoint_file
+        #user_model = load_checkpoint_and_dispatch(user_model, checkpoint=checkpoint_file, device_mp="auto", offload_folder=checkpoint_file)
+        use_layer_wise = True 
+    #calib_dataset = load_dataset(args.dataset, split="train")
     # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
-    calib_dataset = calib_dataset.shuffle(seed=args.seed)
-    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
-    calib_dataloader = DataLoader(
-        calib_evaluator.dataset,
-        batch_size=calib_size,
-        shuffle=False,
-        collate_fn=calib_evaluator.collate_batch,
-    )
+    #calib_dataset = calib_dataset.shuffle(seed=args.seed)
+    #calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
+    #calib_dataloader = DataLoader(
+    #    calib_evaluator.dataset,
+    #    batch_size=calib_size,
+    #    shuffle=False,
+    #    collate_fn=calib_evaluator.collate_batch,
+    #)
 
     # 3.x api
     from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
@@ -255,8 +273,9 @@ def get_user_model():
                 double_quant_dtype=args.double_quant_dtype,
                 double_quant_use_sym=args.double_quant_use_sym,
                 double_quant_group_size=args.double_quant_group_size,
+                use_layer_wise=use_layer_wise,
             )
-        quant_config.set_local("lm_head", RTNConfig(dtype="fp32"))
+        quant_config.set_local("lm_head", RTNConfig(use_layer_wise=use_layer_wise, dtype="fp32"))
         user_model = prepare(model=user_model, quant_config=quant_config)
         user_model = convert(model=user_model)
     elif args.woq_algo == "GPTQ":
@@ -315,6 +334,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
         run_fn_for_gptq(user_model, dataloader_for_calibration)
         user_model = convert(user_model)
 
+    exit(0)
     user_model.save(args.output_dir)
 
 
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index faf7f43a3c3..676ab871bc0 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -92,12 +92,23 @@ def convert(
         """
         weight_config = self.quant_config
         device = get_accelerator(kwargs.pop("device", "auto")).current_device_name()
+        if use_layer_wise:
+            from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
+
+            os.makedirs(LWQ_WORKSPACE, exist_ok=True)
 
         # Put model on device explicitly
         # TODO: refine it later, Put module on device one by one instead of the whole model
-        if not use_layer_wise:
-            model.to(device)
+        #if not use_layer_wise:
+        #    model.to(device)
 
+        total_time = 0.0
+        total_load_time = 0.0
+        total_save_time = 0.0
+        total_quant_time = 0.0
+        total_quant_int_time = 0.0
+        total_set_module_time = 0.0
+        import time
         assert isinstance(model, torch.nn.Module), "only support torch module"
         if is_transformers_imported():
             supported_layers = (torch.nn.Linear, transformers.Conv1D)
@@ -113,6 +124,7 @@ def convert(
         }
         use_optimum_format = kwargs.get("use_optimum_format", True)
         for name, m in model.named_modules():
+            
             if not isinstance(m, supported_layers):
                 continue
             if name in weight_config:  # pragma: no cover
@@ -159,7 +171,8 @@ def convert(
             logger.debug(f"RTN quantized module:{name, m}")
             logger.debug(log_msg)
 
-            if use_layer_wise:
+            if use_layer_wise and True:
+                start_load = time.time()
                 from neural_compressor.common.utils import DEFAULT_WORKSPACE
                 from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
 
@@ -171,6 +184,9 @@ def convert(
 
                 # load weight
                 load_module(model, name, model_path, device=device)
+                load_time = time.time() - start_load
+                total_load_time += load_time
+                logger.info(load_time)
 
             # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight.
             if is_transformers_imported():
@@ -183,6 +199,7 @@ def convert(
                 weight = m.weight.detach()
             if use_mse_search:
                 quantile = search_clip(m, bits, group_size, scheme, dtype, use_full_range)
+            start_quant = time.time()
             int_weight, scale, zp = quant_tensor(
                 weight,
                 dtype=dtype,
@@ -194,6 +211,8 @@ def convert(
                 full_range=use_full_range,
                 **double_quant_config,
             )
+            quant_int_time = time.time() - start_quant
+            total_quant_int_time += quant_int_time
             int_weight = int_weight.t_().contiguous() if transpose else int_weight
             scale = scale.t_().contiguous() if transpose else scale
             zp = zp.t_().contiguous() if transpose and zp is not None else zp
@@ -219,22 +238,51 @@ def convert(
                 use_optimum_format=use_optimum_format,
                 device=device,
             )
+            if name in ["model.layers.11.mlp.up_proj", "model.layers.16.mlp.gate_proj"]:
+                print("will break")
+                #breakpoint()
+            logger.info(name)
             new_module.pack(int_weight, scale, zp, m.bias)
 
             if use_layer_wise:
                 # save and clean weight
                 from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
+                from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
+
+                import time
 
+                start = time.time()
                 torch.save(new_module.state_dict(), os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
-                clean_module_weight(new_module)
-                clean_module_weight(m)
+                save_time = time.time() - start
+                logger.info(f"save time {save_time}")
+                total_save_time += save_time
+                start = time.time()
+                #clean_module_weight(new_module)
+                new_module = new_module.to_empty(device=torch.device("meta"))
+                m = m.to_empty(device=torch.device("meta"))
+                #clean_module_weight(m)
+                layer_time = time.time() - start
+                total_time += layer_time
+                logger.info(layer_time)
                 del m
                 gc.collect()
             if name == "":
                 return new_module
             else:
+                start_set = time.time()
                 set_module(model, name, new_module)
+                set_module_time = time.time() - start_set
+                total_set_module_time += set_module_time
+            quant_time = time.time() - start_quant - save_time - layer_time
+            logger.info(f"quant time {quant_time}")
+            total_quant_time += quant_time
 
+        logger.info(f"load time: {total_load_time}")
+        logger.info(f"save time: {total_save_time}")
+        logger.info(f"clean time: {total_time}")
+        logger.info(f"quant time: {total_quant_time}")
+        logger.info(f"quant int time: {total_quant_int_time}")
+        logger.info(f"set module time: {total_set_module_time}")
         if use_layer_wise:
             # register hooks
             from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks

From 6cf8ff3463b3f1b5e7e2806a9e9960a00d68aae7 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Wed, 10 Jul 2024 22:00:01 -0700
Subject: [PATCH 17/38] add numba pack

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/weight_only/modules.py   | 322 +++++++++++++++++-
 .../torch/algorithms/weight_only/rtn.py       |   4 +-
 2 files changed, 311 insertions(+), 15 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 30c40cfa9c3..1ad7339e314 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -23,6 +23,7 @@
 import torch
 from torch.autograd import Function
 from torch.nn import functional as F
+import numba
 
 from neural_compressor.torch.utils import accelerator, logger
 
@@ -300,25 +301,320 @@ def unpack_tensor_with_torch(self, packed_tensor):
                 unpacked_tensor[:, index].copy_(tmp.type(target_dtype))
                 accelerator.synchronize()
         return unpacked_tensor
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b4_c32(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 7] << 28)
+                | (raw_array[:, i * n_pack + 6] << 24)
+                | (raw_array[:, i * n_pack + 5] << 20)
+                | (raw_array[:, i * n_pack + 4] << 16)
+                | (raw_array[:, i * n_pack + 3] << 12)
+                | (raw_array[:, i * n_pack + 2] << 8)
+                | (raw_array[:, i * n_pack + 1] << 4)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b4_c16(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 3] << 12)
+                | (raw_array[:, i * n_pack + 2] << 8)
+                | (raw_array[:, i * n_pack + 1] << 4)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b4_c8(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 1] << 4)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b4_c64(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 15] << 60)
+                | (raw_array[:, i * n_pack + 14] << 56)
+                | (raw_array[:, i * n_pack + 13] << 52)
+                | (raw_array[:, i * n_pack + 12] << 48)
+                | (raw_array[:, i * n_pack + 11] << 44)
+                | (raw_array[:, i * n_pack + 10] << 40)
+                | (raw_array[:, i * n_pack + 9] << 36)
+                | (raw_array[:, i * n_pack + 8] << 32)
+                | (raw_array[:, i * n_pack + 7] << 28)
+                | (raw_array[:, i * n_pack + 6] << 24)
+                | (raw_array[:, i * n_pack + 5] << 20)
+                | (raw_array[:, i * n_pack + 4] << 16)
+                | (raw_array[:, i * n_pack + 3] << 12)
+                | (raw_array[:, i * n_pack + 2] << 8)
+                | (raw_array[:, i * n_pack + 1] << 4)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
 
-    def pack_tensor_with_numpy(self, raw_tensor):
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b8_c32(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 3] << 24)
+                | (raw_array[:, i * n_pack + 2] << 16)
+                | (raw_array[:, i * n_pack + 1] << 8)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b8_c16(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 3] << 24)
+                | (raw_array[:, i * n_pack + 2] << 16)
+                | (raw_array[:, i * n_pack + 1] << 8)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b8_c8(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = raw_array[:, i * n_pack]
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b8_c64(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 7] << 56)
+                | (raw_array[:, i * n_pack + 6] << 48)
+                | (raw_array[:, i * n_pack + 5] << 40)
+                | (raw_array[:, i * n_pack + 4] << 32)
+                | (raw_array[:, i * n_pack + 3] << 24)
+                | (raw_array[:, i * n_pack + 2] << 16)
+                | (raw_array[:, i * n_pack + 1] << 8)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b2_c32(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b2_c32(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 15] << 30)
+                | (raw_array[:, i * n_pack + 14] << 28)
+                | (raw_array[:, i * n_pack + 13] << 26)
+                | (raw_array[:, i * n_pack + 12] << 24)
+                | (raw_array[:, i * n_pack + 11] << 22)
+                | (raw_array[:, i * n_pack + 10] << 20)
+                | (raw_array[:, i * n_pack + 9] << 18)
+                | (raw_array[:, i * n_pack + 8] << 16)
+                | (raw_array[:, i * n_pack + 7] << 14)
+                | (raw_array[:, i * n_pack + 6] << 12)
+                | (raw_array[:, i * n_pack + 5] << 10)
+                | (raw_array[:, i * n_pack + 4] << 8)
+                | (raw_array[:, i * n_pack + 3] << 6)
+                | (raw_array[:, i * n_pack + 2] << 4)
+                | (raw_array[:, i * n_pack + 1] << 2)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b2_c16(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 8] << 16)
+                | (raw_array[:, i * n_pack + 7] << 14)
+                | (raw_array[:, i * n_pack + 6] << 12)
+                | (raw_array[:, i * n_pack + 5] << 10)
+                | (raw_array[:, i * n_pack + 4] << 8)
+                | (raw_array[:, i * n_pack + 3] << 6)
+                | (raw_array[:, i * n_pack + 2] << 4)
+                | (raw_array[:, i * n_pack + 1] << 2)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b2_c8(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 3] << 6)
+                | (raw_array[:, i * n_pack + 2] << 4)
+                | (raw_array[:, i * n_pack + 1] << 2)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    @staticmethod
+    @numba.jit(nopython=True, parallel=True)
+    def pack_array_with_numba_b2_c64(
+        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+    ) -> np.ndarray:
+        for i in range(new_in_features):
+            packed_array[:, i] = (
+                (raw_array[:, i * n_pack + 31] << 62)
+                | (raw_array[:, i * n_pack + 30] << 60)
+                | (raw_array[:, i * n_pack + 29] << 58)
+                | (raw_array[:, i * n_pack + 28] << 56)
+                | (raw_array[:, i * n_pack + 27] << 54)
+                | (raw_array[:, i * n_pack + 26] << 52)
+                | (raw_array[:, i * n_pack + 25] << 50)
+                | (raw_array[:, i * n_pack + 24] << 48)
+                | (raw_array[:, i * n_pack + 23] << 46)
+                | (raw_array[:, i * n_pack + 22] << 44)
+                | (raw_array[:, i * n_pack + 21] << 42)
+                | (raw_array[:, i * n_pack + 20] << 40)
+                | (raw_array[:, i * n_pack + 19] << 38)
+                | (raw_array[:, i * n_pack + 18] << 36)
+                | (raw_array[:, i * n_pack + 17] << 34)
+                | (raw_array[:, i * n_pack + 16] << 32)
+                | (raw_array[:, i * n_pack + 15] << 30)
+                | (raw_array[:, i * n_pack + 14] << 28)
+                | (raw_array[:, i * n_pack + 13] << 26)
+                | (raw_array[:, i * n_pack + 12] << 24)
+                | (raw_array[:, i * n_pack + 11] << 22)
+                | (raw_array[:, i * n_pack + 10] << 20)
+                | (raw_array[:, i * n_pack + 9] << 18)
+                | (raw_array[:, i * n_pack + 8] << 16)
+                | (raw_array[:, i * n_pack + 7] << 14)
+                | (raw_array[:, i * n_pack + 6] << 12)
+                | (raw_array[:, i * n_pack + 5] << 10)
+                | (raw_array[:, i * n_pack + 4] << 8)
+                | (raw_array[:, i * n_pack + 3] << 6)
+                | (raw_array[:, i * n_pack + 2] << 4)
+                | (raw_array[:, i * n_pack + 1] << 2)
+                | raw_array[:, i * n_pack]
+            )
+        return packed_array
+    
+    def pack_array_with_numba1(
+        self, raw_array: np.ndarray, n_pack: int, bits: int, compress_bits: int, compression_dtype=np.int32
+    ) -> np.ndarray:
+        """Packs the input array by combining elements into a specified bit-width format using NumPy.
+
+        Args:
+            raw_array (np.ndarray): The array to be packed. Shape: [out_features, in_features] or [1, in_features].
+            n_pack (int): The number of elements to be packed together.
+            bits (int): The number of bits for each element.
+            compress_bits (int): The number of bits for each element of the compressed array, supported 2, 4, 8.
+            compression_dtype (np.dtype, optional): The data type of the compressed array. Defaults to np.int32.
+
+        Returns:
+            np.ndarray: The packed array.
+        """
+        out_features, in_features = raw_array.shape
+        new_in_features = (in_features + n_pack - 1) // n_pack
+        packed_array = np.zeros((out_features, new_in_features), dtype=compression_dtype)
+        raw_array = raw_array.astype(compression_dtype)
+        
+        pack_method_name = f"pack_array_with_numba_b{bits}_c{compress_bits}"
+        pack_method = getattr(self, pack_method_name)
+        return pack_method(raw_array, packed_array, n_pack, new_in_features)
+        
+    @staticmethod
+    @numba.jit(nopython=True)
+    def pack_array_with_numba(
+        raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32
+    ) -> np.ndarray:
+        """Packs the input tensor by combining elements into a specified bit-width format using NumPy.
+        Args:
+            raw_tensor (np.ndarray): The tensor to be packed. Shape: [out_features, in_features] or [1, in_features].
+            n_pack (int): The number of elements to be packed together.
+            bits (int): The number of bits for each element.
+            compression_dtype (np.dtype, optional): The data type of the compressed tensor. Defaults to np.int32.
+        Returns:
+            np.ndarray: The packed tensor.
+        """
+        out_features, in_features = raw_tensor.shape
+        new_in_features = (in_features + n_pack - 1) // n_pack
+        packed_tensor = np.zeros((out_features, new_in_features), dtype=compression_dtype)
+        raw_tensor = raw_tensor.astype(compression_dtype)
+
+        if bits == 4:
+            for i in range(new_in_features):
+                packed_tensor[:, i] = (
+                    (raw_tensor[:, i * n_pack + 7] << 28)
+                    | (raw_tensor[:, i * n_pack + 6] << 24)
+                    | (raw_tensor[:, i * n_pack + 5] << 20)
+                    | (raw_tensor[:, i * n_pack + 4] << 16)
+                    | (raw_tensor[:, i * n_pack + 3] << 12)
+                    | (raw_tensor[:, i * n_pack + 2] << 8)
+                    | (raw_tensor[:, i * n_pack + 1] << 4)
+                    | raw_tensor[:, i * n_pack]
+                )
+
+        return packed_tensor
+    
+    def pack_tensor_with_reshape(self, raw_tensor):
         raw_array = raw_tensor.cpu().numpy()
         target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int)
         target_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype
-        packed_array = np.zeros((raw_array.shape[0], target_len), dtype=target_dtype)
-        mask = np.uint8(2**self.bits - 1)
-        for j in range(packed_array.shape[1]):
-            start = self.n_pack * j
-            end = self.n_pack * (j + 1)
-            tmp = raw_array[:, start:end].astype(target_dtype)
-            tmp &= mask
-            for e in range(tmp.shape[1]):
-                tmp[:, e] = np.left_shift(tmp[:, e], self.bits * e)
-                packed_array[:, j] |= tmp[:, e]
-                accelerator.synchronize()
-        packed_tensor = torch.from_numpy(packed_array).to(device=raw_tensor.device)
+        reshaped = raw_array.reshape(-1, self.n_pack)
+        packed_array = np.zeros(reshaped.shape[0], dtype=target_dtype)
+        for i in range(self.n_pack):
+            packed_array |= (reshaped[:, i].astype(target_dtype) << (self.bits * i))
+       
+        packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to(device=raw_tensor.device)
         return packed_tensor
 
+    def pack_tensor_with_numpy(self, raw_tensor):
+        # breakpoint()
+        if self.bits not in [2, 4, 8]:
+            return self.pack_tensor_with_reshape(raw_tensor)
+        compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype
+        packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits,  compression_dtype)
+        # packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype)
+        return torch.from_numpy(packed_array).to(device=raw_tensor.device)
+
     def unpack_tensor_with_numpy(self, packed_tensor):
         packed_array = packed_tensor.cpu().numpy()
         target_dtype = np.int8 if not hasattr(self, "qzeros") or "int" not in self.dtype else np.uint8
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 676ab871bc0..5143f22df60 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -108,6 +108,8 @@ def convert(
         total_quant_time = 0.0
         total_quant_int_time = 0.0
         total_set_module_time = 0.0
+        save_time = 0.0
+        layer_time = 0.0
         import time
         assert isinstance(model, torch.nn.Module), "only support torch module"
         if is_transformers_imported():
@@ -264,8 +266,6 @@ def convert(
                 layer_time = time.time() - start
                 total_time += layer_time
                 logger.info(layer_time)
-                del m
-                gc.collect()
             if name == "":
                 return new_module
             else:

From 0e388c0b1f06d6095d1477fe7a8772445d0aede9 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Wed, 10 Jul 2024 22:59:44 -0700
Subject: [PATCH 18/38] mimor fix numba

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/weight_only/modules.py          | 12 +++++-------
 test/3x/torch/quantization/weight_only/test_rtn.py   |  4 ++--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 1ad7339e314..2fba8b7e6a0 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -468,8 +468,7 @@ def pack_array_with_numba_b2_c16(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 8] << 16)
-                | (raw_array[:, i * n_pack + 7] << 14)
+                (raw_array[:, i * n_pack + 7] << 14)
                 | (raw_array[:, i * n_pack + 6] << 12)
                 | (raw_array[:, i * n_pack + 5] << 10)
                 | (raw_array[:, i * n_pack + 4] << 8)
@@ -536,7 +535,7 @@ def pack_array_with_numba_b2_c64(
             )
         return packed_array
     
-    def pack_array_with_numba1(
+    def pack_array_with_numba(
         self, raw_array: np.ndarray, n_pack: int, bits: int, compress_bits: int, compression_dtype=np.int32
     ) -> np.ndarray:
         """Packs the input array by combining elements into a specified bit-width format using NumPy.
@@ -562,7 +561,7 @@ def pack_array_with_numba1(
         
     @staticmethod
     @numba.jit(nopython=True)
-    def pack_array_with_numba(
+    def pack_array_with_numba_yi(
         raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32
     ) -> np.ndarray:
         """Packs the input tensor by combining elements into a specified bit-width format using NumPy.
@@ -607,12 +606,11 @@ def pack_tensor_with_reshape(self, raw_tensor):
         return packed_tensor
 
     def pack_tensor_with_numpy(self, raw_tensor):
-        # breakpoint()
         if self.bits not in [2, 4, 8]:
             return self.pack_tensor_with_reshape(raw_tensor)
         compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype
-        packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits,  compression_dtype)
-        # packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype)
+        # packed_array = self.pack_array_with_numba_yi(raw_tensor.cpu().numpy(), self.n_pack, self.bits,  compression_dtype)
+        packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype)
         return torch.from_numpy(packed_array).to(device=raw_tensor.device)
 
     def unpack_tensor_with_numpy(self, packed_tensor):
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 94f7c2954b1..0623a58d3be 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -22,8 +22,8 @@
 class ModelConv1d(torch.nn.Module):
     def __init__(self):
         super(ModelConv1d, self).__init__()
-        self.fc1 = transformers.Conv1D(50, 32)
-        self.fc2 = torch.nn.Linear(50, 32)
+        self.fc1 = transformers.Conv1D(64, 32)
+        self.fc2 = torch.nn.Linear(64, 32)
         self.fc3 = torch.nn.Linear(32, 5)
 
     def forward(self, x):

From b0ccd622d4c70d0603f0103f2f4cff9c38467a35 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Wed, 10 Jul 2024 23:45:35 -0700
Subject: [PATCH 19/38] apply mask

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/weight_only/modules.py   | 223 +++++++++---------
 1 file changed, 108 insertions(+), 115 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 2fba8b7e6a0..1bb6a4321b4 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -309,14 +309,14 @@ def pack_array_with_numba_b4_c32(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 7] << 28)
-                | (raw_array[:, i * n_pack + 6] << 24)
-                | (raw_array[:, i * n_pack + 5] << 20)
-                | (raw_array[:, i * n_pack + 4] << 16)
-                | (raw_array[:, i * n_pack + 3] << 12)
-                | (raw_array[:, i * n_pack + 2] << 8)
-                | (raw_array[:, i * n_pack + 1] << 4)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 7] & 0b1111) << 28)
+                | ((raw_array[:, i * n_pack + 6] & 0b1111) << 24)
+                | ((raw_array[:, i * n_pack + 5] & 0b1111) << 20)
+                | ((raw_array[:, i * n_pack + 4] & 0b1111) << 16)
+                | ((raw_array[:, i * n_pack + 3] & 0b1111) << 12)
+                | ((raw_array[:, i * n_pack + 2] & 0b1111) << 8)
+                | ((raw_array[:, i * n_pack + 1] & 0b1111) << 4)
+                | (raw_array[:, i * n_pack] & 0b1111)
             )
         return packed_array
     
@@ -327,10 +327,10 @@ def pack_array_with_numba_b4_c16(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 3] << 12)
-                | (raw_array[:, i * n_pack + 2] << 8)
-                | (raw_array[:, i * n_pack + 1] << 4)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 3] & 0b1111) << 12)
+                | ((raw_array[:, i * n_pack + 2] & 0b1111) << 8)
+                | ((raw_array[:, i * n_pack + 1] & 0b1111) << 4)
+                | (raw_array[:, i * n_pack] & 0b1111)
             )
         return packed_array
     
@@ -341,8 +341,8 @@ def pack_array_with_numba_b4_c8(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 1] << 4)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 1] & 0b1111) << 4)
+                | (raw_array[:, i * n_pack] & 0b1111)
             )
         return packed_array
     
@@ -353,22 +353,22 @@ def pack_array_with_numba_b4_c64(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 15] << 60)
-                | (raw_array[:, i * n_pack + 14] << 56)
-                | (raw_array[:, i * n_pack + 13] << 52)
-                | (raw_array[:, i * n_pack + 12] << 48)
-                | (raw_array[:, i * n_pack + 11] << 44)
-                | (raw_array[:, i * n_pack + 10] << 40)
-                | (raw_array[:, i * n_pack + 9] << 36)
-                | (raw_array[:, i * n_pack + 8] << 32)
-                | (raw_array[:, i * n_pack + 7] << 28)
-                | (raw_array[:, i * n_pack + 6] << 24)
-                | (raw_array[:, i * n_pack + 5] << 20)
-                | (raw_array[:, i * n_pack + 4] << 16)
-                | (raw_array[:, i * n_pack + 3] << 12)
-                | (raw_array[:, i * n_pack + 2] << 8)
-                | (raw_array[:, i * n_pack + 1] << 4)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 15] & 0b1111) << 60)
+                | ((raw_array[:, i * n_pack + 14] & 0b1111) << 56)
+                | ((raw_array[:, i * n_pack + 13] & 0b1111) << 52)
+                | ((raw_array[:, i * n_pack + 12] & 0b1111) << 48)
+                | ((raw_array[:, i * n_pack + 11] & 0b1111) << 44)
+                | ((raw_array[:, i * n_pack + 10] & 0b1111) << 40)
+                | ((raw_array[:, i * n_pack + 9] & 0b1111) << 36)
+                | ((raw_array[:, i * n_pack + 8] & 0b1111) << 32)
+                | ((raw_array[:, i * n_pack + 7] & 0b1111) << 28)
+                | ((raw_array[:, i * n_pack + 6] & 0b1111) << 24)
+                | ((raw_array[:, i * n_pack + 5] & 0b1111) << 20)
+                | ((raw_array[:, i * n_pack + 4] & 0b1111) << 16)
+                | ((raw_array[:, i * n_pack + 3] & 0b1111) << 12)
+                | ((raw_array[:, i * n_pack + 2] & 0b1111) << 8)
+                | ((raw_array[:, i * n_pack + 1] & 0b1111) << 4)
+                | (raw_array[:, i * n_pack] & 0b1111)
             )
         return packed_array
 
@@ -380,10 +380,10 @@ def pack_array_with_numba_b8_c32(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 3] << 24)
-                | (raw_array[:, i * n_pack + 2] << 16)
-                | (raw_array[:, i * n_pack + 1] << 8)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24)
+                | ((raw_array[:, i * n_pack + 2] & 0b11111111) << 16)
+                | ((raw_array[:, i * n_pack + 1] & 0b11111111) << 8)
+                | (raw_array[:, i * n_pack] & 0b11111111)
             )
         return packed_array
     
@@ -394,10 +394,10 @@ def pack_array_with_numba_b8_c16(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 3] << 24)
-                | (raw_array[:, i * n_pack + 2] << 16)
-                | (raw_array[:, i * n_pack + 1] << 8)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24)
+                | ((raw_array[:, i * n_pack + 2] & 0b11111111) << 16)
+                | ((raw_array[:, i * n_pack + 1] & 0b11111111) << 8)
+                | (raw_array[:, i * n_pack] & 0b11111111)
             )
         return packed_array
     
@@ -407,7 +407,7 @@ def pack_array_with_numba_b8_c8(
         raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
     ) -> np.ndarray:
         for i in range(new_in_features):
-            packed_array[:, i] = raw_array[:, i * n_pack]
+            packed_array[:, i] = (raw_array[:, i * n_pack] & 0b11111111)
         return packed_array
     
     @staticmethod
@@ -417,24 +417,17 @@ def pack_array_with_numba_b8_c64(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 7] << 56)
-                | (raw_array[:, i * n_pack + 6] << 48)
-                | (raw_array[:, i * n_pack + 5] << 40)
-                | (raw_array[:, i * n_pack + 4] << 32)
-                | (raw_array[:, i * n_pack + 3] << 24)
-                | (raw_array[:, i * n_pack + 2] << 16)
-                | (raw_array[:, i * n_pack + 1] << 8)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 7] & 0b11111111) << 56)
+                | ((raw_array[:, i * n_pack + 6] & 0b11111111) << 48)
+                | ((raw_array[:, i * n_pack + 5] & 0b11111111) << 40)
+                | ((raw_array[:, i * n_pack + 4] & 0b11111111) << 32)
+                | ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24)
+                | ((raw_array[:, i * n_pack + 2] & 0b11111111) << 16)
+                | ((raw_array[:, i * n_pack + 1] & 0b11111111) << 8)
+                | (raw_array[:, i * n_pack] & 0b11111111)
             )
         return packed_array
     
-    @staticmethod
-    @numba.jit(nopython=True, parallel=True)
-    def pack_array_with_numba_b2_c32(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
-    ) -> np.ndarray:
-        return packed_array
-    
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b2_c32(
@@ -442,22 +435,22 @@ def pack_array_with_numba_b2_c32(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 15] << 30)
-                | (raw_array[:, i * n_pack + 14] << 28)
-                | (raw_array[:, i * n_pack + 13] << 26)
-                | (raw_array[:, i * n_pack + 12] << 24)
-                | (raw_array[:, i * n_pack + 11] << 22)
-                | (raw_array[:, i * n_pack + 10] << 20)
-                | (raw_array[:, i * n_pack + 9] << 18)
-                | (raw_array[:, i * n_pack + 8] << 16)
-                | (raw_array[:, i * n_pack + 7] << 14)
-                | (raw_array[:, i * n_pack + 6] << 12)
-                | (raw_array[:, i * n_pack + 5] << 10)
-                | (raw_array[:, i * n_pack + 4] << 8)
-                | (raw_array[:, i * n_pack + 3] << 6)
-                | (raw_array[:, i * n_pack + 2] << 4)
-                | (raw_array[:, i * n_pack + 1] << 2)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 15] & 0b11) << 30)
+                | ((raw_array[:, i * n_pack + 14] & 0b11) << 28)
+                | ((raw_array[:, i * n_pack + 13] & 0b11) << 26)
+                | ((raw_array[:, i * n_pack + 12] & 0b11) << 24)
+                | ((raw_array[:, i * n_pack + 11] & 0b11) << 22)
+                | ((raw_array[:, i * n_pack + 10] & 0b11) << 20)
+                | ((raw_array[:, i * n_pack + 9] & 0b11) << 18)
+                | ((raw_array[:, i * n_pack + 8] & 0b11) << 16)
+                | ((raw_array[:, i * n_pack + 7] & 0b11) << 14)
+                | ((raw_array[:, i * n_pack + 6] & 0b11) << 12)
+                | ((raw_array[:, i * n_pack + 5] & 0b11) << 10)
+                | ((raw_array[:, i * n_pack + 4] & 0b11) << 8)
+                | ((raw_array[:, i * n_pack + 3] & 0b11) << 6)
+                | ((raw_array[:, i * n_pack + 2] & 0b11) << 4)
+                | ((raw_array[:, i * n_pack + 1] & 0b11) << 2)
+                | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
 
@@ -468,14 +461,14 @@ def pack_array_with_numba_b2_c16(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 7] << 14)
-                | (raw_array[:, i * n_pack + 6] << 12)
-                | (raw_array[:, i * n_pack + 5] << 10)
-                | (raw_array[:, i * n_pack + 4] << 8)
-                | (raw_array[:, i * n_pack + 3] << 6)
-                | (raw_array[:, i * n_pack + 2] << 4)
-                | (raw_array[:, i * n_pack + 1] << 2)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 7] & 0b11) << 14)
+                | ((raw_array[:, i * n_pack + 6] & 0b11) << 12)
+                | ((raw_array[:, i * n_pack + 5] & 0b11) << 10)
+                | ((raw_array[:, i * n_pack + 4] & 0b11) << 8)
+                | ((raw_array[:, i * n_pack + 3] & 0b11) << 6)
+                | ((raw_array[:, i * n_pack + 2] & 0b11) << 4)
+                | ((raw_array[:, i * n_pack + 1] & 0b11) << 2)
+                | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
     
@@ -486,10 +479,10 @@ def pack_array_with_numba_b2_c8(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 3] << 6)
-                | (raw_array[:, i * n_pack + 2] << 4)
-                | (raw_array[:, i * n_pack + 1] << 2)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 3] & 0b11) << 6)
+                | ((raw_array[:, i * n_pack + 2] & 0b11) << 4)
+                | ((raw_array[:, i * n_pack + 1] & 0b11) << 2)
+                | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
     
@@ -500,38 +493,38 @@ def pack_array_with_numba_b2_c64(
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
-                (raw_array[:, i * n_pack + 31] << 62)
-                | (raw_array[:, i * n_pack + 30] << 60)
-                | (raw_array[:, i * n_pack + 29] << 58)
-                | (raw_array[:, i * n_pack + 28] << 56)
-                | (raw_array[:, i * n_pack + 27] << 54)
-                | (raw_array[:, i * n_pack + 26] << 52)
-                | (raw_array[:, i * n_pack + 25] << 50)
-                | (raw_array[:, i * n_pack + 24] << 48)
-                | (raw_array[:, i * n_pack + 23] << 46)
-                | (raw_array[:, i * n_pack + 22] << 44)
-                | (raw_array[:, i * n_pack + 21] << 42)
-                | (raw_array[:, i * n_pack + 20] << 40)
-                | (raw_array[:, i * n_pack + 19] << 38)
-                | (raw_array[:, i * n_pack + 18] << 36)
-                | (raw_array[:, i * n_pack + 17] << 34)
-                | (raw_array[:, i * n_pack + 16] << 32)
-                | (raw_array[:, i * n_pack + 15] << 30)
-                | (raw_array[:, i * n_pack + 14] << 28)
-                | (raw_array[:, i * n_pack + 13] << 26)
-                | (raw_array[:, i * n_pack + 12] << 24)
-                | (raw_array[:, i * n_pack + 11] << 22)
-                | (raw_array[:, i * n_pack + 10] << 20)
-                | (raw_array[:, i * n_pack + 9] << 18)
-                | (raw_array[:, i * n_pack + 8] << 16)
-                | (raw_array[:, i * n_pack + 7] << 14)
-                | (raw_array[:, i * n_pack + 6] << 12)
-                | (raw_array[:, i * n_pack + 5] << 10)
-                | (raw_array[:, i * n_pack + 4] << 8)
-                | (raw_array[:, i * n_pack + 3] << 6)
-                | (raw_array[:, i * n_pack + 2] << 4)
-                | (raw_array[:, i * n_pack + 1] << 2)
-                | raw_array[:, i * n_pack]
+                ((raw_array[:, i * n_pack + 31] & 0b11) << 62)
+                | ((raw_array[:, i * n_pack + 30] & 0b11) << 60)
+                | ((raw_array[:, i * n_pack + 29] & 0b11) << 58)
+                | ((raw_array[:, i * n_pack + 28] & 0b11) << 56)
+                | ((raw_array[:, i * n_pack + 27] & 0b11) << 54)
+                | ((raw_array[:, i * n_pack + 26] & 0b11) << 52)
+                | ((raw_array[:, i * n_pack + 25] & 0b11) << 50)
+                | ((raw_array[:, i * n_pack + 24] & 0b11) << 48)
+                | ((raw_array[:, i * n_pack + 23] & 0b11) << 46)
+                | ((raw_array[:, i * n_pack + 22] & 0b11) << 44)
+                | ((raw_array[:, i * n_pack + 21] & 0b11) << 42)
+                | ((raw_array[:, i * n_pack + 20] & 0b11) << 40)
+                | ((raw_array[:, i * n_pack + 19] & 0b11) << 38)
+                | ((raw_array[:, i * n_pack + 18] & 0b11) << 36)
+                | ((raw_array[:, i * n_pack + 17] & 0b11) << 34)
+                | ((raw_array[:, i * n_pack + 16] & 0b11) << 32)
+                | ((raw_array[:, i * n_pack + 15] & 0b11) << 30)
+                | ((raw_array[:, i * n_pack + 14] & 0b11) << 28)
+                | ((raw_array[:, i * n_pack + 13] & 0b11) << 26)
+                | ((raw_array[:, i * n_pack + 12] & 0b11) << 24)
+                | ((raw_array[:, i * n_pack + 11] & 0b11) << 22)
+                | ((raw_array[:, i * n_pack + 10] & 0b11) << 20)
+                | ((raw_array[:, i * n_pack + 9] & 0b11) << 18)
+                | ((raw_array[:, i * n_pack + 8] & 0b11) << 16)
+                | ((raw_array[:, i * n_pack + 7] & 0b11) << 14)
+                | ((raw_array[:, i * n_pack + 6] & 0b11) << 12)
+                | ((raw_array[:, i * n_pack + 5] & 0b11) << 10)
+                | ((raw_array[:, i * n_pack + 4] & 0b11) << 8)
+                | ((raw_array[:, i * n_pack + 3] & 0b11) << 6)
+                | ((raw_array[:, i * n_pack + 2] & 0b11) << 4)
+                | ((raw_array[:, i * n_pack + 1] & 0b11) << 2)
+                | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
     
@@ -582,7 +575,7 @@ def pack_array_with_numba_yi(
             for i in range(new_in_features):
                 packed_tensor[:, i] = (
                     (raw_tensor[:, i * n_pack + 7] << 28)
-                    | (raw_tensor[:, i * n_pack + 6] << 24)
+                    | (raw_tensor[:, i * n_pack + 6]  << 24)
                     | (raw_tensor[:, i * n_pack + 5] << 20)
                     | (raw_tensor[:, i * n_pack + 4] << 16)
                     | (raw_tensor[:, i * n_pack + 3] << 12)

From 0f7de684bf589fb6679c2f5b1e21516c490a1790 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 01:32:06 -0700
Subject: [PATCH 20/38] support gptq

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/weight_only/gptq.py      | 98 ++++++++++---------
 1 file changed, 54 insertions(+), 44 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 30b4d07be6a..942e38b73d3 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -482,20 +482,12 @@ def tmp(_, inp, out):
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 logger.info(f"Quantizing layer {layer_name}")
                 if self.use_layer_wise:  # pragma: no cover
-                    from neural_compressor.torch.algorithms.layer_wise import load_value, set_module_tensor_to_device
+                    from neural_compressor.torch.algorithms.layer_wise import load_value
 
                     full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    for n, p in sub_layers[layer_name].named_parameters():
-                        param_name = full_layer_name + "." + n
-                        if n == "weight":
-                            W = load_value(self.model, full_layer_name + ".weight", self.model_path)
-                        else:
-                            value = load_value(self.model, param_name, self.model_path)
-                            set_module_tensor_to_device(self.model, param_name, self.device, value)
-
+                    W = load_value(self.model, full_layer_name + ".weight", self.model_path)
                 else:
                     W = sub_layers[layer_name].weight.data.clone()
-
                 accelerator.mark_step()
                 if "hpu" in self.device:
                     W = W.to("cpu")
@@ -507,8 +499,30 @@ def tmp(_, inp, out):
                     act_order=weight_config_this_layer["act_order"],
                     static_groups=weight_config_this_layer["static_groups"],
                 )
+                if self.use_layer_wise:  # pragma: no cover
+                    from neural_compressor.torch.algorithms.layer_wise import (
+                        LWQ_WORKSPACE,
+                        clean_module_weight,
+                        load_value,
+                        set_module_tensor_to_device,
+                    )
 
-                # Step 2.5: export to compressed model
+                    sub_layer = sub_layers[layer_name]
+                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                    for n, p in sub_layer.named_parameters():
+                        param_name = full_layer_name + "." + n
+                        if n == "weight":
+                            set_module_tensor_to_device(self.model, param_name, self.device, Q)
+                        else:
+                            value = load_value(self.model, param_name, self.model_path)
+                            set_module_tensor_to_device(self.model, param_name, self.device, value)
+                    # sub_layer.weight.data = Q
+                    torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
+                    clean_module_weight(sub_layer)
+                    del Q
+                    gc.collect()
+                else:
+                    sub_layers[layer_name].weight.data = Q
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
@@ -516,7 +530,24 @@ def tmp(_, inp, out):
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[
                         layer_name
                     ].perm
+                gptq_for_this_block[layer_name].free()
 
+            # Step 2.5: replace output data with quantized weights
+            outs = []
+            batch_num = self.cache_key_arguments.pop("batch_num")
+            for j in range(batch_num):
+                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
+                cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
+                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
+                out = self.track_hidden_states(out)
+                outs.append(out)
+            self.cache_key_arguments["batch_num"] = batch_num
+            if self.use_layer_wise:  # pragma: no cover
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
+            else:
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
+            # Step 2.6: export to compressed model
+            for layer_name in sub_layers:
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 gptq_scale = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["scale"]
                 if not weight_config_this_layer["sym"]:
@@ -527,6 +558,13 @@ def tmp(_, inp, out):
                     gptq_perm = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"]
                 else:
                     gptq_perm = None
+                if self.use_layer_wise:
+                    state_dict = torch.load(LWQ_WORKSPACE + f"/{self.get_full_layer_name(layer_name, block_idx)}.pt")
+                    Q = state_dict["weight"].data
+                    bias = state_dict["bias"] if "bias" in state_dict.keys() else None
+                        
+                else:
+                    Q = sub_layers[layer_name].weight.data
                 if weight_config_this_layer["act_order"]:
                     Q.copy_(Q[:, gptq_perm])
                 if is_transformers_imported() and isinstance(sub_layers[layer_name], transformers.Conv1D):
@@ -555,6 +593,9 @@ def tmp(_, inp, out):
                     scale = scale.t_().contiguous()
                     zp = zp.t_().contiguous() if zp is not None else zp
 
+                if not self.use_layer_wise:
+                    bias = sub_layers[layer_name].bias
+                
                 new_module = WeightOnlyLinear(
                     in_features,
                     out_features,
@@ -562,43 +603,12 @@ def tmp(_, inp, out):
                     bits=weight_config_this_layer["bits"],
                     group_size=weight_config_this_layer["group_size"],
                     zp=gptq_zp is not None,
-                    bias=sub_layers[layer_name].bias is not None,
+                    bias=bias is not None,
                     g_idx=gptq_perm is not None,
                     device=self.device,
                 )
-                new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm)
-
-                if self.use_layer_wise:  # pragma: no cover
-                    from neural_compressor.torch.algorithms.layer_wise import (
-                        LWQ_WORKSPACE,
-                        clean_module_weight,
-                        load_value,
-                        set_module_tensor_to_device,
-                    )
-
-                    torch.save(new_module.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
-                    clean_module_weight(new_module)
-                    del Q
-                    gc.collect()
+                new_module.pack(int_weight, gptq_scale, gptq_zp, bias, gptq_perm)
                 set_module(transformer_block, layer_name, new_module)
-
-                gptq_for_this_block[layer_name].free()
-
-            # Step 2.6: replace output data with quantized weights
-            outs = []
-            batch_num = self.cache_key_arguments.pop("batch_num")
-            for j in range(batch_num):
-                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
-                cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
-                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
-                out = self.track_hidden_states(out)
-                outs.append(out)
-            self.cache_key_arguments["batch_num"] = batch_num
-            if self.use_layer_wise:  # pragma: no cover
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
-            else:
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
-
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.

From 83c6a9b2f5d879f450b3d23f764c0cdf5aaf6150 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 20:23:25 -0700
Subject: [PATCH 21/38] keep q_model in memory

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/layer_wise/utils.py      | 18 ++---
 .../torch/algorithms/weight_only/rtn.py       | 78 ++++---------------
 2 files changed, 24 insertions(+), 72 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index 2722a891144..bbe59de3fe4 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -251,17 +251,13 @@ def hook(module, input):
             state_dict = None
             if os.path.exists(os.path.join(LWQ_WORKSPACE, f"{name}.pt")):
                 state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
-            if isinstance(module, WeightOnlyLinear):
-                for n, p in module._buffers.items():
-                    setattr(module, n, state_dict[n])
-            else:
-                for n, p in module.named_parameters():
-                    param_name = name + "." + n
-                    if state_dict:
-                        value = state_dict[n]
-                    else:
-                        value = load_value(model, param_name, path)
-                    set_module_tensor_to_device(model, param_name, device, value)
+            for n, p in module.named_parameters():
+                param_name = name + "." + n
+                if state_dict:
+                    value = state_dict[n]
+                else:
+                    value = load_value(model, param_name, path)
+                set_module_tensor_to_device(model, param_name, device, value)
 
         return hook
 
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 5143f22df60..5ac0deb594c 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -92,15 +92,11 @@ def convert(
         """
         weight_config = self.quant_config
         device = get_accelerator(kwargs.pop("device", "auto")).current_device_name()
-        if use_layer_wise:
-            from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
-
-            os.makedirs(LWQ_WORKSPACE, exist_ok=True)
 
         # Put model on device explicitly
         # TODO: refine it later, Put module on device one by one instead of the whole model
-        #if not use_layer_wise:
-        #    model.to(device)
+        if not use_layer_wise:
+           model.to(device)
 
         total_time = 0.0
         total_load_time = 0.0
@@ -125,9 +121,21 @@ def convert(
             "double_quant_group_size": kwargs.get("double_quant_group_size", 256),
         }
         use_optimum_format = kwargs.get("use_optimum_format", True)
+        
+        if use_layer_wise:
+            from neural_compressor.common.utils import DEFAULT_WORKSPACE
+            from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, register_weight_hooks
+
+            if model_path == "":
+                model_path = model.path
+            assert model_path, "model_path should not be None."
+            model_path = get_path(model_path)
+
+            register_weight_hooks(model, model_path, device=device, clean_weight=True)
+        
         for name, m in model.named_modules():
             
-            if not isinstance(m, supported_layers):
+            if not isinstance(m, supported_layers): 
                 continue
             if name in weight_config:  # pragma: no cover
                 # initialize op configuration
@@ -173,22 +181,10 @@ def convert(
             logger.debug(f"RTN quantized module:{name, m}")
             logger.debug(log_msg)
 
-            if use_layer_wise and True:
-                start_load = time.time()
-                from neural_compressor.common.utils import DEFAULT_WORKSPACE
-                from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
 
-                os.makedirs(LWQ_WORKSPACE, exist_ok=True)
-                if model_path == "":
-                    model_path = model.path
-                assert model_path, "model_path should not be None."
-                model_path = get_path(model_path)
-
-                # load weight
+            
+            if use_layer_wise:
                 load_module(model, name, model_path, device=device)
-                load_time = time.time() - start_load
-                total_load_time += load_time
-                logger.info(load_time)
 
             # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight.
             if is_transformers_imported():
@@ -240,52 +236,12 @@ def convert(
                 use_optimum_format=use_optimum_format,
                 device=device,
             )
-            if name in ["model.layers.11.mlp.up_proj", "model.layers.16.mlp.gate_proj"]:
-                print("will break")
-                #breakpoint()
-            logger.info(name)
             new_module.pack(int_weight, scale, zp, m.bias)
 
             if use_layer_wise:
-                # save and clean weight
-                from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight
-                from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module
-
-                import time
-
-                start = time.time()
-                torch.save(new_module.state_dict(), os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
-                save_time = time.time() - start
-                logger.info(f"save time {save_time}")
-                total_save_time += save_time
-                start = time.time()
-                #clean_module_weight(new_module)
-                new_module = new_module.to_empty(device=torch.device("meta"))
                 m = m.to_empty(device=torch.device("meta"))
-                #clean_module_weight(m)
-                layer_time = time.time() - start
-                total_time += layer_time
-                logger.info(layer_time)
             if name == "":
                 return new_module
             else:
-                start_set = time.time()
                 set_module(model, name, new_module)
-                set_module_time = time.time() - start_set
-                total_set_module_time += set_module_time
-            quant_time = time.time() - start_quant - save_time - layer_time
-            logger.info(f"quant time {quant_time}")
-            total_quant_time += quant_time
-
-        logger.info(f"load time: {total_load_time}")
-        logger.info(f"save time: {total_save_time}")
-        logger.info(f"clean time: {total_time}")
-        logger.info(f"quant time: {total_quant_time}")
-        logger.info(f"quant int time: {total_quant_int_time}")
-        logger.info(f"set module time: {total_set_module_time}")
-        if use_layer_wise:
-            # register hooks
-            from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks
-
-            register_weight_hooks(model, model_path, device=device, clean_weight=True)
         return model

From c543783eacb80207f49b50a4e7ca3366dc965a9e Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 21:40:26 -0700
Subject: [PATCH 22/38] fix master conflict

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/weight_only/gptq.py           |  3 ++-
 .../torch/algorithms/weight_only/rtn.py            | 14 +++++++-------
 neural_compressor/torch/quantization/config.py     | 10 +++++-----
 test/3x/torch/quantization/weight_only/test_rtn.py |  3 +--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 82b73c4213b..89882f965b7 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -1045,7 +1045,8 @@ def convert(self, model, *args, **kwargs):
         self.gptq_quantizer.remove_prepare_for_calibration()
 
         q_model, gptq_config = self.gptq_quantizer.execute_quantization()
-        q_model = q_model.to(self.model_device)
+        if not self.gptq_quantizer.use_layer_wise:
+            q_model = q_model.to(self.model_device)
         q_model.gptq_config = gptq_config
         logger.info("GPTQ quantizing done.")
         return q_model
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 6a1de840b2d..0554fd74383 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -146,7 +146,8 @@ def convert(
                 if dtype == "fp32":
                     continue
                 # Move modules to the accelerator device layer-by-layer
-                m.to(device)
+                if not use_layer_wise:
+                    m.to(device)
                 ### FP8 cast part
                 if dtype in ["fp8_e5m2", "fp8_e5m2fnuz", "fp8_e4m3fn", "fp8_e4m3fnuz"]:
                     logger.debug("Cast module {} to FP8 using qdq mode, no scaling".format(name))
@@ -200,7 +201,6 @@ def convert(
                 weight = m.weight.detach()
             if use_mse_search:
                 quantile = search_clip(m, bits, group_size, scheme, dtype, use_full_range)
-            start_quant = time.time()
             int_weight, scale, zp = quant_tensor(
                 weight,
                 dtype=dtype,
@@ -212,8 +212,6 @@ def convert(
                 full_range=use_full_range,
                 **double_quant_config,
             )
-            quant_int_time = time.time() - start_quant
-            total_quant_int_time += quant_int_time
             int_weight = int_weight.t_().contiguous() if transpose else int_weight
             scale = scale.t_().contiguous() if transpose else scale
             zp = zp.t_().contiguous() if transpose and zp is not None else zp
@@ -248,7 +246,9 @@ def convert(
             else:
                 set_module(model, name, new_module)
             # Move modules back to the model device layer-by-layer
-            m.to(model_device)
-            new_module.to(model_device)
-        model.to(model_device)
+            if not use_layer_wise:
+                m.to(model_device)
+                new_module.to(model_device)
+        if not use_layer_wise:
+            model.to(model_device)
         return model
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 9014f1576a3..1f60fe83647 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -200,7 +200,7 @@ def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]:
         if not self.quant_lm_head:
-            self.set_local(LM_HEAD_NAMES, RTNConfig(dtype="fp32"))
+            self.set_local(LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path))
         config_mapping = super().to_config_mapping(config_list, model_info)
         return config_mapping
 
@@ -363,7 +363,7 @@ def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]:
         if not self.quant_lm_head:
-            self.set_local(LM_HEAD_NAMES, GPTQConfig(dtype="fp32"))
+            self.set_local(LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path))
         config_mapping = super().to_config_mapping(config_list, model_info)
         return config_mapping
 
@@ -385,7 +385,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig
     @classmethod
     def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "GPTQConfig"]:
         pre_defined_configs: Dict[torch_utils.ProcessorType, GPTQConfig] = {}
-        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)#, model_path=self.model_path)
         pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
         return pre_defined_configs
 
@@ -508,7 +508,7 @@ def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]:
         if not self.quant_lm_head:
-            self.set_local(LM_HEAD_NAMES, AWQConfig(dtype="fp32"))
+            self.set_local(LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path))
         config_mapping = super().to_config_mapping(config_list, model_info)
         return config_mapping
 
@@ -815,7 +815,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "AutoRoundConfig", List["AutoR
     @classmethod
     def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "AutoRoundConfig"]:
         pre_defined_configs: Dict[torch_utils.ProcessorType, AutoRoundConfig] = {}
-        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True, model_path=self.model_path)
         pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
         return pre_defined_configs
 
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index b3a379be15f..d4e1ae2f4e6 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -44,7 +44,7 @@ def setup_class(self):
         self.label = self.tiny_gptj(self.example_inputs)[0]
         # test_default_config
         model = copy.deepcopy(self.tiny_gptj)
-        quant_config = get_default_rtn_config()
+        quant_config = get_default_rtn_config("Server")
         model = prepare(model, quant_config)
         model = convert(model)
         # record q_label for comparison
@@ -172,7 +172,6 @@ def test_layer_wise(self):
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
         quant_config = RTNConfig(
             use_layer_wise=True,
-            model_path="hf-internal-testing/tiny-random-GPTJForCausalLM",
         )
         model = prepare(model, quant_config)
         model = convert(model)

From 159aa34d363f721593d84746dcb69eb479849e3e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 04:43:16 +0000
Subject: [PATCH 23/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/weight_only/gptq.py      |  4 +-
 .../torch/algorithms/weight_only/modules.py   | 77 ++++++++++---------
 .../torch/algorithms/weight_only/rtn.py       | 10 +--
 .../torch/quantization/config.py              | 16 +++-
 4 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 89882f965b7..eae9f7c3a84 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -581,7 +581,7 @@ def tmp(_, inp, out):
                     state_dict = torch.load(LWQ_WORKSPACE + f"/{self.get_full_layer_name(layer_name, block_idx)}.pt")
                     Q = state_dict["weight"].data
                     bias = state_dict["bias"] if "bias" in state_dict.keys() else None
-                        
+
                 else:
                     Q = sub_layers[layer_name].weight.data
                 if weight_config_this_layer["act_order"]:
@@ -614,7 +614,7 @@ def tmp(_, inp, out):
 
                 if not self.use_layer_wise:
                     bias = sub_layers[layer_name].bias
-                
+
                 new_module = WeightOnlyLinear(
                     in_features,
                     out_features,
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 47d51560612..7b0aae9589b 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -19,11 +19,11 @@
 # since the model classes inherit torch.nn.Module.
 import math
 
+import numba
 import numpy as np
 import torch
 from torch.autograd import Function
 from torch.nn import functional as F
-import numba
 
 from neural_compressor.torch.utils import accelerator, logger
 
@@ -301,11 +301,11 @@ def unpack_tensor_with_torch(self, packed_tensor):
                 unpacked_tensor[:, index].copy_(tmp.type(target_dtype))
                 accelerator.synchronize()
         return unpacked_tensor
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b4_c32(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -319,11 +319,11 @@ def pack_array_with_numba_b4_c32(
                 | (raw_array[:, i * n_pack] & 0b1111)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b4_c16(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -333,23 +333,20 @@ def pack_array_with_numba_b4_c16(
                 | (raw_array[:, i * n_pack] & 0b1111)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b4_c8(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
-            packed_array[:, i] = (
-                ((raw_array[:, i * n_pack + 1] & 0b1111) << 4)
-                | (raw_array[:, i * n_pack] & 0b1111)
-            )
+            packed_array[:, i] = ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) | (raw_array[:, i * n_pack] & 0b1111)
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b4_c64(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -372,11 +369,10 @@ def pack_array_with_numba_b4_c64(
             )
         return packed_array
 
-    
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b8_c32(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -386,11 +382,11 @@ def pack_array_with_numba_b8_c32(
                 | (raw_array[:, i * n_pack] & 0b11111111)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b8_c16(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -400,20 +396,20 @@ def pack_array_with_numba_b8_c16(
                 | (raw_array[:, i * n_pack] & 0b11111111)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b8_c8(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
-            packed_array[:, i] = (raw_array[:, i * n_pack] & 0b11111111)
+            packed_array[:, i] = raw_array[:, i * n_pack] & 0b11111111
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b8_c64(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -427,11 +423,11 @@ def pack_array_with_numba_b8_c64(
                 | (raw_array[:, i * n_pack] & 0b11111111)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b2_c32(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -457,7 +453,7 @@ def pack_array_with_numba_b2_c32(
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b2_c16(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -471,11 +467,11 @@ def pack_array_with_numba_b2_c16(
                 | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b2_c8(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -485,11 +481,11 @@ def pack_array_with_numba_b2_c8(
                 | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
-    
+
     @staticmethod
     @numba.jit(nopython=True, parallel=True)
     def pack_array_with_numba_b2_c64(
-        raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int
+        raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int
     ) -> np.ndarray:
         for i in range(new_in_features):
             packed_array[:, i] = (
@@ -527,7 +523,7 @@ def pack_array_with_numba_b2_c64(
                 | (raw_array[:, i * n_pack] & 0b11)
             )
         return packed_array
-    
+
     def pack_array_with_numba(
         self, raw_array: np.ndarray, n_pack: int, bits: int, compress_bits: int, compression_dtype=np.int32
     ) -> np.ndarray:
@@ -547,17 +543,18 @@ def pack_array_with_numba(
         new_in_features = (in_features + n_pack - 1) // n_pack
         packed_array = np.zeros((out_features, new_in_features), dtype=compression_dtype)
         raw_array = raw_array.astype(compression_dtype)
-        
+
         pack_method_name = f"pack_array_with_numba_b{bits}_c{compress_bits}"
         pack_method = getattr(self, pack_method_name)
         return pack_method(raw_array, packed_array, n_pack, new_in_features)
-        
+
     @staticmethod
     @numba.jit(nopython=True)
     def pack_array_with_numba_yi(
         raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32
     ) -> np.ndarray:
         """Packs the input tensor by combining elements into a specified bit-width format using NumPy.
+
         Args:
             raw_tensor (np.ndarray): The tensor to be packed. Shape: [out_features, in_features] or [1, in_features].
             n_pack (int): The number of elements to be packed together.
@@ -575,7 +572,7 @@ def pack_array_with_numba_yi(
             for i in range(new_in_features):
                 packed_tensor[:, i] = (
                     (raw_tensor[:, i * n_pack + 7] << 28)
-                    | (raw_tensor[:, i * n_pack + 6]  << 24)
+                    | (raw_tensor[:, i * n_pack + 6] << 24)
                     | (raw_tensor[:, i * n_pack + 5] << 20)
                     | (raw_tensor[:, i * n_pack + 4] << 16)
                     | (raw_tensor[:, i * n_pack + 3] << 12)
@@ -585,7 +582,7 @@ def pack_array_with_numba_yi(
                 )
 
         return packed_tensor
-    
+
     def pack_tensor_with_reshape(self, raw_tensor):
         raw_array = raw_tensor.cpu().numpy()
         target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int)
@@ -593,9 +590,11 @@ def pack_tensor_with_reshape(self, raw_tensor):
         reshaped = raw_array.reshape(-1, self.n_pack)
         packed_array = np.zeros(reshaped.shape[0], dtype=target_dtype)
         for i in range(self.n_pack):
-            packed_array |= (reshaped[:, i].astype(target_dtype) << (self.bits * i))
-       
-        packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to(device=raw_tensor.device)
+            packed_array |= reshaped[:, i].astype(target_dtype) << (self.bits * i)
+
+        packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to(
+            device=raw_tensor.device
+        )
         return packed_tensor
 
     def pack_tensor_with_numpy(self, raw_tensor):
@@ -603,7 +602,9 @@ def pack_tensor_with_numpy(self, raw_tensor):
             return self.pack_tensor_with_reshape(raw_tensor)
         compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype
         # packed_array = self.pack_array_with_numba_yi(raw_tensor.cpu().numpy(), self.n_pack, self.bits,  compression_dtype)
-        packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype)
+        packed_array = self.pack_array_with_numba(
+            raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype
+        )
         return torch.from_numpy(packed_array).to(device=raw_tensor.device)
 
     def unpack_tensor_with_numpy(self, packed_tensor):
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 0554fd74383..c04327a62f4 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -124,7 +124,7 @@ def convert(
             "double_quant_group_size": kwargs.get("double_quant_group_size", 256),
         }
         use_optimum_format = kwargs.get("use_optimum_format", True)
-        
+
         if use_layer_wise:
             from neural_compressor.common.utils import DEFAULT_WORKSPACE
             from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, register_weight_hooks
@@ -135,10 +135,10 @@ def convert(
             model_path = get_path(model_path)
 
             register_weight_hooks(model, model_path, device=device, clean_weight=True)
-        
+
         for name, m in model.named_modules():
-            
-            if not isinstance(m, supported_layers): 
+
+            if not isinstance(m, supported_layers):
                 continue
             if name in weight_config:  # pragma: no cover
                 # initialize op configuration
@@ -186,7 +186,7 @@ def convert(
                 continue
             logger.debug(f"RTN quantized module:{name, m}")
             logger.debug(log_msg)
-            
+
             if use_layer_wise:
                 load_module(model, name, model_path, device=device)
 
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 1f60fe83647..90e0119769e 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -200,7 +200,9 @@ def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]:
         if not self.quant_lm_head:
-            self.set_local(LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path))
+            self.set_local(
+                LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)
+            )
         config_mapping = super().to_config_mapping(config_list, model_info)
         return config_mapping
 
@@ -363,7 +365,9 @@ def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]:
         if not self.quant_lm_head:
-            self.set_local(LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path))
+            self.set_local(
+                LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)
+            )
         config_mapping = super().to_config_mapping(config_list, model_info)
         return config_mapping
 
@@ -385,7 +389,9 @@ def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig
     @classmethod
     def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "GPTQConfig"]:
         pre_defined_configs: Dict[torch_utils.ProcessorType, GPTQConfig] = {}
-        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)#, model_path=self.model_path)
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(
+            use_layer_wise=True
+        )  # , model_path=self.model_path)
         pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
         return pre_defined_configs
 
@@ -508,7 +514,9 @@ def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]:
         if not self.quant_lm_head:
-            self.set_local(LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path))
+            self.set_local(
+                LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)
+            )
         config_mapping = super().to_config_mapping(config_list, model_info)
         return config_mapping
 

From 809c0fb2e66584043ac18b247d241bffb0f936a0 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 21:44:59 -0700
Subject: [PATCH 24/38] update numba requirements_pt

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 requirements_pt.txt                                 | 1 +
 test/3x/torch/quantization/weight_only/test_gptq.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_pt.txt b/requirements_pt.txt
index 94667b64665..0ed9fabde27 100644
--- a/requirements_pt.txt
+++ b/requirements_pt.txt
@@ -1,4 +1,5 @@
 numpy < 2.0
+numba
 peft==0.10.0
 prettytable
 psutil
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 1974bc33222..93b09aec02b 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -176,7 +176,6 @@ def test_act_order(self):
         assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check."
 
     def test_layer_wise(self):
-        # model = copy.deepcopy(self.tiny_gptj)
         model = copy.deepcopy(self.tiny_gptj)
         quant_config = GPTQConfig()
         model = prepare(model, quant_config)

From 308c7fc6392312b2f8741019d815d49508c3ca22 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 04:48:44 +0000
Subject: [PATCH 25/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 requirements_pt.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_pt.txt b/requirements_pt.txt
index 0ed9fabde27..5f18aead98d 100644
--- a/requirements_pt.txt
+++ b/requirements_pt.txt
@@ -1,5 +1,5 @@
-numpy < 2.0
 numba
+numpy < 2.0
 peft==0.10.0
 prettytable
 psutil

From 5d80e9bc005c9ec1b85ce9258ecd9a5c256e782a Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 22:13:03 -0700
Subject: [PATCH 26/38] fix awq config

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 neural_compressor/torch/quantization/config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 90e0119769e..868c44c4746 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -444,6 +444,7 @@ def __init__(
         use_full_range: bool = False,
         use_mse_search: bool = False,
         use_layer_wise: bool = False,
+        model_path: str = "",
         # double quant
         use_double_quant: bool = False,
         double_quant_dtype: str = "int",
@@ -469,6 +470,7 @@ def __init__(
             use_full_range (bool): Enables full range for activations, default is False.
             use_mse_search (bool): Enables mean squared error (MSE) search, default is False.
             use_layer_wise (bool): Enables quantize model per layer. Defaults to False.
+            model_path (str): Model path that is used to load state_dict per layer.
             use_double_quant (bool): Enables double quantization, default is False.
             double_quant_dtype (str): Data type for double_quant scale, default is "int".
             double_quant_bits (int): Number of bits used to represent double_quant scale, default is 4.
@@ -489,6 +491,7 @@ def __init__(
         self.use_full_range = use_full_range
         self.use_mse_search = use_mse_search
         self.use_layer_wise = use_layer_wise
+        self.model_path = model_path
         # double quant
         self.use_double_quant = use_double_quant
         self.double_quant_bits = double_quant_bits

From c4af34434d25df02d93af961749001ede51bb6cc Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 22:54:14 -0700
Subject: [PATCH 27/38] remove pack_with_reshpe

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../torch/algorithms/weight_only/modules.py   | 60 +++++--------------
 1 file changed, 14 insertions(+), 46 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 7b0aae9589b..503a469b0c7 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -548,60 +548,28 @@ def pack_array_with_numba(
         pack_method = getattr(self, pack_method_name)
         return pack_method(raw_array, packed_array, n_pack, new_in_features)
 
-    @staticmethod
-    @numba.jit(nopython=True)
-    def pack_array_with_numba_yi(
-        raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32
-    ) -> np.ndarray:
-        """Packs the input tensor by combining elements into a specified bit-width format using NumPy.
-
-        Args:
-            raw_tensor (np.ndarray): The tensor to be packed. Shape: [out_features, in_features] or [1, in_features].
-            n_pack (int): The number of elements to be packed together.
-            bits (int): The number of bits for each element.
-            compression_dtype (np.dtype, optional): The data type of the compressed tensor. Defaults to np.int32.
-        Returns:
-            np.ndarray: The packed tensor.
-        """
-        out_features, in_features = raw_tensor.shape
-        new_in_features = (in_features + n_pack - 1) // n_pack
-        packed_tensor = np.zeros((out_features, new_in_features), dtype=compression_dtype)
-        raw_tensor = raw_tensor.astype(compression_dtype)
-
-        if bits == 4:
-            for i in range(new_in_features):
-                packed_tensor[:, i] = (
-                    (raw_tensor[:, i * n_pack + 7] << 28)
-                    | (raw_tensor[:, i * n_pack + 6] << 24)
-                    | (raw_tensor[:, i * n_pack + 5] << 20)
-                    | (raw_tensor[:, i * n_pack + 4] << 16)
-                    | (raw_tensor[:, i * n_pack + 3] << 12)
-                    | (raw_tensor[:, i * n_pack + 2] << 8)
-                    | (raw_tensor[:, i * n_pack + 1] << 4)
-                    | raw_tensor[:, i * n_pack]
-                )
-
-        return packed_tensor
-
-    def pack_tensor_with_reshape(self, raw_tensor):
+    def pack_tensor_with_numpy_impl(self, raw_tensor):
         raw_array = raw_tensor.cpu().numpy()
         target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int)
         target_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype
-        reshaped = raw_array.reshape(-1, self.n_pack)
-        packed_array = np.zeros(reshaped.shape[0], dtype=target_dtype)
-        for i in range(self.n_pack):
-            packed_array |= reshaped[:, i].astype(target_dtype) << (self.bits * i)
-
-        packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to(
-            device=raw_tensor.device
-        )
+        packed_array = np.zeros((raw_array.shape[0], target_len), dtype=target_dtype)
+        mask = np.uint8(2**self.bits - 1)
+        for j in range(packed_array.shape[1]):
+            start = self.n_pack * j
+            end = self.n_pack * (j + 1)
+            tmp = raw_array[:, start:end].astype(target_dtype)
+            tmp &= mask
+            for e in range(tmp.shape[1]):
+                tmp[:, e] = np.left_shift(tmp[:, e], self.bits * e)
+                packed_array[:, j] |= tmp[:, e]
+                accelerator.synchronize()
+        packed_tensor = torch.from_numpy(packed_array).to(device=raw_tensor.device)
         return packed_tensor
 
     def pack_tensor_with_numpy(self, raw_tensor):
         if self.bits not in [2, 4, 8]:
-            return self.pack_tensor_with_reshape(raw_tensor)
+            return self.pack_tensor_with_numpy_impl(raw_tensor)
         compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype
-        # packed_array = self.pack_array_with_numba_yi(raw_tensor.cpu().numpy(), self.n_pack, self.bits,  compression_dtype)
         packed_array = self.pack_array_with_numba(
             raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype
         )

From e99ee19efd5ccb796adda546ac505ddba1ecb649 Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 23:20:30 -0700
Subject: [PATCH 28/38] recover ar

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 neural_compressor/torch/quantization/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 868c44c4746..aaa1c5e60ca 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -826,7 +826,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "AutoRoundConfig", List["AutoR
     @classmethod
     def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "AutoRoundConfig"]:
         pre_defined_configs: Dict[torch_utils.ProcessorType, AutoRoundConfig] = {}
-        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True, model_path=self.model_path)
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)
         pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
         return pre_defined_configs
 

From 1dd01a0eaf09dd211ddf0228434b7a84ba7a27fa Mon Sep 17 00:00:00 2001
From: sdp <sdp@9049fa09fd7b.jf.intel.com>
Date: Thu, 11 Jul 2024 23:46:08 -0700
Subject: [PATCH 29/38] revert eg

Signed-off-by: sdp <sdp@9049fa09fd7b.jf.intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 42 +++++--------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 96ecc12cdc3..abd8228354e 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -212,35 +212,17 @@ def get_user_model():
 
 if args.quantize:
     # dataset
-    if 0:
-        user_model, tokenizer = get_user_model()
-        use_layer_wise =False
-        # user_model.save_pretrained("./saved",max_shard_size="20GB", safe_serialization=False)
-    else:
-        from neural_compressor.torch.algorithms.layer_wise import load_empty_model
-        user_model = load_empty_model(args.model)
-    #     user_model = AutoModelForCausalLM.from_pretrained(
-    #     args.model,
-    #     #trust_remote_code=args.trust_remote_code,
-    #     low_cpu_mem_usage=True,
-    #     torch_dtype="auto"
-    #    )
-       #from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-        #tokenizer = AutoTokenizer.from_pretrained(args.model)
-       # checkpoint_file = "/home/sdp/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590"
-        #checkpoint_file = "./saved" #if checkpoint_file in "./saved" else checkpoint_file
-        #user_model = load_checkpoint_and_dispatch(user_model, checkpoint=checkpoint_file, device_mp="auto", offload_folder=checkpoint_file)
-        use_layer_wise = True 
-    #calib_dataset = load_dataset(args.dataset, split="train")
+    user_model, tokenizer = get_user_model()
+    calib_dataset = load_dataset(args.dataset, split="train")
     # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
-    #calib_dataset = calib_dataset.shuffle(seed=args.seed)
-    #calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
-    #calib_dataloader = DataLoader(
-    #    calib_evaluator.dataset,
-    #    batch_size=calib_size,
-    #    shuffle=False,
-    #    collate_fn=calib_evaluator.collate_batch,
-    #)
+    calib_dataset = calib_dataset.shuffle(seed=args.seed)
+    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
+    calib_dataloader = DataLoader(
+        calib_evaluator.dataset,
+        batch_size=calib_size,
+        shuffle=False,
+        collate_fn=calib_evaluator.collate_batch,
+    )
 
     # 3.x api
     from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
@@ -273,9 +255,8 @@ def get_user_model():
                 double_quant_dtype=args.double_quant_dtype,
                 double_quant_use_sym=args.double_quant_use_sym,
                 double_quant_group_size=args.double_quant_group_size,
-                use_layer_wise=use_layer_wise,
             )
-        quant_config.set_local("lm_head", RTNConfig(use_layer_wise=use_layer_wise, dtype="fp32"))
+        quant_config.set_local("lm_head", RTNConfig(dtype="fp32"))
         user_model = prepare(model=user_model, quant_config=quant_config)
         user_model = convert(model=user_model)
     elif args.woq_algo == "GPTQ":
@@ -331,7 +312,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
         run_fn_for_gptq(user_model, dataloader_for_calibration)
         user_model = convert(user_model)
 
-    exit(0)
     user_model.save(args.output_dir)
 
 

From 8dbf793d308956a7e80d04cbdff2290e2704e916 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 12 Jul 2024 15:48:51 +0800
Subject: [PATCH 30/38] install py 3x deps Signed-off-by: chensuyue
 <suyue.chen@intel.com>

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/codeScan/pylint/pylint.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh
index 062bfe414ee..5631dcc0917 100644
--- a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh
+++ b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh
@@ -20,6 +20,7 @@ apt-get install -y --no-install-recommends --fix-missing \
         build-essential
 
 pip install -r /neural-compressor/requirements.txt
+pip install -r /neural-compressor/requirements_pt.txt
 pip install cmake
 
 pip install torch \

From 0ea77fd1ffdbf184abe7521a35f753079fb7bb54 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 14:02:57 +0800
Subject: [PATCH 31/38] enhance import&add pack ut

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/__init__.py                | 1 +
 test/3x/torch/quantization/weight_only/test_rtn.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
index 28f108cb636..fa59ad3b280 100644
--- a/neural_compressor/torch/__init__.py
+++ b/neural_compressor/torch/__init__.py
@@ -11,3 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from neural_compressor.torch.algorithms.layer_wise import load_empty_model
\ No newline at end of file
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index d4e1ae2f4e6..293f11f6b8b 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -167,7 +167,7 @@ def test_quant_lm_head(self):
         ), "The tied lm_head weight is not deep copied, please check!"
 
     def test_layer_wise(self):
-        from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+        from neural_compressor.torch import load_empty_model
 
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
         quant_config = RTNConfig(

From eec87ac28e2ef8f735b9e070d3cfbfbded4981f8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 06:08:15 +0000
Subject: [PATCH 32/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
index fa59ad3b280..72d063553fa 100644
--- a/neural_compressor/torch/__init__.py
+++ b/neural_compressor/torch/__init__.py
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from neural_compressor.torch.algorithms.layer_wise import load_empty_model
\ No newline at end of file
+from neural_compressor.torch.algorithms.layer_wise import load_empty_model

From 36a4a29173ad1b131adac82194208163444002de Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 14:13:11 +0800
Subject: [PATCH 33/38] add pack ut file

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../algorithms/weight_only/test_woq_module.py | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 test/3x/torch/algorithms/weight_only/test_woq_module.py

diff --git a/test/3x/torch/algorithms/weight_only/test_woq_module.py b/test/3x/torch/algorithms/weight_only/test_woq_module.py
new file mode 100644
index 00000000000..3dbb4c77a9c
--- /dev/null
+++ b/test/3x/torch/algorithms/weight_only/test_woq_module.py
@@ -0,0 +1,48 @@
+import pytest
+import copy
+import torch
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.utility import quant_tensor
+class TestWeightOnlyLinear:
+    @pytest.mark.parametrize(
+        "bits, compression_dtype",
+        [
+            (8, torch.int8),
+            (8, torch.int16),
+            (8, torch.int32),
+            (8, torch.int64),
+            (4, torch.int8),
+            (4, torch.int16),
+            (4, torch.int32),
+            (4, torch.int64),
+            (2, torch.int8),
+            (2, torch.int16),
+            (2, torch.int32),
+            (2, torch.int64),
+        ],
+    )
+    def test_pack_with_numba(self, bits, compression_dtype):
+        m = torch.nn.Linear(64, 32)
+        dtype = "int"
+        weight = m.weight.detach()
+        int_weight, scale, zp = quant_tensor(
+                weight,
+                dtype=dtype,
+                bits=bits,
+                return_int=True,
+                group_size=32,
+            )
+        new_module = WeightOnlyLinear(
+            m.in_features,
+            m.out_features,
+            dtype=dtype,
+            bits=bits,
+            group_size=32,
+            zp=zp is not None,
+            bias=m.bias is not None,
+            use_optimum_format=False,
+            compression_dtype=compression_dtype,
+        )
+        new_module.pack(int_weight, scale, zp, m.bias)
+        unpacked_int_weight = new_module.unpack_tensor(new_module.qweight)
+        assert torch.equal(unpacked_int_weight, int_weight)
\ No newline at end of file

From 86008f476f90f4d8c57c58c5f7059d29a9531488 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 06:15:49 +0000
Subject: [PATCH 34/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../algorithms/weight_only/test_woq_module.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/test/3x/torch/algorithms/weight_only/test_woq_module.py b/test/3x/torch/algorithms/weight_only/test_woq_module.py
index 3dbb4c77a9c..0f06f358beb 100644
--- a/test/3x/torch/algorithms/weight_only/test_woq_module.py
+++ b/test/3x/torch/algorithms/weight_only/test_woq_module.py
@@ -1,8 +1,12 @@
-import pytest
 import copy
+
+import pytest
 import torch
+
 from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.algorithms.weight_only.utility import quant_tensor
+
+
 class TestWeightOnlyLinear:
     @pytest.mark.parametrize(
         "bits, compression_dtype",
@@ -26,12 +30,12 @@ def test_pack_with_numba(self, bits, compression_dtype):
         dtype = "int"
         weight = m.weight.detach()
         int_weight, scale, zp = quant_tensor(
-                weight,
-                dtype=dtype,
-                bits=bits,
-                return_int=True,
-                group_size=32,
-            )
+            weight,
+            dtype=dtype,
+            bits=bits,
+            return_int=True,
+            group_size=32,
+        )
         new_module = WeightOnlyLinear(
             m.in_features,
             m.out_features,
@@ -45,4 +49,4 @@ def test_pack_with_numba(self, bits, compression_dtype):
         )
         new_module.pack(int_weight, scale, zp, m.bias)
         unpacked_int_weight = new_module.unpack_tensor(new_module.qweight)
-        assert torch.equal(unpacked_int_weight, int_weight)
\ No newline at end of file
+        assert torch.equal(unpacked_int_weight, int_weight)

From 93a86f296eb0e820608b3545b52f81e704ffb7b1 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 15:55:54 +0800
Subject: [PATCH 35/38] move load_empty_model to torch.utils

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/utils/__init__.py           | 1 +
 test/3x/torch/quantization/weight_only/test_gptq.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/torch/utils/__init__.py b/neural_compressor/torch/utils/__init__.py
index dab02a017c6..ca802ba2145 100644
--- a/neural_compressor/torch/utils/__init__.py
+++ b/neural_compressor/torch/utils/__init__.py
@@ -15,3 +15,4 @@
 from .environ import *
 from .constants import *
 from .utility import *
+from neural_compressor.torch.algorithms.layer_wise import load_empty_model
\ No newline at end of file
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 93b09aec02b..8608e1801a4 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -183,7 +183,7 @@ def test_layer_wise(self):
         model = convert(model)
         q_label = model(self.example_inputs)[0]
 
-        from neural_compressor.torch.algorithms.layer_wise import load_empty_model
+        from neural_compressor.torch.utils import load_empty_model
 
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
 

From 19b1c4d3fb464bec8b7a3aad6d28394969577ae2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 07:58:36 +0000
Subject: [PATCH 36/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/utils/__init__.py b/neural_compressor/torch/utils/__init__.py
index ca802ba2145..25aadaa6d66 100644
--- a/neural_compressor/torch/utils/__init__.py
+++ b/neural_compressor/torch/utils/__init__.py
@@ -15,4 +15,4 @@
 from .environ import *
 from .constants import *
 from .utility import *
-from neural_compressor.torch.algorithms.layer_wise import load_empty_model
\ No newline at end of file
+from neural_compressor.torch.algorithms.layer_wise import load_empty_model

From f17c64027a156c47d93ef11a048e853a7521e34c Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 16:14:41 +0800
Subject: [PATCH 37/38] remove torch import

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/torch/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
index 72d063553fa..28f108cb636 100644
--- a/neural_compressor/torch/__init__.py
+++ b/neural_compressor/torch/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from neural_compressor.torch.algorithms.layer_wise import load_empty_model

From fa39f6f041565a99f2422818249fa1a5842d6955 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 18:28:05 +0800
Subject: [PATCH 38/38] fix ut import

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/quantization/weight_only/test_rtn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 293f11f6b8b..cc4a0df6172 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -167,7 +167,7 @@ def test_quant_lm_head(self):
         ), "The tied lm_head weight is not deep copied, please check!"
 
     def test_layer_wise(self):
-        from neural_compressor.torch import load_empty_model
+        from neural_compressor.torch.utils import load_empty_model
 
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
         quant_config = RTNConfig(