From c3a4bbcd17ae98a6234126c7be8e0b3aa1825698 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 3 Jul 2024 03:14:15 -0400
Subject: [PATCH 1/2] add some new feature for layer-wise quant

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 .../torch/algorithms/layer_wise/utils.py      | 172 +++++++++++++++++-
 1 file changed, 164 insertions(+), 8 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index 464a25cdee0..3258fef60d2 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -16,9 +16,13 @@
 # limitations under the License.
 """Utils for layer wise quantization."""
 
+import os
 import gc
 import json
-import os
+import pickle
+from functools import partial
+import logging
+from collections import OrderedDict
 
 import torch
 from accelerate import init_empty_weights
@@ -26,11 +30,12 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from neural_compressor.common import options
-
 from .load import load
 
-LWQ_WORKSPACE = os.path.join(options.workspace, "layer_wise_tmp")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(filename)s L%(lineno)d: %(message)s")
+logger = logging.getLogger("layer_wise_tools")
+
+LWQ_WORKSPACE = os.path.join("layer_wise_tmp")
 
 
 class QDQLayer(torch.nn.Module):
@@ -121,7 +126,7 @@ def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
         return file_path
 
 
-def load_empty_model(pretrained_model_name_or_path, cls=AutoModelForCausalLM, **kwargs):
+def load_empty_model(pretrained_model_name_or_path, cls=AutoModelForCausalLM, save_path=None, **kwargs):
     """Load a empty model."""
     is_local = os.path.isdir(pretrained_model_name_or_path)
     if is_local:  # pragma: no cover
@@ -139,6 +144,10 @@ def load_empty_model(pretrained_model_name_or_path, cls=AutoModelForCausalLM, **
     model.tie_weights()
     model.eval()
     model.path = pretrained_model_name_or_path
+
+    if save_path is None:
+        save_path = LWQ_WORKSPACE
+    convert_model(model, save_path)
     return model
 
 
@@ -163,6 +172,40 @@ def update_module(model, module_name, new_module):
         setattr(super_module, module_name.split(".")[-1], new_module)
 
 
+def get_layers_before_block(model):
+    """get the embed layers before blocks."""
+    return_layers = []
+    block_name = None
+    def _forward(module, name, *args, **kwargs):
+        if name == block_name:
+        # if 'DecoderLayer' in name:
+            raise NotImplementedError
+        if len(module._modules) == 0:
+            return_layers.append((name, module))
+        return module.ori_forward(*args, **kwargs)
+
+    for n, m in model.named_modules():
+        if isinstance(m, torch.nn.ModuleList):
+            block_name = n + '.' + m.named_children().__next__()[0]
+        m.ori_forward = m.forward
+        m.forward = partial(_forward, m, n)
+    
+    try:
+        model.forward(
+            input_ids=torch.zeros((1,1), device='meta', dtype=torch.int),
+            attention_mask=torch.zeros((1,1), device='meta', dtype=torch.int)
+            )
+    except NotImplementedError:
+        pass
+
+    for n, m in model.named_modules():
+        m.forward = m.ori_forward
+        del m.ori_forward
+    
+    return return_layers
+
+
+
 def load_layer_wise_quantized_model(path):  # pragma: no cover
     """Load layer wise quantized model."""
     model = torch.load(os.path.join(path, "model_arch.pt"))
@@ -207,6 +250,8 @@ def load_tensor(path, tensor_name=None, prefix=None):
 
 
 def _get_path(pretrained_model_name_or_path):
+    if pretrained_model_name_or_path is None:
+        return None
     is_local = os.path.isdir(pretrained_model_name_or_path)
     if is_local:  # pragma: no cover
         path = pretrained_model_name_or_path
@@ -216,6 +261,7 @@ def _get_path(pretrained_model_name_or_path):
 
 
 def load_value(model, param_name, path):
+    logger.debug(f'load value for layer: {param_name}')
     if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True):
         input_embeddings = model.get_input_embeddings()
         modules = get_named_children(model)
@@ -244,9 +290,10 @@ def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_pa
 
     def forward_pre_hook(name):
         def hook(module, input):
+            logger.debug(f"{name} forward hood load value")
             state_dict = None
-            if os.path.exists(os.path.join(LWQ_WORKSPACE, f"{name}.pt")):
-                state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt"))
+            if os.path.exists(os.path.join(saved_path, f"{name}.pt")):
+                state_dict = torch.load(os.path.join(saved_path, f"{name}.pt"))
             for n, p in module.named_parameters():
                 param_name = name + "." + n
                 if state_dict:
@@ -254,11 +301,13 @@ def hook(module, input):
                 else:
                     value = load_value(model, param_name, path)
                 set_module_tensor_to_device(model, param_name, device, value)
-
+            module = module.to(device)
+            
         return hook
 
     def forward_hook(name):
         def hook(module, input, output):
+            logger.debug(f"{name} forward hood clean value")
             if saved_path:
                 file_path = os.path.join(saved_path, f"{name}.pt")
                 torch.save(module.state_dict(), file_path)
@@ -294,3 +343,110 @@ def clean_module_weight(module):
                 new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to("meta")
                 submodule._parameters[n] = new_value
     gc.collect()
+
+
+def convert_model(empty_model, saved_path=LWQ_WORKSPACE):
+    def _get_value(name, n):
+        state_dict = None
+        if os.path.exists(os.path.join(saved_path, f"{name}.pt")):
+            state_dict = torch.load(os.path.join(saved_path, f"{name}.pt"))
+        param_name = name + "." + n
+        if state_dict:
+            value = state_dict[n]
+        else:
+            value = load_value(empty_model, param_name, empty_model.path)
+        return value
+
+    def _update(module):
+        state_dict = None
+        if os.path.exists(os.path.join(saved_path, f"{name}.pt")):
+            state_dict = torch.load(os.path.join(saved_path, f"{name}.pt"))
+        for n, p in module.named_parameters():
+            if str(p.device) != 'meta':
+                continue
+            param_name = name + "." + n
+            if state_dict:
+                value = state_dict[n]
+            else:
+                value = load_value(empty_model, param_name, saved_path)
+            set_module_tensor_to_device(empty_model, param_name, 'cpu', value)
+        file_path = os.path.join(saved_path, f"{name}.pt")
+        torch.save(module.state_dict(), file_path)
+
+    def _layer_wise_to(module, name, device_or_dtype):
+        if isinstance(device_or_dtype, torch.dtype):
+            return module.ori_to(device_or_dtype)
+        elif len(module._modules) == 0:
+            # skip method type
+            if len(module._parameters) == 0 or module.weight.device.type != 'meta':
+                return module.ori_to(device_or_dtype)
+            else:
+                for n, _ in module.named_parameters():
+                    param_name = name + "." + n
+                    value = load_value(empty_model, param_name, empty_model.path)
+                    dtype = None
+                    if hasattr(module, 'dtype'):
+                        dtype = module.dtype
+                    set_module_tensor_to_device(module, n, device_or_dtype, value, dtype=dtype)
+                return module.ori_to(device_or_dtype)
+        else:
+            for n, m in module.named_children():
+                m.to(device_or_dtype)
+            return module
+
+    modules = get_named_children(empty_model)
+    for name, module in modules:
+        if hasattr(module, 'weight'):
+            # delattr(module, 'weight')
+            # module.weight = partial(_get_value, name, 'weight')()
+            module.get_weight = partial(_get_value, name, 'weight')
+        if hasattr(module, 'bias') and module.bias is not None:
+            module.get_bias = partial(_get_value, name, 'bias')
+        module.update = partial(_update, module)
+    
+    def _repalce_to(module, name):
+        if len(module._modules) > 0:
+            for n, m in module.named_children():
+                if len(name) > 0:
+                    n = name + '.' + n
+                _repalce_to(m, n)
+        module.ori_to = module.to
+        module.to = partial(_layer_wise_to, module, name)
+    _repalce_to(empty_model, '')
+
+def load_model_with_hooks(
+        pretrained_model_name_or_path,
+        cls=AutoModelForCausalLM,
+        device="cpu",
+        clean_weight=True,
+        saved_path=None, 
+        **kwargs):
+    if saved_path is None:
+        saved_path = LWQ_WORKSPACE
+    empty_model = load_empty_model(pretrained_model_name_or_path, cls=cls, **kwargs)
+    register_weight_hooks(empty_model, empty_model.path, device, clean_weight, saved_path)
+    return empty_model
+
+
+def layer_wise_save(model, path):
+    os.makedirs(path, exist_ok=True)
+    file_path = os.path.join(path, 'layer_wise_model.bin')
+    modules = get_named_children(model)
+    with open(file_path, 'wb') as f:
+        for name, module in modules:
+            output = OrderedDict()
+            if hasattr(module, "get_weight"):
+                output[f"{name}.weight"] = module.get_weight()
+            if hasattr(module, "get_bias"):
+                output[f"{name}.bias"] = module.get_bias()
+            output = pickle.dumps(output)
+            f.write(output + b'split_tag')
+
+def layer_wise_load(path):
+    file_path = os.path.join(path, 'layer_wise_model.bin')
+    state_dict = OrderedDict()
+    data = open(file_path, 'rb').read().split(b'split_tag')
+    for d in data:
+        if len(d) > 0:
+            d = pickle.loads(d)
+            state_dict.update(d)
\ No newline at end of file

From 71887bce2f3df47a7063829ff30c27701a4fb787 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Jul 2024 07:18:35 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/layer_wise/utils.py      | 73 +++++++++----------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index 3258fef60d2..974d744f45d 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -16,13 +16,13 @@
 # limitations under the License.
 """Utils for layer wise quantization."""
 
-import os
 import gc
 import json
-import pickle
-from functools import partial
 import logging
+import os
+import pickle
 from collections import OrderedDict
+from functools import partial
 
 import torch
 from accelerate import init_empty_weights
@@ -173,12 +173,13 @@ def update_module(model, module_name, new_module):
 
 
 def get_layers_before_block(model):
-    """get the embed layers before blocks."""
+    """Get the embed layers before blocks."""
     return_layers = []
     block_name = None
+
     def _forward(module, name, *args, **kwargs):
         if name == block_name:
-        # if 'DecoderLayer' in name:
+            # if 'DecoderLayer' in name:
             raise NotImplementedError
         if len(module._modules) == 0:
             return_layers.append((name, module))
@@ -186,24 +187,23 @@ def _forward(module, name, *args, **kwargs):
 
     for n, m in model.named_modules():
         if isinstance(m, torch.nn.ModuleList):
-            block_name = n + '.' + m.named_children().__next__()[0]
+            block_name = n + "." + m.named_children().__next__()[0]
         m.ori_forward = m.forward
         m.forward = partial(_forward, m, n)
-    
+
     try:
         model.forward(
-            input_ids=torch.zeros((1,1), device='meta', dtype=torch.int),
-            attention_mask=torch.zeros((1,1), device='meta', dtype=torch.int)
-            )
+            input_ids=torch.zeros((1, 1), device="meta", dtype=torch.int),
+            attention_mask=torch.zeros((1, 1), device="meta", dtype=torch.int),
+        )
     except NotImplementedError:
         pass
 
     for n, m in model.named_modules():
         m.forward = m.ori_forward
         del m.ori_forward
-    
-    return return_layers
 
+    return return_layers
 
 
 def load_layer_wise_quantized_model(path):  # pragma: no cover
@@ -261,7 +261,7 @@ def _get_path(pretrained_model_name_or_path):
 
 
 def load_value(model, param_name, path):
-    logger.debug(f'load value for layer: {param_name}')
+    logger.debug(f"load value for layer: {param_name}")
     if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True):
         input_embeddings = model.get_input_embeddings()
         modules = get_named_children(model)
@@ -302,7 +302,7 @@ def hook(module, input):
                     value = load_value(model, param_name, path)
                 set_module_tensor_to_device(model, param_name, device, value)
             module = module.to(device)
-            
+
         return hook
 
     def forward_hook(name):
@@ -362,14 +362,14 @@ def _update(module):
         if os.path.exists(os.path.join(saved_path, f"{name}.pt")):
             state_dict = torch.load(os.path.join(saved_path, f"{name}.pt"))
         for n, p in module.named_parameters():
-            if str(p.device) != 'meta':
+            if str(p.device) != "meta":
                 continue
             param_name = name + "." + n
             if state_dict:
                 value = state_dict[n]
             else:
                 value = load_value(empty_model, param_name, saved_path)
-            set_module_tensor_to_device(empty_model, param_name, 'cpu', value)
+            set_module_tensor_to_device(empty_model, param_name, "cpu", value)
         file_path = os.path.join(saved_path, f"{name}.pt")
         torch.save(module.state_dict(), file_path)
 
@@ -378,14 +378,14 @@ def _layer_wise_to(module, name, device_or_dtype):
             return module.ori_to(device_or_dtype)
         elif len(module._modules) == 0:
             # skip method type
-            if len(module._parameters) == 0 or module.weight.device.type != 'meta':
+            if len(module._parameters) == 0 or module.weight.device.type != "meta":
                 return module.ori_to(device_or_dtype)
             else:
                 for n, _ in module.named_parameters():
                     param_name = name + "." + n
                     value = load_value(empty_model, param_name, empty_model.path)
                     dtype = None
-                    if hasattr(module, 'dtype'):
+                    if hasattr(module, "dtype"):
                         dtype = module.dtype
                     set_module_tensor_to_device(module, n, device_or_dtype, value, dtype=dtype)
                 return module.ori_to(device_or_dtype)
@@ -396,31 +396,29 @@ def _layer_wise_to(module, name, device_or_dtype):
 
     modules = get_named_children(empty_model)
     for name, module in modules:
-        if hasattr(module, 'weight'):
+        if hasattr(module, "weight"):
             # delattr(module, 'weight')
             # module.weight = partial(_get_value, name, 'weight')()
-            module.get_weight = partial(_get_value, name, 'weight')
-        if hasattr(module, 'bias') and module.bias is not None:
-            module.get_bias = partial(_get_value, name, 'bias')
+            module.get_weight = partial(_get_value, name, "weight")
+        if hasattr(module, "bias") and module.bias is not None:
+            module.get_bias = partial(_get_value, name, "bias")
         module.update = partial(_update, module)
-    
+
     def _repalce_to(module, name):
         if len(module._modules) > 0:
             for n, m in module.named_children():
                 if len(name) > 0:
-                    n = name + '.' + n
+                    n = name + "." + n
                 _repalce_to(m, n)
         module.ori_to = module.to
         module.to = partial(_layer_wise_to, module, name)
-    _repalce_to(empty_model, '')
+
+    _repalce_to(empty_model, "")
+
 
 def load_model_with_hooks(
-        pretrained_model_name_or_path,
-        cls=AutoModelForCausalLM,
-        device="cpu",
-        clean_weight=True,
-        saved_path=None, 
-        **kwargs):
+    pretrained_model_name_or_path, cls=AutoModelForCausalLM, device="cpu", clean_weight=True, saved_path=None, **kwargs
+):
     if saved_path is None:
         saved_path = LWQ_WORKSPACE
     empty_model = load_empty_model(pretrained_model_name_or_path, cls=cls, **kwargs)
@@ -430,9 +428,9 @@ def load_model_with_hooks(
 
 def layer_wise_save(model, path):
     os.makedirs(path, exist_ok=True)
-    file_path = os.path.join(path, 'layer_wise_model.bin')
+    file_path = os.path.join(path, "layer_wise_model.bin")
     modules = get_named_children(model)
-    with open(file_path, 'wb') as f:
+    with open(file_path, "wb") as f:
         for name, module in modules:
             output = OrderedDict()
             if hasattr(module, "get_weight"):
@@ -440,13 +438,14 @@ def layer_wise_save(model, path):
             if hasattr(module, "get_bias"):
                 output[f"{name}.bias"] = module.get_bias()
             output = pickle.dumps(output)
-            f.write(output + b'split_tag')
+            f.write(output + b"split_tag")
+
 
 def layer_wise_load(path):
-    file_path = os.path.join(path, 'layer_wise_model.bin')
+    file_path = os.path.join(path, "layer_wise_model.bin")
     state_dict = OrderedDict()
-    data = open(file_path, 'rb').read().split(b'split_tag')
+    data = open(file_path, "rb").read().split(b"split_tag")
     for d in data:
         if len(d) > 0:
             d = pickle.loads(d)
-            state_dict.update(d)
\ No newline at end of file
+            state_dict.update(d)