From 4cf0620c6e4e4ccc5a26d5a1b72afd6d9d73156d Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 25 Jun 2024 16:12:35 +0800 Subject: [PATCH 01/38] support rtn & gptq(draft) Signed-off-by: Kaihui-intel --- .../torch/algorithms/layer_wise/utils.py | 2 + .../torch/algorithms/weight_only/gptq.py | 130 +++++++++++------- .../torch/algorithms/weight_only/modules.py | 3 +- .../torch/algorithms/weight_only/rtn.py | 40 +++++- .../torch/quantization/algorithm_entry.py | 1 + .../quantization/weight_only/test_gptq.py | 36 +++-- .../quantization/weight_only/test_rtn.py | 8 +- 7 files changed, 153 insertions(+), 67 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index 464a25cdee0..93c41fa9fc1 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -214,6 +214,8 @@ def _get_path(pretrained_model_name_or_path): path = dowload_hf_model(pretrained_model_name_or_path) return path +get_path = _get_path + def load_value(model, param_name, path): if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True): diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 4e2c19a8815..53217e35ee3 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -224,11 +224,13 @@ def __init__( # device self.device = get_accelerator(kwargs.pop("device", "auto")).current_device_name() - self.model.to(self.device) + if not use_layer_wise: + self.model.to(self.device) self.is_ready = False self.use_layer_wise = use_layer_wise - self.model_path = model_path + if use_layer_wise: + self.prepare_layer_wise(model_path) # dataloader self.use_max_length = use_max_length @@ -237,6 +239,18 @@ def __init__( self.dataloader = [] self.nsamples = nsamples + def prepare_layer_wise(self, model_path): + from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks + import os + os.makedirs(LWQ_WORKSPACE, exist_ok=True) + if model_path == "": + model_path = self.model.path + assert model_path, "model_path should not be None." + self.model_path = get_path(model_path) + register_weight_hooks( + self.model, self.model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE + ) + def get_full_layer_name(self, sub_layer_name, block_idx): transformer_name = self.gptq_related_blocks["transformers_name"] return ".".join([transformer_name, str(block_idx), sub_layer_name]) @@ -394,7 +408,6 @@ def execute_quantization(self, means=None, stds=None): # Step1: prepare quantization (calibration datasets) logger.info("Begin ====>") - model_path = self.model_path # Step2: run gptq quantization in a transformer block-wise manner. gptq_config = {} @@ -430,8 +443,8 @@ def execute_quantization(self, means=None, stds=None): weight_config_this_layer = self.get_layer_config(full_layer_name) if self.use_layer_wise: # pragma: no cover from neural_compressor.torch.algorithms.layer_wise import load_value - - W = load_value(self.model, full_layer_name + ".weight", model_path) + # import pdb; pdb.set_trace() + W = load_value(self.model, full_layer_name + ".weight", self.model_path) else: W = sub_layers[layer_name].weight.data.clone() @@ -467,12 +480,23 @@ def tmp(_, inp, out): weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) logger.info(f"Quantizing layer {layer_name}") if self.use_layer_wise: # pragma: no cover - from neural_compressor.torch.algorithms.layer_wise import load_value + from neural_compressor.torch.algorithms.layer_wise import load_value, set_module_tensor_to_device full_layer_name = self.get_full_layer_name(layer_name, block_idx) - W = load_value(self.model, full_layer_name + ".weight", model_path) + for n, p in sub_layers[layer_name].named_parameters(): + param_name = full_layer_name + "." + n + # breakpoint() + if n == "weight": + W = load_value(self.model, full_layer_name + ".weight", self.model_path) + else: + value = load_value(self.model, param_name, self.model_path) + set_module_tensor_to_device(self.model, param_name, self.device, value) + else: W = sub_layers[layer_name].weight.data.clone() + + + accelerator.mark_step() if "hpu" in self.device: W = W.to("cpu") @@ -484,30 +508,8 @@ def tmp(_, inp, out): act_order=weight_config_this_layer["act_order"], static_groups=weight_config_this_layer["static_groups"], ) - if self.use_layer_wise: # pragma: no cover - from neural_compressor.torch.algorithms.layer_wise import ( - LWQ_WORKSPACE, - clean_module_weight, - load_value, - set_module_tensor_to_device, - ) - - sub_layer = sub_layers[layer_name] - full_layer_name = self.get_full_layer_name(layer_name, block_idx) - for n, p in sub_layer.named_parameters(): - param_name = full_layer_name + "." + n - if n == "weight": - set_module_tensor_to_device(self.model, param_name, self.device, Q) - else: - value = load_value(self.model, param_name, model_path) - set_module_tensor_to_device(self.model, param_name, self.device, value) - # sub_layer.weight.data = Q - torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") - clean_module_weight(sub_layer) - del Q - gc.collect() - else: - sub_layers[layer_name].weight.data = Q + + # Step 2.5: export to compressed model gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale} if not weight_config_this_layer["sym"]: gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp @@ -515,24 +517,7 @@ def tmp(_, inp, out): gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[ layer_name ].perm - gptq_for_this_block[layer_name].free() - - # Step 2.5: replace output data with quantized weights - outs = [] - batch_num = self.cache_key_arguments.pop("batch_num") - for j in range(batch_num): - cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) - cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) - out = transformer_block(*cache_positional_batch, **cache_keyword_batch) - out = self.track_hidden_states(out) - outs.append(out) - self.cache_key_arguments["batch_num"] = batch_num - if self.use_layer_wise: # pragma: no cover - self.gptq_related_blocks["transformers"][block_idx] = transformer_block - else: - self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() - # Step 2.6: export to compressed model - for layer_name in sub_layers: + weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) gptq_scale = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["scale"] if not weight_config_this_layer["sym"]: @@ -543,7 +528,6 @@ def tmp(_, inp, out): gptq_perm = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] else: gptq_perm = None - Q = sub_layers[layer_name].weight.data if weight_config_this_layer["act_order"]: Q.copy_(Q[:, gptq_perm]) if is_transformers_imported() and isinstance(sub_layers[layer_name], transformers.Conv1D): @@ -584,7 +568,52 @@ def tmp(_, inp, out): device=self.device, ) new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm) + + + if self.use_layer_wise: # pragma: no cover + from neural_compressor.torch.algorithms.layer_wise import ( + LWQ_WORKSPACE, + clean_module_weight, + load_value, + set_module_tensor_to_device, + ) + + # sub_layer = sub_layers[layer_name] + # full_layer_name = self.get_full_layer_name(layer_name, block_idx) + # for n, p in sub_layer.named_parameters(): + # param_name = full_layer_name + "." + n + # # breakpoint() + # if n == "weight": + # set_module_tensor_to_device(self.model, param_name, self.device, Q) + # else: + # value = load_value(self.model, param_name, model_path) + # set_module_tensor_to_device(self.model, param_name, self.device, value) + # sub_layer.weight.data = Q + # torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") + torch.save(new_module.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") + clean_module_weight(new_module) + del Q + gc.collect() set_module(transformer_block, layer_name, new_module) + + gptq_for_this_block[layer_name].free() + + # Step 2.6: replace output data with quantized weights + outs = [] + batch_num = self.cache_key_arguments.pop("batch_num") + for j in range(batch_num): + cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) + cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) + out = transformer_block(*cache_positional_batch, **cache_keyword_batch) + out = self.track_hidden_states(out) + outs.append(out) + self.cache_key_arguments["batch_num"] = batch_num + if self.use_layer_wise: # pragma: no cover + self.gptq_related_blocks["transformers"][block_idx] = transformer_block + else: + self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() + + del gptq_for_this_block torch.cuda.empty_cache() # iteratively replace the input with output, thus layerwise quantization can continue. @@ -999,6 +1028,7 @@ def prepare( def convert(self, model, *args, **kwargs): self.gptq_quantizer.model = model self.gptq_quantizer.remove_prepare_for_calibration() + q_model, gptq_config = self.gptq_quantizer.execute_quantization() q_model.gptq_config = gptq_config logger.info("GPTQ quantizing done.") diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 18cf6e46e55..30c40cfa9c3 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -175,7 +175,8 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): self.scales = self.scales.T.contiguous() self.qweight = self.qweight.T.contiguous() self.qzeros = self.qzeros.T.contiguous() - int_weight = int_weight.to(self.device) + if int_weight.device.type != "meta": + int_weight = int_weight.to(self.device) if self.use_optimum_format and zp is None: # to avoid overflow int_weight = int_weight.type(torch.int32) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index fc083191ffe..20040438c00 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -22,6 +22,7 @@ from collections import OrderedDict import torch +import gc from neural_compressor.torch.algorithms import Quantizer from neural_compressor.torch.utils import get_accelerator, is_transformers_imported, logger, set_module @@ -89,10 +90,6 @@ def convert( weight_config = self.quant_config device = get_accelerator(kwargs.pop("device", "auto")).current_device_name() - # Put model on device explicitly - # TODO: refine it later, Put module on device one by one instead of the whole model - model.to(device) - assert isinstance(model, torch.nn.Module), "only support torch module" if is_transformers_imported(): supported_layers = (torch.nn.Linear, transformers.Conv1D) @@ -130,6 +127,7 @@ def convert( use_full_range = weight_config[name]["use_full_range"] use_mse_search = weight_config[name]["use_mse_search"] use_layer_wise = weight_config[name]["use_layer_wise"] + model_path = weight_config[name]["model_path"] use_optimum_format = kwargs.get("use_optimum_format", True) # double quant config double_quant_config = { @@ -154,6 +152,24 @@ def convert( continue logger.debug(f"RTN quantized module:{name, m}") logger.debug(log_msg) + + if use_layer_wise: + from neural_compressor.common.utils import DEFAULT_WORKSPACE + from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, load_value + import os + lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir") + os.makedirs(lwq_workspace, exist_ok=True) + model_path = get_path(model_path) + + # load weight + # breakpoint() + load_module(model, name, model_path, device=device) + # load_value(model, name + ".weight", model_path) + else: + # Put model on device explicitly + # TODO: refine it later, Put module on device one by one instead of the whole model + model.to(device) + # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight. if is_transformers_imported(): transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D)) @@ -202,8 +218,24 @@ def convert( device=device, ) new_module.pack(int_weight, scale, zp, m.bias) + + # import pdb; pdb.set_trace() + if use_layer_wise: + # save and clean weight + from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight + + torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt")) + clean_module_weight(new_module) + del m + gc.collect() if name == "": return new_module else: set_module(model, name, new_module) + + if use_layer_wise: + # register hooks + from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks + + register_weight_hooks(model, model_path, device=device, clean_weight=True) return model diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 733e4409b91..1850829104f 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -84,6 +84,7 @@ def rtn_entry( "use_full_range": quant_config.use_full_range, "use_mse_search": quant_config.use_mse_search, "use_layer_wise": quant_config.use_layer_wise, + "model_path": quant_config.model_path, "use_double_quant": quant_config.use_double_quant, "double_quant_dtype": quant_config.double_quant_dtype, "double_quant_bits": quant_config.double_quant_bits, diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index be408af2564..3f258204c75 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -28,8 +28,10 @@ def run_fn(model): # GPTQ uses ValueError to reduce computation when collecting input data of the first block # It's special for UTs, no need to add this wrapper in examples. with pytest.raises(ValueError): - model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device)) - model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device)) + # model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device)) + # model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device)) + model(torch.tensor([[10, 20, 30]], dtype=torch.long)) + model(torch.tensor([[40, 50, 60]], dtype=torch.long)) class TestGPTQQuant: @@ -170,14 +172,28 @@ def test_act_order(self): # compare atol, this case is an ideal case. assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check." - # def test_layer_wise(self): - # model = copy.deepcopy(self.tiny_gptj) - # quant_config = GPTQConfig( - # use_layer_wise=True, - # ) - # model = quantize(model, quant_config, run_fn=run_fn) - # TODO: (Xin) not implemented - + def test_layer_wise(self): + # model = copy.deepcopy(self.tiny_gptj) + model = copy.deepcopy(self.tiny_gptj) + quant_config = GPTQConfig() + model = prepare(model, quant_config) + run_fn(model) + model = convert(model) + q_label = model(self.example_inputs)[0] + + from neural_compressor.torch.algorithms.layer_wise import load_empty_model + model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM", torchscript=True) + + + quant_config = GPTQConfig( + use_layer_wise=True, + model_path="hf-internal-testing/tiny-random-GPTJForCausalLM" + ) + model = quantize(model, quant_config, run_fn=run_fn) + out = model(self.example_inputs)[0] + atol_true = (out - q_label).amax() + print(out, atol_true) + @pytest.mark.parametrize("dtype", ["nf4", "int4"]) @pytest.mark.parametrize("double_quant_bits", [6]) @pytest.mark.parametrize("double_quant_group_size", [8, 256]) diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index f82185cc82e..889aa902b87 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -139,13 +139,17 @@ def test_mse_search(self): assert torch.allclose(atol_false, atol_true, atol=0.012), "atol is very close, double checked the logic." def test_layer_wise(self): - model = copy.deepcopy(self.tiny_gptj) + # model = copy.deepcopy(self.tiny_gptj) + from neural_compressor.torch.algorithms.layer_wise import load_empty_model + model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") quant_config = RTNConfig( use_layer_wise=True, + model_path="hf-internal-testing/tiny-random-GPTJForCausalLM", ) model = prepare(model, quant_config) model = convert(model) - # TODO: (Xin) not implemented + out = model(self.example_inputs)[0] + assert torch.equal(out, self.q_label), "use_layer_wise=True output should be same. Please double check." @pytest.mark.parametrize( "dtype", From a1d9e1045bb7485ffc82dd6f3eb2de928bec02a2 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 25 Jun 2024 16:53:02 +0800 Subject: [PATCH 02/38] clean code Signed-off-by: Kaihui-intel --- .../torch/algorithms/weight_only/gptq.py | 5 ----- .../torch/algorithms/weight_only/rtn.py | 18 ++++++++---------- .../torch/quantization/algorithm_entry.py | 3 +-- .../quantization/weight_only/test_gptq.py | 3 +-- 4 files changed, 10 insertions(+), 19 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 53217e35ee3..08745ddceaa 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -443,7 +443,6 @@ def execute_quantization(self, means=None, stds=None): weight_config_this_layer = self.get_layer_config(full_layer_name) if self.use_layer_wise: # pragma: no cover from neural_compressor.torch.algorithms.layer_wise import load_value - # import pdb; pdb.set_trace() W = load_value(self.model, full_layer_name + ".weight", self.model_path) else: W = sub_layers[layer_name].weight.data.clone() @@ -485,7 +484,6 @@ def tmp(_, inp, out): full_layer_name = self.get_full_layer_name(layer_name, block_idx) for n, p in sub_layers[layer_name].named_parameters(): param_name = full_layer_name + "." + n - # breakpoint() if n == "weight": W = load_value(self.model, full_layer_name + ".weight", self.model_path) else: @@ -495,8 +493,6 @@ def tmp(_, inp, out): else: W = sub_layers[layer_name].weight.data.clone() - - accelerator.mark_step() if "hpu" in self.device: W = W.to("cpu") @@ -568,7 +564,6 @@ def tmp(_, inp, out): device=self.device, ) new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm) - if self.use_layer_wise: # pragma: no cover from neural_compressor.torch.algorithms.layer_wise import ( diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 20040438c00..fb823abea82 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -65,6 +65,7 @@ def convert( quantile=1.0, use_full_range=False, use_mse_search=False, + use_layer_wise=False, *args, **kwargs, ): @@ -90,6 +91,11 @@ def convert( weight_config = self.quant_config device = get_accelerator(kwargs.pop("device", "auto")).current_device_name() + # Put model on device explicitly + # TODO: refine it later, Put module on device one by one instead of the whole model + if not use_layer_wise: + model.to(device) + assert isinstance(model, torch.nn.Module), "only support torch module" if is_transformers_imported(): supported_layers = (torch.nn.Linear, transformers.Conv1D) @@ -126,7 +132,6 @@ def convert( group_dim = weight_config[name]["group_dim"] use_full_range = weight_config[name]["use_full_range"] use_mse_search = weight_config[name]["use_mse_search"] - use_layer_wise = weight_config[name]["use_layer_wise"] model_path = weight_config[name]["model_path"] use_optimum_format = kwargs.get("use_optimum_format", True) # double quant config @@ -162,14 +167,8 @@ def convert( model_path = get_path(model_path) # load weight - # breakpoint() load_module(model, name, model_path, device=device) - # load_value(model, name + ".weight", model_path) - else: - # Put model on device explicitly - # TODO: refine it later, Put module on device one by one instead of the whole model - model.to(device) - + # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight. if is_transformers_imported(): transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D)) @@ -218,8 +217,7 @@ def convert( device=device, ) new_module.pack(int_weight, scale, zp, m.bias) - - # import pdb; pdb.set_trace() + if use_layer_wise: # save and clean weight from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 1850829104f..01b496ee9a3 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -83,7 +83,6 @@ def rtn_entry( "group_dim": quant_config.group_dim, "use_full_range": quant_config.use_full_range, "use_mse_search": quant_config.use_mse_search, - "use_layer_wise": quant_config.use_layer_wise, "model_path": quant_config.model_path, "use_double_quant": quant_config.use_double_quant, "double_quant_dtype": quant_config.double_quant_dtype, @@ -93,7 +92,7 @@ def rtn_entry( } quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config) - model = quantizer.execute(model, mode=mode) + model = quantizer.execute(model, mode=mode, use_layer_wise=quant_config.use_layer_wise) model.qconfig = configs_mapping model.save = MethodType(save, model) postprocess_model(model, mode, quantizer) diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index 3f258204c75..447b0c03343 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -191,8 +191,7 @@ def test_layer_wise(self): ) model = quantize(model, quant_config, run_fn=run_fn) out = model(self.example_inputs)[0] - atol_true = (out - q_label).amax() - print(out, atol_true) + assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check." @pytest.mark.parametrize("dtype", ["nf4", "int4"]) @pytest.mark.parametrize("double_quant_bits", [6]) From b4e93f3625d240ff93e7e143c5739d2ed9c88d08 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 25 Jun 2024 16:58:17 +0800 Subject: [PATCH 03/38] clean gptq Signed-off-by: Kaihui-intel --- .../torch/algorithms/weight_only/gptq.py | 12 ------------ test/3x/torch/quantization/weight_only/test_gptq.py | 13 ++++++------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 08745ddceaa..4b914e506b7 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -573,18 +573,6 @@ def tmp(_, inp, out): set_module_tensor_to_device, ) - # sub_layer = sub_layers[layer_name] - # full_layer_name = self.get_full_layer_name(layer_name, block_idx) - # for n, p in sub_layer.named_parameters(): - # param_name = full_layer_name + "." + n - # # breakpoint() - # if n == "weight": - # set_module_tensor_to_device(self.model, param_name, self.device, Q) - # else: - # value = load_value(self.model, param_name, model_path) - # set_module_tensor_to_device(self.model, param_name, self.device, value) - # sub_layer.weight.data = Q - # torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") torch.save(new_module.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") clean_module_weight(new_module) del Q diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index 447b0c03343..432a5446bc9 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -28,10 +28,8 @@ def run_fn(model): # GPTQ uses ValueError to reduce computation when collecting input data of the first block # It's special for UTs, no need to add this wrapper in examples. with pytest.raises(ValueError): - # model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device)) - # model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device)) - model(torch.tensor([[10, 20, 30]], dtype=torch.long)) - model(torch.tensor([[40, 50, 60]], dtype=torch.long)) + model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device)) + model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device)) class TestGPTQQuant: @@ -182,14 +180,15 @@ def test_layer_wise(self): q_label = model(self.example_inputs)[0] from neural_compressor.torch.algorithms.layer_wise import load_empty_model - model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM", torchscript=True) - + model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") quant_config = GPTQConfig( use_layer_wise=True, model_path="hf-internal-testing/tiny-random-GPTJForCausalLM" ) - model = quantize(model, quant_config, run_fn=run_fn) + model = prepare(model, quant_config) + run_fn(model) + model = convert(model) out = model(self.example_inputs)[0] assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check." From a3a061e227285063d64d366136ffa1a71080f86b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 09:03:20 +0000 Subject: [PATCH 04/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/layer_wise/utils.py | 1 + .../torch/algorithms/weight_only/gptq.py | 20 ++++++++++--------- .../torch/algorithms/weight_only/rtn.py | 14 +++++++------ .../quantization/weight_only/test_gptq.py | 12 +++++------ .../quantization/weight_only/test_rtn.py | 3 ++- 5 files changed, 27 insertions(+), 23 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index 93c41fa9fc1..f02c0d2de3a 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -214,6 +214,7 @@ def _get_path(pretrained_model_name_or_path): path = dowload_hf_model(pretrained_model_name_or_path) return path + get_path = _get_path diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 4b914e506b7..30b4d07be6a 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -240,8 +240,10 @@ def __init__( self.nsamples = nsamples def prepare_layer_wise(self, model_path): - from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks import os + + from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks + os.makedirs(LWQ_WORKSPACE, exist_ok=True) if model_path == "": model_path = self.model.path @@ -250,7 +252,7 @@ def prepare_layer_wise(self, model_path): register_weight_hooks( self.model, self.model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE ) - + def get_full_layer_name(self, sub_layer_name, block_idx): transformer_name = self.gptq_related_blocks["transformers_name"] return ".".join([transformer_name, str(block_idx), sub_layer_name]) @@ -443,6 +445,7 @@ def execute_quantization(self, means=None, stds=None): weight_config_this_layer = self.get_layer_config(full_layer_name) if self.use_layer_wise: # pragma: no cover from neural_compressor.torch.algorithms.layer_wise import load_value + W = load_value(self.model, full_layer_name + ".weight", self.model_path) else: W = sub_layers[layer_name].weight.data.clone() @@ -489,10 +492,10 @@ def tmp(_, inp, out): else: value = load_value(self.model, param_name, self.model_path) set_module_tensor_to_device(self.model, param_name, self.device, value) - + else: W = sub_layers[layer_name].weight.data.clone() - + accelerator.mark_step() if "hpu" in self.device: W = W.to("cpu") @@ -504,7 +507,7 @@ def tmp(_, inp, out): act_order=weight_config_this_layer["act_order"], static_groups=weight_config_this_layer["static_groups"], ) - + # Step 2.5: export to compressed model gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale} if not weight_config_this_layer["sym"]: @@ -513,7 +516,7 @@ def tmp(_, inp, out): gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[ layer_name ].perm - + weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) gptq_scale = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["scale"] if not weight_config_this_layer["sym"]: @@ -564,7 +567,7 @@ def tmp(_, inp, out): device=self.device, ) new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm) - + if self.use_layer_wise: # pragma: no cover from neural_compressor.torch.algorithms.layer_wise import ( LWQ_WORKSPACE, @@ -595,8 +598,7 @@ def tmp(_, inp, out): self.gptq_related_blocks["transformers"][block_idx] = transformer_block else: self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() - - + del gptq_for_this_block torch.cuda.empty_cache() # iteratively replace the input with output, thus layerwise quantization can continue. diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index fb823abea82..dfe9d18522f 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -19,10 +19,10 @@ # limitations under the License. +import gc from collections import OrderedDict import torch -import gc from neural_compressor.torch.algorithms import Quantizer from neural_compressor.torch.utils import get_accelerator, is_transformers_imported, logger, set_module @@ -157,18 +157,20 @@ def convert( continue logger.debug(f"RTN quantized module:{name, m}") logger.debug(log_msg) - + if use_layer_wise: + import os + from neural_compressor.common.utils import DEFAULT_WORKSPACE from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, load_value - import os + lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir") os.makedirs(lwq_workspace, exist_ok=True) model_path = get_path(model_path) - + # load weight load_module(model, name, model_path, device=device) - + # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight. if is_transformers_imported(): transpose = (group_dim == 0) ^ (isinstance(m, transformers.Conv1D)) @@ -230,7 +232,7 @@ def convert( return new_module else: set_module(model, name, new_module) - + if use_layer_wise: # register hooks from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index 432a5446bc9..13d25eff188 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -178,20 +178,18 @@ def test_layer_wise(self): run_fn(model) model = convert(model) q_label = model(self.example_inputs)[0] - + from neural_compressor.torch.algorithms.layer_wise import load_empty_model + model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") - - quant_config = GPTQConfig( - use_layer_wise=True, - model_path="hf-internal-testing/tiny-random-GPTJForCausalLM" - ) + + quant_config = GPTQConfig(use_layer_wise=True, model_path="hf-internal-testing/tiny-random-GPTJForCausalLM") model = prepare(model, quant_config) run_fn(model) model = convert(model) out = model(self.example_inputs)[0] assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check." - + @pytest.mark.parametrize("dtype", ["nf4", "int4"]) @pytest.mark.parametrize("double_quant_bits", [6]) @pytest.mark.parametrize("double_quant_group_size", [8, 256]) diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 889aa902b87..6a8ac4bab96 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -141,6 +141,7 @@ def test_mse_search(self): def test_layer_wise(self): # model = copy.deepcopy(self.tiny_gptj) from neural_compressor.torch.algorithms.layer_wise import load_empty_model + model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") quant_config = RTNConfig( use_layer_wise=True, @@ -149,7 +150,7 @@ def test_layer_wise(self): model = prepare(model, quant_config) model = convert(model) out = model(self.example_inputs)[0] - assert torch.equal(out, self.q_label), "use_layer_wise=True output should be same. Please double check." + assert torch.equal(out, self.q_label), "use_layer_wise=True output should be same. Please double check." @pytest.mark.parametrize( "dtype", From 02ee1f8144820a3b6de9d8fcdd2d148dfa60ec9c Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 25 Jun 2024 17:05:39 +0800 Subject: [PATCH 05/38] del unused line Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_rtn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 6a8ac4bab96..94f7c2954b1 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -139,7 +139,6 @@ def test_mse_search(self): assert torch.allclose(atol_false, atol_true, atol=0.012), "atol is very close, double checked the logic." def test_layer_wise(self): - # model = copy.deepcopy(self.tiny_gptj) from neural_compressor.torch.algorithms.layer_wise import load_empty_model model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") From 060ea50169712d9316698b78cb37bd3cb86777b6 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 26 Jun 2024 09:34:24 +0800 Subject: [PATCH 06/38] fix load import Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/layer_wise/load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/load.py b/neural_compressor/torch/algorithms/layer_wise/load.py index 09700044a8f..a883bfe3848 100644 --- a/neural_compressor/torch/algorithms/layer_wise/load.py +++ b/neural_compressor/torch/algorithms/layer_wise/load.py @@ -32,7 +32,7 @@ _open_zipfile_reader, ) -from neural_compressor.adaptor.torch_utils.layer_wise_quant import modified_pickle as pickle +from neural_compressor.torch.algorithms.layer_wise import modified_pickle as pickle from .utils import torch From 1a60731343c76cc2d9d4eddb3548286f96eb3944 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 26 Jun 2024 10:04:58 +0800 Subject: [PATCH 07/38] fix rtn model_path Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/rtn.py | 5 ++++- .../torch/quantization/algorithm_entry.py | 10 +++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index dfe9d18522f..9738bc13846 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -66,6 +66,7 @@ def convert( use_full_range=False, use_mse_search=False, use_layer_wise=False, + model_path="", *args, **kwargs, ): @@ -132,7 +133,6 @@ def convert( group_dim = weight_config[name]["group_dim"] use_full_range = weight_config[name]["use_full_range"] use_mse_search = weight_config[name]["use_mse_search"] - model_path = weight_config[name]["model_path"] use_optimum_format = kwargs.get("use_optimum_format", True) # double quant config double_quant_config = { @@ -166,6 +166,9 @@ def convert( lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir") os.makedirs(lwq_workspace, exist_ok=True) + if model_path == "": + model_path = self.model.path + assert model_path, "model_path should not be None." model_path = get_path(model_path) # load weight diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 01b496ee9a3..678dc9a0a13 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -83,16 +83,20 @@ def rtn_entry( "group_dim": quant_config.group_dim, "use_full_range": quant_config.use_full_range, "use_mse_search": quant_config.use_mse_search, - "model_path": quant_config.model_path, "use_double_quant": quant_config.use_double_quant, "double_quant_dtype": quant_config.double_quant_dtype, "double_quant_bits": quant_config.double_quant_bits, "double_quant_scheme": "sym" if quant_config.double_quant_use_sym else "asym", "double_quant_group_size": quant_config.double_quant_group_size, } - + kwargs.update( + { + "use_layer_wise": quant_config.use_layer_wise, + "model_path": quant_config.model_path, + } + ) quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config) - model = quantizer.execute(model, mode=mode, use_layer_wise=quant_config.use_layer_wise) + model = quantizer.execute(model, mode=mode, *args, **kwargs) model.qconfig = configs_mapping model.save = MethodType(save, model) postprocess_model(model, mode, quantizer) From 04e1923d7436b73bd61666e3a6696a0eecb42c05 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 02:07:34 +0000 Subject: [PATCH 08/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/quantization/algorithm_entry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 678dc9a0a13..07898ed1dd3 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -96,7 +96,7 @@ def rtn_entry( } ) quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config) - model = quantizer.execute(model, mode=mode, *args, **kwargs) + model = quantizer.execute(model, mode=mode, *args, **kwargs) model.qconfig = configs_mapping model.save = MethodType(save, model) postprocess_model(model, mode, quantizer) From 8f27d4781e277cde11e4ffa749e63bba03ede567 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 26 Jun 2024 10:56:42 +0800 Subject: [PATCH 09/38] update rtn model Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/rtn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 9738bc13846..4dcf046e05f 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -167,7 +167,7 @@ def convert( lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir") os.makedirs(lwq_workspace, exist_ok=True) if model_path == "": - model_path = self.model.path + model_path = model.path assert model_path, "model_path should not be None." model_path = get_path(model_path) From 5a3f0906c6ff03d33cd6182014400f1bad3a1014 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 26 Jun 2024 15:53:01 +0800 Subject: [PATCH 10/38] fix clean module Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/layer_wise/utils.py | 8 +++++++- neural_compressor/torch/algorithms/weight_only/rtn.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index f02c0d2de3a..bb1e2f8ebcc 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -278,12 +278,18 @@ def hook(module, input, output): return handle -def clean_module_weight(module): +def clean_module_weight(module, woq_type=False): if isinstance(module, QDQLayer): submodule = module.module else: submodule = module + if woq_type is True: + for n, m in submodule._buffers.items(): + old_value = getattr(submodule, n) + with torch.no_grad(): + submodule._buffers[n] = torch.zeros(old_value.shape, device="meta") + for n, m in submodule.named_parameters(): is_buffer = n in submodule._buffers old_value = getattr(submodule, n) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 4dcf046e05f..aa8f4f70ed9 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -228,7 +228,8 @@ def convert( from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt")) - clean_module_weight(new_module) + clean_module_weight(new_module, woq_type=True) + clean_module_weight(m) del m gc.collect() if name == "": From 14bd733bbf8abfe06f460ff68649fe0b0df5b8c9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 07:55:37 +0000 Subject: [PATCH 11/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/layer_wise/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index bb1e2f8ebcc..095d27779aa 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -289,7 +289,7 @@ def clean_module_weight(module, woq_type=False): old_value = getattr(submodule, n) with torch.no_grad(): submodule._buffers[n] = torch.zeros(old_value.shape, device="meta") - + for n, m in submodule.named_parameters(): is_buffer = n in submodule._buffers old_value = getattr(submodule, n) From 4ce74db461e8eb6d34462084e388b8ba113773a6 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 26 Jun 2024 18:45:33 +0800 Subject: [PATCH 12/38] fix layerwise woq forward Signed-off-by: Kaihui-intel --- .../torch/algorithms/layer_wise/utils.py | 25 +++++++++++-------- .../torch/algorithms/weight_only/rtn.py | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index bb1e2f8ebcc..8f4272597e4 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -25,12 +25,13 @@ from accelerate.utils import set_module_tensor_to_device from transformers import AutoConfig, AutoModelForCausalLM from transformers.models.auto.auto_factory import _BaseAutoModelClass +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.common import options from .load import load -LWQ_WORKSPACE = os.path.join(options.workspace, "layer_wise_tmp") +LWQ_WORKSPACE = os.path.join(options.workspace, "lwq_tmpdir") class QDQLayer(torch.nn.Module): @@ -250,13 +251,17 @@ def hook(module, input): state_dict = None if os.path.exists(os.path.join(LWQ_WORKSPACE, f"{name}.pt")): state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt")) - for n, p in module.named_parameters(): - param_name = name + "." + n - if state_dict: - value = state_dict[n] - else: - value = load_value(model, param_name, path) - set_module_tensor_to_device(model, param_name, device, value) + if isinstance(module, WeightOnlyLinear): + for n, p in module._buffers.items(): + setattr(module, n, state_dict[n]) + else: + for n, p in module.named_parameters(): + param_name = name + "." + n + if state_dict: + value = state_dict[n] + else: + value = load_value(model, param_name, path) + set_module_tensor_to_device(model, param_name, device, value) return hook @@ -278,13 +283,13 @@ def hook(module, input, output): return handle -def clean_module_weight(module, woq_type=False): +def clean_module_weight(module): if isinstance(module, QDQLayer): submodule = module.module else: submodule = module - if woq_type is True: + if isinstance(module, WeightOnlyLinear): for n, m in submodule._buffers.items(): old_value = getattr(submodule, n) with torch.no_grad(): diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index aa8f4f70ed9..bf539f46a6e 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -228,7 +228,7 @@ def convert( from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt")) - clean_module_weight(new_module, woq_type=True) + clean_module_weight(new_module) clean_module_weight(m) del m gc.collect() From b700d39617ba7b77b10f768211986429189ac3d0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:48:26 +0000 Subject: [PATCH 13/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/layer_wise/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index bd1ee39998f..2722a891144 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -25,9 +25,9 @@ from accelerate.utils import set_module_tensor_to_device from transformers import AutoConfig, AutoModelForCausalLM from transformers.models.auto.auto_factory import _BaseAutoModelClass -from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.common import options +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from .load import load @@ -253,7 +253,7 @@ def hook(module, input): state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt")) if isinstance(module, WeightOnlyLinear): for n, p in module._buffers.items(): - setattr(module, n, state_dict[n]) + setattr(module, n, state_dict[n]) else: for n, p in module.named_parameters(): param_name = name + "." + n From 96d0e05ab04a47276aac3fb7ea2d618da345fb1f Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 26 Jun 2024 19:09:07 +0800 Subject: [PATCH 14/38] fix import Signed-off-by: Kaihui-intel --- neural_compressor/torch/algorithms/weight_only/rtn.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index bf539f46a6e..b1331050419 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -20,6 +20,7 @@ import gc +import os from collections import OrderedDict import torch @@ -159,13 +160,10 @@ def convert( logger.debug(log_msg) if use_layer_wise: - import os - from neural_compressor.common.utils import DEFAULT_WORKSPACE - from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, load_value + from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, LWQ_WORKSPACE - lwq_workspace = os.path.join(DEFAULT_WORKSPACE, "lwq_tmpdir") - os.makedirs(lwq_workspace, exist_ok=True) + os.makedirs(LWQ_WORKSPACE, exist_ok=True) if model_path == "": model_path = model.path assert model_path, "model_path should not be None." @@ -227,7 +225,7 @@ def convert( # save and clean weight from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight - torch.save(new_module.state_dict(), os.path.join(lwq_workspace, f"{name}.pt")) + torch.save(new_module.state_dict(), os.path.join(LWQ_WORKSPACE, f"{name}.pt")) clean_module_weight(new_module) clean_module_weight(m) del m From 7b2d3268bf3d66ed22672b7d2bd2641bc3647dec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 11:11:50 +0000 Subject: [PATCH 15/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/weight_only/rtn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index b1331050419..faf7f43a3c3 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -161,7 +161,7 @@ def convert( if use_layer_wise: from neural_compressor.common.utils import DEFAULT_WORKSPACE - from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, LWQ_WORKSPACE + from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module os.makedirs(LWQ_WORKSPACE, exist_ok=True) if model_path == "": From 77cde5c1d9e6014342029a61dd164ee0492a246c Mon Sep 17 00:00:00 2001 From: sdp Date: Tue, 2 Jul 2024 23:30:32 -0700 Subject: [PATCH 16/38] update clean module & add timestep Signed-off-by: sdp --- .../weight_only/run_clm_no_trainer.py | 42 ++++++++++---- .../torch/algorithms/weight_only/rtn.py | 58 +++++++++++++++++-- 2 files changed, 84 insertions(+), 16 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 8655c47a8da..21964f5d5b2 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -212,17 +212,35 @@ def get_user_model(): if args.quantize: # dataset - user_model, tokenizer = get_user_model() - calib_dataset = load_dataset(args.dataset, split="train") + if 0: + user_model, tokenizer = get_user_model() + use_layer_wise =False + # user_model.save_pretrained("./saved",max_shard_size="20GB", safe_serialization=False) + else: + from neural_compressor.torch.algorithms.layer_wise import load_empty_model + user_model = load_empty_model(args.model) + # user_model = AutoModelForCausalLM.from_pretrained( + # args.model, + # #trust_remote_code=args.trust_remote_code, + # low_cpu_mem_usage=True, + # torch_dtype="auto" + # ) + #from accelerate import init_empty_weights, load_checkpoint_and_dispatch + #tokenizer = AutoTokenizer.from_pretrained(args.model) + # checkpoint_file = "/home/sdp/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590" + #checkpoint_file = "./saved" #if checkpoint_file in "./saved" else checkpoint_file + #user_model = load_checkpoint_and_dispatch(user_model, checkpoint=checkpoint_file, device_mp="auto", offload_folder=checkpoint_file) + use_layer_wise = True + #calib_dataset = load_dataset(args.dataset, split="train") # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF - calib_dataset = calib_dataset.shuffle(seed=args.seed) - calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) - calib_dataloader = DataLoader( - calib_evaluator.dataset, - batch_size=calib_size, - shuffle=False, - collate_fn=calib_evaluator.collate_batch, - ) + #calib_dataset = calib_dataset.shuffle(seed=args.seed) + #calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + #calib_dataloader = DataLoader( + # calib_evaluator.dataset, + # batch_size=calib_size, + # shuffle=False, + # collate_fn=calib_evaluator.collate_batch, + #) # 3.x api from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize @@ -255,8 +273,9 @@ def get_user_model(): double_quant_dtype=args.double_quant_dtype, double_quant_use_sym=args.double_quant_use_sym, double_quant_group_size=args.double_quant_group_size, + use_layer_wise=use_layer_wise, ) - quant_config.set_local("lm_head", RTNConfig(dtype="fp32")) + quant_config.set_local("lm_head", RTNConfig(use_layer_wise=use_layer_wise, dtype="fp32")) user_model = prepare(model=user_model, quant_config=quant_config) user_model = convert(model=user_model) elif args.woq_algo == "GPTQ": @@ -315,6 +334,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): run_fn_for_gptq(user_model, dataloader_for_calibration) user_model = convert(user_model) + exit(0) user_model.save(args.output_dir) diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index faf7f43a3c3..676ab871bc0 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -92,12 +92,23 @@ def convert( """ weight_config = self.quant_config device = get_accelerator(kwargs.pop("device", "auto")).current_device_name() + if use_layer_wise: + from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module + + os.makedirs(LWQ_WORKSPACE, exist_ok=True) # Put model on device explicitly # TODO: refine it later, Put module on device one by one instead of the whole model - if not use_layer_wise: - model.to(device) + #if not use_layer_wise: + # model.to(device) + total_time = 0.0 + total_load_time = 0.0 + total_save_time = 0.0 + total_quant_time = 0.0 + total_quant_int_time = 0.0 + total_set_module_time = 0.0 + import time assert isinstance(model, torch.nn.Module), "only support torch module" if is_transformers_imported(): supported_layers = (torch.nn.Linear, transformers.Conv1D) @@ -113,6 +124,7 @@ def convert( } use_optimum_format = kwargs.get("use_optimum_format", True) for name, m in model.named_modules(): + if not isinstance(m, supported_layers): continue if name in weight_config: # pragma: no cover @@ -159,7 +171,8 @@ def convert( logger.debug(f"RTN quantized module:{name, m}") logger.debug(log_msg) - if use_layer_wise: + if use_layer_wise and True: + start_load = time.time() from neural_compressor.common.utils import DEFAULT_WORKSPACE from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module @@ -171,6 +184,9 @@ def convert( # load weight load_module(model, name, model_path, device=device) + load_time = time.time() - start_load + total_load_time += load_time + logger.info(load_time) # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight. if is_transformers_imported(): @@ -183,6 +199,7 @@ def convert( weight = m.weight.detach() if use_mse_search: quantile = search_clip(m, bits, group_size, scheme, dtype, use_full_range) + start_quant = time.time() int_weight, scale, zp = quant_tensor( weight, dtype=dtype, @@ -194,6 +211,8 @@ def convert( full_range=use_full_range, **double_quant_config, ) + quant_int_time = time.time() - start_quant + total_quant_int_time += quant_int_time int_weight = int_weight.t_().contiguous() if transpose else int_weight scale = scale.t_().contiguous() if transpose else scale zp = zp.t_().contiguous() if transpose and zp is not None else zp @@ -219,22 +238,51 @@ def convert( use_optimum_format=use_optimum_format, device=device, ) + if name in ["model.layers.11.mlp.up_proj", "model.layers.16.mlp.gate_proj"]: + print("will break") + #breakpoint() + logger.info(name) new_module.pack(int_weight, scale, zp, m.bias) if use_layer_wise: # save and clean weight from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight + from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module + + import time + start = time.time() torch.save(new_module.state_dict(), os.path.join(LWQ_WORKSPACE, f"{name}.pt")) - clean_module_weight(new_module) - clean_module_weight(m) + save_time = time.time() - start + logger.info(f"save time {save_time}") + total_save_time += save_time + start = time.time() + #clean_module_weight(new_module) + new_module = new_module.to_empty(device=torch.device("meta")) + m = m.to_empty(device=torch.device("meta")) + #clean_module_weight(m) + layer_time = time.time() - start + total_time += layer_time + logger.info(layer_time) del m gc.collect() if name == "": return new_module else: + start_set = time.time() set_module(model, name, new_module) + set_module_time = time.time() - start_set + total_set_module_time += set_module_time + quant_time = time.time() - start_quant - save_time - layer_time + logger.info(f"quant time {quant_time}") + total_quant_time += quant_time + logger.info(f"load time: {total_load_time}") + logger.info(f"save time: {total_save_time}") + logger.info(f"clean time: {total_time}") + logger.info(f"quant time: {total_quant_time}") + logger.info(f"quant int time: {total_quant_int_time}") + logger.info(f"set module time: {total_set_module_time}") if use_layer_wise: # register hooks from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks From 6cf8ff3463b3f1b5e7e2806a9e9960a00d68aae7 Mon Sep 17 00:00:00 2001 From: sdp Date: Wed, 10 Jul 2024 22:00:01 -0700 Subject: [PATCH 17/38] add numba pack Signed-off-by: sdp --- .../torch/algorithms/weight_only/modules.py | 322 +++++++++++++++++- .../torch/algorithms/weight_only/rtn.py | 4 +- 2 files changed, 311 insertions(+), 15 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 30c40cfa9c3..1ad7339e314 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -23,6 +23,7 @@ import torch from torch.autograd import Function from torch.nn import functional as F +import numba from neural_compressor.torch.utils import accelerator, logger @@ -300,25 +301,320 @@ def unpack_tensor_with_torch(self, packed_tensor): unpacked_tensor[:, index].copy_(tmp.type(target_dtype)) accelerator.synchronize() return unpacked_tensor + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b4_c32( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 7] << 28) + | (raw_array[:, i * n_pack + 6] << 24) + | (raw_array[:, i * n_pack + 5] << 20) + | (raw_array[:, i * n_pack + 4] << 16) + | (raw_array[:, i * n_pack + 3] << 12) + | (raw_array[:, i * n_pack + 2] << 8) + | (raw_array[:, i * n_pack + 1] << 4) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b4_c16( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 3] << 12) + | (raw_array[:, i * n_pack + 2] << 8) + | (raw_array[:, i * n_pack + 1] << 4) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b4_c8( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 1] << 4) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b4_c64( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 15] << 60) + | (raw_array[:, i * n_pack + 14] << 56) + | (raw_array[:, i * n_pack + 13] << 52) + | (raw_array[:, i * n_pack + 12] << 48) + | (raw_array[:, i * n_pack + 11] << 44) + | (raw_array[:, i * n_pack + 10] << 40) + | (raw_array[:, i * n_pack + 9] << 36) + | (raw_array[:, i * n_pack + 8] << 32) + | (raw_array[:, i * n_pack + 7] << 28) + | (raw_array[:, i * n_pack + 6] << 24) + | (raw_array[:, i * n_pack + 5] << 20) + | (raw_array[:, i * n_pack + 4] << 16) + | (raw_array[:, i * n_pack + 3] << 12) + | (raw_array[:, i * n_pack + 2] << 8) + | (raw_array[:, i * n_pack + 1] << 4) + | raw_array[:, i * n_pack] + ) + return packed_array - def pack_tensor_with_numpy(self, raw_tensor): + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b8_c32( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 3] << 24) + | (raw_array[:, i * n_pack + 2] << 16) + | (raw_array[:, i * n_pack + 1] << 8) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b8_c16( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 3] << 24) + | (raw_array[:, i * n_pack + 2] << 16) + | (raw_array[:, i * n_pack + 1] << 8) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b8_c8( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = raw_array[:, i * n_pack] + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b8_c64( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 7] << 56) + | (raw_array[:, i * n_pack + 6] << 48) + | (raw_array[:, i * n_pack + 5] << 40) + | (raw_array[:, i * n_pack + 4] << 32) + | (raw_array[:, i * n_pack + 3] << 24) + | (raw_array[:, i * n_pack + 2] << 16) + | (raw_array[:, i * n_pack + 1] << 8) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b2_c32( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b2_c32( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 15] << 30) + | (raw_array[:, i * n_pack + 14] << 28) + | (raw_array[:, i * n_pack + 13] << 26) + | (raw_array[:, i * n_pack + 12] << 24) + | (raw_array[:, i * n_pack + 11] << 22) + | (raw_array[:, i * n_pack + 10] << 20) + | (raw_array[:, i * n_pack + 9] << 18) + | (raw_array[:, i * n_pack + 8] << 16) + | (raw_array[:, i * n_pack + 7] << 14) + | (raw_array[:, i * n_pack + 6] << 12) + | (raw_array[:, i * n_pack + 5] << 10) + | (raw_array[:, i * n_pack + 4] << 8) + | (raw_array[:, i * n_pack + 3] << 6) + | (raw_array[:, i * n_pack + 2] << 4) + | (raw_array[:, i * n_pack + 1] << 2) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b2_c16( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 8] << 16) + | (raw_array[:, i * n_pack + 7] << 14) + | (raw_array[:, i * n_pack + 6] << 12) + | (raw_array[:, i * n_pack + 5] << 10) + | (raw_array[:, i * n_pack + 4] << 8) + | (raw_array[:, i * n_pack + 3] << 6) + | (raw_array[:, i * n_pack + 2] << 4) + | (raw_array[:, i * n_pack + 1] << 2) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b2_c8( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 3] << 6) + | (raw_array[:, i * n_pack + 2] << 4) + | (raw_array[:, i * n_pack + 1] << 2) + | raw_array[:, i * n_pack] + ) + return packed_array + + @staticmethod + @numba.jit(nopython=True, parallel=True) + def pack_array_with_numba_b2_c64( + raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + ) -> np.ndarray: + for i in range(new_in_features): + packed_array[:, i] = ( + (raw_array[:, i * n_pack + 31] << 62) + | (raw_array[:, i * n_pack + 30] << 60) + | (raw_array[:, i * n_pack + 29] << 58) + | (raw_array[:, i * n_pack + 28] << 56) + | (raw_array[:, i * n_pack + 27] << 54) + | (raw_array[:, i * n_pack + 26] << 52) + | (raw_array[:, i * n_pack + 25] << 50) + | (raw_array[:, i * n_pack + 24] << 48) + | (raw_array[:, i * n_pack + 23] << 46) + | (raw_array[:, i * n_pack + 22] << 44) + | (raw_array[:, i * n_pack + 21] << 42) + | (raw_array[:, i * n_pack + 20] << 40) + | (raw_array[:, i * n_pack + 19] << 38) + | (raw_array[:, i * n_pack + 18] << 36) + | (raw_array[:, i * n_pack + 17] << 34) + | (raw_array[:, i * n_pack + 16] << 32) + | (raw_array[:, i * n_pack + 15] << 30) + | (raw_array[:, i * n_pack + 14] << 28) + | (raw_array[:, i * n_pack + 13] << 26) + | (raw_array[:, i * n_pack + 12] << 24) + | (raw_array[:, i * n_pack + 11] << 22) + | (raw_array[:, i * n_pack + 10] << 20) + | (raw_array[:, i * n_pack + 9] << 18) + | (raw_array[:, i * n_pack + 8] << 16) + | (raw_array[:, i * n_pack + 7] << 14) + | (raw_array[:, i * n_pack + 6] << 12) + | (raw_array[:, i * n_pack + 5] << 10) + | (raw_array[:, i * n_pack + 4] << 8) + | (raw_array[:, i * n_pack + 3] << 6) + | (raw_array[:, i * n_pack + 2] << 4) + | (raw_array[:, i * n_pack + 1] << 2) + | raw_array[:, i * n_pack] + ) + return packed_array + + def pack_array_with_numba1( + self, raw_array: np.ndarray, n_pack: int, bits: int, compress_bits: int, compression_dtype=np.int32 + ) -> np.ndarray: + """Packs the input array by combining elements into a specified bit-width format using NumPy. + + Args: + raw_array (np.ndarray): The array to be packed. Shape: [out_features, in_features] or [1, in_features]. + n_pack (int): The number of elements to be packed together. + bits (int): The number of bits for each element. + compress_bits (int): The number of bits for each element of the compressed array, supported 2, 4, 8. + compression_dtype (np.dtype, optional): The data type of the compressed array. Defaults to np.int32. + + Returns: + np.ndarray: The packed array. + """ + out_features, in_features = raw_array.shape + new_in_features = (in_features + n_pack - 1) // n_pack + packed_array = np.zeros((out_features, new_in_features), dtype=compression_dtype) + raw_array = raw_array.astype(compression_dtype) + + pack_method_name = f"pack_array_with_numba_b{bits}_c{compress_bits}" + pack_method = getattr(self, pack_method_name) + return pack_method(raw_array, packed_array, n_pack, new_in_features) + + @staticmethod + @numba.jit(nopython=True) + def pack_array_with_numba( + raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32 + ) -> np.ndarray: + """Packs the input tensor by combining elements into a specified bit-width format using NumPy. + Args: + raw_tensor (np.ndarray): The tensor to be packed. Shape: [out_features, in_features] or [1, in_features]. + n_pack (int): The number of elements to be packed together. + bits (int): The number of bits for each element. + compression_dtype (np.dtype, optional): The data type of the compressed tensor. Defaults to np.int32. + Returns: + np.ndarray: The packed tensor. + """ + out_features, in_features = raw_tensor.shape + new_in_features = (in_features + n_pack - 1) // n_pack + packed_tensor = np.zeros((out_features, new_in_features), dtype=compression_dtype) + raw_tensor = raw_tensor.astype(compression_dtype) + + if bits == 4: + for i in range(new_in_features): + packed_tensor[:, i] = ( + (raw_tensor[:, i * n_pack + 7] << 28) + | (raw_tensor[:, i * n_pack + 6] << 24) + | (raw_tensor[:, i * n_pack + 5] << 20) + | (raw_tensor[:, i * n_pack + 4] << 16) + | (raw_tensor[:, i * n_pack + 3] << 12) + | (raw_tensor[:, i * n_pack + 2] << 8) + | (raw_tensor[:, i * n_pack + 1] << 4) + | raw_tensor[:, i * n_pack] + ) + + return packed_tensor + + def pack_tensor_with_reshape(self, raw_tensor): raw_array = raw_tensor.cpu().numpy() target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int) target_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype - packed_array = np.zeros((raw_array.shape[0], target_len), dtype=target_dtype) - mask = np.uint8(2**self.bits - 1) - for j in range(packed_array.shape[1]): - start = self.n_pack * j - end = self.n_pack * (j + 1) - tmp = raw_array[:, start:end].astype(target_dtype) - tmp &= mask - for e in range(tmp.shape[1]): - tmp[:, e] = np.left_shift(tmp[:, e], self.bits * e) - packed_array[:, j] |= tmp[:, e] - accelerator.synchronize() - packed_tensor = torch.from_numpy(packed_array).to(device=raw_tensor.device) + reshaped = raw_array.reshape(-1, self.n_pack) + packed_array = np.zeros(reshaped.shape[0], dtype=target_dtype) + for i in range(self.n_pack): + packed_array |= (reshaped[:, i].astype(target_dtype) << (self.bits * i)) + + packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to(device=raw_tensor.device) return packed_tensor + def pack_tensor_with_numpy(self, raw_tensor): + # breakpoint() + if self.bits not in [2, 4, 8]: + return self.pack_tensor_with_reshape(raw_tensor) + compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype + packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, compression_dtype) + # packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype) + return torch.from_numpy(packed_array).to(device=raw_tensor.device) + def unpack_tensor_with_numpy(self, packed_tensor): packed_array = packed_tensor.cpu().numpy() target_dtype = np.int8 if not hasattr(self, "qzeros") or "int" not in self.dtype else np.uint8 diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 676ab871bc0..5143f22df60 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -108,6 +108,8 @@ def convert( total_quant_time = 0.0 total_quant_int_time = 0.0 total_set_module_time = 0.0 + save_time = 0.0 + layer_time = 0.0 import time assert isinstance(model, torch.nn.Module), "only support torch module" if is_transformers_imported(): @@ -264,8 +266,6 @@ def convert( layer_time = time.time() - start total_time += layer_time logger.info(layer_time) - del m - gc.collect() if name == "": return new_module else: From 0e388c0b1f06d6095d1477fe7a8772445d0aede9 Mon Sep 17 00:00:00 2001 From: sdp Date: Wed, 10 Jul 2024 22:59:44 -0700 Subject: [PATCH 18/38] mimor fix numba Signed-off-by: sdp --- .../torch/algorithms/weight_only/modules.py | 12 +++++------- test/3x/torch/quantization/weight_only/test_rtn.py | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 1ad7339e314..2fba8b7e6a0 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -468,8 +468,7 @@ def pack_array_with_numba_b2_c16( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 8] << 16) - | (raw_array[:, i * n_pack + 7] << 14) + (raw_array[:, i * n_pack + 7] << 14) | (raw_array[:, i * n_pack + 6] << 12) | (raw_array[:, i * n_pack + 5] << 10) | (raw_array[:, i * n_pack + 4] << 8) @@ -536,7 +535,7 @@ def pack_array_with_numba_b2_c64( ) return packed_array - def pack_array_with_numba1( + def pack_array_with_numba( self, raw_array: np.ndarray, n_pack: int, bits: int, compress_bits: int, compression_dtype=np.int32 ) -> np.ndarray: """Packs the input array by combining elements into a specified bit-width format using NumPy. @@ -562,7 +561,7 @@ def pack_array_with_numba1( @staticmethod @numba.jit(nopython=True) - def pack_array_with_numba( + def pack_array_with_numba_yi( raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32 ) -> np.ndarray: """Packs the input tensor by combining elements into a specified bit-width format using NumPy. @@ -607,12 +606,11 @@ def pack_tensor_with_reshape(self, raw_tensor): return packed_tensor def pack_tensor_with_numpy(self, raw_tensor): - # breakpoint() if self.bits not in [2, 4, 8]: return self.pack_tensor_with_reshape(raw_tensor) compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype - packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, compression_dtype) - # packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype) + # packed_array = self.pack_array_with_numba_yi(raw_tensor.cpu().numpy(), self.n_pack, self.bits, compression_dtype) + packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype) return torch.from_numpy(packed_array).to(device=raw_tensor.device) def unpack_tensor_with_numpy(self, packed_tensor): diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 94f7c2954b1..0623a58d3be 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -22,8 +22,8 @@ class ModelConv1d(torch.nn.Module): def __init__(self): super(ModelConv1d, self).__init__() - self.fc1 = transformers.Conv1D(50, 32) - self.fc2 = torch.nn.Linear(50, 32) + self.fc1 = transformers.Conv1D(64, 32) + self.fc2 = torch.nn.Linear(64, 32) self.fc3 = torch.nn.Linear(32, 5) def forward(self, x): From b0ccd622d4c70d0603f0103f2f4cff9c38467a35 Mon Sep 17 00:00:00 2001 From: sdp Date: Wed, 10 Jul 2024 23:45:35 -0700 Subject: [PATCH 19/38] apply mask Signed-off-by: sdp --- .../torch/algorithms/weight_only/modules.py | 223 +++++++++--------- 1 file changed, 108 insertions(+), 115 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 2fba8b7e6a0..1bb6a4321b4 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -309,14 +309,14 @@ def pack_array_with_numba_b4_c32( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 7] << 28) - | (raw_array[:, i * n_pack + 6] << 24) - | (raw_array[:, i * n_pack + 5] << 20) - | (raw_array[:, i * n_pack + 4] << 16) - | (raw_array[:, i * n_pack + 3] << 12) - | (raw_array[:, i * n_pack + 2] << 8) - | (raw_array[:, i * n_pack + 1] << 4) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 7] & 0b1111) << 28) + | ((raw_array[:, i * n_pack + 6] & 0b1111) << 24) + | ((raw_array[:, i * n_pack + 5] & 0b1111) << 20) + | ((raw_array[:, i * n_pack + 4] & 0b1111) << 16) + | ((raw_array[:, i * n_pack + 3] & 0b1111) << 12) + | ((raw_array[:, i * n_pack + 2] & 0b1111) << 8) + | ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) + | (raw_array[:, i * n_pack] & 0b1111) ) return packed_array @@ -327,10 +327,10 @@ def pack_array_with_numba_b4_c16( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 3] << 12) - | (raw_array[:, i * n_pack + 2] << 8) - | (raw_array[:, i * n_pack + 1] << 4) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 3] & 0b1111) << 12) + | ((raw_array[:, i * n_pack + 2] & 0b1111) << 8) + | ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) + | (raw_array[:, i * n_pack] & 0b1111) ) return packed_array @@ -341,8 +341,8 @@ def pack_array_with_numba_b4_c8( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 1] << 4) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) + | (raw_array[:, i * n_pack] & 0b1111) ) return packed_array @@ -353,22 +353,22 @@ def pack_array_with_numba_b4_c64( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 15] << 60) - | (raw_array[:, i * n_pack + 14] << 56) - | (raw_array[:, i * n_pack + 13] << 52) - | (raw_array[:, i * n_pack + 12] << 48) - | (raw_array[:, i * n_pack + 11] << 44) - | (raw_array[:, i * n_pack + 10] << 40) - | (raw_array[:, i * n_pack + 9] << 36) - | (raw_array[:, i * n_pack + 8] << 32) - | (raw_array[:, i * n_pack + 7] << 28) - | (raw_array[:, i * n_pack + 6] << 24) - | (raw_array[:, i * n_pack + 5] << 20) - | (raw_array[:, i * n_pack + 4] << 16) - | (raw_array[:, i * n_pack + 3] << 12) - | (raw_array[:, i * n_pack + 2] << 8) - | (raw_array[:, i * n_pack + 1] << 4) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 15] & 0b1111) << 60) + | ((raw_array[:, i * n_pack + 14] & 0b1111) << 56) + | ((raw_array[:, i * n_pack + 13] & 0b1111) << 52) + | ((raw_array[:, i * n_pack + 12] & 0b1111) << 48) + | ((raw_array[:, i * n_pack + 11] & 0b1111) << 44) + | ((raw_array[:, i * n_pack + 10] & 0b1111) << 40) + | ((raw_array[:, i * n_pack + 9] & 0b1111) << 36) + | ((raw_array[:, i * n_pack + 8] & 0b1111) << 32) + | ((raw_array[:, i * n_pack + 7] & 0b1111) << 28) + | ((raw_array[:, i * n_pack + 6] & 0b1111) << 24) + | ((raw_array[:, i * n_pack + 5] & 0b1111) << 20) + | ((raw_array[:, i * n_pack + 4] & 0b1111) << 16) + | ((raw_array[:, i * n_pack + 3] & 0b1111) << 12) + | ((raw_array[:, i * n_pack + 2] & 0b1111) << 8) + | ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) + | (raw_array[:, i * n_pack] & 0b1111) ) return packed_array @@ -380,10 +380,10 @@ def pack_array_with_numba_b8_c32( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 3] << 24) - | (raw_array[:, i * n_pack + 2] << 16) - | (raw_array[:, i * n_pack + 1] << 8) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24) + | ((raw_array[:, i * n_pack + 2] & 0b11111111) << 16) + | ((raw_array[:, i * n_pack + 1] & 0b11111111) << 8) + | (raw_array[:, i * n_pack] & 0b11111111) ) return packed_array @@ -394,10 +394,10 @@ def pack_array_with_numba_b8_c16( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 3] << 24) - | (raw_array[:, i * n_pack + 2] << 16) - | (raw_array[:, i * n_pack + 1] << 8) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24) + | ((raw_array[:, i * n_pack + 2] & 0b11111111) << 16) + | ((raw_array[:, i * n_pack + 1] & 0b11111111) << 8) + | (raw_array[:, i * n_pack] & 0b11111111) ) return packed_array @@ -407,7 +407,7 @@ def pack_array_with_numba_b8_c8( raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int ) -> np.ndarray: for i in range(new_in_features): - packed_array[:, i] = raw_array[:, i * n_pack] + packed_array[:, i] = (raw_array[:, i * n_pack] & 0b11111111) return packed_array @staticmethod @@ -417,24 +417,17 @@ def pack_array_with_numba_b8_c64( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 7] << 56) - | (raw_array[:, i * n_pack + 6] << 48) - | (raw_array[:, i * n_pack + 5] << 40) - | (raw_array[:, i * n_pack + 4] << 32) - | (raw_array[:, i * n_pack + 3] << 24) - | (raw_array[:, i * n_pack + 2] << 16) - | (raw_array[:, i * n_pack + 1] << 8) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 7] & 0b11111111) << 56) + | ((raw_array[:, i * n_pack + 6] & 0b11111111) << 48) + | ((raw_array[:, i * n_pack + 5] & 0b11111111) << 40) + | ((raw_array[:, i * n_pack + 4] & 0b11111111) << 32) + | ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24) + | ((raw_array[:, i * n_pack + 2] & 0b11111111) << 16) + | ((raw_array[:, i * n_pack + 1] & 0b11111111) << 8) + | (raw_array[:, i * n_pack] & 0b11111111) ) return packed_array - @staticmethod - @numba.jit(nopython=True, parallel=True) - def pack_array_with_numba_b2_c32( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int - ) -> np.ndarray: - return packed_array - @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b2_c32( @@ -442,22 +435,22 @@ def pack_array_with_numba_b2_c32( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 15] << 30) - | (raw_array[:, i * n_pack + 14] << 28) - | (raw_array[:, i * n_pack + 13] << 26) - | (raw_array[:, i * n_pack + 12] << 24) - | (raw_array[:, i * n_pack + 11] << 22) - | (raw_array[:, i * n_pack + 10] << 20) - | (raw_array[:, i * n_pack + 9] << 18) - | (raw_array[:, i * n_pack + 8] << 16) - | (raw_array[:, i * n_pack + 7] << 14) - | (raw_array[:, i * n_pack + 6] << 12) - | (raw_array[:, i * n_pack + 5] << 10) - | (raw_array[:, i * n_pack + 4] << 8) - | (raw_array[:, i * n_pack + 3] << 6) - | (raw_array[:, i * n_pack + 2] << 4) - | (raw_array[:, i * n_pack + 1] << 2) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 15] & 0b11) << 30) + | ((raw_array[:, i * n_pack + 14] & 0b11) << 28) + | ((raw_array[:, i * n_pack + 13] & 0b11) << 26) + | ((raw_array[:, i * n_pack + 12] & 0b11) << 24) + | ((raw_array[:, i * n_pack + 11] & 0b11) << 22) + | ((raw_array[:, i * n_pack + 10] & 0b11) << 20) + | ((raw_array[:, i * n_pack + 9] & 0b11) << 18) + | ((raw_array[:, i * n_pack + 8] & 0b11) << 16) + | ((raw_array[:, i * n_pack + 7] & 0b11) << 14) + | ((raw_array[:, i * n_pack + 6] & 0b11) << 12) + | ((raw_array[:, i * n_pack + 5] & 0b11) << 10) + | ((raw_array[:, i * n_pack + 4] & 0b11) << 8) + | ((raw_array[:, i * n_pack + 3] & 0b11) << 6) + | ((raw_array[:, i * n_pack + 2] & 0b11) << 4) + | ((raw_array[:, i * n_pack + 1] & 0b11) << 2) + | (raw_array[:, i * n_pack] & 0b11) ) return packed_array @@ -468,14 +461,14 @@ def pack_array_with_numba_b2_c16( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 7] << 14) - | (raw_array[:, i * n_pack + 6] << 12) - | (raw_array[:, i * n_pack + 5] << 10) - | (raw_array[:, i * n_pack + 4] << 8) - | (raw_array[:, i * n_pack + 3] << 6) - | (raw_array[:, i * n_pack + 2] << 4) - | (raw_array[:, i * n_pack + 1] << 2) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 7] & 0b11) << 14) + | ((raw_array[:, i * n_pack + 6] & 0b11) << 12) + | ((raw_array[:, i * n_pack + 5] & 0b11) << 10) + | ((raw_array[:, i * n_pack + 4] & 0b11) << 8) + | ((raw_array[:, i * n_pack + 3] & 0b11) << 6) + | ((raw_array[:, i * n_pack + 2] & 0b11) << 4) + | ((raw_array[:, i * n_pack + 1] & 0b11) << 2) + | (raw_array[:, i * n_pack] & 0b11) ) return packed_array @@ -486,10 +479,10 @@ def pack_array_with_numba_b2_c8( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 3] << 6) - | (raw_array[:, i * n_pack + 2] << 4) - | (raw_array[:, i * n_pack + 1] << 2) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 3] & 0b11) << 6) + | ((raw_array[:, i * n_pack + 2] & 0b11) << 4) + | ((raw_array[:, i * n_pack + 1] & 0b11) << 2) + | (raw_array[:, i * n_pack] & 0b11) ) return packed_array @@ -500,38 +493,38 @@ def pack_array_with_numba_b2_c64( ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( - (raw_array[:, i * n_pack + 31] << 62) - | (raw_array[:, i * n_pack + 30] << 60) - | (raw_array[:, i * n_pack + 29] << 58) - | (raw_array[:, i * n_pack + 28] << 56) - | (raw_array[:, i * n_pack + 27] << 54) - | (raw_array[:, i * n_pack + 26] << 52) - | (raw_array[:, i * n_pack + 25] << 50) - | (raw_array[:, i * n_pack + 24] << 48) - | (raw_array[:, i * n_pack + 23] << 46) - | (raw_array[:, i * n_pack + 22] << 44) - | (raw_array[:, i * n_pack + 21] << 42) - | (raw_array[:, i * n_pack + 20] << 40) - | (raw_array[:, i * n_pack + 19] << 38) - | (raw_array[:, i * n_pack + 18] << 36) - | (raw_array[:, i * n_pack + 17] << 34) - | (raw_array[:, i * n_pack + 16] << 32) - | (raw_array[:, i * n_pack + 15] << 30) - | (raw_array[:, i * n_pack + 14] << 28) - | (raw_array[:, i * n_pack + 13] << 26) - | (raw_array[:, i * n_pack + 12] << 24) - | (raw_array[:, i * n_pack + 11] << 22) - | (raw_array[:, i * n_pack + 10] << 20) - | (raw_array[:, i * n_pack + 9] << 18) - | (raw_array[:, i * n_pack + 8] << 16) - | (raw_array[:, i * n_pack + 7] << 14) - | (raw_array[:, i * n_pack + 6] << 12) - | (raw_array[:, i * n_pack + 5] << 10) - | (raw_array[:, i * n_pack + 4] << 8) - | (raw_array[:, i * n_pack + 3] << 6) - | (raw_array[:, i * n_pack + 2] << 4) - | (raw_array[:, i * n_pack + 1] << 2) - | raw_array[:, i * n_pack] + ((raw_array[:, i * n_pack + 31] & 0b11) << 62) + | ((raw_array[:, i * n_pack + 30] & 0b11) << 60) + | ((raw_array[:, i * n_pack + 29] & 0b11) << 58) + | ((raw_array[:, i * n_pack + 28] & 0b11) << 56) + | ((raw_array[:, i * n_pack + 27] & 0b11) << 54) + | ((raw_array[:, i * n_pack + 26] & 0b11) << 52) + | ((raw_array[:, i * n_pack + 25] & 0b11) << 50) + | ((raw_array[:, i * n_pack + 24] & 0b11) << 48) + | ((raw_array[:, i * n_pack + 23] & 0b11) << 46) + | ((raw_array[:, i * n_pack + 22] & 0b11) << 44) + | ((raw_array[:, i * n_pack + 21] & 0b11) << 42) + | ((raw_array[:, i * n_pack + 20] & 0b11) << 40) + | ((raw_array[:, i * n_pack + 19] & 0b11) << 38) + | ((raw_array[:, i * n_pack + 18] & 0b11) << 36) + | ((raw_array[:, i * n_pack + 17] & 0b11) << 34) + | ((raw_array[:, i * n_pack + 16] & 0b11) << 32) + | ((raw_array[:, i * n_pack + 15] & 0b11) << 30) + | ((raw_array[:, i * n_pack + 14] & 0b11) << 28) + | ((raw_array[:, i * n_pack + 13] & 0b11) << 26) + | ((raw_array[:, i * n_pack + 12] & 0b11) << 24) + | ((raw_array[:, i * n_pack + 11] & 0b11) << 22) + | ((raw_array[:, i * n_pack + 10] & 0b11) << 20) + | ((raw_array[:, i * n_pack + 9] & 0b11) << 18) + | ((raw_array[:, i * n_pack + 8] & 0b11) << 16) + | ((raw_array[:, i * n_pack + 7] & 0b11) << 14) + | ((raw_array[:, i * n_pack + 6] & 0b11) << 12) + | ((raw_array[:, i * n_pack + 5] & 0b11) << 10) + | ((raw_array[:, i * n_pack + 4] & 0b11) << 8) + | ((raw_array[:, i * n_pack + 3] & 0b11) << 6) + | ((raw_array[:, i * n_pack + 2] & 0b11) << 4) + | ((raw_array[:, i * n_pack + 1] & 0b11) << 2) + | (raw_array[:, i * n_pack] & 0b11) ) return packed_array @@ -582,7 +575,7 @@ def pack_array_with_numba_yi( for i in range(new_in_features): packed_tensor[:, i] = ( (raw_tensor[:, i * n_pack + 7] << 28) - | (raw_tensor[:, i * n_pack + 6] << 24) + | (raw_tensor[:, i * n_pack + 6] << 24) | (raw_tensor[:, i * n_pack + 5] << 20) | (raw_tensor[:, i * n_pack + 4] << 16) | (raw_tensor[:, i * n_pack + 3] << 12) From 0f7de684bf589fb6679c2f5b1e21516c490a1790 Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 01:32:06 -0700 Subject: [PATCH 20/38] support gptq Signed-off-by: sdp --- .../torch/algorithms/weight_only/gptq.py | 98 ++++++++++--------- 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 30b4d07be6a..942e38b73d3 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -482,20 +482,12 @@ def tmp(_, inp, out): weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) logger.info(f"Quantizing layer {layer_name}") if self.use_layer_wise: # pragma: no cover - from neural_compressor.torch.algorithms.layer_wise import load_value, set_module_tensor_to_device + from neural_compressor.torch.algorithms.layer_wise import load_value full_layer_name = self.get_full_layer_name(layer_name, block_idx) - for n, p in sub_layers[layer_name].named_parameters(): - param_name = full_layer_name + "." + n - if n == "weight": - W = load_value(self.model, full_layer_name + ".weight", self.model_path) - else: - value = load_value(self.model, param_name, self.model_path) - set_module_tensor_to_device(self.model, param_name, self.device, value) - + W = load_value(self.model, full_layer_name + ".weight", self.model_path) else: W = sub_layers[layer_name].weight.data.clone() - accelerator.mark_step() if "hpu" in self.device: W = W.to("cpu") @@ -507,8 +499,30 @@ def tmp(_, inp, out): act_order=weight_config_this_layer["act_order"], static_groups=weight_config_this_layer["static_groups"], ) + if self.use_layer_wise: # pragma: no cover + from neural_compressor.torch.algorithms.layer_wise import ( + LWQ_WORKSPACE, + clean_module_weight, + load_value, + set_module_tensor_to_device, + ) - # Step 2.5: export to compressed model + sub_layer = sub_layers[layer_name] + full_layer_name = self.get_full_layer_name(layer_name, block_idx) + for n, p in sub_layer.named_parameters(): + param_name = full_layer_name + "." + n + if n == "weight": + set_module_tensor_to_device(self.model, param_name, self.device, Q) + else: + value = load_value(self.model, param_name, self.model_path) + set_module_tensor_to_device(self.model, param_name, self.device, value) + # sub_layer.weight.data = Q + torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") + clean_module_weight(sub_layer) + del Q + gc.collect() + else: + sub_layers[layer_name].weight.data = Q gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale} if not weight_config_this_layer["sym"]: gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp @@ -516,7 +530,24 @@ def tmp(_, inp, out): gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[ layer_name ].perm + gptq_for_this_block[layer_name].free() + # Step 2.5: replace output data with quantized weights + outs = [] + batch_num = self.cache_key_arguments.pop("batch_num") + for j in range(batch_num): + cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) + cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) + out = transformer_block(*cache_positional_batch, **cache_keyword_batch) + out = self.track_hidden_states(out) + outs.append(out) + self.cache_key_arguments["batch_num"] = batch_num + if self.use_layer_wise: # pragma: no cover + self.gptq_related_blocks["transformers"][block_idx] = transformer_block + else: + self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() + # Step 2.6: export to compressed model + for layer_name in sub_layers: weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) gptq_scale = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["scale"] if not weight_config_this_layer["sym"]: @@ -527,6 +558,13 @@ def tmp(_, inp, out): gptq_perm = gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] else: gptq_perm = None + if self.use_layer_wise: + state_dict = torch.load(LWQ_WORKSPACE + f"/{self.get_full_layer_name(layer_name, block_idx)}.pt") + Q = state_dict["weight"].data + bias = state_dict["bias"] if "bias" in state_dict.keys() else None + + else: + Q = sub_layers[layer_name].weight.data if weight_config_this_layer["act_order"]: Q.copy_(Q[:, gptq_perm]) if is_transformers_imported() and isinstance(sub_layers[layer_name], transformers.Conv1D): @@ -555,6 +593,9 @@ def tmp(_, inp, out): scale = scale.t_().contiguous() zp = zp.t_().contiguous() if zp is not None else zp + if not self.use_layer_wise: + bias = sub_layers[layer_name].bias + new_module = WeightOnlyLinear( in_features, out_features, @@ -562,43 +603,12 @@ def tmp(_, inp, out): bits=weight_config_this_layer["bits"], group_size=weight_config_this_layer["group_size"], zp=gptq_zp is not None, - bias=sub_layers[layer_name].bias is not None, + bias=bias is not None, g_idx=gptq_perm is not None, device=self.device, ) - new_module.pack(int_weight, gptq_scale, gptq_zp, sub_layers[layer_name].bias, gptq_perm) - - if self.use_layer_wise: # pragma: no cover - from neural_compressor.torch.algorithms.layer_wise import ( - LWQ_WORKSPACE, - clean_module_weight, - load_value, - set_module_tensor_to_device, - ) - - torch.save(new_module.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") - clean_module_weight(new_module) - del Q - gc.collect() + new_module.pack(int_weight, gptq_scale, gptq_zp, bias, gptq_perm) set_module(transformer_block, layer_name, new_module) - - gptq_for_this_block[layer_name].free() - - # Step 2.6: replace output data with quantized weights - outs = [] - batch_num = self.cache_key_arguments.pop("batch_num") - for j in range(batch_num): - cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) - cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) - out = transformer_block(*cache_positional_batch, **cache_keyword_batch) - out = self.track_hidden_states(out) - outs.append(out) - self.cache_key_arguments["batch_num"] = batch_num - if self.use_layer_wise: # pragma: no cover - self.gptq_related_blocks["transformers"][block_idx] = transformer_block - else: - self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() - del gptq_for_this_block torch.cuda.empty_cache() # iteratively replace the input with output, thus layerwise quantization can continue. From 83c6a9b2f5d879f450b3d23f764c0cdf5aaf6150 Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 20:23:25 -0700 Subject: [PATCH 21/38] keep q_model in memory Signed-off-by: sdp --- .../torch/algorithms/layer_wise/utils.py | 18 ++--- .../torch/algorithms/weight_only/rtn.py | 78 ++++--------------- 2 files changed, 24 insertions(+), 72 deletions(-) diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index 2722a891144..bbe59de3fe4 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -251,17 +251,13 @@ def hook(module, input): state_dict = None if os.path.exists(os.path.join(LWQ_WORKSPACE, f"{name}.pt")): state_dict = torch.load(os.path.join(LWQ_WORKSPACE, f"{name}.pt")) - if isinstance(module, WeightOnlyLinear): - for n, p in module._buffers.items(): - setattr(module, n, state_dict[n]) - else: - for n, p in module.named_parameters(): - param_name = name + "." + n - if state_dict: - value = state_dict[n] - else: - value = load_value(model, param_name, path) - set_module_tensor_to_device(model, param_name, device, value) + for n, p in module.named_parameters(): + param_name = name + "." + n + if state_dict: + value = state_dict[n] + else: + value = load_value(model, param_name, path) + set_module_tensor_to_device(model, param_name, device, value) return hook diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 5143f22df60..5ac0deb594c 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -92,15 +92,11 @@ def convert( """ weight_config = self.quant_config device = get_accelerator(kwargs.pop("device", "auto")).current_device_name() - if use_layer_wise: - from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module - - os.makedirs(LWQ_WORKSPACE, exist_ok=True) # Put model on device explicitly # TODO: refine it later, Put module on device one by one instead of the whole model - #if not use_layer_wise: - # model.to(device) + if not use_layer_wise: + model.to(device) total_time = 0.0 total_load_time = 0.0 @@ -125,9 +121,21 @@ def convert( "double_quant_group_size": kwargs.get("double_quant_group_size", 256), } use_optimum_format = kwargs.get("use_optimum_format", True) + + if use_layer_wise: + from neural_compressor.common.utils import DEFAULT_WORKSPACE + from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, register_weight_hooks + + if model_path == "": + model_path = model.path + assert model_path, "model_path should not be None." + model_path = get_path(model_path) + + register_weight_hooks(model, model_path, device=device, clean_weight=True) + for name, m in model.named_modules(): - if not isinstance(m, supported_layers): + if not isinstance(m, supported_layers): continue if name in weight_config: # pragma: no cover # initialize op configuration @@ -173,22 +181,10 @@ def convert( logger.debug(f"RTN quantized module:{name, m}") logger.debug(log_msg) - if use_layer_wise and True: - start_load = time.time() - from neural_compressor.common.utils import DEFAULT_WORKSPACE - from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module - os.makedirs(LWQ_WORKSPACE, exist_ok=True) - if model_path == "": - model_path = model.path - assert model_path, "model_path should not be None." - model_path = get_path(model_path) - - # load weight + + if use_layer_wise: load_module(model, name, model_path, device=device) - load_time = time.time() - start_load - total_load_time += load_time - logger.info(load_time) # for only group_dim is 0 or only `transformers.Conv1D`, we need transpose weight. if is_transformers_imported(): @@ -240,52 +236,12 @@ def convert( use_optimum_format=use_optimum_format, device=device, ) - if name in ["model.layers.11.mlp.up_proj", "model.layers.16.mlp.gate_proj"]: - print("will break") - #breakpoint() - logger.info(name) new_module.pack(int_weight, scale, zp, m.bias) if use_layer_wise: - # save and clean weight - from neural_compressor.torch.algorithms.layer_wise.utils import clean_module_weight - from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE, get_path, load_module - - import time - - start = time.time() - torch.save(new_module.state_dict(), os.path.join(LWQ_WORKSPACE, f"{name}.pt")) - save_time = time.time() - start - logger.info(f"save time {save_time}") - total_save_time += save_time - start = time.time() - #clean_module_weight(new_module) - new_module = new_module.to_empty(device=torch.device("meta")) m = m.to_empty(device=torch.device("meta")) - #clean_module_weight(m) - layer_time = time.time() - start - total_time += layer_time - logger.info(layer_time) if name == "": return new_module else: - start_set = time.time() set_module(model, name, new_module) - set_module_time = time.time() - start_set - total_set_module_time += set_module_time - quant_time = time.time() - start_quant - save_time - layer_time - logger.info(f"quant time {quant_time}") - total_quant_time += quant_time - - logger.info(f"load time: {total_load_time}") - logger.info(f"save time: {total_save_time}") - logger.info(f"clean time: {total_time}") - logger.info(f"quant time: {total_quant_time}") - logger.info(f"quant int time: {total_quant_int_time}") - logger.info(f"set module time: {total_set_module_time}") - if use_layer_wise: - # register hooks - from neural_compressor.torch.algorithms.layer_wise.utils import register_weight_hooks - - register_weight_hooks(model, model_path, device=device, clean_weight=True) return model From c543783eacb80207f49b50a4e7ca3366dc965a9e Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 21:40:26 -0700 Subject: [PATCH 22/38] fix master conflict Signed-off-by: sdp --- .../torch/algorithms/weight_only/gptq.py | 3 ++- .../torch/algorithms/weight_only/rtn.py | 14 +++++++------- neural_compressor/torch/quantization/config.py | 10 +++++----- test/3x/torch/quantization/weight_only/test_rtn.py | 3 +-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 82b73c4213b..89882f965b7 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -1045,7 +1045,8 @@ def convert(self, model, *args, **kwargs): self.gptq_quantizer.remove_prepare_for_calibration() q_model, gptq_config = self.gptq_quantizer.execute_quantization() - q_model = q_model.to(self.model_device) + if not self.gptq_quantizer.use_layer_wise: + q_model = q_model.to(self.model_device) q_model.gptq_config = gptq_config logger.info("GPTQ quantizing done.") return q_model diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 6a1de840b2d..0554fd74383 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -146,7 +146,8 @@ def convert( if dtype == "fp32": continue # Move modules to the accelerator device layer-by-layer - m.to(device) + if not use_layer_wise: + m.to(device) ### FP8 cast part if dtype in ["fp8_e5m2", "fp8_e5m2fnuz", "fp8_e4m3fn", "fp8_e4m3fnuz"]: logger.debug("Cast module {} to FP8 using qdq mode, no scaling".format(name)) @@ -200,7 +201,6 @@ def convert( weight = m.weight.detach() if use_mse_search: quantile = search_clip(m, bits, group_size, scheme, dtype, use_full_range) - start_quant = time.time() int_weight, scale, zp = quant_tensor( weight, dtype=dtype, @@ -212,8 +212,6 @@ def convert( full_range=use_full_range, **double_quant_config, ) - quant_int_time = time.time() - start_quant - total_quant_int_time += quant_int_time int_weight = int_weight.t_().contiguous() if transpose else int_weight scale = scale.t_().contiguous() if transpose else scale zp = zp.t_().contiguous() if transpose and zp is not None else zp @@ -248,7 +246,9 @@ def convert( else: set_module(model, name, new_module) # Move modules back to the model device layer-by-layer - m.to(model_device) - new_module.to(model_device) - model.to(model_device) + if not use_layer_wise: + m.to(model_device) + new_module.to(model_device) + if not use_layer_wise: + model.to(model_device) return model diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 9014f1576a3..1f60fe83647 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -200,7 +200,7 @@ def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: if not self.quant_lm_head: - self.set_local(LM_HEAD_NAMES, RTNConfig(dtype="fp32")) + self.set_local(LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)) config_mapping = super().to_config_mapping(config_list, model_info) return config_mapping @@ -363,7 +363,7 @@ def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: if not self.quant_lm_head: - self.set_local(LM_HEAD_NAMES, GPTQConfig(dtype="fp32")) + self.set_local(LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)) config_mapping = super().to_config_mapping(config_list, model_info) return config_mapping @@ -385,7 +385,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig @classmethod def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "GPTQConfig"]: pre_defined_configs: Dict[torch_utils.ProcessorType, GPTQConfig] = {} - pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True) + pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)#, model_path=self.model_path) pre_defined_configs[torch_utils.ProcessorType.Server] = cls() return pre_defined_configs @@ -508,7 +508,7 @@ def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: if not self.quant_lm_head: - self.set_local(LM_HEAD_NAMES, AWQConfig(dtype="fp32")) + self.set_local(LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)) config_mapping = super().to_config_mapping(config_list, model_info) return config_mapping @@ -815,7 +815,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "AutoRoundConfig", List["AutoR @classmethod def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "AutoRoundConfig"]: pre_defined_configs: Dict[torch_utils.ProcessorType, AutoRoundConfig] = {} - pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True) + pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True, model_path=self.model_path) pre_defined_configs[torch_utils.ProcessorType.Server] = cls() return pre_defined_configs diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index b3a379be15f..d4e1ae2f4e6 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -44,7 +44,7 @@ def setup_class(self): self.label = self.tiny_gptj(self.example_inputs)[0] # test_default_config model = copy.deepcopy(self.tiny_gptj) - quant_config = get_default_rtn_config() + quant_config = get_default_rtn_config("Server") model = prepare(model, quant_config) model = convert(model) # record q_label for comparison @@ -172,7 +172,6 @@ def test_layer_wise(self): model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") quant_config = RTNConfig( use_layer_wise=True, - model_path="hf-internal-testing/tiny-random-GPTJForCausalLM", ) model = prepare(model, quant_config) model = convert(model) From 159aa34d363f721593d84746dcb69eb479849e3e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jul 2024 04:43:16 +0000 Subject: [PATCH 23/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/weight_only/gptq.py | 4 +- .../torch/algorithms/weight_only/modules.py | 77 ++++++++++--------- .../torch/algorithms/weight_only/rtn.py | 10 +-- .../torch/quantization/config.py | 16 +++- 4 files changed, 58 insertions(+), 49 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index 89882f965b7..eae9f7c3a84 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -581,7 +581,7 @@ def tmp(_, inp, out): state_dict = torch.load(LWQ_WORKSPACE + f"/{self.get_full_layer_name(layer_name, block_idx)}.pt") Q = state_dict["weight"].data bias = state_dict["bias"] if "bias" in state_dict.keys() else None - + else: Q = sub_layers[layer_name].weight.data if weight_config_this_layer["act_order"]: @@ -614,7 +614,7 @@ def tmp(_, inp, out): if not self.use_layer_wise: bias = sub_layers[layer_name].bias - + new_module = WeightOnlyLinear( in_features, out_features, diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 47d51560612..7b0aae9589b 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -19,11 +19,11 @@ # since the model classes inherit torch.nn.Module. import math +import numba import numpy as np import torch from torch.autograd import Function from torch.nn import functional as F -import numba from neural_compressor.torch.utils import accelerator, logger @@ -301,11 +301,11 @@ def unpack_tensor_with_torch(self, packed_tensor): unpacked_tensor[:, index].copy_(tmp.type(target_dtype)) accelerator.synchronize() return unpacked_tensor - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b4_c32( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -319,11 +319,11 @@ def pack_array_with_numba_b4_c32( | (raw_array[:, i * n_pack] & 0b1111) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b4_c16( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -333,23 +333,20 @@ def pack_array_with_numba_b4_c16( | (raw_array[:, i * n_pack] & 0b1111) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b4_c8( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): - packed_array[:, i] = ( - ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) - | (raw_array[:, i * n_pack] & 0b1111) - ) + packed_array[:, i] = ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) | (raw_array[:, i * n_pack] & 0b1111) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b4_c64( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -372,11 +369,10 @@ def pack_array_with_numba_b4_c64( ) return packed_array - @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b8_c32( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -386,11 +382,11 @@ def pack_array_with_numba_b8_c32( | (raw_array[:, i * n_pack] & 0b11111111) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b8_c16( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -400,20 +396,20 @@ def pack_array_with_numba_b8_c16( | (raw_array[:, i * n_pack] & 0b11111111) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b8_c8( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): - packed_array[:, i] = (raw_array[:, i * n_pack] & 0b11111111) + packed_array[:, i] = raw_array[:, i * n_pack] & 0b11111111 return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b8_c64( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -427,11 +423,11 @@ def pack_array_with_numba_b8_c64( | (raw_array[:, i * n_pack] & 0b11111111) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b2_c32( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -457,7 +453,7 @@ def pack_array_with_numba_b2_c32( @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b2_c16( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -471,11 +467,11 @@ def pack_array_with_numba_b2_c16( | (raw_array[:, i * n_pack] & 0b11) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b2_c8( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -485,11 +481,11 @@ def pack_array_with_numba_b2_c8( | (raw_array[:, i * n_pack] & 0b11) ) return packed_array - + @staticmethod @numba.jit(nopython=True, parallel=True) def pack_array_with_numba_b2_c64( - raw_array: np.ndarray, packed_array:np.ndarray, n_pack: int, new_in_features:int + raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: for i in range(new_in_features): packed_array[:, i] = ( @@ -527,7 +523,7 @@ def pack_array_with_numba_b2_c64( | (raw_array[:, i * n_pack] & 0b11) ) return packed_array - + def pack_array_with_numba( self, raw_array: np.ndarray, n_pack: int, bits: int, compress_bits: int, compression_dtype=np.int32 ) -> np.ndarray: @@ -547,17 +543,18 @@ def pack_array_with_numba( new_in_features = (in_features + n_pack - 1) // n_pack packed_array = np.zeros((out_features, new_in_features), dtype=compression_dtype) raw_array = raw_array.astype(compression_dtype) - + pack_method_name = f"pack_array_with_numba_b{bits}_c{compress_bits}" pack_method = getattr(self, pack_method_name) return pack_method(raw_array, packed_array, n_pack, new_in_features) - + @staticmethod @numba.jit(nopython=True) def pack_array_with_numba_yi( raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32 ) -> np.ndarray: """Packs the input tensor by combining elements into a specified bit-width format using NumPy. + Args: raw_tensor (np.ndarray): The tensor to be packed. Shape: [out_features, in_features] or [1, in_features]. n_pack (int): The number of elements to be packed together. @@ -575,7 +572,7 @@ def pack_array_with_numba_yi( for i in range(new_in_features): packed_tensor[:, i] = ( (raw_tensor[:, i * n_pack + 7] << 28) - | (raw_tensor[:, i * n_pack + 6] << 24) + | (raw_tensor[:, i * n_pack + 6] << 24) | (raw_tensor[:, i * n_pack + 5] << 20) | (raw_tensor[:, i * n_pack + 4] << 16) | (raw_tensor[:, i * n_pack + 3] << 12) @@ -585,7 +582,7 @@ def pack_array_with_numba_yi( ) return packed_tensor - + def pack_tensor_with_reshape(self, raw_tensor): raw_array = raw_tensor.cpu().numpy() target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int) @@ -593,9 +590,11 @@ def pack_tensor_with_reshape(self, raw_tensor): reshaped = raw_array.reshape(-1, self.n_pack) packed_array = np.zeros(reshaped.shape[0], dtype=target_dtype) for i in range(self.n_pack): - packed_array |= (reshaped[:, i].astype(target_dtype) << (self.bits * i)) - - packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to(device=raw_tensor.device) + packed_array |= reshaped[:, i].astype(target_dtype) << (self.bits * i) + + packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to( + device=raw_tensor.device + ) return packed_tensor def pack_tensor_with_numpy(self, raw_tensor): @@ -603,7 +602,9 @@ def pack_tensor_with_numpy(self, raw_tensor): return self.pack_tensor_with_reshape(raw_tensor) compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype # packed_array = self.pack_array_with_numba_yi(raw_tensor.cpu().numpy(), self.n_pack, self.bits, compression_dtype) - packed_array = self.pack_array_with_numba(raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype) + packed_array = self.pack_array_with_numba( + raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype + ) return torch.from_numpy(packed_array).to(device=raw_tensor.device) def unpack_tensor_with_numpy(self, packed_tensor): diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 0554fd74383..c04327a62f4 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -124,7 +124,7 @@ def convert( "double_quant_group_size": kwargs.get("double_quant_group_size", 256), } use_optimum_format = kwargs.get("use_optimum_format", True) - + if use_layer_wise: from neural_compressor.common.utils import DEFAULT_WORKSPACE from neural_compressor.torch.algorithms.layer_wise.utils import get_path, load_module, register_weight_hooks @@ -135,10 +135,10 @@ def convert( model_path = get_path(model_path) register_weight_hooks(model, model_path, device=device, clean_weight=True) - + for name, m in model.named_modules(): - - if not isinstance(m, supported_layers): + + if not isinstance(m, supported_layers): continue if name in weight_config: # pragma: no cover # initialize op configuration @@ -186,7 +186,7 @@ def convert( continue logger.debug(f"RTN quantized module:{name, m}") logger.debug(log_msg) - + if use_layer_wise: load_module(model, name, model_path, device=device) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 1f60fe83647..90e0119769e 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -200,7 +200,9 @@ def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: if not self.quant_lm_head: - self.set_local(LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)) + self.set_local( + LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path) + ) config_mapping = super().to_config_mapping(config_list, model_info) return config_mapping @@ -363,7 +365,9 @@ def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: if not self.quant_lm_head: - self.set_local(LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)) + self.set_local( + LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path) + ) config_mapping = super().to_config_mapping(config_list, model_info) return config_mapping @@ -385,7 +389,9 @@ def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig @classmethod def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "GPTQConfig"]: pre_defined_configs: Dict[torch_utils.ProcessorType, GPTQConfig] = {} - pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)#, model_path=self.model_path) + pre_defined_configs[torch_utils.ProcessorType.Client] = cls( + use_layer_wise=True + ) # , model_path=self.model_path) pre_defined_configs[torch_utils.ProcessorType.Server] = cls() return pre_defined_configs @@ -508,7 +514,9 @@ def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: if not self.quant_lm_head: - self.set_local(LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path)) + self.set_local( + LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path) + ) config_mapping = super().to_config_mapping(config_list, model_info) return config_mapping From 809c0fb2e66584043ac18b247d241bffb0f936a0 Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 21:44:59 -0700 Subject: [PATCH 24/38] update numba requirements_pt Signed-off-by: sdp --- requirements_pt.txt | 1 + test/3x/torch/quantization/weight_only/test_gptq.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_pt.txt b/requirements_pt.txt index 94667b64665..0ed9fabde27 100644 --- a/requirements_pt.txt +++ b/requirements_pt.txt @@ -1,4 +1,5 @@ numpy < 2.0 +numba peft==0.10.0 prettytable psutil diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index 1974bc33222..93b09aec02b 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -176,7 +176,6 @@ def test_act_order(self): assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check." def test_layer_wise(self): - # model = copy.deepcopy(self.tiny_gptj) model = copy.deepcopy(self.tiny_gptj) quant_config = GPTQConfig() model = prepare(model, quant_config) From 308c7fc6392312b2f8741019d815d49508c3ca22 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jul 2024 04:48:44 +0000 Subject: [PATCH 25/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- requirements_pt.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_pt.txt b/requirements_pt.txt index 0ed9fabde27..5f18aead98d 100644 --- a/requirements_pt.txt +++ b/requirements_pt.txt @@ -1,5 +1,5 @@ -numpy < 2.0 numba +numpy < 2.0 peft==0.10.0 prettytable psutil From 5d80e9bc005c9ec1b85ce9258ecd9a5c256e782a Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 22:13:03 -0700 Subject: [PATCH 26/38] fix awq config Signed-off-by: sdp --- neural_compressor/torch/quantization/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 90e0119769e..868c44c4746 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -444,6 +444,7 @@ def __init__( use_full_range: bool = False, use_mse_search: bool = False, use_layer_wise: bool = False, + model_path: str = "", # double quant use_double_quant: bool = False, double_quant_dtype: str = "int", @@ -469,6 +470,7 @@ def __init__( use_full_range (bool): Enables full range for activations, default is False. use_mse_search (bool): Enables mean squared error (MSE) search, default is False. use_layer_wise (bool): Enables quantize model per layer. Defaults to False. + model_path (str): Model path that is used to load state_dict per layer. use_double_quant (bool): Enables double quantization, default is False. double_quant_dtype (str): Data type for double_quant scale, default is "int". double_quant_bits (int): Number of bits used to represent double_quant scale, default is 4. @@ -489,6 +491,7 @@ def __init__( self.use_full_range = use_full_range self.use_mse_search = use_mse_search self.use_layer_wise = use_layer_wise + self.model_path = model_path # double quant self.use_double_quant = use_double_quant self.double_quant_bits = double_quant_bits From c4af34434d25df02d93af961749001ede51bb6cc Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 22:54:14 -0700 Subject: [PATCH 27/38] remove pack_with_reshpe Signed-off-by: sdp --- .../torch/algorithms/weight_only/modules.py | 60 +++++-------------- 1 file changed, 14 insertions(+), 46 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 7b0aae9589b..503a469b0c7 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -548,60 +548,28 @@ def pack_array_with_numba( pack_method = getattr(self, pack_method_name) return pack_method(raw_array, packed_array, n_pack, new_in_features) - @staticmethod - @numba.jit(nopython=True) - def pack_array_with_numba_yi( - raw_tensor: np.ndarray, n_pack: int, bits: int, compression_dtype=np.int32 - ) -> np.ndarray: - """Packs the input tensor by combining elements into a specified bit-width format using NumPy. - - Args: - raw_tensor (np.ndarray): The tensor to be packed. Shape: [out_features, in_features] or [1, in_features]. - n_pack (int): The number of elements to be packed together. - bits (int): The number of bits for each element. - compression_dtype (np.dtype, optional): The data type of the compressed tensor. Defaults to np.int32. - Returns: - np.ndarray: The packed tensor. - """ - out_features, in_features = raw_tensor.shape - new_in_features = (in_features + n_pack - 1) // n_pack - packed_tensor = np.zeros((out_features, new_in_features), dtype=compression_dtype) - raw_tensor = raw_tensor.astype(compression_dtype) - - if bits == 4: - for i in range(new_in_features): - packed_tensor[:, i] = ( - (raw_tensor[:, i * n_pack + 7] << 28) - | (raw_tensor[:, i * n_pack + 6] << 24) - | (raw_tensor[:, i * n_pack + 5] << 20) - | (raw_tensor[:, i * n_pack + 4] << 16) - | (raw_tensor[:, i * n_pack + 3] << 12) - | (raw_tensor[:, i * n_pack + 2] << 8) - | (raw_tensor[:, i * n_pack + 1] << 4) - | raw_tensor[:, i * n_pack] - ) - - return packed_tensor - - def pack_tensor_with_reshape(self, raw_tensor): + def pack_tensor_with_numpy_impl(self, raw_tensor): raw_array = raw_tensor.cpu().numpy() target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int) target_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype - reshaped = raw_array.reshape(-1, self.n_pack) - packed_array = np.zeros(reshaped.shape[0], dtype=target_dtype) - for i in range(self.n_pack): - packed_array |= reshaped[:, i].astype(target_dtype) << (self.bits * i) - - packed_tensor = torch.from_numpy(packed_array.reshape((raw_array.shape[0], target_len))).to( - device=raw_tensor.device - ) + packed_array = np.zeros((raw_array.shape[0], target_len), dtype=target_dtype) + mask = np.uint8(2**self.bits - 1) + for j in range(packed_array.shape[1]): + start = self.n_pack * j + end = self.n_pack * (j + 1) + tmp = raw_array[:, start:end].astype(target_dtype) + tmp &= mask + for e in range(tmp.shape[1]): + tmp[:, e] = np.left_shift(tmp[:, e], self.bits * e) + packed_array[:, j] |= tmp[:, e] + accelerator.synchronize() + packed_tensor = torch.from_numpy(packed_array).to(device=raw_tensor.device) return packed_tensor def pack_tensor_with_numpy(self, raw_tensor): if self.bits not in [2, 4, 8]: - return self.pack_tensor_with_reshape(raw_tensor) + return self.pack_tensor_with_numpy_impl(raw_tensor) compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype - # packed_array = self.pack_array_with_numba_yi(raw_tensor.cpu().numpy(), self.n_pack, self.bits, compression_dtype) packed_array = self.pack_array_with_numba( raw_tensor.cpu().numpy(), self.n_pack, self.bits, self.compress_bits, compression_dtype ) From e99ee19efd5ccb796adda546ac505ddba1ecb649 Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 23:20:30 -0700 Subject: [PATCH 28/38] recover ar Signed-off-by: sdp --- neural_compressor/torch/quantization/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 868c44c4746..aaa1c5e60ca 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -826,7 +826,7 @@ def get_config_set_for_tuning(cls) -> Union[None, "AutoRoundConfig", List["AutoR @classmethod def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "AutoRoundConfig"]: pre_defined_configs: Dict[torch_utils.ProcessorType, AutoRoundConfig] = {} - pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True, model_path=self.model_path) + pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True) pre_defined_configs[torch_utils.ProcessorType.Server] = cls() return pre_defined_configs From 1dd01a0eaf09dd211ddf0228434b7a84ba7a27fa Mon Sep 17 00:00:00 2001 From: sdp Date: Thu, 11 Jul 2024 23:46:08 -0700 Subject: [PATCH 29/38] revert eg Signed-off-by: sdp --- .../weight_only/run_clm_no_trainer.py | 42 +++++-------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 96ecc12cdc3..abd8228354e 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -212,35 +212,17 @@ def get_user_model(): if args.quantize: # dataset - if 0: - user_model, tokenizer = get_user_model() - use_layer_wise =False - # user_model.save_pretrained("./saved",max_shard_size="20GB", safe_serialization=False) - else: - from neural_compressor.torch.algorithms.layer_wise import load_empty_model - user_model = load_empty_model(args.model) - # user_model = AutoModelForCausalLM.from_pretrained( - # args.model, - # #trust_remote_code=args.trust_remote_code, - # low_cpu_mem_usage=True, - # torch_dtype="auto" - # ) - #from accelerate import init_empty_weights, load_checkpoint_and_dispatch - #tokenizer = AutoTokenizer.from_pretrained(args.model) - # checkpoint_file = "/home/sdp/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590" - #checkpoint_file = "./saved" #if checkpoint_file in "./saved" else checkpoint_file - #user_model = load_checkpoint_and_dispatch(user_model, checkpoint=checkpoint_file, device_mp="auto", offload_folder=checkpoint_file) - use_layer_wise = True - #calib_dataset = load_dataset(args.dataset, split="train") + user_model, tokenizer = get_user_model() + calib_dataset = load_dataset(args.dataset, split="train") # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF - #calib_dataset = calib_dataset.shuffle(seed=args.seed) - #calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) - #calib_dataloader = DataLoader( - # calib_evaluator.dataset, - # batch_size=calib_size, - # shuffle=False, - # collate_fn=calib_evaluator.collate_batch, - #) + calib_dataset = calib_dataset.shuffle(seed=args.seed) + calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=calib_size, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, + ) # 3.x api from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize @@ -273,9 +255,8 @@ def get_user_model(): double_quant_dtype=args.double_quant_dtype, double_quant_use_sym=args.double_quant_use_sym, double_quant_group_size=args.double_quant_group_size, - use_layer_wise=use_layer_wise, ) - quant_config.set_local("lm_head", RTNConfig(use_layer_wise=use_layer_wise, dtype="fp32")) + quant_config.set_local("lm_head", RTNConfig(dtype="fp32")) user_model = prepare(model=user_model, quant_config=quant_config) user_model = convert(model=user_model) elif args.woq_algo == "GPTQ": @@ -331,7 +312,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): run_fn_for_gptq(user_model, dataloader_for_calibration) user_model = convert(user_model) - exit(0) user_model.save(args.output_dir) From 8dbf793d308956a7e80d04cbdff2290e2704e916 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 12 Jul 2024 15:48:51 +0800 Subject: [PATCH 30/38] install py 3x deps Signed-off-by: chensuyue Signed-off-by: chensuyue --- .azure-pipelines/scripts/codeScan/pylint/pylint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh index 062bfe414ee..5631dcc0917 100644 --- a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh +++ b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh @@ -20,6 +20,7 @@ apt-get install -y --no-install-recommends --fix-missing \ build-essential pip install -r /neural-compressor/requirements.txt +pip install -r /neural-compressor/requirements_pt.txt pip install cmake pip install torch \ From 0ea77fd1ffdbf184abe7521a35f753079fb7bb54 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 14:02:57 +0800 Subject: [PATCH 31/38] enhance import&add pack ut Signed-off-by: Kaihui-intel --- neural_compressor/torch/__init__.py | 1 + test/3x/torch/quantization/weight_only/test_rtn.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py index 28f108cb636..fa59ad3b280 100644 --- a/neural_compressor/torch/__init__.py +++ b/neural_compressor/torch/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from neural_compressor.torch.algorithms.layer_wise import load_empty_model \ No newline at end of file diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index d4e1ae2f4e6..293f11f6b8b 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -167,7 +167,7 @@ def test_quant_lm_head(self): ), "The tied lm_head weight is not deep copied, please check!" def test_layer_wise(self): - from neural_compressor.torch.algorithms.layer_wise import load_empty_model + from neural_compressor.torch import load_empty_model model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") quant_config = RTNConfig( From eec87ac28e2ef8f735b9e070d3cfbfbded4981f8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 06:08:15 +0000 Subject: [PATCH 32/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py index fa59ad3b280..72d063553fa 100644 --- a/neural_compressor/torch/__init__.py +++ b/neural_compressor/torch/__init__.py @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from neural_compressor.torch.algorithms.layer_wise import load_empty_model \ No newline at end of file +from neural_compressor.torch.algorithms.layer_wise import load_empty_model From 36a4a29173ad1b131adac82194208163444002de Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 14:13:11 +0800 Subject: [PATCH 33/38] add pack ut file Signed-off-by: Kaihui-intel --- .../algorithms/weight_only/test_woq_module.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 test/3x/torch/algorithms/weight_only/test_woq_module.py diff --git a/test/3x/torch/algorithms/weight_only/test_woq_module.py b/test/3x/torch/algorithms/weight_only/test_woq_module.py new file mode 100644 index 00000000000..3dbb4c77a9c --- /dev/null +++ b/test/3x/torch/algorithms/weight_only/test_woq_module.py @@ -0,0 +1,48 @@ +import pytest +import copy +import torch +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear +from neural_compressor.torch.algorithms.weight_only.utility import quant_tensor +class TestWeightOnlyLinear: + @pytest.mark.parametrize( + "bits, compression_dtype", + [ + (8, torch.int8), + (8, torch.int16), + (8, torch.int32), + (8, torch.int64), + (4, torch.int8), + (4, torch.int16), + (4, torch.int32), + (4, torch.int64), + (2, torch.int8), + (2, torch.int16), + (2, torch.int32), + (2, torch.int64), + ], + ) + def test_pack_with_numba(self, bits, compression_dtype): + m = torch.nn.Linear(64, 32) + dtype = "int" + weight = m.weight.detach() + int_weight, scale, zp = quant_tensor( + weight, + dtype=dtype, + bits=bits, + return_int=True, + group_size=32, + ) + new_module = WeightOnlyLinear( + m.in_features, + m.out_features, + dtype=dtype, + bits=bits, + group_size=32, + zp=zp is not None, + bias=m.bias is not None, + use_optimum_format=False, + compression_dtype=compression_dtype, + ) + new_module.pack(int_weight, scale, zp, m.bias) + unpacked_int_weight = new_module.unpack_tensor(new_module.qweight) + assert torch.equal(unpacked_int_weight, int_weight) \ No newline at end of file From 86008f476f90f4d8c57c58c5f7059d29a9531488 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 06:15:49 +0000 Subject: [PATCH 34/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../algorithms/weight_only/test_woq_module.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test/3x/torch/algorithms/weight_only/test_woq_module.py b/test/3x/torch/algorithms/weight_only/test_woq_module.py index 3dbb4c77a9c..0f06f358beb 100644 --- a/test/3x/torch/algorithms/weight_only/test_woq_module.py +++ b/test/3x/torch/algorithms/weight_only/test_woq_module.py @@ -1,8 +1,12 @@ -import pytest import copy + +import pytest import torch + from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.torch.algorithms.weight_only.utility import quant_tensor + + class TestWeightOnlyLinear: @pytest.mark.parametrize( "bits, compression_dtype", @@ -26,12 +30,12 @@ def test_pack_with_numba(self, bits, compression_dtype): dtype = "int" weight = m.weight.detach() int_weight, scale, zp = quant_tensor( - weight, - dtype=dtype, - bits=bits, - return_int=True, - group_size=32, - ) + weight, + dtype=dtype, + bits=bits, + return_int=True, + group_size=32, + ) new_module = WeightOnlyLinear( m.in_features, m.out_features, @@ -45,4 +49,4 @@ def test_pack_with_numba(self, bits, compression_dtype): ) new_module.pack(int_weight, scale, zp, m.bias) unpacked_int_weight = new_module.unpack_tensor(new_module.qweight) - assert torch.equal(unpacked_int_weight, int_weight) \ No newline at end of file + assert torch.equal(unpacked_int_weight, int_weight) From 93a86f296eb0e820608b3545b52f81e704ffb7b1 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 15:55:54 +0800 Subject: [PATCH 35/38] move load_empty_model to torch.utils Signed-off-by: Kaihui-intel --- neural_compressor/torch/utils/__init__.py | 1 + test/3x/torch/quantization/weight_only/test_gptq.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/utils/__init__.py b/neural_compressor/torch/utils/__init__.py index dab02a017c6..ca802ba2145 100644 --- a/neural_compressor/torch/utils/__init__.py +++ b/neural_compressor/torch/utils/__init__.py @@ -15,3 +15,4 @@ from .environ import * from .constants import * from .utility import * +from neural_compressor.torch.algorithms.layer_wise import load_empty_model \ No newline at end of file diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py index 93b09aec02b..8608e1801a4 100644 --- a/test/3x/torch/quantization/weight_only/test_gptq.py +++ b/test/3x/torch/quantization/weight_only/test_gptq.py @@ -183,7 +183,7 @@ def test_layer_wise(self): model = convert(model) q_label = model(self.example_inputs)[0] - from neural_compressor.torch.algorithms.layer_wise import load_empty_model + from neural_compressor.torch.utils import load_empty_model model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") From 19b1c4d3fb464bec8b7a3aad6d28394969577ae2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 07:58:36 +0000 Subject: [PATCH 36/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/utils/__init__.py b/neural_compressor/torch/utils/__init__.py index ca802ba2145..25aadaa6d66 100644 --- a/neural_compressor/torch/utils/__init__.py +++ b/neural_compressor/torch/utils/__init__.py @@ -15,4 +15,4 @@ from .environ import * from .constants import * from .utility import * -from neural_compressor.torch.algorithms.layer_wise import load_empty_model \ No newline at end of file +from neural_compressor.torch.algorithms.layer_wise import load_empty_model From f17c64027a156c47d93ef11a048e853a7521e34c Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 16:14:41 +0800 Subject: [PATCH 37/38] remove torch import Signed-off-by: Kaihui-intel --- neural_compressor/torch/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py index 72d063553fa..28f108cb636 100644 --- a/neural_compressor/torch/__init__.py +++ b/neural_compressor/torch/__init__.py @@ -11,4 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from neural_compressor.torch.algorithms.layer_wise import load_empty_model From fa39f6f041565a99f2422818249fa1a5842d6955 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 18:28:05 +0800 Subject: [PATCH 38/38] fix ut import Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_rtn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 293f11f6b8b..cc4a0df6172 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -167,7 +167,7 @@ def test_quant_lm_head(self): ), "The tied lm_head weight is not deep copied, please check!" def test_layer_wise(self): - from neural_compressor.torch import load_empty_model + from neural_compressor.torch.utils import load_empty_model model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM") quant_config = RTNConfig(