Skip to content

Commit

Permalink
fix llm scripts and pruning import issue (#1143)
Browse files Browse the repository at this point in the history
* fix llm scripts and pruning ipmort issue

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* fix import

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* fxitypo

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* fix import issue 2

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* fix llm example

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* refine torch method

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* refine torch method2

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* refine torch methods

Signed-off-by: Zhang, Weiwei1 <[email protected]>

* fix progress bar

Signed-off-by: Zhang, Weiwei1 <[email protected]>

---------

Signed-off-by: Zhang, Weiwei1 <[email protected]>
  • Loading branch information
WeiweiZhang1 authored Aug 11, 2023
1 parent d4baed9 commit c83d01d
Show file tree
Hide file tree
Showing 11 changed files with 116 additions and 100 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -761,20 +761,23 @@ def eval_func(model):
acc, _ = eval_func(model)
logger.info(f"total_steps:{args.max_pruning_steps} accuracy:{acc}")
else:
logger.info(f"***** Running Evaluation before ffn auto slim*****")
accuracy, avg_latency = eval_func(model)
logger.info(f"accuracy:{accuracy} avg_latency:{avg_latency}")
model = model_slim(model, round_multiplier=32)

logger.info(f"***** Running Evaluation after ffn auto_slim*****")
accuracy, avg_latency = eval_func(model)
logger.info(f"accuracy:{accuracy} avg_latency:{avg_latency}")

if args.output_dir is not None:
accelerator.wait_for_everyone()
traced_model = trace_model(model, tokenizer)
logger.info(f"Save silmed jit model")
torch.jit.save(traced_model, args.output_dir+"/slimed_jit_model.pt")
if 'bloom' not in model_name:
logger.info(f"***** Running Evaluation before ffn auto slim*****")
accuracy, avg_latency = eval_func(model)
logger.info(f"accuracy:{accuracy} avg_latency:{avg_latency}")
model = model_slim(model, round_multiplier=32)

logger.info(f"***** Running Evaluation after ffn auto_slim*****")
accuracy, avg_latency = eval_func(model)
logger.info(f"accuracy:{accuracy} avg_latency:{avg_latency}")

if args.output_dir is not None:
accelerator.wait_for_everyone()
traced_model = trace_model(model, tokenizer)
logger.info(f"Save silmed jit model")
torch.jit.save(traced_model, args.output_dir+"/slimed_jit_model.pt")
else:
logger.info(f"Trace on BLOOM MODEL is not supported yet.")


if args.with_tracking:
Expand All @@ -783,3 +786,4 @@ def eval_func(model):

if __name__ == "__main__":
main()

Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@

#!/bin/bash

set -x

# Set environment
CUBLAS_WORKSPACE_CONFIG=':4096:8'
export CUBLAS_WORKSPACE_CONFIG=':4096:8'


CUDA_VISIBLE_DEVICES=4 python \
Expand All @@ -14,14 +15,13 @@ CUDA_VISIBLE_DEVICES=4 python \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 16 \
--max_pruning_steps 3002 \
--weight_decay 0 \
--block_size 512 \
--max_length 512 \
--do_prune \
--auto_config \
--auto_slim \
--output_dir ./sparse_model \
--target_sparsity 0.1 \
--pruning_pattern channelx1 \
--pruning_frequency 500


Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#!/bin/bash

set -x
et -x

# Set environment
CUBLAS_WORKSPACE_CONFIG=':4096:8'
export CUBLAS_WORKSPACE_CONFIG=':4096:8'


CUDA_VISIBLE_DEVICES=4 python \
Expand All @@ -14,13 +12,14 @@ CUDA_VISIBLE_DEVICES=4 python \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 16 \
--max_pruning_steps 3002 \
--weight_decay 0 \
--block_size 512 \
--max_length 512 \
--do_prune \
--auto_config \
--auto_slim \
--output_dir ./sparse_model \
--target_sparsity 0.1 \
--pruning_pattern channelx1 \
--pruning_frequency 500


Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -x

# Set environment
CUBLAS_WORKSPACE_CONFIG=':4096:8'
export CUBLAS_WORKSPACE_CONFIG=':4096:8'


CUDA_VISIBLE_DEVICES=4 python \
Expand All @@ -14,14 +14,15 @@ CUDA_VISIBLE_DEVICES=4 python \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 16 \
--max_pruning_steps 3002 \
--weight_decay 0 \
--block_size 512 \
--max_length 512 \
--do_prune \
--auto_config \
--auto_slim \
--output_dir ./sparse_model \
--target_sparsity 0.1 \
--pruning_pattern channelx1 \
--pruning_frequency 500



Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
set -x

# Set environment
CUBLAS_WORKSPACE_CONFIG=':4096:8'
export CUBLAS_WORKSPACE_CONFIG=':4096:8'

# Available Models

# Common Large Language Models(LLMs), e.g. OPT, GPT, LLaMA, BLOOM, Dolly, MPT, Falcon, Stable-LM, LaMini-LM, etc.

#cd neural-compressor
python examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py \
--model_name_or_path /PATH/TO/LLM/
--model_name_or_path /PATH/TO/LLM/ \
--calibration_dataset_name wikitext-2-raw-v1 \
--evaluation_dataset_name lambada \
--per_device_train_batch_size 1 \
Expand All @@ -23,3 +23,4 @@ python examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/r
--target_sparsity 0.5 \
--pruning_pattern 1x1


Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -x

# Set environment
CUBLAS_WORKSPACE_CONFIG=':4096:8'
export CUBLAS_WORKSPACE_CONFIG=':4096:8'


CUDA_VISIBLE_DEVICES=4 python \
Expand All @@ -14,14 +14,15 @@ CUDA_VISIBLE_DEVICES=4 python \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 16 \
--max_pruning_steps 3002 \
--weight_decay 0 \
--block_size 512 \
--max_length 512 \
--do_prune \
--auto_config \
--auto_slim \
--output_dir ./sparse_model \
--target_sparsity 0.1 \
--pruning_pattern channelx1 \
--pruning_frequency 500



4 changes: 2 additions & 2 deletions neural_compressor/compression/pruner/patterns/ninm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
SparsityInfo,
ProgressivePatternUtils)
from ..utils import logger, torch, tf, nn
import transformers


@register_pattern('ptN:M')
Expand Down Expand Up @@ -394,7 +393,7 @@ def update_progressive_masks(self, pre_masks, cur_masks, scores, progressive_ste
progressive_step, progressive_configs)

def fasterprune(self, gpt, blocksize=128, percdamp=.01):
""""""
import transformers
W = gpt.module.weight.data.clone()
dev = gpt.dev
rows = gpt.rows
Expand Down Expand Up @@ -465,3 +464,4 @@ def fasterprune(self, gpt, blocksize=128, percdamp=.01):
if torch.cuda.is_available():
torch.cuda.empty_cache()


3 changes: 2 additions & 1 deletion neural_compressor/compression/pruner/patterns/nxm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
ProgressivePatternUtils)

from ..utils import logger, torch, tf, nn
import transformers

@register_pattern('ptNxM')
class PytorchPatternNxM(PytorchBasePattern):
Expand Down Expand Up @@ -447,6 +446,7 @@ def update_progressive_masks(self, pre_masks, cur_masks, scores, progressive_ste
raise NotImplementedError

def fasterprune(self, gpt, blocksize=128, percdamp=.01):
import transformers
sparsity = self.target_sparsity_ratio
W = gpt.module.weight.data.clone()
dev = gpt.dev
Expand Down Expand Up @@ -815,3 +815,4 @@ def get_masks_global(self, scores, cur_target_sparsity_ratio, pre_masks,
logger.info(f'{key} sparsity is {layer_ratio}')
return masks


16 changes: 9 additions & 7 deletions neural_compressor/compression/pruner/pruners/sparse_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from ..patterns import get_pattern
from ..criteria import get_criterion
from ..regs import get_reg
import transformers
from ..utils import logger, torch, nn
import gc
import math
Expand Down Expand Up @@ -42,9 +41,10 @@ def _init(self):
logger.warning("sparse_gpt pruner fixed the weights, Please DO NOT train or update gradients.")
assert "1x1" in self.pattern.pattern or ":" in self.pattern.pattern, \
"sparse_gpt pruner type only supports 1x1 and N:M patterns."

class SparseGPT():
def __init__(self, module):
import transformers
self.module = module
self.dev = self.module.weight.device
W = module.weight.data.clone()
Expand All @@ -58,6 +58,7 @@ def __init__(self, module):
self.nsamples = 0

def add_batch(self, inp, blocksize=1024):
import transformers
if len(inp.shape) == 2:
inp = inp.unsqueeze(0)
sample_num = inp.shape[0] # batchsize
Expand All @@ -83,10 +84,11 @@ def tmp(_, inp):
handles.append(module.register_forward_pre_hook(add_batch(self.gpts[name])))
return handles

@torch.no_grad()
def fasterprune(self, op_names):
for name in op_names:
logger.info(f"module: {name}\t target ratio: {self.target_sparsity_ratio}")
module = self.modules[name]
self.pattern.fasterprune(self.gpts[name]) # is there necessary to add a hyperparameter of blocksize
with torch.no_grad():
for name in op_names:
logger.info(f"module: {name}\t target ratio: {self.target_sparsity_ratio}")
module = self.modules[name]
self.pattern.fasterprune(self.gpts[name]) # is there necessary to add a hyperparameter of blocksize


55 changes: 30 additions & 25 deletions neural_compressor/compression/pruner/pruning.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from neural_compressor.compression.pruner.pruners import get_pruner
from neural_compressor.compression.pruner.utils import logger, torch, collect_layer_inputs, get_layers
from typing import Optional
from tqdm.auto import tqdm

PRUNINGS = {}

def register_pruning(name):
Expand Down Expand Up @@ -182,36 +182,36 @@ def _prepare_pruners(self):
self._model = self._model.to(self.model_dev)
# TODO add get_sparsity_ratio() for sparseGPT

@torch.no_grad()
def _do_pruning(self):
from tqdm.auto import tqdm
layers = self._layers
self._model = self._model.cpu()
inputs, inp_dict = collect_layer_inputs(model=self._model, layers=layers, layer_idx=0,
layer_inputs=self._dataloader, device=self.dev)
if 'cuda' in self.dev.type:
torch.cuda.empty_cache()

for i in tqdm(range(len(layers))):
layer = layers[i].to(self.dev)
layer_index_str = '.' + str(i) + '.'
handles_list = []
for pruner in self.pruners:
layer_op_names = [key for key in pruner.modules.keys() if layer_index_str in key]
handles_list.append(pruner.register_gpt_hook(layer_op_names))
for j in range(len(inputs)):
layer(inputs[j], **inp_dict)[0]
for handles in handles_list:
for h in handles:
h.remove()
for pruner in self.pruners:
layer_op_names = [key for key in pruner.modules.keys() if layer_index_str in key]
pruner.fasterprune(layer_op_names)
for j in range(len(inputs)):
# the weights of current layer have been pruned, get the latest outputs as the inputs for next layer
inputs[j] = layer(inputs[j], **inp_dict)[0]
layers[i] = layer.cpu()
if 'cuda' in self.dev.type:
torch.cuda.empty_cache()
with torch.no_grad():
for i in tqdm(range(len(layers))):
layer = layers[i].to(self.dev)
layer_index_str = '.' + str(i) + '.'
handles_list = []
for pruner in self.pruners:
layer_op_names = [key for key in pruner.modules.keys() if layer_index_str in key]
handles_list.append(pruner.register_gpt_hook(layer_op_names))
for j in range(len(inputs)):
layer(inputs[j], **inp_dict)[0]
for handles in handles_list:
for h in handles:
h.remove()
for pruner in self.pruners:
layer_op_names = [key for key in pruner.modules.keys() if layer_index_str in key]
pruner.fasterprune(layer_op_names)
for j in range(len(inputs)):
# the weights of current layer have been pruned, get the latest outputs as the inputs for next layer
inputs[j] = layer(inputs[j], **inp_dict)[0]
layers[i] = layer.cpu()
if 'cuda' in self.dev.type:
torch.cuda.empty_cache()

def on_train_begin(self, dataloader): # pragma: no cover
if self._dataloader is not None:
Expand Down Expand Up @@ -240,7 +240,11 @@ def _prepare_pruners(self):
get_sparsity_ratio(self.pruners, self._model)

def _do_pruning(self):
progress_bar = tqdm(range(len(self._dataloader.dataset)))
from tqdm.auto import tqdm
length = len(self._dataloader.dataset)
if self._dataloader.batch_sampler is not None:
length = len(self._dataloader.batch_sampler)
progress_bar = tqdm(range(length))
if self._loss_func is not None:
for inputs, target in self._dataloader:
self.on_step_begin()
Expand Down Expand Up @@ -269,3 +273,4 @@ def _do_pruning(self):
# self._prepare_pruners()



Loading

0 comments on commit c83d01d

Please sign in to comment.