-
Notifications
You must be signed in to change notification settings - Fork 258
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
support multi-cards magnitude pruning (#1202)
Signed-off-by: Lv, Kaokao <[email protected]> Signed-off-by: Xinyu Ye <[email protected]> Co-authored-by: Xinyu Ye <[email protected]>
- Loading branch information
1 parent
a01ba8b
commit 9096188
Showing
7 changed files
with
1,998 additions
and
0 deletions.
There are no files selected for viewing
107 changes: 107 additions & 0 deletions
107
...es/pytorch/nlp/huggingface_models/language-modeling/pruning/magnitude/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
Step-by-Step | ||
============ | ||
|
||
# single GPU | ||
|
||
``` | ||
export CUDA_VISIBLE_DEVICES=0 | ||
bash run.sh \ | ||
--model_name_or_path=facebook/opt-125m \ | ||
--dataset_name=NeelNanda/pile-10k \ | ||
--block_size=128 \ | ||
--output_dir=./test-clm \ | ||
--pruning_type=magnitude \ | ||
--pruning_pattern=4x1 \ | ||
--pruning_frequency=1000 | ||
``` | ||
|
||
# multi GPU | ||
|
||
we use `accelerate` and `deepspeed ZeRO Stage-2` to conduct weight magnitude pruning | ||
|
||
### Accelerate DeepSpeed Plugin | ||
|
||
On your machine(s) just run: | ||
``` | ||
accelerate config | ||
``` | ||
|
||
and answer the questions asked. It will ask whether you want to use a config file for DeepSpeed to which you should answer no. Then answer the following questions to generate a basic DeepSpeed config. This will generate a config file that will be used automatically to properly set the default options when doing | ||
|
||
For instance, | ||
|
||
``` | ||
compute_environment: LOCAL_MACHINE | ||
deepspeed_config: | ||
deepspeed_config_file: config/zero_stage2_config.json | ||
zero3_init_flag: true | ||
distributed_type: DEEPSPEED | ||
fsdp_config: {} | ||
machine_rank: 0 | ||
main_process_ip: null | ||
main_process_port: null | ||
main_training_function: main | ||
mixed_precision: fp16 | ||
num_machines: 1 | ||
num_processes: 2 | ||
use_cpu: false | ||
``` | ||
with the contents of `config/zero_stage2_config.json` being: | ||
|
||
``` | ||
{ | ||
"train_batch_size": 64, | ||
"train_micro_batch_size_per_gpu": 8, | ||
"gradient_accumulation_steps": 4, | ||
"fp16": { | ||
"enabled": true, | ||
"min_loss_scale": 1, | ||
"opt_level": "O2" | ||
}, | ||
"zero_optimization": { | ||
"stage": 2, | ||
"offload_param": { | ||
"device": "cpu" | ||
}, | ||
"offload_optimizer": { | ||
"device": "cpu" | ||
}, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 5e8, | ||
"contiguous_gradients": true | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"torch_adam": true, | ||
"adam_w_mode": true | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupDecayLR", | ||
"params": { | ||
"warmup_min_lr": 0.0, | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto", | ||
"total_num_steps": "auto", | ||
"warmup_type": "cosine" | ||
} | ||
} | ||
} | ||
``` | ||
|
||
### pruning | ||
|
||
``` | ||
# 2 gpu cards example | ||
export CUDA_VISIBLE_DEVICES=0,1 | ||
bash run_ds.sh \ | ||
--model_name_or_path=facebook/opt-125m \ | ||
--dataset_name=NeelNanda/pile-10k \ | ||
--block_size=128 \ | ||
--output_dir=./test-clm \ | ||
--pruning_type=magnitude \ | ||
--pruning_pattern=4x1 \ | ||
--pruning_frequency=1000 | ||
``` |
40 changes: 40 additions & 0 deletions
40
...nlp/huggingface_models/language-modeling/pruning/magnitude/config/zero_stage2_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
{ | ||
"train_batch_size": 64, | ||
"train_micro_batch_size_per_gpu": 8, | ||
"gradient_accumulation_steps": 4, | ||
"fp16": { | ||
"enabled": true, | ||
"min_loss_scale": 1, | ||
"opt_level": "O2" | ||
}, | ||
"zero_optimization": { | ||
"stage": 2, | ||
"offload_param": { | ||
"device": "cpu" | ||
}, | ||
"offload_optimizer": { | ||
"device": "cpu" | ||
}, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 5e8, | ||
"contiguous_gradients": true | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"torch_adam": true, | ||
"adam_w_mode": true | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupDecayLR", | ||
"params": { | ||
"warmup_min_lr": 0.0, | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto", | ||
"total_num_steps": "auto", | ||
"warmup_type": "cosine" | ||
} | ||
} | ||
} |
8 changes: 8 additions & 0 deletions
8
examples/pytorch/nlp/huggingface_models/language-modeling/pruning/magnitude/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
accelerate | ||
datasets | ||
sentencepiece | ||
transformers | ||
torch | ||
tqdm | ||
optimum | ||
einops |
87 changes: 87 additions & 0 deletions
87
examples/pytorch/nlp/huggingface_models/language-modeling/pruning/magnitude/run.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#!/bin/bash | ||
set -x | ||
|
||
function main { | ||
|
||
init_params "$@" | ||
run_pruning | ||
|
||
} | ||
|
||
# init params | ||
function init_params { | ||
dataset_name="NeelNanda/pile-10k" | ||
model_name_or_path="facebook/opt-125m" | ||
output_dir="./test-clm" | ||
per_device_train_batch_size=8 | ||
block_size=128 | ||
gradient_accumulation_steps=4 | ||
num_train_epochs=3 | ||
target_sparsity=0.8 | ||
pruning_type="magnitude" | ||
pruning_pattern="4x1" | ||
pruning_frequency=1000 | ||
for var in "$@" | ||
do | ||
case $var in | ||
--dataset_name=*) | ||
dataset_name=$(echo $var |cut -f2 -d=) | ||
;; | ||
--model_name_or_path=*) | ||
model_name_or_path=$(echo $var |cut -f2 -d=) | ||
;; | ||
--output_dir=*) | ||
output_dir=$(echo $var |cut -f2 -d=) | ||
;; | ||
--per_device_train_batch_size=*) | ||
per_device_train_batch_size=$(echo $var |cut -f2 -d=) | ||
;; | ||
--block_size=*) | ||
block_size=$(echo $var |cut -f2 -d=) | ||
;; | ||
--gradient_accumulation_steps=*) | ||
gradient_accumulation_steps=$(echo $var |cut -f2 -d=) | ||
;; | ||
--num_train_epochs=*) | ||
num_train_epochs=$(echo $var |cut -f2 -d=) | ||
;; | ||
--target_sparsity=*) | ||
target_sparsity=$(echo $var |cut -f2 -d=) | ||
;; | ||
--pruning_type=*) | ||
pruning_type=$(echo $var |cut -f2 -d=) | ||
;; | ||
--pruning_pattern=*) | ||
pruning_pattern=$(echo $var |cut -f2 -d=) | ||
;; | ||
--pruning_frequency=*) | ||
pruning_frequency=$(echo $var |cut -f2 -d=) | ||
;; | ||
*) | ||
echo "Error: No such parameter: ${var}" | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
} | ||
|
||
# run_tuning | ||
function run_pruning { | ||
python run_clm_no_trainer.py \ | ||
--dataset_name $dataset_name \ | ||
--model_name_or_path $model_name_or_path \ | ||
--block_size $block_size \ | ||
--per_device_train_batch_size $per_device_train_batch_size \ | ||
--gradient_accumulation_steps $gradient_accumulation_steps \ | ||
--output_dir $output_dir \ | ||
--do_prune \ | ||
--pruning_pattern $pruning_type \ | ||
--num_train_epochs $num_train_epochs \ | ||
--target_sparsity $target_sparsity \ | ||
--pruning_pattern $pruning_pattern \ | ||
--pruning_frequency $pruning_frequency | ||
|
||
} | ||
|
||
main "$@" |
Oops, something went wrong.