Skip to content

Commit

Permalink
support multi-cards magnitude pruning (#1202)
Browse files Browse the repository at this point in the history
Signed-off-by: Lv, Kaokao <[email protected]>
Signed-off-by: Xinyu Ye <[email protected]>
Co-authored-by: Xinyu Ye <[email protected]>
  • Loading branch information
lkk12014402 and XinyuYe-Intel authored Sep 1, 2023
1 parent a01ba8b commit 9096188
Show file tree
Hide file tree
Showing 7 changed files with 1,998 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
Step-by-Step
============

# single GPU

```
export CUDA_VISIBLE_DEVICES=0
bash run.sh \
--model_name_or_path=facebook/opt-125m \
--dataset_name=NeelNanda/pile-10k \
--block_size=128 \
--output_dir=./test-clm \
--pruning_type=magnitude \
--pruning_pattern=4x1 \
--pruning_frequency=1000
```

# multi GPU

we use `accelerate` and `deepspeed ZeRO Stage-2` to conduct weight magnitude pruning

### Accelerate DeepSpeed Plugin

On your machine(s) just run:
```
accelerate config
```

and answer the questions asked. It will ask whether you want to use a config file for DeepSpeed to which you should answer no. Then answer the following questions to generate a basic DeepSpeed config. This will generate a config file that will be used automatically to properly set the default options when doing

For instance,

```
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_config_file: config/zero_stage2_config.json
zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
```
with the contents of `config/zero_stage2_config.json` being:

```
{
"train_batch_size": 64,
"train_micro_batch_size_per_gpu": 8,
"gradient_accumulation_steps": 4,
"fp16": {
"enabled": true,
"min_loss_scale": 1,
"opt_level": "O2"
},
"zero_optimization": {
"stage": 2,
"offload_param": {
"device": "cpu"
},
"offload_optimizer": {
"device": "cpu"
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"contiguous_gradients": true
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"torch_adam": true,
"adam_w_mode": true
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0.0,
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto",
"warmup_type": "cosine"
}
}
}
```

### pruning

```
# 2 gpu cards example
export CUDA_VISIBLE_DEVICES=0,1
bash run_ds.sh \
--model_name_or_path=facebook/opt-125m \
--dataset_name=NeelNanda/pile-10k \
--block_size=128 \
--output_dir=./test-clm \
--pruning_type=magnitude \
--pruning_pattern=4x1 \
--pruning_frequency=1000
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"train_batch_size": 64,
"train_micro_batch_size_per_gpu": 8,
"gradient_accumulation_steps": 4,
"fp16": {
"enabled": true,
"min_loss_scale": 1,
"opt_level": "O2"
},
"zero_optimization": {
"stage": 2,
"offload_param": {
"device": "cpu"
},
"offload_optimizer": {
"device": "cpu"
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"contiguous_gradients": true
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"torch_adam": true,
"adam_w_mode": true
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0.0,
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto",
"warmup_type": "cosine"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
accelerate
datasets
sentencepiece
transformers
torch
tqdm
optimum
einops
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_pruning

}

# init params
function init_params {
dataset_name="NeelNanda/pile-10k"
model_name_or_path="facebook/opt-125m"
output_dir="./test-clm"
per_device_train_batch_size=8
block_size=128
gradient_accumulation_steps=4
num_train_epochs=3
target_sparsity=0.8
pruning_type="magnitude"
pruning_pattern="4x1"
pruning_frequency=1000
for var in "$@"
do
case $var in
--dataset_name=*)
dataset_name=$(echo $var |cut -f2 -d=)
;;
--model_name_or_path=*)
model_name_or_path=$(echo $var |cut -f2 -d=)
;;
--output_dir=*)
output_dir=$(echo $var |cut -f2 -d=)
;;
--per_device_train_batch_size=*)
per_device_train_batch_size=$(echo $var |cut -f2 -d=)
;;
--block_size=*)
block_size=$(echo $var |cut -f2 -d=)
;;
--gradient_accumulation_steps=*)
gradient_accumulation_steps=$(echo $var |cut -f2 -d=)
;;
--num_train_epochs=*)
num_train_epochs=$(echo $var |cut -f2 -d=)
;;
--target_sparsity=*)
target_sparsity=$(echo $var |cut -f2 -d=)
;;
--pruning_type=*)
pruning_type=$(echo $var |cut -f2 -d=)
;;
--pruning_pattern=*)
pruning_pattern=$(echo $var |cut -f2 -d=)
;;
--pruning_frequency=*)
pruning_frequency=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}

# run_tuning
function run_pruning {
python run_clm_no_trainer.py \
--dataset_name $dataset_name \
--model_name_or_path $model_name_or_path \
--block_size $block_size \
--per_device_train_batch_size $per_device_train_batch_size \
--gradient_accumulation_steps $gradient_accumulation_steps \
--output_dir $output_dir \
--do_prune \
--pruning_pattern $pruning_type \
--num_train_epochs $num_train_epochs \
--target_sparsity $target_sparsity \
--pruning_pattern $pruning_pattern \
--pruning_frequency $pruning_frequency

}

main "$@"
Loading

0 comments on commit 9096188

Please sign in to comment.