Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ZeRO-Offload checkpointing model tests #344

Merged
merged 4 commits into from
Sep 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion deepspeed/pt/deepspeed_zero_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def __init__(self,
if dist.get_rank() == 0:
logger.info(f"Reduce bucket size {reduce_bucket_size}")
logger.info(f"Allgather bucket size {allgather_bucket_size}")
logger.info(f"CPU Offload: {cpu_offload}")
# The fused optimizer does all the work. We need this layer for two reason:
# 1. maintain same user API from apex.fp16_utils
# 2. keep common stuff here in case we need to add ne552w fused optimizer later
Expand Down Expand Up @@ -1564,7 +1565,8 @@ def _partition_base_optimizer_state(self, state_key, all_partition_states):
dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
return dp_partitions[partition_id]
else:
return all_partition_states[partition_id]
# Assume non-tensor states are not partitioned and equal across ranks, so return first one
return all_partition_states[0]

# Restore base optimizer state from checkpoint by
# 1) Merging optimizer state from checkpoints of all partitions
Expand Down
9 changes: 2 additions & 7 deletions tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,9 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
"stage": 1
},
"zero_allow_untested_optimizer": true,
tjruwase marked this conversation as resolved.
Show resolved Hide resolved
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
Expand Down
9 changes: 2 additions & 7 deletions tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,12 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":2,
"stage": 2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"zero_allow_untested_optimizer": true,
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
Expand Down
21 changes: 21 additions & 0 deletions tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"train_batch_size": 4,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true,
"cpu_offload": true
},
"zero_allow_untested_optimizer": true,
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
8 changes: 1 addition & 7 deletions tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,7 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":0
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
"stage": 0
},
"gradient_clipping": 1.0,
"fp16": {
Expand Down
11 changes: 3 additions & 8 deletions tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,10 @@
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization":{
"stage":1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
"zero_optimization": {
"stage": 1
},
"zero_allow_untested_optimizer": true,
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
Expand Down
10 changes: 2 additions & 8 deletions tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,12 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":2,
"stage": 2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"zero_allow_untested_optimizer": true,
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
Expand All @@ -26,5 +21,4 @@
"partition_activations": true,
"contiguous_memory_optimization": true
}

}
25 changes: 25 additions & 0 deletions tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true,
"cpu_offload": true
},
"zero_allow_untested_optimizer": true,
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"activation_checkpointing": {
"partition_activations": true,
"contiguous_memory_optimization": true
}
}
10 changes: 2 additions & 8 deletions tests/model/Megatron_GPT2/ds_config_func_scheduler.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,9 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":2
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
"stage": 2
},
"zero_allow_untested_optimizer": true,
"gradient_clipping": 1.0,
"scheduler": {
"type": "WarmupLR",
Expand All @@ -20,7 +15,6 @@
"warmup_num_steps": 10
}
},

"fp16": {
"enabled": true,
"loss_scale": 0,
Expand Down
5 changes: 4 additions & 1 deletion tests/model/Megatron_GPT2/ds_config_perf_bs16.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
"train_batch_size": 16,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": 1,
"zero_optimization": {
"stage": 1
},
"zero_allow_untested_optimizer": true,
"disable_allgather": true,
"optimizer": {
"type": "Adam",
Expand Down
3 changes: 2 additions & 1 deletion tests/model/Megatron_GPT2/ds_config_perf_bs32.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":1
"stage": 1
},
"zero_allow_untested_optimizer": true,
"disable_allgather": true,
"optimizer": {
"type": "Adam",
Expand Down
5 changes: 4 additions & 1 deletion tests/model/Megatron_GPT2/ds_config_perf_bs8.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": 1,
"zero_optimization": {
"stage": 1
},
"zero_allow_untested_optimizer": true,
"disable_allgather": true,
"optimizer": {
"type": "Adam",
Expand Down
4 changes: 2 additions & 2 deletions tests/model/Megatron_GPT2/ds_gpt2_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ gpt_options=" \
${ds_opt} \
${zero_opt} \
"

DEEPSPEED_PORT=29600
tjruwase marked this conversation as resolved.
Show resolved Hide resolved
work_dir="../../../DeepSpeedExamples/Megatron-LM/"
run_cmd="(cd ${work_dir} && deepspeed --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
run_cmd="(cd ${work_dir} && deepspeed --master_port ${DEEPSPEED_PORT} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
echo ${run_cmd}
eval ${run_cmd}

Expand Down
Loading