Skip to content

Commit

Permalink
fix for 16GB v100 nodes (deepspeedai#393)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffra authored Sep 10, 2020
1 parent 2dea61f commit b1d4bd7
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
1 change: 1 addition & 0 deletions deepspeed/runtime/zero/stage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,6 +1585,7 @@ def backward(self, loss, retain_graph=False):
if self.cpu_offload:
torch.cuda.current_stream().wait_stream(self.migration_stream)

#TODO: we need to revist this and remove the magic 4.5x multiplier here
if self.contiguous_gradients:
self.ipg_buffer = []
buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
Expand Down
4 changes: 3 additions & 1 deletion tests/unit/test_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,9 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
},
"zero_optimization": {
"stage": zero_stage,
"cpu_offload": use_cpu_offload
"cpu_offload": use_cpu_offload,
"reduce_bucket_size": 100,
"allgather_bucket_size": 100
}
}
args = args_from_dict(tmpdir, config_dict)
Expand Down

0 comments on commit b1d4bd7

Please sign in to comment.