attach empty grad to its param to ensure it's copied after reduction (m…

…icrosoft#316)
bobisai · Aug 13, 2020 · e1bea67 · e1bea67
1 parent 6855ba1
commit e1bea67
Showing 1 changed file with 4 additions and 4 deletions.
diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py
@@ -1013,10 +1013,10 @@ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000)
                 # rank is reducing the same size. In some cases it may make
                 # sense in the future to support the ability to average not
                 # w.r.t. world size but with a different value.
-                grads.append(
-                    torch.zeros(param.size(),
-                                dtype=param.dtype,
-                                device=param.device))
+                param.grad = torch.zeros(param.size(),
+                                         dtype=param.dtype,
+                                         device=param.device)
+                grads.append(param.grad.data)
             else:
                 grad_data = param.grad.data
                 if self.sparse_gradients_enabled(