diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py index 20d680796127..c6e7623b1792 100755 --- a/deepspeed/pt/deepspeed_light.py +++ b/deepspeed/pt/deepspeed_light.py @@ -1013,10 +1013,10 @@ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000) # rank is reducing the same size. In some cases it may make # sense in the future to support the ability to average not # w.r.t. world size but with a different value. - grads.append( - torch.zeros(param.size(), - dtype=param.dtype, - device=param.device)) + param.grad = torch.zeros(param.size(), + dtype=param.dtype, + device=param.device) + grads.append(param.grad.data) else: grad_data = param.grad.data if self.sparse_gradients_enabled(