diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 98d13a94e721..31e049839dbc 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -2292,7 +2292,7 @@ def _get_gradients_for_reduction(self): expert_grads[key] = [] for param_name, param in self.module.named_parameters(): - if param.grad is None: + if param.grad is None and param.requires_grad: # In cases where there is an imbalance of empty grads across # ranks we must create empty grads, this will ensure that every # rank is reducing the same size. In some cases it may make