Fix the contiguous_param_buffer bug about bprop overlap and redundant…

… copy after all-gather.
NVIDIA · Apr 25, 2024 · 394f401 · 394f401
1 parent c562d3d
commit 394f401
Showing 1 changed file with 7 additions and 5 deletions.
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -371,11 +371,13 @@ def init_param_buffer(self) -> None:
                 param._data = buffer_view.view(param.size())
             else:
                 # Preserve memory format for param here, i.e. NHWC tensors
-                param.data.set_(
-                    source=buffer_view,
-                    storage_offset=0,
-                    size=param.size(),
-                    stride=param.stride(),
+                # `param.data.set_()` failed to change storage.
+                # `param.set_()` invalidates bprop hook.
+                param.data = torch.as_strided(
+                    buffer_view,
+                    param.size(),
+                    param.stride(),
+                    storage_offset=buffer_view.storage_offset(),
                 )
 
     def try_grad_sync(self, params: Iterable[torch.nn.Parameter]) -> None: