bigscience-workshop · stas00 · Mar 7, 2022 · Mar 7, 2022 · Mar 7, 2022 · Mar 7, 2022
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
@@ -36,7 +36,7 @@
 from .utils import divide
 from .utils import split_tensor_along_last_dim
 from .utils import VocabUtility
-from megatron import get_args
+from megatron import get_args, mpu
 import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing
 
 
@@ -188,7 +188,9 @@ def __init__(self, num_embeddings, embedding_dim,
         # Allocate weights and initialize.
         args = get_args()
 
-        if args.use_bnb_optimizer or args.embed_layernorm:
+        # only the first stage embedding runs this class' forward. The head's embedding does its own
+        # thing, so don't waste memory allocating LN weights.
+        if mpu.is_pipeline_first_stage() and (args.use_bnb_optimizer or args.embed_layernorm):
             self.norm = torch.nn.LayerNorm(embedding_dim)
 
         if args.use_bnb_optimizer: