diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index db35fc42293ea..652b3b767c942 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1990,6 +1990,12 @@ def build_transformer_config(self) -> TransformerConfig: For attributes in TransformerConfig that are not in the nemo model config, we add custom logic. """ + if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0: + raise ValueError( + f"num_layers ({self.cfg.num_layers}) should be divisible by " + f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})" + ) + normalization = self.cfg.get('normalization', 'layernorm').lower() layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' if normalization == 'layernorm':