mosaicml · dakinggg · May 11, 2024 · May 11, 2024 · May 11, 2024 · May 11, 2024
@@ -3,6 +3,7 @@
 
 """A HuggingFace-style model configuration."""
 
+import copy
 import warnings
 from typing import Any, Dict, Optional, Union
 
@@ -55,15 +56,15 @@ def __init__(
         resid_pdrop: float = 0.0,
         emb_pdrop: float = 0.0,
         learned_pos_emb: bool = True,
-        attn_config: Dict = attn_config_defaults,
-        ffn_config: Dict = ffn_config_defaults,
+        attn_config: Optional[Dict] = None,
+        ffn_config: Optional[Dict] = None,
         init_device: str = 'cpu',
         logit_scale: Optional[Union[float, str]] = None,
         no_bias: bool = False,
         embedding_fraction: float = 1.0,
         norm_type: str = 'low_precision_layernorm',
         use_cache: bool = False,
-        init_config: Dict = init_config_defaults,
+        init_config: Optional[Dict] = None,
         fc_type: str = 'torch',
         tie_word_embeddings: bool = True,
         use_pad_tok_in_ffn: bool = True,
@@ -147,15 +148,21 @@ def __init__(
         self.resid_pdrop = resid_pdrop
         self.emb_pdrop = emb_pdrop
         self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.ffn_config = ffn_config
+        self.attn_config = attn_config if attn_config is not None else copy.deepcopy(
+            attn_config_defaults,
+        )
+        self.ffn_config = ffn_config if ffn_config is not None else copy.deepcopy(
+            ffn_config_defaults,
+        )
         self.init_device = init_device
         self.logit_scale = logit_scale
         self.no_bias = no_bias
         self.embedding_fraction = embedding_fraction
         self.norm_type = norm_type
         self.use_cache = use_cache
-        self.init_config = init_config
+        self.init_config = init_config if init_config is not None else copy.deepcopy(
+            init_config_defaults,
+        )
         self.fc_type = fc_type
         self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
 
@@ -306,14 +313,14 @@ def _validate_config(self) -> None:
                     + 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
                     'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156',
                 )
+
+        self.ffn_config['fc_type'] = self.fc_type
         if self.ffn_config['ffn_type'] == 'mptgeglu':
             raise ValueError(
                 'API CHANGE: `ffn_type=="mptgeglu"` changed to `ffn_type=="mptglu"`. '
                 +
                 'See [#829](https://github.com/mosaicml/llm-foundry/pull/829) for details.',
             )
-        elif self.ffn_config['ffn_type'] in ['mptmlp', 'mptglu']:
-            self.ffn_config['fc_type'] = self.fc_type
         elif self.ffn_config['ffn_type'] in ffns_with_megablocks:
             self.ffn_config['return_bias'] = False
         elif self.ffn_config['ffn_type'] == 'te_ln_mlp':

diff --git a/tests/models/test_model.py b/tests/models/test_model.py
@@ -979,6 +979,37 @@ def test_mpt_creation(
         assert block.resid_ffn_dropout.p == 0.2
 
 
+@pytest.mark.gpu
+def test_mb_mpt_creation():
+    # Test that the config constructs the model as expected.
+    hf_config = MPTConfig(
+        init_device='cpu',
+        d_model=128,
+        n_heads=4,
+        n_layers=2,
+        expansion_ratio=2,
+        max_seq_len=2048,
+        emb_pdrop=0.1,
+        resid_pdrop=0.2,
+        attn_config={
+            'attn_impl': 'torch',
+        },
+        norm_type='low_precision_layernorm',
+        no_bias=True,
+        tie_word_embeddings=False,
+        ffn_config={
+            'ffn_type': 'mb_moe',
+            'ffn_hidden_size': 1024,
+            'ffn_act_fn': {
+                'name': 'gelu',
+            },
+            'moe_world_size': 1,
+        },
+    )
+
+    _ = MPTForCausalLM(hf_config)
+
+
 @pytest.mark.gpu
 @pytest.mark.parametrize('attention_impl', ['flash', 'torch'])
 @pytest.mark.parametrize(