allenai · Muennighoff · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `OLMoE`: Configurations & modeling for training Mixture-of-Experts models.
 - Added support for document masking via flash-attn during training with `--data.generate_doc_lengths`.
 - Added config options for `model.norm_after`, `model.scale_emb_init`, and `auxiliary_loss_multiplier` (used with zloss).
 - Added scripts for running experiments on qk_norm, norm reordering, and zloss.

diff --git a/configs/official/OLMoE-7B-A1B.yaml b/configs/official/OLMoE-7B-A1B.yaml
diff --git a/olmo/config.py b/olmo/config.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from omegaconf.errors import OmegaConfBaseException
@@ -198,6 +199,11 @@ class BlockType(StrEnum):
     implementations of operations like attention to imitate the behavior of Llama.
     """
 
+    moe = "moe"
+    """
+    A block for OLMoE-style Mixture-of-Experts models.
+    """
+
 
 class InitFnType(StrEnum):
     mitchell = "mitchell"
@@ -457,6 +463,61 @@ class ModelConfig(BaseConfig):
     See :data:`TrainConfig.precision` instead.
     """
 
+    moe_num_experts: Optional[int] = 8
+    """
+    The number of experts to use in the MoE block.
+    """
+
+    moe_top_k: Optional[int] = 2
+    """
+    The number of experts to select for each token.
+    """
+
+    moe_mlp_impl: Optional[str] = "sparse"
+    """
+    Choose "grouped" for grouped GEMM installable via `pip install git+https://[email protected]/tgale96/grouped_gemm.git@66c7195e35e8c4f22fa6a014037ef511bfa397cb`.
+    """
+
+    moe_log_expert_assignment: Optional[bool] = True
+    """
+    Whether to log the expert assignment.
+    """
+
+    moe_shared_expert: Optional[bool] = False
+    """
+    Whether to have an always-used expert like in [DeepSeekMoE](https://arxiv.org/abs/2401.06066).
+    """
+
+    moe_lbl_in_fp32: Optional[bool] = False
+    """
+    Whether to perform load balancing in FP32.
+    """
+
+    moe_interleave: Optional[bool] = False
+    """
+    Interleave sequential with MoE blocks starting with sequential.
+    """
+
+    moe_loss_weight: Optional[float] = 0.1
+    """
+    The weight to use for the MoE load balancing loss.
+    """
+
+    moe_zloss_weight: Optional[float] = None
+    """
+    Weight for MoE router z-loss where None means no router z-loss. 0.001 is a common value.
+    """
+
+    moe_dropless: Optional[bool] = True
+    """
+    Whether to use [dMoE](https://arxiv.org/abs/2211.15841).
+    """
+
+    moe_capacity_factor: Optional[float] = 1.25
+    """
+    The capacity factor to use in the MoE block. Only applies if not using dMoE.
+    """
+
     scale_emb_init: bool = False
     """
     If ``True``, embeddings are scaled up by ``sqrt(d_model)`` during initialization.
@@ -1273,3 +1334,41 @@ def update_legacy_settings(cls, config: D) -> D:
                 new_config.optimizer = OptimizerConfig.update_legacy_settings(new_config.optimizer)
 
         return new_config
+
+
+def config_to_moe_args(config: ModelConfig) -> Dict[str, Any]:
+    from .model import Activation
+    from megablocks.layers.arguments import Arguments as MoEArgs
+
+    hidden_size = (
+        config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+    )
+    act = Activation.build(config)
+    num_layers = config.n_layers // 2 if config.moe_interleave else config.n_layers
+    kwargs = {
+        "activation_fn": F.silu if "swiglu" in config.activation_type.lower() else Activation.build(config),
+        "mlp_type": "glu" if "glu" in config.activation_type.lower() else "mlp",
+        "mlp_impl": config.moe_mlp_impl,
+        "hidden_size": config.d_model,
+        "ffn_hidden_size": int(act.output_multiplier * hidden_size),
+        "moe_num_experts": config.moe_num_experts,
+        "num_layers": num_layers,
+        # Handled by FSDP (https://github.com/databricks/megablocks/issues/57#issuecomment-1854594483)
+        "moe_weight_parallelism": False,
+        "moe_expert_model_parallelism": False,
+        "moe_top_k": config.moe_top_k,
+        "moe_capacity_factor": config.moe_capacity_factor,
+        "moe_loss_weight": config.moe_loss_weight,
+        "device": config.init_device,
+        # Handled by FSDP
+        "bf16": False,
+        "fp16": False,
+        "bias": config.include_bias,
+        "return_bias": False,
+        "shared_expert": config.moe_shared_expert,
+        "moe_lbl_in_fp32": config.moe_lbl_in_fp32,
+    }
+    if config.moe_zloss_weight:
+        kwargs["moe_zloss_weight"] = config.moe_zloss_weight
+
+    return MoEArgs(**kwargs)
diff --git a/olmo/initialization.py b/olmo/initialization.py
@@ -13,9 +13,15 @@ def init_normal(
     # weights
     if init_cutoff_factor is not None:
         cutoff_value = init_cutoff_factor * std
-        nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+        if hasattr(module, "weight"):
+            nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+        else:
+            nn.init.trunc_normal_(module, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
     else:
-        nn.init.normal_(module.weight, mean=0.0, std=std)
+        if hasattr(module, "weight"):
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+        else:
+            nn.init.normal_(module, mean=0.0, std=std)
 
     # biases
     if isinstance(module, nn.Linear) and module.bias is not None: