From 199058ed07a65b304bc31a5555d04f0d569065bf Mon Sep 17 00:00:00 2001 From: luoxiang Date: Mon, 13 Jan 2025 17:49:37 +0800 Subject: [PATCH 1/2] fix dsv3 gate bias error --- vllm/model_executor/layers/fused_moe/fused_moe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3ea6217d7c0ef..3850ceac7c5ad 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -496,8 +496,9 @@ def grouped_topk(hidden_states: torch.Tensor, else: raise ValueError(f"Unsupported scoring function: {scoring_func}") + original_scores = scores if e_score_correction_bias is not None: - scores.add_(e_score_correction_bias.unsqueeze(0)) + scores = scores + e_score_correction_bias.unsqueeze(0) num_token = scores.shape[0] group_scores = scores.view(num_token, num_expert_group, @@ -510,10 +511,12 @@ def grouped_topk(hidden_states: torch.Tensor, num_token, num_expert_group, scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] - topk_weights, topk_ids = torch.topk(tmp_scores, - k=topk, - dim=-1, - sorted=False) + topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False)[1] + + topk_weights = original_scores.gather(1, topk_ids) if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) From d9bb640b815ab61a1ad1870e3c5b669a51e1c57c Mon Sep 17 00:00:00 2001 From: mgoin Date: Mon, 13 Jan 2025 18:05:04 +0000 Subject: [PATCH 2/2] Add comments and optimize for no-bias case Signed-off-by: mgoin --- .../layers/fused_moe/fused_moe.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3850ceac7c5ad..308c1d6ac6db1 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -496,8 +496,10 @@ def grouped_topk(hidden_states: torch.Tensor, else: raise ValueError(f"Unsupported scoring function: {scoring_func}") - original_scores = scores if e_score_correction_bias is not None: + # Store original scores before applying correction bias. We use biased + # scores for expert selection but original scores for routing weights + original_scores = scores scores = scores + e_score_correction_bias.unsqueeze(0) num_token = scores.shape[0] @@ -511,12 +513,16 @@ def grouped_topk(hidden_states: torch.Tensor, num_token, num_expert_group, scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] - topk_ids = torch.topk(tmp_scores, - k=topk, - dim=-1, - sorted=False)[1] - topk_weights = original_scores.gather(1, topk_ids) + if e_score_correction_bias is not None: + topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1] + # Use original unbiased scores for the routing weights + topk_weights = original_scores.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)