From 199058ed07a65b304bc31a5555d04f0d569065bf Mon Sep 17 00:00:00 2001
From: luoxiang <sunflowerinaries@gmail.com>
Date: Mon, 13 Jan 2025 17:49:37 +0800
Subject: [PATCH 1/2] fix dsv3 gate bias error

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3ea6217d7c0ef..3850ceac7c5ad 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -496,8 +496,9 @@ def grouped_topk(hidden_states: torch.Tensor,
     else:
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
 
+    original_scores = scores
     if e_score_correction_bias is not None:
-        scores.add_(e_score_correction_bias.unsqueeze(0))
+        scores = scores + e_score_correction_bias.unsqueeze(0)
 
     num_token = scores.shape[0]
     group_scores = scores.view(num_token, num_expert_group,
@@ -510,10 +511,12 @@ def grouped_topk(hidden_states: torch.Tensor,
         num_token, num_expert_group,
         scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-    topk_weights, topk_ids = torch.topk(tmp_scores,
-                                        k=topk,
-                                        dim=-1,
-                                        sorted=False)
+    topk_ids = torch.topk(tmp_scores,
+                            k=topk,
+                            dim=-1,
+                            sorted=False)[1]
+
+    topk_weights = original_scores.gather(1, topk_ids)
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

From d9bb640b815ab61a1ad1870e3c5b669a51e1c57c Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Mon, 13 Jan 2025 18:05:04 +0000
Subject: [PATCH 2/2] Add comments and optimize for no-bias case

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .../layers/fused_moe/fused_moe.py              | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3850ceac7c5ad..308c1d6ac6db1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -496,8 +496,10 @@ def grouped_topk(hidden_states: torch.Tensor,
     else:
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
 
-    original_scores = scores
     if e_score_correction_bias is not None:
+        # Store original scores before applying correction bias. We use biased
+        # scores for expert selection but original scores for routing weights
+        original_scores = scores
         scores = scores + e_score_correction_bias.unsqueeze(0)
 
     num_token = scores.shape[0]
@@ -511,12 +513,16 @@ def grouped_topk(hidden_states: torch.Tensor,
         num_token, num_expert_group,
         scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-    topk_ids = torch.topk(tmp_scores,
-                            k=topk,
-                            dim=-1,
-                            sorted=False)[1]
 
-    topk_weights = original_scores.gather(1, topk_ids)
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        # Use original unbiased scores for the routing weights
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(tmp_scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)