From 8fe22e5ec87325bdf86835baa16879a14cb9dea6 Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Thu, 26 Jun 2025 15:15:26 +0800
Subject: [PATCH] fix 2.0.0 deekseek

---
 mindspeed_llm/core/transformer/moe/router.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mindspeed_llm/core/transformer/moe/router.py b/mindspeed_llm/core/transformer/moe/router.py
index cb0be4170..be8741316 100644
--- a/mindspeed_llm/core/transformer/moe/router.py
+++ b/mindspeed_llm/core/transformer/moe/router.py
@@ -555,6 +555,7 @@ def topk_router_forward(self, input: torch.Tensor):
     """
     args = get_args()
     self.hidden = input.shape[-1]
+    _maintain_float32_expert_bias(self)
 
     # add input_jitter to distinguish whether to use
     if args.input_jitter:
@@ -565,3 +566,15 @@ def topk_router_forward(self, input: torch.Tensor):
     scores, indices = self.routing(logits)
 
     return scores, indices
+
+
+def _maintain_float32_expert_bias(self):
+    """
+    Maintain the expert bias in float32.
+
+    When using bf16/fp16, the expert bias gets converted to lower precision in Float16Module.
+    We keep it in float32 to avoid routing errors when updating the expert_bias.
+    """
+    if hasattr(self, 'expert_bias') and self.expert_bias is not None:
+        if self.expert_bias.dtype != torch.float32:
+            self.expert_bias.data = self.expert_bias.data.to(torch.float32)
-- 
Gitee