From ecf9a5b91288c40172f731c2bcad59fd130e343f Mon Sep 17 00:00:00 2001 From: shengjiayi Date: Tue, 17 Jun 2025 14:10:08 +0800 Subject: [PATCH 1/2] drop for expert bias --- mindspeed_llm/core/transformer/moe/moe_utils.py | 6 +++--- mindspeed_llm/features_manager/common/moe_router.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mindspeed_llm/core/transformer/moe/moe_utils.py b/mindspeed_llm/core/transformer/moe/moe_utils.py index b50b7c34a..f4e6515a6 100644 --- a/mindspeed_llm/core/transformer/moe/moe_utils.py +++ b/mindspeed_llm/core/transformer/moe/moe_utils.py @@ -200,7 +200,7 @@ def topk_softmax_with_capacity( capacity_probs.T.contiguous(), capacity_indices.T.contiguous(), ) - tokens_per_expert_before_capacity = topk_mask.sum(dim=0) + tokens_per_expert = topk_mask.sum(dim=0) else: # Get exceed mask and maskout exceeded probs and indices final_mask = torch.logical_and(topk_mask, capacity_mask) @@ -210,8 +210,8 @@ def topk_softmax_with_capacity( final_indices = top_indices.clone().masked_fill_( exceed_mask, torch.iinfo(torch.long).max ) - tokens_per_expert_before_capacity = topk_mask.sum(dim=0) - return final_probs, final_indices, tokens_per_expert_before_capacity + tokens_per_expert = final_mask.sum(dim=0) + return final_probs, final_indices, tokens_per_expert def track_moe_metrics_wrapper(fn): diff --git a/mindspeed_llm/features_manager/common/moe_router.py b/mindspeed_llm/features_manager/common/moe_router.py index 156f6934c..1a239c95b 100644 --- a/mindspeed_llm/features_manager/common/moe_router.py +++ b/mindspeed_llm/features_manager/common/moe_router.py @@ -9,7 +9,7 @@ class MOERouter(MindSpeedFeature): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--moe-router-load-balancing-type', type=str, choices=['aux_loss', "group_limited_greedy", "softmax_topk", "pai_megatron_aux_loss", - "sparsemixer_topk", "noaux_tc"], + "sparsemixer_topk", "noaux_tc", "none"], default='aux_loss', help='Determines the load balancing strategy for the router. "aux_loss" corresponds ' 'to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds ' -- Gitee From dd208decf6798d54cd93f011ec9fc6d9571abb8e Mon Sep 17 00:00:00 2001 From: shengjiayi Date: Wed, 18 Jun 2025 21:19:37 +0800 Subject: [PATCH 2/2] drop for expert bias --- mindspeed_llm/features_manager/common/moe_router.py | 5 +++-- mindspeed_llm/tasks/megatron_adaptor.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mindspeed_llm/features_manager/common/moe_router.py b/mindspeed_llm/features_manager/common/moe_router.py index 1a239c95b..7e9ea6af4 100644 --- a/mindspeed_llm/features_manager/common/moe_router.py +++ b/mindspeed_llm/features_manager/common/moe_router.py @@ -77,8 +77,9 @@ class MOERouter(MindSpeedFeature): raise ValueError(f'moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity') if args.shared_expert_gate_output_dimension != 1 and args.shared_expert_gate_output_dimension != args.hidden_size: raise AssertionError('shared expert gate output dimension can only be configured with 1 or hidden_size') - if hasattr(args, - 'use_fused_moe_token_permute_and_unpermute') and args.use_fused_moe_token_permute_and_unpermute: + if (args.moe_pad_expert_input_to_capacity + and hasattr(args, 'use_fused_moe_token_permute_and_unpermute') + and args.use_fused_moe_token_permute_and_unpermute): raise AssertionError( 'moe_expert_capacity_factor mode does not support use_fused_moe_token_permute_and_unpermute') diff --git a/mindspeed_llm/tasks/megatron_adaptor.py b/mindspeed_llm/tasks/megatron_adaptor.py index d6bf26fa5..9214ba6da 100644 --- a/mindspeed_llm/tasks/megatron_adaptor.py +++ b/mindspeed_llm/tasks/megatron_adaptor.py @@ -531,7 +531,7 @@ class CoreAdaptation(MegatronAdaptationABC): 'megatron.core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher.token_permutation', alltoall_token_permutation) - if hasattr(args, 'use_fused_moe_token_permute_and_unpermute') and args.use_fused_moe_token_permute_and_unpermute and not args.moe_expert_capacity_factor: + if hasattr(args, 'use_fused_moe_token_permute_and_unpermute') and args.use_fused_moe_token_permute_and_unpermute: from mindspeed.core.fusions.npu_moe_token_permute import permute_wrapper from mindspeed.core.fusions.npu_moe_token_unpermute import unpermute_wrapper MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute', permute_wrapper) -- Gitee