diff --git a/mindspeed_llm/core/transformer/moe/moe_utils.py b/mindspeed_llm/core/transformer/moe/moe_utils.py index b50b7c34ad8fccfa0bfa76b60c338e32c91b7ea3..f4e6515a65831674680913e65692191567a984ba 100644 --- a/mindspeed_llm/core/transformer/moe/moe_utils.py +++ b/mindspeed_llm/core/transformer/moe/moe_utils.py @@ -200,7 +200,7 @@ def topk_softmax_with_capacity( capacity_probs.T.contiguous(), capacity_indices.T.contiguous(), ) - tokens_per_expert_before_capacity = topk_mask.sum(dim=0) + tokens_per_expert = topk_mask.sum(dim=0) else: # Get exceed mask and maskout exceeded probs and indices final_mask = torch.logical_and(topk_mask, capacity_mask) @@ -210,8 +210,8 @@ def topk_softmax_with_capacity( final_indices = top_indices.clone().masked_fill_( exceed_mask, torch.iinfo(torch.long).max ) - tokens_per_expert_before_capacity = topk_mask.sum(dim=0) - return final_probs, final_indices, tokens_per_expert_before_capacity + tokens_per_expert = final_mask.sum(dim=0) + return final_probs, final_indices, tokens_per_expert def track_moe_metrics_wrapper(fn): diff --git a/mindspeed_llm/features_manager/common/moe_router.py b/mindspeed_llm/features_manager/common/moe_router.py index 156f6934c349d6f69ccaf517b3270767aa960328..7e9ea6af480e05e37bac5f26884bd1f9cca56f39 100644 --- a/mindspeed_llm/features_manager/common/moe_router.py +++ b/mindspeed_llm/features_manager/common/moe_router.py @@ -9,7 +9,7 @@ class MOERouter(MindSpeedFeature): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--moe-router-load-balancing-type', type=str, choices=['aux_loss', "group_limited_greedy", "softmax_topk", "pai_megatron_aux_loss", - "sparsemixer_topk", "noaux_tc"], + "sparsemixer_topk", "noaux_tc", "none"], default='aux_loss', help='Determines the load balancing strategy for the router. "aux_loss" corresponds ' 'to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds ' @@ -77,8 +77,9 @@ class MOERouter(MindSpeedFeature): raise ValueError(f'moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity') if args.shared_expert_gate_output_dimension != 1 and args.shared_expert_gate_output_dimension != args.hidden_size: raise AssertionError('shared expert gate output dimension can only be configured with 1 or hidden_size') - if hasattr(args, - 'use_fused_moe_token_permute_and_unpermute') and args.use_fused_moe_token_permute_and_unpermute: + if (args.moe_pad_expert_input_to_capacity + and hasattr(args, 'use_fused_moe_token_permute_and_unpermute') + and args.use_fused_moe_token_permute_and_unpermute): raise AssertionError( 'moe_expert_capacity_factor mode does not support use_fused_moe_token_permute_and_unpermute') diff --git a/mindspeed_llm/tasks/megatron_adaptor.py b/mindspeed_llm/tasks/megatron_adaptor.py index d6bf26fa509ec873066775f9a123415cf186ad9f..9214ba6daaf3f95a917ed7e682492a78acc827d5 100644 --- a/mindspeed_llm/tasks/megatron_adaptor.py +++ b/mindspeed_llm/tasks/megatron_adaptor.py @@ -531,7 +531,7 @@ class CoreAdaptation(MegatronAdaptationABC): 'megatron.core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher.token_permutation', alltoall_token_permutation) - if hasattr(args, 'use_fused_moe_token_permute_and_unpermute') and args.use_fused_moe_token_permute_and_unpermute and not args.moe_expert_capacity_factor: + if hasattr(args, 'use_fused_moe_token_permute_and_unpermute') and args.use_fused_moe_token_permute_and_unpermute: from mindspeed.core.fusions.npu_moe_token_permute import permute_wrapper from mindspeed.core.fusions.npu_moe_token_unpermute import unpermute_wrapper MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute', permute_wrapper)