diff --git a/mindspeed_llm/features_manager/__init__.py b/mindspeed_llm/features_manager/__init__.py index 7a4a7bafb52d7ce14ce68c6862818eeb739d69f5..9a4d08b2740ca57373d6ecb6cfbae9530aabfcda 100644 --- a/mindspeed_llm/features_manager/__init__.py +++ b/mindspeed_llm/features_manager/__init__.py @@ -8,7 +8,8 @@ from mindspeed_llm.features_manager.models.mamba import MambaModel from mindspeed_llm.features_manager.communication.coc import AscendCocFeature from mindspeed_llm.features_manager.common.data import DataFeature -from mindspeed_llm.features_manager.common.moe_router import MOERouter +from mindspeed_llm.features_manager.common.moe_router import MoERouter +from mindspeed_llm.features_manager.common.moe_tp_extend_ep import MoETpExtendEp FEATURES_LIST = [ # MindSpeed Legacy Features @@ -24,7 +25,8 @@ FEATURES_LIST = [ RotaryPositionEmbeddingFeature(), LanguageModelEmbeddingFeature(), MambaModel(), - MOERouter(), + MoERouter(), + MoETpExtendEp(), AscendCocFeature() # MindSpeed-LLM Legacy Features diff --git a/mindspeed_llm/features_manager/common/moe_router.py b/mindspeed_llm/features_manager/common/moe_router.py index 156f6934c349d6f69ccaf517b3270767aa960328..74aef5bc57cffca3f6f3dd87c52570a35a9a30b8 100644 --- a/mindspeed_llm/features_manager/common/moe_router.py +++ b/mindspeed_llm/features_manager/common/moe_router.py @@ -1,9 +1,9 @@ from mindspeed.features_manager.feature import MindSpeedFeature -class MOERouter(MindSpeedFeature): +class MoERouter(MindSpeedFeature): def __init__(self): - super(MOERouter, self).__init__(feature_name="moe_router", optimization_level=0) + super(MoERouter, self).__init__(feature_name="moe_router", optimization_level=0) def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) diff --git a/mindspeed_llm/features_manager/common/moe_tp_extend_ep.py b/mindspeed_llm/features_manager/common/moe_tp_extend_ep.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c089d406de300d2a16736543fad96bf869dc0a --- /dev/null +++ b/mindspeed_llm/features_manager/common/moe_tp_extend_ep.py @@ -0,0 +1,18 @@ +from mindspeed.features_manager.feature import MindSpeedFeature + + +class MoETpExtendEp(MindSpeedFeature): + def __init__(self): + super(MoETpExtendEp, self).__init__(feature_name="moe_tp_extend_ep", optimization_level=0) + + def register_args(self, parser): + group = parser.add_argument_group(title=self.feature_name) + group.add_argument("--moe-tp-extend-ep", action='store_true', + help="use tp group to extend experts parallism instead of sharding weight tensor of experts in tp group") + + def register_patches(self, patch_manager, args): + # For moe tp extend ep ckpt + if args.moe_tp_extend_ep: + from mindspeed.core.transformer.moe.moe_layer import base_moe_init_wrapper + patch_manager.register_patch('megatron.core.transformer.moe.moe_layer.BaseMoELayer.__init__', + base_moe_init_wrapper) diff --git a/mindspeed_llm/tasks/megatron_adaptor.py b/mindspeed_llm/tasks/megatron_adaptor.py index 5a3c51448614e60099376e84cc0695f81aaaee66..d534ed446a728054f4ebb76cb96201bd97f9ec92 100644 --- a/mindspeed_llm/tasks/megatron_adaptor.py +++ b/mindspeed_llm/tasks/megatron_adaptor.py @@ -396,11 +396,6 @@ class CoreAdaptation(MegatronAdaptationABC): dualpipe_register_patches(MegatronAdaptation) args = MegatronAdaptation.get_args() - # For moe tp extend ep ckpt - if args.moe_tp_extend_ep: - from mindspeed.core.transformer.moe.moe_layer import base_moe_init_wrapper - MegatronAdaptation.register('megatron.core.transformer.moe.moe_layer.BaseMoELayer.__init__', - base_moe_init_wrapper) if args.moe_permutation_async_comm: if args.moe_token_dispatcher_type == 'allgather': diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index c7538a9df0bedfb606ae9e6ccac5980fa6ecf4d0..7801cf823a3d54daa7beea69e8eb2a1e619ad8c4 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -407,8 +407,6 @@ def _add_moe_args(parser): help='moe_alltoall_overlap_comm') group.add_argument("--cla-share-factor", type=int, default=1, help="Cross-Layer Attention share kv between cla-share-factor layers") - group.add_argument("--moe-tp-extend-ep", action='store_true', - help="use tp group to extend experts parallism instead of sharding weight tensor of experts in tp group") group.add_argument("--moe-zero-memory", type=str, default='disable', choices=['disable', 'level0', 'level1'], help="Save activation memory in moe layer.")