diff --git a/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh b/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh index a74dd0a5bc9e314785d95a247922f849e534ca40..ff564828228e4826789012576632bbc26f711c6d 100644 --- a/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh +++ b/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh @@ -40,7 +40,6 @@ MOE_ARGS=" --moe-permutation-async-comm \ --moe-token-dispatcher-type allgather \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " torchrun $DISTRIBUTED_ARGS evaluation.py \ diff --git a/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh b/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh index b87784a9a29e37584ba5c0569b805a7221ec6763..8371f4eb9ddbdf7ed481ca253955a91d00847c71 100644 --- a/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh +++ b/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh @@ -41,7 +41,6 @@ MOE_ARGS=" --moe-token-dispatcher-type allgather \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " torchrun $DISTRIBUTED_ARGS inference.py \ diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh index 66875e32016f6d949e9d87e54a04bb1bf03a1b6c..fb94606bc06d4d1ff7a05b2351746f10caacdc96 100644 --- a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh @@ -46,7 +46,6 @@ MOE_ARGS=" --moe-token-dispatcher-type allgather \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " ROPE_ARGS=" diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh index 5deb27966c33084dc5ea30e8ed6b83c5bfc2d165..88cca356c40e22b4107d921ca1d1aa451b16682e 100644 --- a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh @@ -49,7 +49,6 @@ MOE_ARGS=" --moe-permutation-async-comm \ --moe-alltoall-overlap-comm \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh index 3a6691496dc4ff78c6f8c807d0a62589866e1226..c1ad862fe10facaf04f22ec9e0041a678f02a1c5 100644 --- a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh @@ -47,7 +47,6 @@ MOE_ARGS=" --moe-token-dispatcher-type allgather \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh index 135747bb1ab160db010edebbc7435e45e435d142..d1a98cd997b3dafb2e78f71865182da86ae7d384 100644 --- a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh +++ b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh @@ -52,7 +52,6 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall_seq \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh index 3bd692271b4ba56f0a8744967902b01483bd81fb..691375aaa97de7d337c0fcc7186cd414b1cce097 100644 --- a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh +++ b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh @@ -58,7 +58,6 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall_seq \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh index b39abfe39d4079acc6a90737a55abb7121aef3b7..b7a567be6f9295c34e4a5a70e2d54e8c901a6f96 100644 --- a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh +++ b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh @@ -57,7 +57,6 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall_seq \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/mindspeed_llm/core/transformer/transformer_block.py b/mindspeed_llm/core/transformer/transformer_block.py index 4748d61eb553648a47d4a0e951b80c9d711c2d57..e43c68322ad9a0018a02f30dbb46b1e1bf905c88 100644 --- a/mindspeed_llm/core/transformer/transformer_block.py +++ b/mindspeed_llm/core/transformer/transformer_block.py @@ -107,8 +107,8 @@ def _transformer_block_build_layers(self): # For deepseek if ( args.num_experts - and args.first_k_dense_replace is not None - and args.moe_layer_freq is not None + and args.first_k_dense_replace + and args.moe_layer_freq ): if ( diff --git a/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py b/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py index 49e266011ae79d8b53a28fa119e71dc34def4b94..093365e8a46c28db2ebf5126e8290b35a90ef105 100644 --- a/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py +++ b/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py @@ -20,6 +20,6 @@ class NoopLayersFeature(MSNoopLayersFeature): if getattr(args, self.feature_name, None): # Use existing patch: megatron.core.transformer.transformer_block.TransformerBlock._build_layers patch_manager.register_patch("megatron.training.training.num_floating_point_operations", - mindspeed_calc_flop) + mindspeed_calc_flop) patch_manager.register_patch("megatron.core.transformer.moe.moe_utils.track_moe_metrics", - mindspeed_track_moe_metrics) + mindspeed_track_moe_metrics) diff --git a/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py b/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py index ec7e61daacd06a78766f95bf6ba72ee87b993a20..bb87bff5f5c6dc632cbd939e5cd65ed1f5a596ef 100644 --- a/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py +++ b/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py @@ -1,8 +1,6 @@ # Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. from pathlib import Path -from argparse import ArgumentParser - from mindspeed.features_manager.tokenizer.build_tokenizer import BuildTokenizerFeature as MindSpeedBuildTokenizerFeature TEMPLATES_DIR = str( @@ -13,7 +11,7 @@ TEMPLATES_DIR = str( class BuildTokenizerFeature(MindSpeedBuildTokenizerFeature): - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): self.add_parser_argument_choices_value(parser, "--tokenizer-type", 'PretrainedFromHF') group = parser.add_argument_group(title=self.feature_name) diff --git a/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py b/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py index 8626f0608dd304c67119ccc01a04a23195607603..5716dc1ce833152277a24fdc817d3e3f6557c92e 100644 --- a/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py +++ b/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py @@ -1,12 +1,10 @@ # Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. -from argparse import ArgumentParser - from mindspeed.features_manager.transformer.flash_attention.fusion_attention_v1_feature import FusionAttentionFeature as MindSpeedFusionAttentionFeature class FusionAttentionFeature(MindSpeedFusionAttentionFeature): - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title='fusion attention') group.add_argument('--shape-order', type=str, default='SBH', choices=['SBH', 'BSH', 'BSND', 'BNSD'], @@ -22,15 +20,15 @@ class FusionAttentionFeature(MindSpeedFusionAttentionFeature): group.add_argument('--interleave-sliding-window', type=int, help='Window size when use interleave sliding window attention.') - def register_patches(self, pm, args): + def register_patches(self, patch_manager, args): from mindspeed.core.transformer.attention import attention_init from mindspeed_llm.core.transformer.custom_dot_product_attention import CustomDotProductAttention - + # Attention if int(getattr(args, 'context_parallel_size', 1)) < 2: - pm.register_patch('megatron.core.transformer.attention.Attention.__init__', - attention_init) - pm.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention', - CustomDotProductAttention) - pm.register_patch('megatron.core.transformer.custom_layers.transformer_engine.TEDotProductAttention', - CustomDotProductAttention) + patch_manager.register_patch('megatron.core.transformer.attention.Attention.__init__', + attention_init) + patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention', + CustomDotProductAttention) + patch_manager.register_patch('megatron.core.transformer.custom_layers.transformer_engine.TEDotProductAttention', + CustomDotProductAttention) diff --git a/mindspeed_llm/features_manager/transformer/mtp.py b/mindspeed_llm/features_manager/transformer/mtp.py index 1fe128faabbcdd5e7e568ae24312c459ca5df22b..3fbc8009c9d38e99a4620cf4f972e7ede1584d0e 100644 --- a/mindspeed_llm/features_manager/transformer/mtp.py +++ b/mindspeed_llm/features_manager/transformer/mtp.py @@ -1,4 +1,3 @@ -from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature @@ -6,7 +5,7 @@ class MultiTokenPredictionFeature(MindSpeedFeature): def __init__(self): super(MultiTokenPredictionFeature, self).__init__(feature_name="multi-token-prediction", optimization_level=0) - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--recompute-mtp-norm', action='store_true', default=False, @@ -26,9 +25,8 @@ class MultiTokenPredictionFeature(MindSpeedFeature): # Use existing patch: megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.__init__ # mtp compatibility megatron.core.transformer.multi_token_prediction.LNImpl = PTNorm - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MTPLossLoggingHelper.reduce_loss_in_tracker', - mtp_reduce_loss_in_tracker) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MTPLossLoggingHelper.reduce_loss_in_tracker', + mtp_reduce_loss_in_tracker) # mtp memory optimization if args.mtp_mem_efficient_logits: diff --git a/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py b/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py index 48a515de4c796c05ceaa1b00e73c3830e450b4a4..3c04d4b0b4313dfe7d7c9d2ab90eae49a1e2b746 100644 --- a/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py +++ b/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py @@ -1,4 +1,3 @@ -from argparse import ArgumentParser, Namespace from mindspeed.features_manager.feature import MindSpeedFeature @@ -7,7 +6,7 @@ class MLAFeature(MindSpeedFeature): def __init__(self): super().__init__('multi-latent-attention', optimization_level=2) - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title='multi latent attention') group.add_argument('--padded-base-length', type=int, default=128, @@ -27,25 +26,17 @@ class MLAFeature(MindSpeedFeature): group.add_argument('--mla-fa-divide-qk', action='store_true', default=False, help='Flash attn support mla with seperate q and k.') - def validate_args(self, args: Namespace): + def validate_args(self, args): if args.multi_latent_attention: if args.kv_lora_rank is None: - raise AssertionError( - 'The parameter kv-lora-rank should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter kv-lora-rank should be set when use multi_head_latent_attention.' ) elif args.v_head_dim is None: - raise AssertionError( - 'The parameter v-head-dim should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter v-head-dim should be set when use multi_head_latent_attention.' ) elif args.qk_pos_emb_head_dim is None: - raise AssertionError( - 'The parameter qk-pos-emb-head-dim should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter qk-pos-emb-head-dim should be set when use multi_head_latent_attention.' ) elif args.qk_head_dim is None: - raise AssertionError( - 'The parameter qk-head-dim should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter qk-head-dim should be set when use multi_head_latent_attention.' ) diff --git a/mindspeed_llm/features_manager/transformer/transformer_block.py b/mindspeed_llm/features_manager/transformer/transformer_block.py index c0ad00f57805e30d4da494e0a853f7ff1dceff25..9a71810b7ab1a794b1efe7fadd1060c5556accf4 100644 --- a/mindspeed_llm/features_manager/transformer/transformer_block.py +++ b/mindspeed_llm/features_manager/transformer/transformer_block.py @@ -1,4 +1,3 @@ -from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature @@ -6,11 +5,35 @@ class TransformerBlockFeature(MindSpeedFeature): def __init__(self): super(TransformerBlockFeature, self).__init__(feature_name="transformer-block", optimization_level=0) - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--first-k-dense-replace', type=int, default=None, - help='Set first k layer as dense layer') + help='Set first k layer as dense layer') + def validate_args(self, args): + if args.num_experts is None: + if args.first_k_dense_replace is not None or args.moe_layer_freq is not None: + raise AssertionError('First-k-dense-replace and moe-layer-freq must be None when not using MoEs') + else: + if args.first_k_dense_replace and args.num_layers <= args.first_k_dense_replace: + raise AssertionError('Num-layer ({}) must be greater than first-k-dense-replace ({}) when first-k-dense-replace is set.'.format(args.num_layers, + args.first_k_dense_replace)) + if args.first_k_dense_replace and args.pipeline_model_parallel_size > 1: + if args.first_k_dense_replace >= args.num_layers // args.pipeline_model_parallel_size: + raise AssertionError('When using first-k-dense-replace, it is not allowed for all layers within a pp stage to be dense layers.') + if args.num_experts is not None and args.use_ascend_mc2 and args.moe_grouped_gemm: + raise AssertionError('Moe Grouped Gemm is not supported with mc2 in MOE model.') + + if args.num_layer_list: + if len(args.num_layer_list.split(',')) != args.pipeline_model_parallel_size: + raise ValueError("len(args.num_layer_list) != args.pipeline_model_parallel_size") + if not args.pipeline_model_parallel_size > 1: + raise ValueError("Dynamic pipeline model should work with pipeline parallel.") + if args.num_layers_per_virtual_pipeline_stage: + raise ValueError("Dynamic pipeline model and virtual pipeline cannot be enabled at the same time.") + + if args.use_ascend_mc2 and args.use_ascend_coc: + raise AssertionError('--mc2 and coc can not be used together') def register_patches(self, patch_manager, args): from mindspeed_llm.core.transformer.transformer_block import _transformer_block_build_layers, transformer_block_init_wrapper, transformer_block_forward diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index ac61212f12a832732e0b885fce47c1cf63544da3..7a9b77a81ab04904ecfe2165a51d5236dae94b6f 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -1083,8 +1083,6 @@ def _validate_transformer_block_build_layers(args): if args.first_k_dense_replace is not None or args.moe_layer_freq is not None: raise AssertionError('First-k-dense-replace and moe-layer-freq must be None when not using MoEs') else: - if (args.first_k_dense_replace is None) != (args.moe_layer_freq is None): - raise AssertionError('First-k-dense-replace and moe-layer-freq must be set together.') if args.first_k_dense_replace and args.num_layers <= args.first_k_dense_replace: raise AssertionError('Num-layer ({}) must be greater than first-k-dense-replace ({}) when first-k-dense-replace is set.'.format(args.num_layers, args.first_k_dense_replace)) diff --git a/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh b/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh index 5fe12858ebe78a62532b8f3c2470b4d41b959fda..d3247c497996ea9511154044616b54db223c6ce1 100644 --- a/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh +++ b/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh @@ -41,7 +41,6 @@ MOE_ARGS=" --n-shared-experts 2 \ --shared-expert-gate \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ --moe-intermediate-size 320 \ --moe-ffn-hidden-size 320 \