From fb0b28731639250d988e75b8b927eb9ee5e4a765 Mon Sep 17 00:00:00 2001 From: guozhihua Date: Mon, 11 Aug 2025 20:03:09 +0800 Subject: [PATCH] fix qwen2_moe spec --- .../qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh | 1 - .../qwen2_moe/generate_qwen2_57b_a14b_ptd.sh | 1 - .../pretrain_qwen2_57b_a14b_32k_ptd.sh | 1 - .../pretrain_qwen2_57b_a14b_4k_A3_ptd.sh | 1 - .../pretrain_qwen2_57b_a14b_4k_ptd.sh | 1 - .../qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh | 1 - .../tune_qwen2_57b_a14b_4k_full_pack.sh | 1 - .../qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh | 1 - .../core/transformer/transformer_block.py | 4 +-- .../pipeline_parallel/noop_layers.py | 4 +-- .../tokenizer/build_tokenizer.py | 4 +-- .../fusion_attention_feature.py | 20 ++++++------- .../features_manager/transformer/mtp.py | 8 ++--- .../multi_latent_attention/mla_feature.py | 21 ++++---------- .../transformer/transformer_block.py | 29 +++++++++++++++++-- mindspeed_llm/training/arguments.py | 2 -- .../qwen2_moe_tp1_pp2_ep2_cp2_32k.sh | 1 - 17 files changed, 49 insertions(+), 52 deletions(-) diff --git a/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh b/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh index a74dd0a5bc..ff56482822 100644 --- a/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh +++ b/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh @@ -40,7 +40,6 @@ MOE_ARGS=" --moe-permutation-async-comm \ --moe-token-dispatcher-type allgather \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " torchrun $DISTRIBUTED_ARGS evaluation.py \ diff --git a/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh b/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh index b87784a9a2..8371f4eb9d 100644 --- a/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh +++ b/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh @@ -41,7 +41,6 @@ MOE_ARGS=" --moe-token-dispatcher-type allgather \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " torchrun $DISTRIBUTED_ARGS inference.py \ diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh index 66875e3201..fb94606bc0 100644 --- a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32k_ptd.sh @@ -46,7 +46,6 @@ MOE_ARGS=" --moe-token-dispatcher-type allgather \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " ROPE_ARGS=" diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh index 5deb27966c..88cca356c4 100644 --- a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_A3_ptd.sh @@ -49,7 +49,6 @@ MOE_ARGS=" --moe-permutation-async-comm \ --moe-alltoall-overlap-comm \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh index 3a6691496d..c1ad862fe1 100644 --- a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4k_ptd.sh @@ -47,7 +47,6 @@ MOE_ARGS=" --moe-token-dispatcher-type allgather \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh index 135747bb1a..d1a98cd997 100644 --- a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh +++ b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full.sh @@ -52,7 +52,6 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall_seq \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh index 3bd692271b..691375aaa9 100644 --- a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh +++ b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_full_pack.sh @@ -58,7 +58,6 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall_seq \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh index b39abfe39d..b7a567be6f 100644 --- a/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh +++ b/examples/mcore/qwen2_moe/tune_qwen2_57b_a14b_4k_lora.sh @@ -57,7 +57,6 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall_seq \ --moe-aux-loss-coeff 0.001 \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ " OPTIMIZE_ARGS=" diff --git a/mindspeed_llm/core/transformer/transformer_block.py b/mindspeed_llm/core/transformer/transformer_block.py index 4748d61eb5..e43c68322a 100644 --- a/mindspeed_llm/core/transformer/transformer_block.py +++ b/mindspeed_llm/core/transformer/transformer_block.py @@ -107,8 +107,8 @@ def _transformer_block_build_layers(self): # For deepseek if ( args.num_experts - and args.first_k_dense_replace is not None - and args.moe_layer_freq is not None + and args.first_k_dense_replace + and args.moe_layer_freq ): if ( diff --git a/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py b/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py index 49e266011a..093365e8a4 100644 --- a/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py +++ b/mindspeed_llm/features_manager/pipeline_parallel/noop_layers.py @@ -20,6 +20,6 @@ class NoopLayersFeature(MSNoopLayersFeature): if getattr(args, self.feature_name, None): # Use existing patch: megatron.core.transformer.transformer_block.TransformerBlock._build_layers patch_manager.register_patch("megatron.training.training.num_floating_point_operations", - mindspeed_calc_flop) + mindspeed_calc_flop) patch_manager.register_patch("megatron.core.transformer.moe.moe_utils.track_moe_metrics", - mindspeed_track_moe_metrics) + mindspeed_track_moe_metrics) diff --git a/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py b/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py index ec7e61daac..bb87bff5f5 100644 --- a/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py +++ b/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py @@ -1,8 +1,6 @@ # Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. from pathlib import Path -from argparse import ArgumentParser - from mindspeed.features_manager.tokenizer.build_tokenizer import BuildTokenizerFeature as MindSpeedBuildTokenizerFeature TEMPLATES_DIR = str( @@ -13,7 +11,7 @@ TEMPLATES_DIR = str( class BuildTokenizerFeature(MindSpeedBuildTokenizerFeature): - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): self.add_parser_argument_choices_value(parser, "--tokenizer-type", 'PretrainedFromHF') group = parser.add_argument_group(title=self.feature_name) diff --git a/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py b/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py index 8626f0608d..5716dc1ce8 100644 --- a/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py +++ b/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py @@ -1,12 +1,10 @@ # Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. -from argparse import ArgumentParser - from mindspeed.features_manager.transformer.flash_attention.fusion_attention_v1_feature import FusionAttentionFeature as MindSpeedFusionAttentionFeature class FusionAttentionFeature(MindSpeedFusionAttentionFeature): - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title='fusion attention') group.add_argument('--shape-order', type=str, default='SBH', choices=['SBH', 'BSH', 'BSND', 'BNSD'], @@ -22,15 +20,15 @@ class FusionAttentionFeature(MindSpeedFusionAttentionFeature): group.add_argument('--interleave-sliding-window', type=int, help='Window size when use interleave sliding window attention.') - def register_patches(self, pm, args): + def register_patches(self, patch_manager, args): from mindspeed.core.transformer.attention import attention_init from mindspeed_llm.core.transformer.custom_dot_product_attention import CustomDotProductAttention - + # Attention if int(getattr(args, 'context_parallel_size', 1)) < 2: - pm.register_patch('megatron.core.transformer.attention.Attention.__init__', - attention_init) - pm.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention', - CustomDotProductAttention) - pm.register_patch('megatron.core.transformer.custom_layers.transformer_engine.TEDotProductAttention', - CustomDotProductAttention) + patch_manager.register_patch('megatron.core.transformer.attention.Attention.__init__', + attention_init) + patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention', + CustomDotProductAttention) + patch_manager.register_patch('megatron.core.transformer.custom_layers.transformer_engine.TEDotProductAttention', + CustomDotProductAttention) diff --git a/mindspeed_llm/features_manager/transformer/mtp.py b/mindspeed_llm/features_manager/transformer/mtp.py index 1fe128faab..3fbc8009c9 100644 --- a/mindspeed_llm/features_manager/transformer/mtp.py +++ b/mindspeed_llm/features_manager/transformer/mtp.py @@ -1,4 +1,3 @@ -from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature @@ -6,7 +5,7 @@ class MultiTokenPredictionFeature(MindSpeedFeature): def __init__(self): super(MultiTokenPredictionFeature, self).__init__(feature_name="multi-token-prediction", optimization_level=0) - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--recompute-mtp-norm', action='store_true', default=False, @@ -26,9 +25,8 @@ class MultiTokenPredictionFeature(MindSpeedFeature): # Use existing patch: megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.__init__ # mtp compatibility megatron.core.transformer.multi_token_prediction.LNImpl = PTNorm - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MTPLossLoggingHelper.reduce_loss_in_tracker', - mtp_reduce_loss_in_tracker) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MTPLossLoggingHelper.reduce_loss_in_tracker', + mtp_reduce_loss_in_tracker) # mtp memory optimization if args.mtp_mem_efficient_logits: diff --git a/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py b/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py index 48a515de4c..3c04d4b0b4 100644 --- a/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py +++ b/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py @@ -1,4 +1,3 @@ -from argparse import ArgumentParser, Namespace from mindspeed.features_manager.feature import MindSpeedFeature @@ -7,7 +6,7 @@ class MLAFeature(MindSpeedFeature): def __init__(self): super().__init__('multi-latent-attention', optimization_level=2) - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title='multi latent attention') group.add_argument('--padded-base-length', type=int, default=128, @@ -27,25 +26,17 @@ class MLAFeature(MindSpeedFeature): group.add_argument('--mla-fa-divide-qk', action='store_true', default=False, help='Flash attn support mla with seperate q and k.') - def validate_args(self, args: Namespace): + def validate_args(self, args): if args.multi_latent_attention: if args.kv_lora_rank is None: - raise AssertionError( - 'The parameter kv-lora-rank should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter kv-lora-rank should be set when use multi_head_latent_attention.' ) elif args.v_head_dim is None: - raise AssertionError( - 'The parameter v-head-dim should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter v-head-dim should be set when use multi_head_latent_attention.' ) elif args.qk_pos_emb_head_dim is None: - raise AssertionError( - 'The parameter qk-pos-emb-head-dim should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter qk-pos-emb-head-dim should be set when use multi_head_latent_attention.' ) elif args.qk_head_dim is None: - raise AssertionError( - 'The parameter qk-head-dim should be ' - 'set when use multi_head_latent_attention.' + raise AssertionError('The parameter qk-head-dim should be set when use multi_head_latent_attention.' ) diff --git a/mindspeed_llm/features_manager/transformer/transformer_block.py b/mindspeed_llm/features_manager/transformer/transformer_block.py index c0ad00f578..9a71810b7a 100644 --- a/mindspeed_llm/features_manager/transformer/transformer_block.py +++ b/mindspeed_llm/features_manager/transformer/transformer_block.py @@ -1,4 +1,3 @@ -from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature @@ -6,11 +5,35 @@ class TransformerBlockFeature(MindSpeedFeature): def __init__(self): super(TransformerBlockFeature, self).__init__(feature_name="transformer-block", optimization_level=0) - def register_args(self, parser: ArgumentParser): + def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--first-k-dense-replace', type=int, default=None, - help='Set first k layer as dense layer') + help='Set first k layer as dense layer') + def validate_args(self, args): + if args.num_experts is None: + if args.first_k_dense_replace is not None or args.moe_layer_freq is not None: + raise AssertionError('First-k-dense-replace and moe-layer-freq must be None when not using MoEs') + else: + if args.first_k_dense_replace and args.num_layers <= args.first_k_dense_replace: + raise AssertionError('Num-layer ({}) must be greater than first-k-dense-replace ({}) when first-k-dense-replace is set.'.format(args.num_layers, + args.first_k_dense_replace)) + if args.first_k_dense_replace and args.pipeline_model_parallel_size > 1: + if args.first_k_dense_replace >= args.num_layers // args.pipeline_model_parallel_size: + raise AssertionError('When using first-k-dense-replace, it is not allowed for all layers within a pp stage to be dense layers.') + if args.num_experts is not None and args.use_ascend_mc2 and args.moe_grouped_gemm: + raise AssertionError('Moe Grouped Gemm is not supported with mc2 in MOE model.') + + if args.num_layer_list: + if len(args.num_layer_list.split(',')) != args.pipeline_model_parallel_size: + raise ValueError("len(args.num_layer_list) != args.pipeline_model_parallel_size") + if not args.pipeline_model_parallel_size > 1: + raise ValueError("Dynamic pipeline model should work with pipeline parallel.") + if args.num_layers_per_virtual_pipeline_stage: + raise ValueError("Dynamic pipeline model and virtual pipeline cannot be enabled at the same time.") + + if args.use_ascend_mc2 and args.use_ascend_coc: + raise AssertionError('--mc2 and coc can not be used together') def register_patches(self, patch_manager, args): from mindspeed_llm.core.transformer.transformer_block import _transformer_block_build_layers, transformer_block_init_wrapper, transformer_block_forward diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index ac61212f12..7a9b77a81a 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -1083,8 +1083,6 @@ def _validate_transformer_block_build_layers(args): if args.first_k_dense_replace is not None or args.moe_layer_freq is not None: raise AssertionError('First-k-dense-replace and moe-layer-freq must be None when not using MoEs') else: - if (args.first_k_dense_replace is None) != (args.moe_layer_freq is None): - raise AssertionError('First-k-dense-replace and moe-layer-freq must be set together.') if args.first_k_dense_replace and args.num_layers <= args.first_k_dense_replace: raise AssertionError('Num-layer ({}) must be greater than first-k-dense-replace ({}) when first-k-dense-replace is set.'.format(args.num_layers, args.first_k_dense_replace)) diff --git a/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh b/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh index 5fe12858eb..d3247c4979 100644 --- a/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh +++ b/tests/st/shell_scripts/qwen2_moe_tp1_pp2_ep2_cp2_32k.sh @@ -41,7 +41,6 @@ MOE_ARGS=" --n-shared-experts 2 \ --shared-expert-gate \ --moe-layer-freq -1 \ - --first-k-dense-replace -1 \ --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ --moe-intermediate-size 320 \ --moe-ffn-hidden-size 320 \ -- Gitee