From 4d7b34b601947bce5b0b9afdca1cfb788a37c408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Mon, 21 Jul 2025 09:39:30 +0000 Subject: [PATCH 01/16] add mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: flippy航 <654733882@qq.com> --- .../tasks/models/spec/qwen3_mamba_spec.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py new file mode 100644 index 000000000..249231308 --- /dev/null +++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py @@ -0,0 +1,105 @@ +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec +from megatron.training import get_args + +from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules +from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules +from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules + +from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules +from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm + +args = get_args() +num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm + +# # Transformer Layer Spec for Gemma using post_mlp_layernorm and post_mlp_layernorm. +# layer_spec = ModuleSpec( +# module=TransformerLayer, +# submodules=TransformerLayerSubmodules( +# input_layernorm=PTNorm, +# self_attention=ModuleSpec( +# module=SelfAttention, +# params={"attn_mask_type": AttnMaskType.causal}, +# submodules=SelfAttentionSubmodules( +# linear_qkv=ColumnParallelLinear, +# core_attention=DotProductAttention, +# linear_proj=RowParallelLinear, +# q_layernorm=PTNorm if qk_layernorm else IdentityOp, +# k_layernorm=PTNorm if qk_layernorm else IdentityOp, +# ), +# ), +# self_attn_bda=get_bias_dropout_add, +# pre_mlp_layernorm=PTNorm, +# mlp=_get_mlp_module_spec( +# use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm +# ), +# mlp_bda=get_bias_dropout_add, +# sharded_state_dict_keys_map={ +# 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', +# 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', +# }, +# ), +# ) + +layer_spec = ModuleSpec( + module=MambaStack, + submodules=MambaStackSubmodules( + mamba_layer=ModuleSpec( + module=MambaLayer, + submodules=MambaLayerSubmodules( + norm=PTNorm, + mixer=ModuleSpec( + module=MambaMixer, + submodules=MambaMixerSubmodules( + in_proj=ColumnParallelLinear, + out_proj=RowParallelLinear, + ), + ), + mamba_bda=get_bias_dropout_add, + ), + ), + attention_layer=ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=PTNorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + ), + ), + self_attn_bda=get_bias_dropout_add, + ), + ), + mlp_layer=ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + pre_mlp_layernorm=PTNorm, + # mlp=ModuleSpec( + # module=MLP, + # submodules=MLPSubmodules( + # linear_fc1=ColumnParallelLinear, + # linear_fc2=RowParallelLinear, + # ), + # ), + mlp=_get_mlp_module_spec( + use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm + ), + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ), + ), +) \ No newline at end of file -- Gitee From 0d43db8e4921dcb8ecb2a77749d28c25902eb5d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Mon, 21 Jul 2025 09:50:54 +0000 Subject: [PATCH 02/16] add examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: flippy航 <654733882@qq.com> --- .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh new file mode 100644 index 000000000..5cfde0d3c --- /dev/null +++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh @@ -0,0 +1,179 @@ +#!/bin/bash + +export HCCL_CONNECT_TIMEOUT=1800 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export NPU_ASD_ENABLE=0 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" +TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" +CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" + +TP=1 +PP=1 +EP=2 +CP=1 + +MBS=1 +GBS=256 +SEQ_LENGTH=1024 +TRAIN_ITERS=2000 +CP_TYPE='ulysses_cp_algo' +ROUTER_BALANCING_TYPE='aux_loss' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +NUM_LAYERS=96 +LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" +MAMBA_ARGS=" + --reuse-fp32-param \ + --no-shared-storage \ + --use-distributed-optimizer \ + --use-flash-attn \ + --use-mcore-models \ + --num-layers ${NUM_LAYERS} \ + --mamba-ngroups 8 \ + --mamba-chunk-size 128 \ + --mamba-d-state 128 \ + --mamba-d-conv 4 \ + --mamba-expand 2 \ + --mamba-headdim 64 \ + --tokenizer-model ${TOKENIZER_PATH} \ + --hybrid-attention-ratio 0.26 \ + --hybrid-mlp-ratio 0.5 \ + --hybrid-override-pattern $LAYER_PATTEN \ + --untie-embeddings-and-output-weights \ + --overlap-param-gather \ + --overlap-grad-reduce \ + --norm-epsilon 1e-6 \ +" + +MOE_ARGS=" + --num-experts 128 \ + --moe-router-topk 8 \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 768 \ + --moe-grouped-gemm \ + --use-fused-moe-token-permute-and-unpermute \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type alltoall \ + --moe-aux-loss-coeff 0.001 \ +" + +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --sequence-parallel \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --use-distributed-optimizer + --gemm-gradient-accumulation-fusion \ + --recompute-method uniform \ + --recompute-granularity full \ + --recompute-num-layers 1 \ +" + +TRAIN_ARGS=" + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 1.25e-6 \ + --lr-decay-style cosine \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ + --no-shared-storage +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --norm-topk-prob \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --num-layers 48 \ + --hidden-size 2048 \ + --ffn-hidden-size 6144 \ + --num-attention-heads 32 \ + --tokenizer-type PretrainedFromHF \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --attention-softmax-in-fp32 \ + --no-gradient-accumulation-fusion \ + --group-query-attention \ + --num-query-groups 4 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --no-load-optim \ + --no-load-rng +" + +torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $MOE_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + $MAMBA_ARGS \ + --distributed-backend nccl \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen3_30b_a3b.log \ No newline at end of file -- Gitee From fb2057cce2cc5df6a16d54eb6dfd0982a520f1be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Fri, 18 Jul 2025 15:05:23 +0800 Subject: [PATCH 03/16] support Qwen3-MoE-mamba --- .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh | 358 +++++++++--------- .../tasks/models/spec/qwen3_mamba_spec.py | 208 +++++----- 2 files changed, 283 insertions(+), 283 deletions(-) diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh index 5cfde0d3c..66b12c0e5 100644 --- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh +++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh @@ -1,179 +1,179 @@ -#!/bin/bash - -export HCCL_CONNECT_TIMEOUT=1800 -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export NPU_ASD_ENABLE=0 - -NPUS_PER_NODE=8 -MASTER_ADDR=localhost -MASTER_PORT=6000 -NNODES=1 -NODE_RANK=0 -WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) - -# please fill these path configurations -CKPT_SAVE_DIR="your model save ckpt path" -DATA_PATH="your data path" -TOKENIZER_PATH="your tokenizer path" -CKPT_LOAD_DIR="your model ckpt path" - -DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" -TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" -CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" - -TP=1 -PP=1 -EP=2 -CP=1 - -MBS=1 -GBS=256 -SEQ_LENGTH=1024 -TRAIN_ITERS=2000 -CP_TYPE='ulysses_cp_algo' -ROUTER_BALANCING_TYPE='aux_loss' - -DISTRIBUTED_ARGS=" - --nproc_per_node $NPUS_PER_NODE \ - --nnodes $NNODES \ - --node_rank $NODE_RANK \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT -" - -NUM_LAYERS=96 -LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" -MAMBA_ARGS=" - --reuse-fp32-param \ - --no-shared-storage \ - --use-distributed-optimizer \ - --use-flash-attn \ - --use-mcore-models \ - --num-layers ${NUM_LAYERS} \ - --mamba-ngroups 8 \ - --mamba-chunk-size 128 \ - --mamba-d-state 128 \ - --mamba-d-conv 4 \ - --mamba-expand 2 \ - --mamba-headdim 64 \ - --tokenizer-model ${TOKENIZER_PATH} \ - --hybrid-attention-ratio 0.26 \ - --hybrid-mlp-ratio 0.5 \ - --hybrid-override-pattern $LAYER_PATTEN \ - --untie-embeddings-and-output-weights \ - --overlap-param-gather \ - --overlap-grad-reduce \ - --norm-epsilon 1e-6 \ -" - -MOE_ARGS=" - --num-experts 128 \ - --moe-router-topk 8 \ - --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ - --moe-intermediate-size 768 \ - --moe-grouped-gemm \ - --use-fused-moe-token-permute-and-unpermute \ - --moe-permutation-async-comm \ - --moe-token-dispatcher-type alltoall \ - --moe-aux-loss-coeff 0.001 \ -" - -OPTIMIZE_ARGS=" - --use-flash-attn \ - --use-fused-rotary-pos-emb \ - --sequence-parallel \ - --use-rotary-position-embeddings \ - --use-fused-swiglu \ - --use-fused-rmsnorm \ - --no-masked-softmax-fusion \ - --use-distributed-optimizer - --gemm-gradient-accumulation-fusion \ - --recompute-method uniform \ - --recompute-granularity full \ - --recompute-num-layers 1 \ -" - -TRAIN_ARGS=" - --micro-batch-size ${MBS} \ - --global-batch-size ${GBS} \ - --lr 1.25e-6 \ - --lr-decay-style cosine \ - --min-lr 1.25e-7 \ - --weight-decay 1e-1 \ - --lr-warmup-fraction 0.01 \ - --attention-dropout 0.0 \ - --init-method-std 0.01 \ - --hidden-dropout 0.0 \ - --clip-grad 1.0 \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --initial-loss-scale 4096 \ - --seed 42 \ - --bf16 \ - --train-iters ${TRAIN_ITERS} \ - --seq-length ${SEQ_LENGTH} \ - --no-shared-storage -" - -MODEL_PARALLEL_ARGS=" - --tensor-model-parallel-size ${TP} \ - --pipeline-model-parallel-size ${PP} \ - --expert-model-parallel-size ${EP} \ - --context-parallel-size ${CP} \ - --context-parallel-algo ${CP_TYPE} \ -" - -GPT_ARGS=" - --use-mcore-models \ - --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ - --kv-channels 128 \ - --qk-layernorm \ - --norm-topk-prob \ - --tokenizer-name-or-path ${TOKENIZER_PATH} \ - --max-position-embeddings ${SEQ_LENGTH} \ - --num-layers 48 \ - --hidden-size 2048 \ - --ffn-hidden-size 6144 \ - --num-attention-heads 32 \ - --tokenizer-type PretrainedFromHF \ - --make-vocab-size-divisible-by 1 \ - --padded-vocab-size 151936 \ - --rotary-base 1000000 \ - --untie-embeddings-and-output-weights \ - --disable-bias-linear \ - --position-embedding-type rope \ - --normalization RMSNorm \ - --swiglu \ - --attention-softmax-in-fp32 \ - --no-gradient-accumulation-fusion \ - --group-query-attention \ - --num-query-groups 4 -" - -DATA_ARGS=" - --data-path $DATA_PATH \ - --split 100,0,0 -" - -OUTPUT_ARGS=" - --log-interval 1 \ - --save-interval ${TRAIN_ITERS} \ - --eval-interval ${TRAIN_ITERS} \ - --eval-iters 0 \ - --no-load-optim \ - --no-load-rng -" - -torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \ - $GPT_ARGS \ - $DATA_ARGS \ - $MOE_ARGS \ - $OUTPUT_ARGS \ - $OPTIMIZE_ARGS \ - $TRAIN_ARGS \ - $MODEL_PARALLEL_ARGS \ - $MAMBA_ARGS \ - --distributed-backend nccl \ - --save ${CKPT_SAVE_DIR} \ - | tee logs/train_mcore_qwen3_30b_a3b.log \ No newline at end of file +#!/bin/bash + +export HCCL_CONNECT_TIMEOUT=1800 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export NPU_ASD_ENABLE=0 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" +TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" +CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" + +TP=1 +PP=1 +EP=2 +CP=1 + +MBS=1 +GBS=256 +SEQ_LENGTH=1024 +TRAIN_ITERS=2000 +CP_TYPE='ulysses_cp_algo' +ROUTER_BALANCING_TYPE='aux_loss' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +NUM_LAYERS=96 +LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" +MAMBA_ARGS=" + --reuse-fp32-param \ + --no-shared-storage \ + --use-distributed-optimizer \ + --use-flash-attn \ + --use-mcore-models \ + --num-layers ${NUM_LAYERS} \ + --mamba-ngroups 8 \ + --mamba-chunk-size 128 \ + --mamba-d-state 128 \ + --mamba-d-conv 4 \ + --mamba-expand 2 \ + --mamba-headdim 64 \ + --tokenizer-model ${TOKENIZER_PATH} \ + --hybrid-attention-ratio 0.26 \ + --hybrid-mlp-ratio 0.5 \ + --hybrid-override-pattern $LAYER_PATTEN \ + --untie-embeddings-and-output-weights \ + --overlap-param-gather \ + --overlap-grad-reduce \ + --norm-epsilon 1e-6 \ +" + +MOE_ARGS=" + --num-experts 128 \ + --moe-router-topk 8 \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 768 \ + --moe-grouped-gemm \ + --use-fused-moe-token-permute-and-unpermute \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type alltoall \ + --moe-aux-loss-coeff 0.001 \ +" + +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --sequence-parallel \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --use-distributed-optimizer + --gemm-gradient-accumulation-fusion \ + --recompute-method uniform \ + --recompute-granularity full \ + --recompute-num-layers 1 \ +" + +TRAIN_ARGS=" + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 1.25e-6 \ + --lr-decay-style cosine \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ + --no-shared-storage +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --norm-topk-prob \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --num-layers 48 \ + --hidden-size 2048 \ + --ffn-hidden-size 6144 \ + --num-attention-heads 32 \ + --tokenizer-type PretrainedFromHF \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --attention-softmax-in-fp32 \ + --no-gradient-accumulation-fusion \ + --group-query-attention \ + --num-query-groups 4 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --no-load-optim \ + --no-load-rng +" + +torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $MOE_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + $MAMBA_ARGS \ + --distributed-backend nccl \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen3_30b_a3b.log diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py index 249231308..577b79873 100644 --- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py +++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py @@ -1,105 +1,105 @@ -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear -from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules -from megatron.core.transformer.dot_product_attention import DotProductAttention -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.identity_op import IdentityOp -from megatron.core.transformer.spec_utils import ModuleSpec -from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec -from megatron.training import get_args - -from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules -from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules -from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules - -from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules -from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm - -args = get_args() -num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm - -# # Transformer Layer Spec for Gemma using post_mlp_layernorm and post_mlp_layernorm. -# layer_spec = ModuleSpec( -# module=TransformerLayer, -# submodules=TransformerLayerSubmodules( -# input_layernorm=PTNorm, -# self_attention=ModuleSpec( -# module=SelfAttention, -# params={"attn_mask_type": AttnMaskType.causal}, -# submodules=SelfAttentionSubmodules( -# linear_qkv=ColumnParallelLinear, -# core_attention=DotProductAttention, -# linear_proj=RowParallelLinear, -# q_layernorm=PTNorm if qk_layernorm else IdentityOp, -# k_layernorm=PTNorm if qk_layernorm else IdentityOp, -# ), -# ), -# self_attn_bda=get_bias_dropout_add, -# pre_mlp_layernorm=PTNorm, -# mlp=_get_mlp_module_spec( -# use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm -# ), -# mlp_bda=get_bias_dropout_add, -# sharded_state_dict_keys_map={ -# 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', -# 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', -# }, -# ), -# ) - -layer_spec = ModuleSpec( - module=MambaStack, - submodules=MambaStackSubmodules( - mamba_layer=ModuleSpec( - module=MambaLayer, - submodules=MambaLayerSubmodules( - norm=PTNorm, - mixer=ModuleSpec( - module=MambaMixer, - submodules=MambaMixerSubmodules( - in_proj=ColumnParallelLinear, - out_proj=RowParallelLinear, - ), - ), - mamba_bda=get_bias_dropout_add, - ), - ), - attention_layer=ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=PTNorm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=ColumnParallelLinear, - core_attention=DotProductAttention, - linear_proj=RowParallelLinear, - ), - ), - self_attn_bda=get_bias_dropout_add, - ), - ), - mlp_layer=ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - pre_mlp_layernorm=PTNorm, - # mlp=ModuleSpec( - # module=MLP, - # submodules=MLPSubmodules( - # linear_fc1=ColumnParallelLinear, - # linear_fc2=RowParallelLinear, - # ), - # ), - mlp=_get_mlp_module_spec( - use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm - ), - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map={ - 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', - 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', - }, - ), - ), - ), +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec +from megatron.training import get_args + +from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules +from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules +from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules + +from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules +from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm + +args = get_args() +num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm + +# # Transformer Layer Spec for Gemma using post_mlp_layernorm and post_mlp_layernorm. +# layer_spec = ModuleSpec( +# module=TransformerLayer, +# submodules=TransformerLayerSubmodules( +# input_layernorm=PTNorm, +# self_attention=ModuleSpec( +# module=SelfAttention, +# params={"attn_mask_type": AttnMaskType.causal}, +# submodules=SelfAttentionSubmodules( +# linear_qkv=ColumnParallelLinear, +# core_attention=DotProductAttention, +# linear_proj=RowParallelLinear, +# q_layernorm=PTNorm if qk_layernorm else IdentityOp, +# k_layernorm=PTNorm if qk_layernorm else IdentityOp, +# ), +# ), +# self_attn_bda=get_bias_dropout_add, +# pre_mlp_layernorm=PTNorm, +# mlp=_get_mlp_module_spec( +# use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm +# ), +# mlp_bda=get_bias_dropout_add, +# sharded_state_dict_keys_map={ +# 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', +# 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', +# }, +# ), +# ) + +layer_spec = ModuleSpec( + module=MambaStack, + submodules=MambaStackSubmodules( + mamba_layer=ModuleSpec( + module=MambaLayer, + submodules=MambaLayerSubmodules( + norm=PTNorm, + mixer=ModuleSpec( + module=MambaMixer, + submodules=MambaMixerSubmodules( + in_proj=ColumnParallelLinear, + out_proj=RowParallelLinear, + ), + ), + mamba_bda=get_bias_dropout_add, + ), + ), + attention_layer=ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=PTNorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + ), + ), + self_attn_bda=get_bias_dropout_add, + ), + ), + mlp_layer=ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + pre_mlp_layernorm=PTNorm, + # mlp=ModuleSpec( + # module=MLP, + # submodules=MLPSubmodules( + # linear_fc1=ColumnParallelLinear, + # linear_fc2=RowParallelLinear, + # ), + # ), + mlp=_get_mlp_module_spec( + use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm + ), + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ), + ), ) \ No newline at end of file -- Gitee From e25466431332a6b628400dfa3ae69c674aaf5fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Mon, 21 Jul 2025 10:48:12 +0800 Subject: [PATCH 04/16] update mamba_spec with qk_layernorm --- mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py index 577b79873..0c463a42b 100644 --- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py +++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py @@ -75,6 +75,8 @@ layer_spec = ModuleSpec( linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear, + q_layernorm=PTNorm if qk_layernorm else IdentityOp, + k_layernorm=PTNorm if qk_layernorm else IdentityOp, ), ), self_attn_bda=get_bias_dropout_add, -- Gitee From 40ae8dd78815ed5442270e0060d2d16014c6d1b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Mon, 21 Jul 2025 18:08:59 +0800 Subject: [PATCH 05/16] add mamba convert hf2mg --- configs/checkpoint/model_cfg.json | 24 ++++- mindspeed_llm/tasks/checkpoint/models.py | 121 +++++++++++++++++++++-- 2 files changed, 138 insertions(+), 7 deletions(-) diff --git a/configs/checkpoint/model_cfg.json b/configs/checkpoint/model_cfg.json index 458d639c0..f901e51a3 100644 --- a/configs/checkpoint/model_cfg.json +++ b/configs/checkpoint/model_cfg.json @@ -125,7 +125,29 @@ "layers_mlp_experts_up_proj": "model.layers[layer_idx].mlp.experts[expert_idx].up_proj", "layers_mlp_experts_linear_fc2": "model.layers[layer_idx].mlp.experts[expert_idx].down_proj", "final_layernorm": "model.norm", - "output_layer": "lm_head" + "output_layer": "lm_head", + + "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log", + "mamba_mixer_D": "model.layers[layer_idx].mamba.D", + "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d", + "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias", + "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj", + "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm", + "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj" + } + }, + "qwen3-moe-mamba": { + "__base__": "qwen3-moe", + "config_set_value":{}, + "config_hf_key_mapping":{}, + "model_hf_key_mapping":{ + "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log", + "mamba_mixer_D": "model.layers[layer_idx].mamba.D", + "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d", + "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias", + "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj", + "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm", + "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj" } }, "qwen3": { diff --git a/mindspeed_llm/tasks/checkpoint/models.py b/mindspeed_llm/tasks/checkpoint/models.py index ead74608a..b42eded98 100644 --- a/mindspeed_llm/tasks/checkpoint/models.py +++ b/mindspeed_llm/tasks/checkpoint/models.py @@ -23,6 +23,8 @@ from megatron.core import tensor_parallel from mindspeed_llm.training.utils import parse_args from mindspeed_llm.training import model_provider_func_wrapper from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols + logger.basicConfig(format="") logger.getLogger().setLevel(logger.INFO) @@ -191,7 +193,7 @@ class ModelBase(abc.ABC): logger.info(f"Source output layer weight size: {output_layer_weight.size()} " f"Target output layer weight size: {self.get_output_layer_weight().size()}") output_layer_weight = output_layer_weight[:self.get_output_layer_weight().size(0), :] - self.set_output_layer_weight(data=output_layer_weight) + self.set_output_layer_weight(data=output_layer_weight) # true if self.has_final_layernorm_bias(): final_layernorm_bias = src_model.get_final_layernorm_bias() self.set_final_layernorm_bias(data=final_layernorm_bias) @@ -214,7 +216,9 @@ class ModelBase(abc.ABC): """ For source layer index == destination layer index. """ - self.set_layer_state_base(src_model, layer_idx, layer_idx) + # self.set_layer_state_base(src_model, layer_idx, layer_idx) + self.set_qwen_mamba_layer_state_base(src_model, layer_idx, layer_idx) + @staticmethod def is_noop_layer(src_layer_idx): @@ -227,17 +231,29 @@ class ModelBase(abc.ABC): kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx} self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) self.set_mlp_state(src_model, **kwargs) + + if self.args_cmd.model_type_hf == "qwen3-moe-mamba": + layer_type = self.args.layer_pattern[self.args.num_layers] + if layer_type == LayerSymbols.MAMBA: + pass + elif layer_type == LayerSymbols.ATTENTION: + # remove + self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) + elif layer_type == LayerSymbols.MLP: + # add pre_mlp_layernorm + self.set_mlp_state(src_model, **kwargs) + if self.args_cmd.save_lora_to_hf: return input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx) self.set_layers_input_layernorm_weight(layer_idx=dst_layer_idx, data=input_layernorm_weight) - if self.args.post_norm: + if self.args.post_norm: # false post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight( layer_idx=src_layer_idx) self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=dst_layer_idx, data=post_attn_layernorm_weight) - else: + else: # true pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight( layer_idx=src_layer_idx) self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx, @@ -251,6 +267,91 @@ class ModelBase(abc.ABC): pre_mlp_layernorm_bias = src_model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx) self.set_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=dst_layer_idx, data=pre_mlp_layernorm_bias) + def set_qwen_mamba_layer_state_base(self, src_model, src_layer_idx, dst_layer_idx): + # assert self.args_cmd.model_type_hf == "qwen3-moe-mamba" + self.args.hybrid_override_pattern = "*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" # TODO + dst_layer_idx = src_layer_idx * 2 + layer_type = self.args.hybrid_override_pattern[dst_layer_idx] + + if layer_type == LayerSymbols.MAMBA: + self.set_mamba_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) + + elif layer_type == LayerSymbols.ATTENTION: + # inpyt_layernorm + input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx) + self.set_layers_input_layernorm_weight(layer_idx=src_layer_idx*2, data=input_layernorm_weight) + + # self_attention + self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) + + dst_layer_idx = dst_layer_idx + 1 + assert self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP + if self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP: + # pre_mlp_layernorm / post_attention_layernorm + pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=src_layer_idx) + self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx, + data=pre_mlp_layernorm_weight) + # mlp + kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx} + self.set_mlp_state(src_model, **kwargs) + + # TODO + def set_mamba_state(self, src_layer_idx, dst_layer_idx, src_model): + + # mg + + # mixer + # in_proj + # conv1d + # norm + # out_proj + # norm + + + # hf (model.layers.1.input_layernorm.weight) + # + + # mamba_in_proj.weight + + # mamba.A_log + # mamba.D + + # mamba.conv1d.weight + # mamba.conv1d.bias + # mamba.dt_bias + + # mamba.norm.weight + # mamba.out_proj.weight + + + # model.layers.1.mamba.in_proj.weight + in_proj_weight = src_model.get_mamba_mixer_in_proj_weight(layer_idx=src_layer_idx) + # self.set_mamba_mixer_in_proj_weight(layer_idx=dst_layer_idx, data=in_proj_weight) # TODO [4384, 2048] (mg) [9248, 2048] (hf) + + # A_log, D, dt_bias (without weight, just nn.parameter) + A_log = src_model.get_mamba_mixer_A_log(layer_idx=src_layer_idx) + D = src_model.get_mamba_mixer_D(layer_idx=src_layer_idx) + dt_bias = src_model.get_mamba_mixer_dt_bias(layer_idx=src_layer_idx) + + # conv1d + conv1d_weight = src_model.get_mamba_mixer_conv1d_weight(layer_idx=src_layer_idx) + conv1d_bias = src_model.get_mamba_mixer_conv1d_bias(layer_idx=src_layer_idx) + self.set_mamba_mixer_conv1d_weight(layer_idx=dst_layer_idx, data=conv1d_weight) + self.set_mamba_mixer_conv1d_bias(layer_idx=dst_layer_idx, data=conv1d_bias) + + # model.layers.1.mamba.norm.weight + mixer_norm = src_model.get_mamba_mixer_norm_weight(layer_idx=src_layer_idx) + self.set_mamba_mixer_norm_weight(layer_idx=dst_layer_idx, data=mixer_norm) + + # model.layers.1.mamba.out_proj.weight + out_proj_weight = src_model.get_mamba_mixer_out_proj_weight(layer_idx=src_layer_idx) + self.set_mamba_mixer_out_proj_weight(layer_idx=dst_layer_idx, data=out_proj_weight) + + # mamba norm (not found in hf) # TODO + + pass + + def set_attn_state(self, src_layer_idx, dst_layer_idx, src_model): """Set self-attention params.""" if self.args.save_lora_to_hf: @@ -270,7 +371,7 @@ class ModelBase(abc.ABC): self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) else: q_layernorm = src_model.get_layers_self_attention_q_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) + self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) # error k_layernorm = src_model.get_layers_self_attention_k_layernorm_weight(layer_idx=src_layer_idx) self.set_layers_self_attention_k_layernorm_weight(layer_idx=dst_layer_idx, data=k_layernorm) @@ -1401,7 +1502,8 @@ class MegatronMCoreModel(MegatronModel): "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1", "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2", "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm", - "final_layernorm": "decoder.final_layernorm", + # "final_layernorm": "decoder.final_layernorm", + "final_layernorm": "decoder.final_norm", "output_layer": "output_layer", "rm_head": "rm_head" } @@ -1418,6 +1520,13 @@ class MegatronMCoreModel(MegatronModel): self.module_mapping["layers_self_attention_linear_qb"] = module_layer + "self_attention.linear_qb" self.module_mapping["layers_self_attention_linear_kvb"] = module_layer + "self_attention.linear_kvb" + # Mamba + self.module_mapping["mamba_mixer_A_log"] = module_layer + "A_log" + self.module_mapping["mamba_mixer_in_proj"] = module_layer + "mixer.in_proj" + self.module_mapping["mamba_mixer_out_proj"] = module_layer + "mixer.out_proj" + self.module_mapping["mamba_mixer_norm"] = module_layer + "mixer.norm" + + # shared experts self.module_mapping[ "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1" -- Gitee From 89b185f511cdf2446fd0a0af00b6c33bcb810c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Wed, 23 Jul 2025 17:38:59 +0800 Subject: [PATCH 06/16] Revert "add mamba convert hf2mg" This reverts commit 6b7e1e435ec72e2d488c45e78277311eb1359e82. --- configs/checkpoint/model_cfg.json | 24 +---- mindspeed_llm/tasks/checkpoint/models.py | 121 ++--------------------- 2 files changed, 7 insertions(+), 138 deletions(-) diff --git a/configs/checkpoint/model_cfg.json b/configs/checkpoint/model_cfg.json index f901e51a3..458d639c0 100644 --- a/configs/checkpoint/model_cfg.json +++ b/configs/checkpoint/model_cfg.json @@ -125,29 +125,7 @@ "layers_mlp_experts_up_proj": "model.layers[layer_idx].mlp.experts[expert_idx].up_proj", "layers_mlp_experts_linear_fc2": "model.layers[layer_idx].mlp.experts[expert_idx].down_proj", "final_layernorm": "model.norm", - "output_layer": "lm_head", - - "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log", - "mamba_mixer_D": "model.layers[layer_idx].mamba.D", - "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d", - "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias", - "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj", - "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm", - "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj" - } - }, - "qwen3-moe-mamba": { - "__base__": "qwen3-moe", - "config_set_value":{}, - "config_hf_key_mapping":{}, - "model_hf_key_mapping":{ - "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log", - "mamba_mixer_D": "model.layers[layer_idx].mamba.D", - "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d", - "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias", - "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj", - "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm", - "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj" + "output_layer": "lm_head" } }, "qwen3": { diff --git a/mindspeed_llm/tasks/checkpoint/models.py b/mindspeed_llm/tasks/checkpoint/models.py index b42eded98..ead74608a 100644 --- a/mindspeed_llm/tasks/checkpoint/models.py +++ b/mindspeed_llm/tasks/checkpoint/models.py @@ -23,8 +23,6 @@ from megatron.core import tensor_parallel from mindspeed_llm.training.utils import parse_args from mindspeed_llm.training import model_provider_func_wrapper from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols - logger.basicConfig(format="") logger.getLogger().setLevel(logger.INFO) @@ -193,7 +191,7 @@ class ModelBase(abc.ABC): logger.info(f"Source output layer weight size: {output_layer_weight.size()} " f"Target output layer weight size: {self.get_output_layer_weight().size()}") output_layer_weight = output_layer_weight[:self.get_output_layer_weight().size(0), :] - self.set_output_layer_weight(data=output_layer_weight) # true + self.set_output_layer_weight(data=output_layer_weight) if self.has_final_layernorm_bias(): final_layernorm_bias = src_model.get_final_layernorm_bias() self.set_final_layernorm_bias(data=final_layernorm_bias) @@ -216,9 +214,7 @@ class ModelBase(abc.ABC): """ For source layer index == destination layer index. """ - # self.set_layer_state_base(src_model, layer_idx, layer_idx) - self.set_qwen_mamba_layer_state_base(src_model, layer_idx, layer_idx) - + self.set_layer_state_base(src_model, layer_idx, layer_idx) @staticmethod def is_noop_layer(src_layer_idx): @@ -231,29 +227,17 @@ class ModelBase(abc.ABC): kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx} self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) self.set_mlp_state(src_model, **kwargs) - - if self.args_cmd.model_type_hf == "qwen3-moe-mamba": - layer_type = self.args.layer_pattern[self.args.num_layers] - if layer_type == LayerSymbols.MAMBA: - pass - elif layer_type == LayerSymbols.ATTENTION: - # remove - self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) - elif layer_type == LayerSymbols.MLP: - # add pre_mlp_layernorm - self.set_mlp_state(src_model, **kwargs) - if self.args_cmd.save_lora_to_hf: return input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx) self.set_layers_input_layernorm_weight(layer_idx=dst_layer_idx, data=input_layernorm_weight) - if self.args.post_norm: # false + if self.args.post_norm: post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight( layer_idx=src_layer_idx) self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=dst_layer_idx, data=post_attn_layernorm_weight) - else: # true + else: pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight( layer_idx=src_layer_idx) self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx, @@ -267,91 +251,6 @@ class ModelBase(abc.ABC): pre_mlp_layernorm_bias = src_model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx) self.set_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=dst_layer_idx, data=pre_mlp_layernorm_bias) - def set_qwen_mamba_layer_state_base(self, src_model, src_layer_idx, dst_layer_idx): - # assert self.args_cmd.model_type_hf == "qwen3-moe-mamba" - self.args.hybrid_override_pattern = "*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" # TODO - dst_layer_idx = src_layer_idx * 2 - layer_type = self.args.hybrid_override_pattern[dst_layer_idx] - - if layer_type == LayerSymbols.MAMBA: - self.set_mamba_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) - - elif layer_type == LayerSymbols.ATTENTION: - # inpyt_layernorm - input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_input_layernorm_weight(layer_idx=src_layer_idx*2, data=input_layernorm_weight) - - # self_attention - self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) - - dst_layer_idx = dst_layer_idx + 1 - assert self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP - if self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP: - # pre_mlp_layernorm / post_attention_layernorm - pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx, - data=pre_mlp_layernorm_weight) - # mlp - kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx} - self.set_mlp_state(src_model, **kwargs) - - # TODO - def set_mamba_state(self, src_layer_idx, dst_layer_idx, src_model): - - # mg - - # mixer - # in_proj - # conv1d - # norm - # out_proj - # norm - - - # hf (model.layers.1.input_layernorm.weight) - # - - # mamba_in_proj.weight - - # mamba.A_log - # mamba.D - - # mamba.conv1d.weight - # mamba.conv1d.bias - # mamba.dt_bias - - # mamba.norm.weight - # mamba.out_proj.weight - - - # model.layers.1.mamba.in_proj.weight - in_proj_weight = src_model.get_mamba_mixer_in_proj_weight(layer_idx=src_layer_idx) - # self.set_mamba_mixer_in_proj_weight(layer_idx=dst_layer_idx, data=in_proj_weight) # TODO [4384, 2048] (mg) [9248, 2048] (hf) - - # A_log, D, dt_bias (without weight, just nn.parameter) - A_log = src_model.get_mamba_mixer_A_log(layer_idx=src_layer_idx) - D = src_model.get_mamba_mixer_D(layer_idx=src_layer_idx) - dt_bias = src_model.get_mamba_mixer_dt_bias(layer_idx=src_layer_idx) - - # conv1d - conv1d_weight = src_model.get_mamba_mixer_conv1d_weight(layer_idx=src_layer_idx) - conv1d_bias = src_model.get_mamba_mixer_conv1d_bias(layer_idx=src_layer_idx) - self.set_mamba_mixer_conv1d_weight(layer_idx=dst_layer_idx, data=conv1d_weight) - self.set_mamba_mixer_conv1d_bias(layer_idx=dst_layer_idx, data=conv1d_bias) - - # model.layers.1.mamba.norm.weight - mixer_norm = src_model.get_mamba_mixer_norm_weight(layer_idx=src_layer_idx) - self.set_mamba_mixer_norm_weight(layer_idx=dst_layer_idx, data=mixer_norm) - - # model.layers.1.mamba.out_proj.weight - out_proj_weight = src_model.get_mamba_mixer_out_proj_weight(layer_idx=src_layer_idx) - self.set_mamba_mixer_out_proj_weight(layer_idx=dst_layer_idx, data=out_proj_weight) - - # mamba norm (not found in hf) # TODO - - pass - - def set_attn_state(self, src_layer_idx, dst_layer_idx, src_model): """Set self-attention params.""" if self.args.save_lora_to_hf: @@ -371,7 +270,7 @@ class ModelBase(abc.ABC): self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) else: q_layernorm = src_model.get_layers_self_attention_q_layernorm_weight(layer_idx=src_layer_idx) - self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) # error + self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) k_layernorm = src_model.get_layers_self_attention_k_layernorm_weight(layer_idx=src_layer_idx) self.set_layers_self_attention_k_layernorm_weight(layer_idx=dst_layer_idx, data=k_layernorm) @@ -1502,8 +1401,7 @@ class MegatronMCoreModel(MegatronModel): "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1", "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2", "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm", - # "final_layernorm": "decoder.final_layernorm", - "final_layernorm": "decoder.final_norm", + "final_layernorm": "decoder.final_layernorm", "output_layer": "output_layer", "rm_head": "rm_head" } @@ -1520,13 +1418,6 @@ class MegatronMCoreModel(MegatronModel): self.module_mapping["layers_self_attention_linear_qb"] = module_layer + "self_attention.linear_qb" self.module_mapping["layers_self_attention_linear_kvb"] = module_layer + "self_attention.linear_kvb" - # Mamba - self.module_mapping["mamba_mixer_A_log"] = module_layer + "A_log" - self.module_mapping["mamba_mixer_in_proj"] = module_layer + "mixer.in_proj" - self.module_mapping["mamba_mixer_out_proj"] = module_layer + "mixer.out_proj" - self.module_mapping["mamba_mixer_norm"] = module_layer + "mixer.norm" - - # shared experts self.module_mapping[ "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1" -- Gitee From b7d0fcfea6fd423ace95a87f1c69549eb14dbeab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Wed, 23 Jul 2025 17:44:44 +0800 Subject: [PATCH 07/16] support Qwen3-30B-A3B-Mamba training --- .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh | 13 +++-- mindspeed_llm/core/ssm/mamba_block.py | 5 ++ .../features_manager/models/mamba.py | 2 +- .../tasks/models/spec/qwen3_mamba_spec.py | 56 ++++++++++++++++++- 4 files changed, 68 insertions(+), 8 deletions(-) diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh index 66b12c0e5..5f59b529b 100644 --- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh +++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh @@ -17,13 +17,15 @@ CKPT_SAVE_DIR="your model save ckpt path" DATA_PATH="your data path" TOKENIZER_PATH="your tokenizer path" CKPT_LOAD_DIR="your model ckpt path" - + + DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" - -TP=1 -PP=1 +CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-tp2-pp2-ep2" + +TP=2 +PP=2 EP=2 CP=1 @@ -44,6 +46,8 @@ DISTRIBUTED_ARGS=" NUM_LAYERS=96 LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" +NUM_LAYERS=4 +LAYER_PATTEN="*-M-" MAMBA_ARGS=" --reuse-fp32-param \ --no-shared-storage \ @@ -176,4 +180,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \ $MAMBA_ARGS \ --distributed-backend nccl \ --save ${CKPT_SAVE_DIR} \ + --load ${CKPT_LOAD_DIR} | tee logs/train_mcore_qwen3_30b_a3b.log diff --git a/mindspeed_llm/core/ssm/mamba_block.py b/mindspeed_llm/core/ssm/mamba_block.py index 4c4266b51..3430ff83b 100644 --- a/mindspeed_llm/core/ssm/mamba_block.py +++ b/mindspeed_llm/core/ssm/mamba_block.py @@ -79,6 +79,11 @@ def _mamba_block_method_checkpointed_forward_func( inference_params=None, rotary_pos_emb=rotary_pos_emb, ) + # The attention layer (currently a simplified transformer layer) + # outputs a tuple of (hidden_states, context). Context is intended + # for cross-attention, and is not needed in our model. + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] return hidden_states return custom_forward diff --git a/mindspeed_llm/features_manager/models/mamba.py b/mindspeed_llm/features_manager/models/mamba.py index ac482e581..d3fed01e8 100644 --- a/mindspeed_llm/features_manager/models/mamba.py +++ b/mindspeed_llm/features_manager/models/mamba.py @@ -15,7 +15,7 @@ class MambaModel(MindSpeedFeature): group.add_argument('--mamba-d-state', type=int, default=128, help='state dim for mamba') group.add_argument('--mamba-d-conv', type=int, default=4, help='conv channel dim for mamba') group.add_argument('--mamba-expand', type=int, default=1, help='expand scale for mamba') - group.add_argument('--mamba-headdim', type=int, default=80, help='head dim for mamba') + group.add_argument('--mamba-headdim', type=int, default=64, help='head dim for mamba') def register_patches(self, patch_manager, args): from mindspeed_llm.core.ssm.mamba_mixer import mamba_mixer_init_wrapper, mamba_mixer_forward, Mamba2RMSNorm diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py index 0c463a42b..8578d7442 100644 --- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py +++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py @@ -46,14 +46,64 @@ num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped # }, # ), # ) - + +# layer_spec = ModuleSpec( +# module=MambaStack, +# submodules=MambaStackSubmodules( +# mamba_layer=ModuleSpec( +# module=MambaLayer, +# submodules=MambaLayerSubmodules( +# norm=PTNorm, +# mixer=ModuleSpec( +# module=MambaMixer, +# submodules=MambaMixerSubmodules( +# in_proj=ColumnParallelLinear, +# out_proj=RowParallelLinear, +# ), +# ), +# mamba_bda=get_bias_dropout_add, +# ), +# ), +# attention_layer=ModuleSpec( +# module=TransformerLayer, +# submodules=TransformerLayerSubmodules( +# input_layernorm=PTNorm, +# self_attention=ModuleSpec( +# module=SelfAttention, +# params={"attn_mask_type": AttnMaskType.causal}, +# submodules=SelfAttentionSubmodules( +# linear_qkv=ColumnParallelLinear, +# core_attention=DotProductAttention, +# linear_proj=RowParallelLinear, +# ), +# ), +# self_attn_bda=get_bias_dropout_add, +# ), +# ), +# mlp_layer=ModuleSpec( +# module=TransformerLayer, +# submodules=TransformerLayerSubmodules( +# pre_mlp_layernorm=PTNorm, +# mlp=ModuleSpec( +# module=MLP, +# submodules=MLPSubmodules( +# linear_fc1=ColumnParallelLinear, +# linear_fc2=RowParallelLinear, +# ), +# ), +# mlp_bda=get_bias_dropout_add, +# ), +# ), +# ), +# ) + layer_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( mamba_layer=ModuleSpec( module=MambaLayer, submodules=MambaLayerSubmodules( - norm=PTNorm, + # norm=PTNorm, mixer=ModuleSpec( module=MambaMixer, submodules=MambaMixerSubmodules( @@ -67,7 +117,7 @@ layer_spec = ModuleSpec( attention_layer=ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( - input_layernorm=PTNorm, + # input_layernorm=PTNorm, self_attention=ModuleSpec( module=SelfAttention, params={"attn_mask_type": AttnMaskType.causal}, -- Gitee From a8549bfb53698513f3d66823aaa03a4df832163d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Wed, 23 Jul 2025 19:18:33 +0800 Subject: [PATCH 08/16] modified config --- .../qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh index 5f59b529b..bef9e8346 100644 --- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh +++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh @@ -22,7 +22,7 @@ CKPT_LOAD_DIR="your model ckpt path" DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" -CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-tp2-pp2-ep2" +CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-tp2-pp2-ep2" TP=2 PP=2 @@ -57,10 +57,10 @@ MAMBA_ARGS=" --num-layers ${NUM_LAYERS} \ --mamba-ngroups 8 \ --mamba-chunk-size 128 \ - --mamba-d-state 128 \ + --mamba-d-state 64 \ --mamba-d-conv 4 \ --mamba-expand 2 \ - --mamba-headdim 64 \ + --mamba-headdim 128 \ --tokenizer-model ${TOKENIZER_PATH} \ --hybrid-attention-ratio 0.26 \ --hybrid-mlp-ratio 0.5 \ @@ -179,6 +179,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \ $MODEL_PARALLEL_ARGS \ $MAMBA_ARGS \ --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ --save ${CKPT_SAVE_DIR} \ - --load ${CKPT_LOAD_DIR} | tee logs/train_mcore_qwen3_30b_a3b.log -- Gitee From 4a88d2f5f8e2fa40ed241d0fe09f4531fb9ce478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Thu, 24 Jul 2025 16:10:20 +0800 Subject: [PATCH 09/16] add layer norm and modified pre_mlp_norm --- .../qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh | 8 ++++---- mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh index bef9e8346..873e79d97 100644 --- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh +++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh @@ -22,7 +22,7 @@ CKPT_LOAD_DIR="your model ckpt path" DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" -CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-tp2-pp2-ep2" +CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v2-tp2-pp2-ep2" TP=2 PP=2 @@ -55,12 +55,12 @@ MAMBA_ARGS=" --use-flash-attn \ --use-mcore-models \ --num-layers ${NUM_LAYERS} \ - --mamba-ngroups 8 \ + --mamba-ngroups 4 \ --mamba-chunk-size 128 \ - --mamba-d-state 64 \ + --mamba-d-state 128 \ --mamba-d-conv 4 \ --mamba-expand 2 \ - --mamba-headdim 128 \ + --mamba-headdim 64 \ --tokenizer-model ${TOKENIZER_PATH} \ --hybrid-attention-ratio 0.26 \ --hybrid-mlp-ratio 0.5 \ diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py index 8578d7442..008dd51e3 100644 --- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py +++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py @@ -103,7 +103,7 @@ layer_spec = ModuleSpec( mamba_layer=ModuleSpec( module=MambaLayer, submodules=MambaLayerSubmodules( - # norm=PTNorm, + norm=PTNorm, mixer=ModuleSpec( module=MambaMixer, submodules=MambaMixerSubmodules( @@ -117,7 +117,7 @@ layer_spec = ModuleSpec( attention_layer=ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( - # input_layernorm=PTNorm, + input_layernorm=PTNorm, self_attention=ModuleSpec( module=SelfAttention, params={"attn_mask_type": AttnMaskType.causal}, -- Gitee From 0cb2785e1d32cf0073d1634a81eacc1e9cc878ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Thu, 24 Jul 2025 16:20:23 +0800 Subject: [PATCH 10/16] remove redundant modified --- mindspeed_llm/features_manager/models/mamba.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspeed_llm/features_manager/models/mamba.py b/mindspeed_llm/features_manager/models/mamba.py index d3fed01e8..c6cdbe879 100644 --- a/mindspeed_llm/features_manager/models/mamba.py +++ b/mindspeed_llm/features_manager/models/mamba.py @@ -15,7 +15,7 @@ class MambaModel(MindSpeedFeature): group.add_argument('--mamba-d-state', type=int, default=128, help='state dim for mamba') group.add_argument('--mamba-d-conv', type=int, default=4, help='conv channel dim for mamba') group.add_argument('--mamba-expand', type=int, default=1, help='expand scale for mamba') - group.add_argument('--mamba-headdim', type=int, default=64, help='head dim for mamba') + group.add_argument('--mamba-headdim', type=int, default=80, help='head dim for mamba') def register_patches(self, patch_manager, args): from mindspeed_llm.core.ssm.mamba_mixer import mamba_mixer_init_wrapper, mamba_mixer_forward, Mamba2RMSNorm -- Gitee From 55ac1151c7ebf9e71b865c356a60a014c7c844d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Mon, 28 Jul 2025 14:30:20 +0800 Subject: [PATCH 11/16] change head_dim=128 --- .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh index 873e79d97..d3a9b2ab6 100644 --- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh +++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh @@ -1,41 +1,40 @@ #!/bin/bash - + export HCCL_CONNECT_TIMEOUT=1800 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export NPU_ASD_ENABLE=0 - + NPUS_PER_NODE=8 -MASTER_ADDR=localhost +MASTER_ADDR=7.150.14.181 MASTER_PORT=6000 -NNODES=1 -NODE_RANK=0 +NNODES=2 +NODE_RANK=1 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) - + # please fill these path configurations CKPT_SAVE_DIR="your model save ckpt path" DATA_PATH="your data path" TOKENIZER_PATH="your tokenizer path" CKPT_LOAD_DIR="your model ckpt path" - -DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document" +DATA_PATH="/home/ascend-vllm/dataset/lsb/enwiki20230101/Qwen3-30B-A3B-convert-pretrain_text_document" TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba" -CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v2-tp2-pp2-ep2" +CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v3-tp1-pp4-ep4" -TP=2 -PP=2 -EP=2 +TP=1 +PP=4 +EP=4 CP=1 - + MBS=1 GBS=256 -SEQ_LENGTH=1024 +SEQ_LENGTH=4096 TRAIN_ITERS=2000 CP_TYPE='ulysses_cp_algo' ROUTER_BALANCING_TYPE='aux_loss' - + DISTRIBUTED_ARGS=" --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ @@ -43,11 +42,11 @@ DISTRIBUTED_ARGS=" --master_addr $MASTER_ADDR \ --master_port $MASTER_PORT " - + NUM_LAYERS=96 LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" -NUM_LAYERS=4 -LAYER_PATTEN="*-M-" +# NUM_LAYERS=8 +# LAYER_PATTEN="*-M-*-M-" MAMBA_ARGS=" --reuse-fp32-param \ --no-shared-storage \ @@ -60,7 +59,7 @@ MAMBA_ARGS=" --mamba-d-state 128 \ --mamba-d-conv 4 \ --mamba-expand 2 \ - --mamba-headdim 64 \ + --mamba-headdim 128 \ --tokenizer-model ${TOKENIZER_PATH} \ --hybrid-attention-ratio 0.26 \ --hybrid-mlp-ratio 0.5 \ @@ -70,7 +69,7 @@ MAMBA_ARGS=" --overlap-grad-reduce \ --norm-epsilon 1e-6 \ " - + MOE_ARGS=" --num-experts 128 \ --moe-router-topk 8 \ @@ -82,7 +81,7 @@ MOE_ARGS=" --moe-token-dispatcher-type alltoall \ --moe-aux-loss-coeff 0.001 \ " - + OPTIMIZE_ARGS=" --use-flash-attn \ --use-fused-rotary-pos-emb \ @@ -97,7 +96,7 @@ OPTIMIZE_ARGS=" --recompute-granularity full \ --recompute-num-layers 1 \ " - + TRAIN_ARGS=" --micro-batch-size ${MBS} \ --global-batch-size ${GBS} \ @@ -119,7 +118,7 @@ TRAIN_ARGS=" --seq-length ${SEQ_LENGTH} \ --no-shared-storage " - + MODEL_PARALLEL_ARGS=" --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ @@ -127,7 +126,7 @@ MODEL_PARALLEL_ARGS=" --context-parallel-size ${CP} \ --context-parallel-algo ${CP_TYPE} \ " - + GPT_ARGS=" --use-mcore-models \ --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ @@ -154,12 +153,12 @@ GPT_ARGS=" --group-query-attention \ --num-query-groups 4 " - + DATA_ARGS=" --data-path $DATA_PATH \ --split 100,0,0 " - + OUTPUT_ARGS=" --log-interval 1 \ --save-interval ${TRAIN_ITERS} \ @@ -168,7 +167,7 @@ OUTPUT_ARGS=" --no-load-optim \ --no-load-rng " - + torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \ $GPT_ARGS \ $DATA_ARGS \ -- Gitee From 9e7539c1694703ff100a03be2d40e7ded97746f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Tue, 29 Jul 2025 13:35:37 +0000 Subject: [PATCH 12/16] add inference_mamba.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: flippy航 <654733882@qq.com> --- inference_mamba.py | 139 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 inference_mamba.py diff --git a/inference_mamba.py b/inference_mamba.py new file mode 100644 index 000000000..ac4b8964f --- /dev/null +++ b/inference_mamba.py @@ -0,0 +1,139 @@ +# coding=utf-8 +# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Union + +from mindspeed_llm import megatron_adaptor +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, \ + get_gpt_layer_local_spec +from megatron.core.transformer.spec_utils import import_module +from megatron.training import get_args, print_rank_0 +from megatron.legacy.model import GPTModel +from megatron.training.initialize import initialize_megatron +from megatron.training.arguments import core_transformer_config_from_args +from megatron.training.yaml_arguments import core_transformer_config_from_yaml + +from mindspeed_llm.tasks.inference.infer_base import task_factory +from mindspeed_llm.tasks.inference.module import GPTModelInfer, MambaModelInfer, MegatronModuleForCausalLM +from megatron.core.inference_params import InferenceParams + + + +def model_provider(pre_process=True, post_process=True) -> Union[MambaModelInfer, GPTModel]: + """Builds the model. + + If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + Union[GPTModelInfer, GPTModel]: The returned model + """ + args = get_args() + use_te = args.transformer_impl == "transformer_engine" + + if args.sequence_parallel and args.use_kv_cache: + raise AssertionError('Use_kv_cache can not be true in sequence_parallel mode.') + + print_rank_0('building GPT model ...') + # Experimental loading arguments from yaml + if args.yaml_cfg is not None: + config = core_transformer_config_from_yaml(args, "language_model") + else: + config = core_transformer_config_from_args(args) + + if args.spec is not None: + mamba_stack_spec = import_module(args.spec) + else: + raise "You must provide a valid Mamba layer spec!" + + if args.use_mcore_models: + + model = MambaModelInfer( + config=config, + mamba_stack_spec=mamba_stack_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + mamba_ssm_ngroups=args.mamba_ngroups, + pre_process=pre_process, + hybrid_attention_ratio=args.hybrid_attention_ratio, + hybrid_mlp_ratio=args.hybrid_mlp_ratio, + hybrid_override_pattern=args.hybrid_override_pattern, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + rotary_base=args.rotary_base + ) + + else: + if not args.context_parallel_size == 1: + raise ValueError("Context parallelism is only supported with Megatron Core!") + + model = GPTModel( + config, + parallel_output=True if args.sequence_parallel else False, + pre_process=pre_process, + post_process=post_process + ) + + return model + + +def main(): + initialize_megatron(args_defaults={'no_load_rng': True, + 'no_load_optim': True}) + + args = get_args() + + model = MegatronModuleForCausalLM.from_pretrained( + model_provider=model_provider, + pretrained_model_name_or_path=args.load + ) + + task_factory(args, model) + + + # # 生成指定输入 + # import torch + # import numpy as np + # from megatron.training.utils import get_ltor_masks_and_position_ids + # input_ids = torch.tensor([i for i in range(10000, 12048)]).unsqueeze(0).npu() + # eod = 0 + # reset_position_ids = False + # reset_attention_mask = False + # eod_mask_loss = False + # max_batch_size = 1 + # max_sequence_length = 2048 + # attention_mask, loss_mask, _ = get_ltor_masks_and_position_ids( + # input_ids, + # eod, + # reset_position_ids, + # reset_attention_mask, + # eod_mask_loss) + # inference_params = InferenceParams(max_batch_size, max_sequence_length) + # with torch.no_grad(): + # outputs = model.forward(input_ids=input_ids, position_ids=None, attention_mask=attention_mask.npu(), inference_params=inference_params) + # print(outputs.shape, outputs.dtype) + # np.save("./npu_forward_out_mg_qewn3_mamba_logits_fp16.npy", outputs.cpu().numpy()) + + +if __name__ == "__main__": + main() \ No newline at end of file -- Gitee From 151b1e0598f7bb8edac9503f65c9726a031d0c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Tue, 29 Jul 2025 13:36:24 +0000 Subject: [PATCH 13/16] add examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: flippy航 <654733882@qq.com> --- .../generate_qwen3_30b_a3b_ptd_mamba.sh | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh diff --git a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh new file mode 100644 index 000000000..f9ba7c3cd --- /dev/null +++ b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# The number of parameters is not aligned +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# please fill these path configurations +TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" +CHECKPOINT="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v4-tp1-pp8-ep1" + + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=8 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +TP=1 +PP=8 +EP=1 +SEQ_LENGTH=2048 +ROUTER_BALANCING_TYPE='softmax_topk' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 128 \ + --moe-router-topk 8 \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 768 \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type allgather \ + --moe-aux-loss-coeff 0.001 +" + + +NUM_LAYERS=96 +LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" +# NUM_LAYERS=8 +# LAYER_PATTEN="*-M-*-M-" +MAMBA_ARGS=" + --reuse-fp32-param \ + --no-shared-storage \ + --use-distributed-optimizer \ + --use-flash-attn \ + --use-mcore-models \ + --num-layers ${NUM_LAYERS} \ + --mamba-ngroups 4 \ + --mamba-chunk-size 128 \ + --mamba-d-state 128 \ + --mamba-d-conv 4 \ + --mamba-expand 2 \ + --mamba-headdim 128 \ + --tokenizer-model ${TOKENIZER_PATH} \ + --hybrid-attention-ratio 0.26 \ + --hybrid-mlp-ratio 0.5 \ + --hybrid-override-pattern $LAYER_PATTEN \ + --untie-embeddings-and-output-weights \ + --overlap-param-gather \ + --overlap-grad-reduce \ + --norm-epsilon 1e-6 \ +" + +torchrun $DISTRIBUTED_ARGS inference_mamba.py \ + $MOE_ARGS \ + $MAMBA_ARGS \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --load ${CHECKPOINT} \ + --moe-grouped-gemm \ + --norm-topk-prob \ + --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --num-layers ${NUM_LAYERS} \ + --hidden-size 2048 \ + --use-rotary-position-embeddings \ + --num-attention-heads 32 \ + --ffn-hidden-size 8192 \ + --max-position-embeddings 40960 \ + --seq-length ${SEQ_LENGTH} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --micro-batch-size 1 \ + --disable-bias-linear \ + --swiglu \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --normalization RMSNorm \ + --position-embedding-type rope \ + --norm-epsilon 1e-6 \ + --hidden-dropout 0 \ + --attention-dropout 0 \ + --tokenizer-not-use-fast \ + --max-new-tokens 256 \ + --no-gradient-accumulation-fusion \ + --attention-softmax-in-fp32 \ + --exit-on-missing-checkpoint \ + --no-masked-softmax-fusion \ + --group-query-attention \ + --num-query-groups 4 \ + --seed 42 \ + --bf16 \ + | tee logs/generate_mcore_qwen3_30b_a3b.log -- Gitee From fa4b018190284ffbceb55156343a62d43935de1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Tue, 29 Jul 2025 21:40:01 +0800 Subject: [PATCH 14/16] support mamba inference --- mindspeed_llm/core/ssm/mamba_mixer.py | 2 +- mindspeed_llm/tasks/inference/module.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/mindspeed_llm/core/ssm/mamba_mixer.py b/mindspeed_llm/core/ssm/mamba_mixer.py index 3ae0b6f77..76b12a774 100644 --- a/mindspeed_llm/core/ssm/mamba_mixer.py +++ b/mindspeed_llm/core/ssm/mamba_mixer.py @@ -131,7 +131,7 @@ def mamba_mixer_forward(self, hidden_states, seqlen=None, seq_idx=None, cu_seqle ) state_opts = StateOptions( - return_final_state=True if ssm_state else False + return_final_state=True if ssm_state is not None else False ) state_space_duality = StateSpaceProcessor(config=config) y = state_space_duality.process(inputs, state_opts) diff --git a/mindspeed_llm/tasks/inference/module.py b/mindspeed_llm/tasks/inference/module.py index cc9da9d48..c4e4d9dcb 100644 --- a/mindspeed_llm/tasks/inference/module.py +++ b/mindspeed_llm/tasks/inference/module.py @@ -25,6 +25,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron.core.models.gpt.gpt_model import GPTModel from megatron.training import get_args, global_vars from megatron.core import parallel_state, ModelParallelConfig +from megatron.core.models.mamba import MambaModel class MegatronModuleForCausalLMABC(torch.nn.Module, abc.ABC): @@ -501,5 +502,13 @@ class GPTModelInfer(GPTModel): super().__init__(*args, **kwargs) self.infer_model = MegatronModuleForCausalLM() + def generate(self, input_ids=None, **kwargs): + return self.infer_model.generate(input_ids=input_ids, **kwargs) + +class MambaModelInfer(MambaModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.infer_model = MegatronModuleForCausalLM() + def generate(self, input_ids=None, **kwargs): return self.infer_model.generate(input_ids=input_ids, **kwargs) \ No newline at end of file -- Gitee From 05ddf3ace04fbb897193d06bf157a2b72019f31e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Wed, 30 Jul 2025 09:53:34 +0800 Subject: [PATCH 15/16] replace /r --- .../generate_qwen3_30b_a3b_ptd_mamba.sh | 234 +++++++++--------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh index f9ba7c3cd..c33ceda35 100644 --- a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh +++ b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh @@ -1,117 +1,117 @@ -#!/bin/bash - -# The number of parameters is not aligned -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -# please fill these path configurations -TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" -CHECKPOINT="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v4-tp1-pp8-ep1" - - -# Change for multinode config -MASTER_ADDR=localhost -MASTER_PORT=6000 -NNODES=1 -NODE_RANK=0 -NPUS_PER_NODE=8 -WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) - -TP=1 -PP=8 -EP=1 -SEQ_LENGTH=2048 -ROUTER_BALANCING_TYPE='softmax_topk' - -DISTRIBUTED_ARGS=" - --nproc_per_node $NPUS_PER_NODE \ - --nnodes $NNODES \ - --node_rank $NODE_RANK \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT -" - -MOE_ARGS=" - --num-experts 128 \ - --moe-router-topk 8 \ - --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ - --moe-intermediate-size 768 \ - --moe-permutation-async-comm \ - --moe-token-dispatcher-type allgather \ - --moe-aux-loss-coeff 0.001 -" - - -NUM_LAYERS=96 -LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" -# NUM_LAYERS=8 -# LAYER_PATTEN="*-M-*-M-" -MAMBA_ARGS=" - --reuse-fp32-param \ - --no-shared-storage \ - --use-distributed-optimizer \ - --use-flash-attn \ - --use-mcore-models \ - --num-layers ${NUM_LAYERS} \ - --mamba-ngroups 4 \ - --mamba-chunk-size 128 \ - --mamba-d-state 128 \ - --mamba-d-conv 4 \ - --mamba-expand 2 \ - --mamba-headdim 128 \ - --tokenizer-model ${TOKENIZER_PATH} \ - --hybrid-attention-ratio 0.26 \ - --hybrid-mlp-ratio 0.5 \ - --hybrid-override-pattern $LAYER_PATTEN \ - --untie-embeddings-and-output-weights \ - --overlap-param-gather \ - --overlap-grad-reduce \ - --norm-epsilon 1e-6 \ -" - -torchrun $DISTRIBUTED_ARGS inference_mamba.py \ - $MOE_ARGS \ - $MAMBA_ARGS \ - --use-mcore-models \ - --tensor-model-parallel-size ${TP} \ - --pipeline-model-parallel-size ${PP} \ - --expert-model-parallel-size ${EP} \ - --load ${CHECKPOINT} \ - --moe-grouped-gemm \ - --norm-topk-prob \ - --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ - --kv-channels 128 \ - --qk-layernorm \ - --num-layers ${NUM_LAYERS} \ - --hidden-size 2048 \ - --use-rotary-position-embeddings \ - --num-attention-heads 32 \ - --ffn-hidden-size 8192 \ - --max-position-embeddings 40960 \ - --seq-length ${SEQ_LENGTH} \ - --make-vocab-size-divisible-by 1 \ - --padded-vocab-size 151936 \ - --rotary-base 1000000 \ - --untie-embeddings-and-output-weights \ - --micro-batch-size 1 \ - --disable-bias-linear \ - --swiglu \ - --use-fused-swiglu \ - --use-fused-rmsnorm \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path ${TOKENIZER_PATH} \ - --normalization RMSNorm \ - --position-embedding-type rope \ - --norm-epsilon 1e-6 \ - --hidden-dropout 0 \ - --attention-dropout 0 \ - --tokenizer-not-use-fast \ - --max-new-tokens 256 \ - --no-gradient-accumulation-fusion \ - --attention-softmax-in-fp32 \ - --exit-on-missing-checkpoint \ - --no-masked-softmax-fusion \ - --group-query-attention \ - --num-query-groups 4 \ - --seed 42 \ - --bf16 \ - | tee logs/generate_mcore_qwen3_30b_a3b.log +#!/bin/bash + +# The number of parameters is not aligned +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# please fill these path configurations +TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B" +CHECKPOINT="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v4-tp1-pp8-ep1" + + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=8 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +TP=1 +PP=8 +EP=1 +SEQ_LENGTH=2048 +ROUTER_BALANCING_TYPE='softmax_topk' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 128 \ + --moe-router-topk 8 \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 768 \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type allgather \ + --moe-aux-loss-coeff 0.001 +" + + +NUM_LAYERS=96 +LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" +# NUM_LAYERS=8 +# LAYER_PATTEN="*-M-*-M-" +MAMBA_ARGS=" + --reuse-fp32-param \ + --no-shared-storage \ + --use-distributed-optimizer \ + --use-flash-attn \ + --use-mcore-models \ + --num-layers ${NUM_LAYERS} \ + --mamba-ngroups 4 \ + --mamba-chunk-size 128 \ + --mamba-d-state 128 \ + --mamba-d-conv 4 \ + --mamba-expand 2 \ + --mamba-headdim 128 \ + --tokenizer-model ${TOKENIZER_PATH} \ + --hybrid-attention-ratio 0.26 \ + --hybrid-mlp-ratio 0.5 \ + --hybrid-override-pattern $LAYER_PATTEN \ + --untie-embeddings-and-output-weights \ + --overlap-param-gather \ + --overlap-grad-reduce \ + --norm-epsilon 1e-6 \ +" + +torchrun $DISTRIBUTED_ARGS inference_mamba.py \ + $MOE_ARGS \ + $MAMBA_ARGS \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --load ${CHECKPOINT} \ + --moe-grouped-gemm \ + --norm-topk-prob \ + --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --num-layers ${NUM_LAYERS} \ + --hidden-size 2048 \ + --use-rotary-position-embeddings \ + --num-attention-heads 32 \ + --ffn-hidden-size 8192 \ + --max-position-embeddings 40960 \ + --seq-length ${SEQ_LENGTH} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --micro-batch-size 1 \ + --disable-bias-linear \ + --swiglu \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --normalization RMSNorm \ + --position-embedding-type rope \ + --norm-epsilon 1e-6 \ + --hidden-dropout 0 \ + --attention-dropout 0 \ + --tokenizer-not-use-fast \ + --max-new-tokens 256 \ + --no-gradient-accumulation-fusion \ + --attention-softmax-in-fp32 \ + --exit-on-missing-checkpoint \ + --no-masked-softmax-fusion \ + --group-query-attention \ + --num-query-groups 4 \ + --seed 42 \ + --bf16 \ + | tee logs/generate_mcore_qwen3_30b_a3b.log -- Gitee From 6d3a6adc7983942f23c1425ba3bda9e7e6b59410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com> Date: Fri, 1 Aug 2025 17:15:07 +0800 Subject: [PATCH 16/16] precision alignment for Mamba and MoE --- mindspeed_llm/core/ssm/mamba_mixer.py | 6 +++--- mindspeed_llm/core/transformer/moe/moe_layer.py | 1 + .../tasks/models/ssm/state_space_duality.py | 14 +++++++++----- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mindspeed_llm/core/ssm/mamba_mixer.py b/mindspeed_llm/core/ssm/mamba_mixer.py index 76b12a774..2d6045378 100644 --- a/mindspeed_llm/core/ssm/mamba_mixer.py +++ b/mindspeed_llm/core/ssm/mamba_mixer.py @@ -22,8 +22,8 @@ def mamba_mixer_init_wrapper(fn): kwargs["expand"] = param_args.mamba_expand kwargs["headdim"] = param_args.mamba_headdim fn(self, *args, **kwargs) - dt_min = kwargs.pop('dt_min', 0.001) - dt_max = kwargs.pop('dt_max', 0.1) + dt_min = kwargs.pop('dt_min', 0.0) + dt_max = kwargs.pop('dt_max', float("inf")) self.use_mem_eff_path = False self.d_ssm = param_args.mamba_d_ssm self.dt_min = dt_min @@ -102,9 +102,9 @@ def mamba_mixer_forward(self, hidden_states, seqlen=None, seq_idx=None, cu_seqle x, B, C = torch.split( xBC, [ - self.d_inner_local, self.ngroups_local * self.d_state, self.ngroups_local * self.d_state, + self.d_inner_local, ], dim=-1, ) diff --git a/mindspeed_llm/core/transformer/moe/moe_layer.py b/mindspeed_llm/core/transformer/moe/moe_layer.py index c4997a29c..52f3de78e 100644 --- a/mindspeed_llm/core/transformer/moe/moe_layer.py +++ b/mindspeed_llm/core/transformer/moe/moe_layer.py @@ -93,6 +93,7 @@ def moe_layer_forward(self, hidden_states: torch.Tensor): # process MoE scores, indices = self.router(hidden_states) + scores = scores / scores.sum(dim=-1, keepdim=True) if global_args.moe_revert_type_after_topk: (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation( diff --git a/mindspeed_llm/tasks/models/ssm/state_space_duality.py b/mindspeed_llm/tasks/models/ssm/state_space_duality.py index 809eeffaf..c68d60087 100644 --- a/mindspeed_llm/tasks/models/ssm/state_space_duality.py +++ b/mindspeed_llm/tasks/models/ssm/state_space_duality.py @@ -77,12 +77,14 @@ class StateSpaceProcessor: # Dimension transformations x, dt, A, B, C = self._expand_dims(x, A, dt, B, C) - B_exp, C_exp = self._expand_groups_to_heads(B, C) + # B_exp, C_exp = self._expand_groups_to_heads(B, C) + x_exp, B_exp = self._expand_groups_to_heads(x, B) dt_proc = self._process_time_step(dt) - D = self._prepare_residual(D, x, pad_size) + D = self._prepare_residual(D, x_exp, pad_size) # Chunk processing - x_pad, A_pad, B_pad, C_pad = self._chunk_and_pad(x, dt_proc, A, B_exp, C_exp, pad_size) + # x_pad, A_pad, B_pad, C_pad = self._chunk_and_pad(x, dt_proc, A, B_exp, C_exp, pad_size) + x_pad, A_pad, B_pad, C_pad = self._chunk_and_pad(x_exp, dt_proc, A, B_exp, C, pad_size) # Core computations Y_diag, states, A_cum, C_br = self._compute_diagonal_blocks(A_pad, B_pad, C_pad, x_pad) @@ -93,11 +95,13 @@ class StateSpaceProcessor: return self._synthesize_output((Y_diag, Y_off, D), (pad_size, seq_len), state_opts) def _expand_dims(self, x, A, dt, B, C): - x = rearrange(x, "b l (h p) -> b l h p", p=self.config['headdim']).contiguous() + # x = rearrange(x, "b l (h p) -> b l h p", p=self.config['headdim']).contiguous() + C = rearrange(C, "b l (h p) -> b l h p", p=self.config['headdim']).contiguous() dt = dt.contiguous() A = A.contiguous() B = rearrange(B, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous() - C = rearrange(C, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous() + x = rearrange(x, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous() + # C = rearrange(C, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous() return x, dt, A, B, C def _prepare_initial_states(self, states: Optional[torch.Tensor]) -> torch.Tensor: -- Gitee