From 4d7b34b601947bce5b0b9afdca1cfb788a37c408 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Mon, 21 Jul 2025 09:39:30 +0000
Subject: [PATCH 01/16] add
 mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: flippy航 <654733882@qq.com>
---
 .../tasks/models/spec/qwen3_mamba_spec.py     | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py

diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
new file mode 100644
index 000000000..249231308
--- /dev/null
+++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
@@ -0,0 +1,105 @@
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
+from megatron.training import get_args
+ 
+from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
+from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
+from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+ 
+from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules
+from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm
+ 
+args = get_args()
+num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm
+ 
+# # Transformer Layer Spec for Gemma using post_mlp_layernorm and post_mlp_layernorm.
+# layer_spec = ModuleSpec(
+#     module=TransformerLayer,
+#     submodules=TransformerLayerSubmodules(
+#         input_layernorm=PTNorm,
+#         self_attention=ModuleSpec(
+#             module=SelfAttention,
+#             params={"attn_mask_type": AttnMaskType.causal},
+#             submodules=SelfAttentionSubmodules(
+#                 linear_qkv=ColumnParallelLinear,
+#                 core_attention=DotProductAttention,
+#                 linear_proj=RowParallelLinear,
+#                 q_layernorm=PTNorm if qk_layernorm else IdentityOp,
+#                 k_layernorm=PTNorm if qk_layernorm else IdentityOp,
+#             ),
+#         ),
+#         self_attn_bda=get_bias_dropout_add,
+#         pre_mlp_layernorm=PTNorm,
+#         mlp=_get_mlp_module_spec(
+#             use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+#         ),
+#         mlp_bda=get_bias_dropout_add,
+#         sharded_state_dict_keys_map={
+#             'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+#             'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+#         },
+#     ),
+# )
+ 
+layer_spec = ModuleSpec(
+    module=MambaStack,
+    submodules=MambaStackSubmodules(
+        mamba_layer=ModuleSpec(
+            module=MambaLayer,
+            submodules=MambaLayerSubmodules(
+                norm=PTNorm,
+                mixer=ModuleSpec(
+                    module=MambaMixer,
+                    submodules=MambaMixerSubmodules(
+                        in_proj=ColumnParallelLinear,
+                        out_proj=RowParallelLinear,
+                    ),
+                ),
+                mamba_bda=get_bias_dropout_add,
+            ),
+        ),
+        attention_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=PTNorm,
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+            ),
+        ),
+        mlp_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                pre_mlp_layernorm=PTNorm,
+                # mlp=ModuleSpec(
+                #     module=MLP,
+                #     submodules=MLPSubmodules(
+                #         linear_fc1=ColumnParallelLinear,
+                #         linear_fc2=RowParallelLinear,
+                #     ),
+                # ),
+                mlp=_get_mlp_module_spec(
+                    use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+                ),
+                mlp_bda=get_bias_dropout_add,
+                sharded_state_dict_keys_map={
+                    'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                    'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+                },
+            ),
+        ),
+    ),
+)
\ No newline at end of file
-- 
Gitee


From 0d43db8e4921dcb8ecb2a77749d28c25902eb5d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Mon, 21 Jul 2025 09:50:54 +0000
Subject: [PATCH 02/16] add
 examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: flippy航 <654733882@qq.com>
---
 .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh    | 179 ++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh

diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
new file mode 100644
index 000000000..5cfde0d3c
--- /dev/null
+++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+ 
+export HCCL_CONNECT_TIMEOUT=1800
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export NPU_ASD_ENABLE=0
+ 
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+ 
+# please fill these path configurations
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+ 
+DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
+TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
+CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
+ 
+TP=1
+PP=1
+EP=2
+CP=1
+ 
+MBS=1
+GBS=256
+SEQ_LENGTH=1024
+TRAIN_ITERS=2000
+CP_TYPE='ulysses_cp_algo'
+ROUTER_BALANCING_TYPE='aux_loss'
+ 
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+ 
+NUM_LAYERS=96
+LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
+MAMBA_ARGS="
+    --reuse-fp32-param \
+    --no-shared-storage \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --use-mcore-models \
+    --num-layers ${NUM_LAYERS} \
+    --mamba-ngroups 8 \
+    --mamba-chunk-size 128 \
+    --mamba-d-state 128 \
+    --mamba-d-conv 4 \
+    --mamba-expand 2 \
+    --mamba-headdim 64 \
+    --tokenizer-model ${TOKENIZER_PATH} \
+    --hybrid-attention-ratio 0.26 \
+    --hybrid-mlp-ratio 0.5 \
+    --hybrid-override-pattern $LAYER_PATTEN \
+    --untie-embeddings-and-output-weights \
+    --overlap-param-gather \
+    --overlap-grad-reduce \
+    --norm-epsilon 1e-6 \
+"
+ 
+MOE_ARGS="
+    --num-experts 128 \
+    --moe-router-topk 8 \
+    --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \
+    --moe-intermediate-size 768 \
+    --moe-grouped-gemm \
+    --use-fused-moe-token-permute-and-unpermute \
+    --moe-permutation-async-comm \
+    --moe-token-dispatcher-type alltoall \
+    --moe-aux-loss-coeff 0.001 \
+"
+ 
+OPTIMIZE_ARGS="
+    --use-flash-attn \
+    --use-fused-rotary-pos-emb \
+    --sequence-parallel \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-fused-rmsnorm \
+    --no-masked-softmax-fusion \
+    --use-distributed-optimizer
+    --gemm-gradient-accumulation-fusion \
+    --recompute-method uniform \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+"
+ 
+TRAIN_ARGS="
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --lr 1.25e-6 \
+    --lr-decay-style cosine \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --initial-loss-scale 4096 \
+    --seed 42 \
+    --bf16 \
+    --train-iters ${TRAIN_ITERS} \
+    --seq-length ${SEQ_LENGTH} \
+    --no-shared-storage
+"
+ 
+MODEL_PARALLEL_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --expert-model-parallel-size ${EP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo ${CP_TYPE} \
+"
+ 
+GPT_ARGS="
+    --use-mcore-models \
+    --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
+    --kv-channels 128 \
+    --qk-layernorm \
+    --norm-topk-prob \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --max-position-embeddings ${SEQ_LENGTH} \
+    --num-layers 48 \
+    --hidden-size 2048 \
+    --ffn-hidden-size 6144 \
+    --num-attention-heads 32 \
+    --tokenizer-type PretrainedFromHF \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 151936 \
+    --rotary-base 1000000 \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --swiglu \
+    --attention-softmax-in-fp32 \
+    --no-gradient-accumulation-fusion \
+    --group-query-attention \
+    --num-query-groups 4
+"
+ 
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+ 
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval ${TRAIN_ITERS} \
+    --eval-interval ${TRAIN_ITERS} \
+    --eval-iters 0 \
+    --no-load-optim \
+    --no-load-rng
+"
+ 
+torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $MOE_ARGS \
+    $OUTPUT_ARGS \
+    $OPTIMIZE_ARGS \
+    $TRAIN_ARGS \
+    $MODEL_PARALLEL_ARGS \
+    $MAMBA_ARGS \
+    --distributed-backend nccl \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_mcore_qwen3_30b_a3b.log
\ No newline at end of file
-- 
Gitee


From fb2057cce2cc5df6a16d54eb6dfd0982a520f1be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Fri, 18 Jul 2025 15:05:23 +0800
Subject: [PATCH 03/16] support Qwen3-MoE-mamba

---
 .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh    | 358 +++++++++---------
 .../tasks/models/spec/qwen3_mamba_spec.py     | 208 +++++-----
 2 files changed, 283 insertions(+), 283 deletions(-)

diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
index 5cfde0d3c..66b12c0e5 100644
--- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
+++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
@@ -1,179 +1,179 @@
-#!/bin/bash
- 
-export HCCL_CONNECT_TIMEOUT=1800
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-export NPU_ASD_ENABLE=0
- 
-NPUS_PER_NODE=8
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
- 
-# please fill these path configurations
-CKPT_SAVE_DIR="your model save ckpt path"
-DATA_PATH="your data path"
-TOKENIZER_PATH="your tokenizer path"
-CKPT_LOAD_DIR="your model ckpt path"
- 
-DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
-TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
-CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
- 
-TP=1
-PP=1
-EP=2
-CP=1
- 
-MBS=1
-GBS=256
-SEQ_LENGTH=1024
-TRAIN_ITERS=2000
-CP_TYPE='ulysses_cp_algo'
-ROUTER_BALANCING_TYPE='aux_loss'
- 
-DISTRIBUTED_ARGS="
-    --nproc_per_node $NPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
- 
-NUM_LAYERS=96
-LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
-MAMBA_ARGS="
-    --reuse-fp32-param \
-    --no-shared-storage \
-    --use-distributed-optimizer \
-    --use-flash-attn \
-    --use-mcore-models \
-    --num-layers ${NUM_LAYERS} \
-    --mamba-ngroups 8 \
-    --mamba-chunk-size 128 \
-    --mamba-d-state 128 \
-    --mamba-d-conv 4 \
-    --mamba-expand 2 \
-    --mamba-headdim 64 \
-    --tokenizer-model ${TOKENIZER_PATH} \
-    --hybrid-attention-ratio 0.26 \
-    --hybrid-mlp-ratio 0.5 \
-    --hybrid-override-pattern $LAYER_PATTEN \
-    --untie-embeddings-and-output-weights \
-    --overlap-param-gather \
-    --overlap-grad-reduce \
-    --norm-epsilon 1e-6 \
-"
- 
-MOE_ARGS="
-    --num-experts 128 \
-    --moe-router-topk 8 \
-    --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \
-    --moe-intermediate-size 768 \
-    --moe-grouped-gemm \
-    --use-fused-moe-token-permute-and-unpermute \
-    --moe-permutation-async-comm \
-    --moe-token-dispatcher-type alltoall \
-    --moe-aux-loss-coeff 0.001 \
-"
- 
-OPTIMIZE_ARGS="
-    --use-flash-attn \
-    --use-fused-rotary-pos-emb \
-    --sequence-parallel \
-    --use-rotary-position-embeddings \
-    --use-fused-swiglu \
-    --use-fused-rmsnorm \
-    --no-masked-softmax-fusion \
-    --use-distributed-optimizer
-    --gemm-gradient-accumulation-fusion \
-    --recompute-method uniform \
-    --recompute-granularity full \
-    --recompute-num-layers 1 \
-"
- 
-TRAIN_ARGS="
-    --micro-batch-size ${MBS} \
-    --global-batch-size ${GBS} \
-    --lr 1.25e-6 \
-    --lr-decay-style cosine \
-    --min-lr 1.25e-7 \
-    --weight-decay 1e-1 \
-    --lr-warmup-fraction 0.01 \
-    --attention-dropout 0.0 \
-    --init-method-std 0.01 \
-    --hidden-dropout 0.0 \
-    --clip-grad 1.0 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --initial-loss-scale 4096 \
-    --seed 42 \
-    --bf16 \
-    --train-iters ${TRAIN_ITERS} \
-    --seq-length ${SEQ_LENGTH} \
-    --no-shared-storage
-"
- 
-MODEL_PARALLEL_ARGS="
-    --tensor-model-parallel-size ${TP} \
-    --pipeline-model-parallel-size ${PP} \
-    --expert-model-parallel-size ${EP} \
-    --context-parallel-size ${CP} \
-    --context-parallel-algo ${CP_TYPE} \
-"
- 
-GPT_ARGS="
-    --use-mcore-models \
-    --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
-    --kv-channels 128 \
-    --qk-layernorm \
-    --norm-topk-prob \
-    --tokenizer-name-or-path ${TOKENIZER_PATH} \
-    --max-position-embeddings ${SEQ_LENGTH} \
-    --num-layers 48 \
-    --hidden-size 2048 \
-    --ffn-hidden-size 6144 \
-    --num-attention-heads 32 \
-    --tokenizer-type PretrainedFromHF \
-    --make-vocab-size-divisible-by 1 \
-    --padded-vocab-size 151936 \
-    --rotary-base 1000000 \
-    --untie-embeddings-and-output-weights \
-    --disable-bias-linear \
-    --position-embedding-type rope \
-    --normalization RMSNorm \
-    --swiglu \
-    --attention-softmax-in-fp32 \
-    --no-gradient-accumulation-fusion \
-    --group-query-attention \
-    --num-query-groups 4
-"
- 
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --split 100,0,0
-"
- 
-OUTPUT_ARGS="
-    --log-interval 1 \
-    --save-interval ${TRAIN_ITERS} \
-    --eval-interval ${TRAIN_ITERS} \
-    --eval-iters 0 \
-    --no-load-optim \
-    --no-load-rng
-"
- 
-torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $MOE_ARGS \
-    $OUTPUT_ARGS \
-    $OPTIMIZE_ARGS \
-    $TRAIN_ARGS \
-    $MODEL_PARALLEL_ARGS \
-    $MAMBA_ARGS \
-    --distributed-backend nccl \
-    --save ${CKPT_SAVE_DIR} \
-    | tee logs/train_mcore_qwen3_30b_a3b.log
\ No newline at end of file
+#!/bin/bash
+ 
+export HCCL_CONNECT_TIMEOUT=1800
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export NPU_ASD_ENABLE=0
+ 
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+ 
+# please fill these path configurations
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+ 
+DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
+TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
+CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
+ 
+TP=1
+PP=1
+EP=2
+CP=1
+ 
+MBS=1
+GBS=256
+SEQ_LENGTH=1024
+TRAIN_ITERS=2000
+CP_TYPE='ulysses_cp_algo'
+ROUTER_BALANCING_TYPE='aux_loss'
+ 
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+ 
+NUM_LAYERS=96
+LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
+MAMBA_ARGS="
+    --reuse-fp32-param \
+    --no-shared-storage \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --use-mcore-models \
+    --num-layers ${NUM_LAYERS} \
+    --mamba-ngroups 8 \
+    --mamba-chunk-size 128 \
+    --mamba-d-state 128 \
+    --mamba-d-conv 4 \
+    --mamba-expand 2 \
+    --mamba-headdim 64 \
+    --tokenizer-model ${TOKENIZER_PATH} \
+    --hybrid-attention-ratio 0.26 \
+    --hybrid-mlp-ratio 0.5 \
+    --hybrid-override-pattern $LAYER_PATTEN \
+    --untie-embeddings-and-output-weights \
+    --overlap-param-gather \
+    --overlap-grad-reduce \
+    --norm-epsilon 1e-6 \
+"
+ 
+MOE_ARGS="
+    --num-experts 128 \
+    --moe-router-topk 8 \
+    --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \
+    --moe-intermediate-size 768 \
+    --moe-grouped-gemm \
+    --use-fused-moe-token-permute-and-unpermute \
+    --moe-permutation-async-comm \
+    --moe-token-dispatcher-type alltoall \
+    --moe-aux-loss-coeff 0.001 \
+"
+ 
+OPTIMIZE_ARGS="
+    --use-flash-attn \
+    --use-fused-rotary-pos-emb \
+    --sequence-parallel \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-fused-rmsnorm \
+    --no-masked-softmax-fusion \
+    --use-distributed-optimizer
+    --gemm-gradient-accumulation-fusion \
+    --recompute-method uniform \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+"
+ 
+TRAIN_ARGS="
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --lr 1.25e-6 \
+    --lr-decay-style cosine \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --initial-loss-scale 4096 \
+    --seed 42 \
+    --bf16 \
+    --train-iters ${TRAIN_ITERS} \
+    --seq-length ${SEQ_LENGTH} \
+    --no-shared-storage
+"
+ 
+MODEL_PARALLEL_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --expert-model-parallel-size ${EP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo ${CP_TYPE} \
+"
+ 
+GPT_ARGS="
+    --use-mcore-models \
+    --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
+    --kv-channels 128 \
+    --qk-layernorm \
+    --norm-topk-prob \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --max-position-embeddings ${SEQ_LENGTH} \
+    --num-layers 48 \
+    --hidden-size 2048 \
+    --ffn-hidden-size 6144 \
+    --num-attention-heads 32 \
+    --tokenizer-type PretrainedFromHF \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 151936 \
+    --rotary-base 1000000 \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --swiglu \
+    --attention-softmax-in-fp32 \
+    --no-gradient-accumulation-fusion \
+    --group-query-attention \
+    --num-query-groups 4
+"
+ 
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+ 
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval ${TRAIN_ITERS} \
+    --eval-interval ${TRAIN_ITERS} \
+    --eval-iters 0 \
+    --no-load-optim \
+    --no-load-rng
+"
+ 
+torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $MOE_ARGS \
+    $OUTPUT_ARGS \
+    $OPTIMIZE_ARGS \
+    $TRAIN_ARGS \
+    $MODEL_PARALLEL_ARGS \
+    $MAMBA_ARGS \
+    --distributed-backend nccl \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_mcore_qwen3_30b_a3b.log
diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
index 249231308..577b79873 100644
--- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
+++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
@@ -1,105 +1,105 @@
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron.core.transformer.dot_product_attention import DotProductAttention
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
-from megatron.training import get_args
- 
-from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
-from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
-from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
- 
-from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules
-from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm
- 
-args = get_args()
-num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm
- 
-# # Transformer Layer Spec for Gemma using post_mlp_layernorm and post_mlp_layernorm.
-# layer_spec = ModuleSpec(
-#     module=TransformerLayer,
-#     submodules=TransformerLayerSubmodules(
-#         input_layernorm=PTNorm,
-#         self_attention=ModuleSpec(
-#             module=SelfAttention,
-#             params={"attn_mask_type": AttnMaskType.causal},
-#             submodules=SelfAttentionSubmodules(
-#                 linear_qkv=ColumnParallelLinear,
-#                 core_attention=DotProductAttention,
-#                 linear_proj=RowParallelLinear,
-#                 q_layernorm=PTNorm if qk_layernorm else IdentityOp,
-#                 k_layernorm=PTNorm if qk_layernorm else IdentityOp,
-#             ),
-#         ),
-#         self_attn_bda=get_bias_dropout_add,
-#         pre_mlp_layernorm=PTNorm,
-#         mlp=_get_mlp_module_spec(
-#             use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
-#         ),
-#         mlp_bda=get_bias_dropout_add,
-#         sharded_state_dict_keys_map={
-#             'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-#             'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-#         },
-#     ),
-# )
- 
-layer_spec = ModuleSpec(
-    module=MambaStack,
-    submodules=MambaStackSubmodules(
-        mamba_layer=ModuleSpec(
-            module=MambaLayer,
-            submodules=MambaLayerSubmodules(
-                norm=PTNorm,
-                mixer=ModuleSpec(
-                    module=MambaMixer,
-                    submodules=MambaMixerSubmodules(
-                        in_proj=ColumnParallelLinear,
-                        out_proj=RowParallelLinear,
-                    ),
-                ),
-                mamba_bda=get_bias_dropout_add,
-            ),
-        ),
-        attention_layer=ModuleSpec(
-            module=TransformerLayer,
-            submodules=TransformerLayerSubmodules(
-                input_layernorm=PTNorm,
-                self_attention=ModuleSpec(
-                    module=SelfAttention,
-                    params={"attn_mask_type": AttnMaskType.causal},
-                    submodules=SelfAttentionSubmodules(
-                        linear_qkv=ColumnParallelLinear,
-                        core_attention=DotProductAttention,
-                        linear_proj=RowParallelLinear,
-                    ),
-                ),
-                self_attn_bda=get_bias_dropout_add,
-            ),
-        ),
-        mlp_layer=ModuleSpec(
-            module=TransformerLayer,
-            submodules=TransformerLayerSubmodules(
-                pre_mlp_layernorm=PTNorm,
-                # mlp=ModuleSpec(
-                #     module=MLP,
-                #     submodules=MLPSubmodules(
-                #         linear_fc1=ColumnParallelLinear,
-                #         linear_fc2=RowParallelLinear,
-                #     ),
-                # ),
-                mlp=_get_mlp_module_spec(
-                    use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
-                ),
-                mlp_bda=get_bias_dropout_add,
-                sharded_state_dict_keys_map={
-                    'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-                    'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-                },
-            ),
-        ),
-    ),
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
+from megatron.training import get_args
+ 
+from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
+from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
+from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+ 
+from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules
+from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm
+ 
+args = get_args()
+num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm
+ 
+# # Transformer Layer Spec for Gemma using post_mlp_layernorm and post_mlp_layernorm.
+# layer_spec = ModuleSpec(
+#     module=TransformerLayer,
+#     submodules=TransformerLayerSubmodules(
+#         input_layernorm=PTNorm,
+#         self_attention=ModuleSpec(
+#             module=SelfAttention,
+#             params={"attn_mask_type": AttnMaskType.causal},
+#             submodules=SelfAttentionSubmodules(
+#                 linear_qkv=ColumnParallelLinear,
+#                 core_attention=DotProductAttention,
+#                 linear_proj=RowParallelLinear,
+#                 q_layernorm=PTNorm if qk_layernorm else IdentityOp,
+#                 k_layernorm=PTNorm if qk_layernorm else IdentityOp,
+#             ),
+#         ),
+#         self_attn_bda=get_bias_dropout_add,
+#         pre_mlp_layernorm=PTNorm,
+#         mlp=_get_mlp_module_spec(
+#             use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+#         ),
+#         mlp_bda=get_bias_dropout_add,
+#         sharded_state_dict_keys_map={
+#             'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+#             'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+#         },
+#     ),
+# )
+ 
+layer_spec = ModuleSpec(
+    module=MambaStack,
+    submodules=MambaStackSubmodules(
+        mamba_layer=ModuleSpec(
+            module=MambaLayer,
+            submodules=MambaLayerSubmodules(
+                norm=PTNorm,
+                mixer=ModuleSpec(
+                    module=MambaMixer,
+                    submodules=MambaMixerSubmodules(
+                        in_proj=ColumnParallelLinear,
+                        out_proj=RowParallelLinear,
+                    ),
+                ),
+                mamba_bda=get_bias_dropout_add,
+            ),
+        ),
+        attention_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=PTNorm,
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+            ),
+        ),
+        mlp_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                pre_mlp_layernorm=PTNorm,
+                # mlp=ModuleSpec(
+                #     module=MLP,
+                #     submodules=MLPSubmodules(
+                #         linear_fc1=ColumnParallelLinear,
+                #         linear_fc2=RowParallelLinear,
+                #     ),
+                # ),
+                mlp=_get_mlp_module_spec(
+                    use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+                ),
+                mlp_bda=get_bias_dropout_add,
+                sharded_state_dict_keys_map={
+                    'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                    'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+                },
+            ),
+        ),
+    ),
 )
\ No newline at end of file
-- 
Gitee


From e25466431332a6b628400dfa3ae69c674aaf5fdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Mon, 21 Jul 2025 10:48:12 +0800
Subject: [PATCH 04/16] update mamba_spec with qk_layernorm

---
 mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
index 577b79873..0c463a42b 100644
--- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
+++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
@@ -75,6 +75,8 @@ layer_spec = ModuleSpec(
                         linear_qkv=ColumnParallelLinear,
                         core_attention=DotProductAttention,
                         linear_proj=RowParallelLinear,
+                        q_layernorm=PTNorm if qk_layernorm else IdentityOp,
+                        k_layernorm=PTNorm if qk_layernorm else IdentityOp,
                     ),
                 ),
                 self_attn_bda=get_bias_dropout_add,
-- 
Gitee


From 40ae8dd78815ed5442270e0060d2d16014c6d1b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Mon, 21 Jul 2025 18:08:59 +0800
Subject: [PATCH 05/16] add mamba convert hf2mg

---
 configs/checkpoint/model_cfg.json        |  24 ++++-
 mindspeed_llm/tasks/checkpoint/models.py | 121 +++++++++++++++++++++--
 2 files changed, 138 insertions(+), 7 deletions(-)

diff --git a/configs/checkpoint/model_cfg.json b/configs/checkpoint/model_cfg.json
index 458d639c0..f901e51a3 100644
--- a/configs/checkpoint/model_cfg.json
+++ b/configs/checkpoint/model_cfg.json
@@ -125,7 +125,29 @@
         "layers_mlp_experts_up_proj": "model.layers[layer_idx].mlp.experts[expert_idx].up_proj",
         "layers_mlp_experts_linear_fc2": "model.layers[layer_idx].mlp.experts[expert_idx].down_proj",
         "final_layernorm": "model.norm",
-        "output_layer": "lm_head"
+        "output_layer": "lm_head",
+
+        "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log",
+        "mamba_mixer_D": "model.layers[layer_idx].mamba.D",
+        "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d",
+        "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias",
+        "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj",
+        "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm",
+        "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj"
+      }
+    },
+    "qwen3-moe-mamba": {
+      "__base__": "qwen3-moe",
+      "config_set_value":{},
+      "config_hf_key_mapping":{},
+      "model_hf_key_mapping":{
+        "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log",
+        "mamba_mixer_D": "model.layers[layer_idx].mamba.D",
+        "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d",
+        "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias",
+        "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj",
+        "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm",
+        "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj"
       }
     },
     "qwen3": {
diff --git a/mindspeed_llm/tasks/checkpoint/models.py b/mindspeed_llm/tasks/checkpoint/models.py
index ead74608a..b42eded98 100644
--- a/mindspeed_llm/tasks/checkpoint/models.py
+++ b/mindspeed_llm/tasks/checkpoint/models.py
@@ -23,6 +23,8 @@ from megatron.core import tensor_parallel
 from mindspeed_llm.training.utils import parse_args
 from mindspeed_llm.training import model_provider_func_wrapper
 from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper
+from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols
+
 
 logger.basicConfig(format="")
 logger.getLogger().setLevel(logger.INFO)
@@ -191,7 +193,7 @@ class ModelBase(abc.ABC):
                 logger.info(f"Source output layer weight size: {output_layer_weight.size()} "
                             f"Target output layer weight size: {self.get_output_layer_weight().size()}")
                 output_layer_weight = output_layer_weight[:self.get_output_layer_weight().size(0), :]
-            self.set_output_layer_weight(data=output_layer_weight)
+            self.set_output_layer_weight(data=output_layer_weight) # true
         if self.has_final_layernorm_bias():
             final_layernorm_bias = src_model.get_final_layernorm_bias()
             self.set_final_layernorm_bias(data=final_layernorm_bias)
@@ -214,7 +216,9 @@ class ModelBase(abc.ABC):
         """
         For source layer index == destination layer index.
         """
-        self.set_layer_state_base(src_model, layer_idx, layer_idx)
+        # self.set_layer_state_base(src_model, layer_idx, layer_idx)
+        self.set_qwen_mamba_layer_state_base(src_model, layer_idx, layer_idx)
+        
 
     @staticmethod
     def is_noop_layer(src_layer_idx):
@@ -227,17 +231,29 @@ class ModelBase(abc.ABC):
         kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx}
         self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model)
         self.set_mlp_state(src_model, **kwargs)
+
+        if self.args_cmd.model_type_hf == "qwen3-moe-mamba":
+            layer_type = self.args.layer_pattern[self.args.num_layers]
+            if layer_type == LayerSymbols.MAMBA:
+                pass
+            elif layer_type == LayerSymbols.ATTENTION:
+                # remove 
+                self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model)
+            elif layer_type == LayerSymbols.MLP:
+                # add pre_mlp_layernorm
+                self.set_mlp_state(src_model, **kwargs)
+
         if self.args_cmd.save_lora_to_hf:
             return
         input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx)
         self.set_layers_input_layernorm_weight(layer_idx=dst_layer_idx, data=input_layernorm_weight)
 
-        if self.args.post_norm:
+        if self.args.post_norm: # false
             post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight(
                 layer_idx=src_layer_idx)
             self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=dst_layer_idx,
                                                                            data=post_attn_layernorm_weight)
-        else:
+        else: # true
             pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(
                 layer_idx=src_layer_idx)
             self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx,
@@ -251,6 +267,91 @@ class ModelBase(abc.ABC):
             pre_mlp_layernorm_bias = src_model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx)
             self.set_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=dst_layer_idx, data=pre_mlp_layernorm_bias)
 
+    def set_qwen_mamba_layer_state_base(self, src_model, src_layer_idx, dst_layer_idx):
+        # assert self.args_cmd.model_type_hf == "qwen3-moe-mamba"
+        self.args.hybrid_override_pattern = "*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" # TODO
+        dst_layer_idx = src_layer_idx * 2
+        layer_type = self.args.hybrid_override_pattern[dst_layer_idx]
+
+        if layer_type == LayerSymbols.MAMBA:
+            self.set_mamba_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) 
+        
+        elif layer_type == LayerSymbols.ATTENTION:
+            # inpyt_layernorm
+            input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx)
+            self.set_layers_input_layernorm_weight(layer_idx=src_layer_idx*2, data=input_layernorm_weight)
+            
+            # self_attention
+            self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model)
+        
+        dst_layer_idx = dst_layer_idx + 1
+        assert self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP
+        if self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP:
+            # pre_mlp_layernorm / post_attention_layernorm
+            pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=src_layer_idx)
+            self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx,
+                                                                data=pre_mlp_layernorm_weight)
+            # mlp
+            kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx}
+            self.set_mlp_state(src_model, **kwargs)
+
+    # TODO
+    def set_mamba_state(self, src_layer_idx, dst_layer_idx, src_model):
+        
+        # mg
+        
+        # mixer
+            # in_proj
+            # conv1d
+            # norm
+            # out_proj
+        # norm
+
+
+        # hf (model.layers.1.input_layernorm.weight)
+        # 
+
+        # mamba_in_proj.weight
+
+        # mamba.A_log
+        # mamba.D
+
+        # mamba.conv1d.weight
+        # mamba.conv1d.bias
+        # mamba.dt_bias
+        
+        # mamba.norm.weight
+        # mamba.out_proj.weight
+
+
+        # model.layers.1.mamba.in_proj.weight
+        in_proj_weight = src_model.get_mamba_mixer_in_proj_weight(layer_idx=src_layer_idx)
+        # self.set_mamba_mixer_in_proj_weight(layer_idx=dst_layer_idx, data=in_proj_weight) # TODO [4384, 2048] (mg) [9248, 2048] (hf)
+
+        # A_log, D, dt_bias (without weight, just nn.parameter)
+        A_log = src_model.get_mamba_mixer_A_log(layer_idx=src_layer_idx)
+        D = src_model.get_mamba_mixer_D(layer_idx=src_layer_idx)
+        dt_bias = src_model.get_mamba_mixer_dt_bias(layer_idx=src_layer_idx)
+
+        # conv1d 
+        conv1d_weight = src_model.get_mamba_mixer_conv1d_weight(layer_idx=src_layer_idx)
+        conv1d_bias = src_model.get_mamba_mixer_conv1d_bias(layer_idx=src_layer_idx)
+        self.set_mamba_mixer_conv1d_weight(layer_idx=dst_layer_idx, data=conv1d_weight)
+        self.set_mamba_mixer_conv1d_bias(layer_idx=dst_layer_idx, data=conv1d_bias)
+
+        # model.layers.1.mamba.norm.weight
+        mixer_norm = src_model.get_mamba_mixer_norm_weight(layer_idx=src_layer_idx)
+        self.set_mamba_mixer_norm_weight(layer_idx=dst_layer_idx, data=mixer_norm)
+
+        # model.layers.1.mamba.out_proj.weight
+        out_proj_weight = src_model.get_mamba_mixer_out_proj_weight(layer_idx=src_layer_idx)
+        self.set_mamba_mixer_out_proj_weight(layer_idx=dst_layer_idx, data=out_proj_weight)
+
+        # mamba norm (not found in hf) # TODO
+
+        pass
+
+
     def set_attn_state(self, src_layer_idx, dst_layer_idx, src_model):
         """Set self-attention params."""
         if self.args.save_lora_to_hf:
@@ -270,7 +371,7 @@ class ModelBase(abc.ABC):
                         self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm)
                 else:
                     q_layernorm = src_model.get_layers_self_attention_q_layernorm_weight(layer_idx=src_layer_idx)
-                    self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm)
+                    self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) # error
                 k_layernorm = src_model.get_layers_self_attention_k_layernorm_weight(layer_idx=src_layer_idx)
                 self.set_layers_self_attention_k_layernorm_weight(layer_idx=dst_layer_idx, data=k_layernorm)
 
@@ -1401,7 +1502,8 @@ class MegatronMCoreModel(MegatronModel):
             "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1",
             "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2",
             "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm",
-            "final_layernorm": "decoder.final_layernorm",
+            # "final_layernorm": "decoder.final_layernorm",
+            "final_layernorm": "decoder.final_norm",
             "output_layer": "output_layer",
             "rm_head": "rm_head"
         }
@@ -1418,6 +1520,13 @@ class MegatronMCoreModel(MegatronModel):
         self.module_mapping["layers_self_attention_linear_qb"] = module_layer + "self_attention.linear_qb"
         self.module_mapping["layers_self_attention_linear_kvb"] = module_layer + "self_attention.linear_kvb"
 
+        # Mamba
+        self.module_mapping["mamba_mixer_A_log"] = module_layer + "A_log"
+        self.module_mapping["mamba_mixer_in_proj"] = module_layer + "mixer.in_proj"
+        self.module_mapping["mamba_mixer_out_proj"] = module_layer + "mixer.out_proj"
+        self.module_mapping["mamba_mixer_norm"] = module_layer + "mixer.norm"
+        
+
         # shared experts
         self.module_mapping[
             "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1"
-- 
Gitee


From 89b185f511cdf2446fd0a0af00b6c33bcb810c9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Wed, 23 Jul 2025 17:38:59 +0800
Subject: [PATCH 06/16] Revert "add mamba convert hf2mg"

This reverts commit 6b7e1e435ec72e2d488c45e78277311eb1359e82.
---
 configs/checkpoint/model_cfg.json        |  24 +----
 mindspeed_llm/tasks/checkpoint/models.py | 121 ++---------------------
 2 files changed, 7 insertions(+), 138 deletions(-)

diff --git a/configs/checkpoint/model_cfg.json b/configs/checkpoint/model_cfg.json
index f901e51a3..458d639c0 100644
--- a/configs/checkpoint/model_cfg.json
+++ b/configs/checkpoint/model_cfg.json
@@ -125,29 +125,7 @@
         "layers_mlp_experts_up_proj": "model.layers[layer_idx].mlp.experts[expert_idx].up_proj",
         "layers_mlp_experts_linear_fc2": "model.layers[layer_idx].mlp.experts[expert_idx].down_proj",
         "final_layernorm": "model.norm",
-        "output_layer": "lm_head",
-
-        "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log",
-        "mamba_mixer_D": "model.layers[layer_idx].mamba.D",
-        "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d",
-        "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias",
-        "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj",
-        "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm",
-        "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj"
-      }
-    },
-    "qwen3-moe-mamba": {
-      "__base__": "qwen3-moe",
-      "config_set_value":{},
-      "config_hf_key_mapping":{},
-      "model_hf_key_mapping":{
-        "mamba_mixer_A_log": "model.layers[layer_idx].mamba.A_log",
-        "mamba_mixer_D": "model.layers[layer_idx].mamba.D",
-        "mamba_mixer_conv1d": "model.layers[layer_idx].mamba.conv1d",
-        "mamba_mixer_dt_bias": "model.layers[layer_idx].mamba.dt_bias",
-        "mamba_mixer_in_proj": "model.layers[layer_idx].mamba.in_proj",
-        "mamba_mixer_norm": "model.layers[layer_idx].mamba.norm",
-        "mamba_mixer_out_proj": "model.layers[layer_idx].mamba.out_proj"
+        "output_layer": "lm_head"
       }
     },
     "qwen3": {
diff --git a/mindspeed_llm/tasks/checkpoint/models.py b/mindspeed_llm/tasks/checkpoint/models.py
index b42eded98..ead74608a 100644
--- a/mindspeed_llm/tasks/checkpoint/models.py
+++ b/mindspeed_llm/tasks/checkpoint/models.py
@@ -23,8 +23,6 @@ from megatron.core import tensor_parallel
 from mindspeed_llm.training.utils import parse_args
 from mindspeed_llm.training import model_provider_func_wrapper
 from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper
-from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols
-
 
 logger.basicConfig(format="")
 logger.getLogger().setLevel(logger.INFO)
@@ -193,7 +191,7 @@ class ModelBase(abc.ABC):
                 logger.info(f"Source output layer weight size: {output_layer_weight.size()} "
                             f"Target output layer weight size: {self.get_output_layer_weight().size()}")
                 output_layer_weight = output_layer_weight[:self.get_output_layer_weight().size(0), :]
-            self.set_output_layer_weight(data=output_layer_weight) # true
+            self.set_output_layer_weight(data=output_layer_weight)
         if self.has_final_layernorm_bias():
             final_layernorm_bias = src_model.get_final_layernorm_bias()
             self.set_final_layernorm_bias(data=final_layernorm_bias)
@@ -216,9 +214,7 @@ class ModelBase(abc.ABC):
         """
         For source layer index == destination layer index.
         """
-        # self.set_layer_state_base(src_model, layer_idx, layer_idx)
-        self.set_qwen_mamba_layer_state_base(src_model, layer_idx, layer_idx)
-        
+        self.set_layer_state_base(src_model, layer_idx, layer_idx)
 
     @staticmethod
     def is_noop_layer(src_layer_idx):
@@ -231,29 +227,17 @@ class ModelBase(abc.ABC):
         kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx}
         self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model)
         self.set_mlp_state(src_model, **kwargs)
-
-        if self.args_cmd.model_type_hf == "qwen3-moe-mamba":
-            layer_type = self.args.layer_pattern[self.args.num_layers]
-            if layer_type == LayerSymbols.MAMBA:
-                pass
-            elif layer_type == LayerSymbols.ATTENTION:
-                # remove 
-                self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model)
-            elif layer_type == LayerSymbols.MLP:
-                # add pre_mlp_layernorm
-                self.set_mlp_state(src_model, **kwargs)
-
         if self.args_cmd.save_lora_to_hf:
             return
         input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx)
         self.set_layers_input_layernorm_weight(layer_idx=dst_layer_idx, data=input_layernorm_weight)
 
-        if self.args.post_norm: # false
+        if self.args.post_norm:
             post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight(
                 layer_idx=src_layer_idx)
             self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=dst_layer_idx,
                                                                            data=post_attn_layernorm_weight)
-        else: # true
+        else:
             pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(
                 layer_idx=src_layer_idx)
             self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx,
@@ -267,91 +251,6 @@ class ModelBase(abc.ABC):
             pre_mlp_layernorm_bias = src_model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=src_layer_idx)
             self.set_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=dst_layer_idx, data=pre_mlp_layernorm_bias)
 
-    def set_qwen_mamba_layer_state_base(self, src_model, src_layer_idx, dst_layer_idx):
-        # assert self.args_cmd.model_type_hf == "qwen3-moe-mamba"
-        self.args.hybrid_override_pattern = "*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-" # TODO
-        dst_layer_idx = src_layer_idx * 2
-        layer_type = self.args.hybrid_override_pattern[dst_layer_idx]
-
-        if layer_type == LayerSymbols.MAMBA:
-            self.set_mamba_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model) 
-        
-        elif layer_type == LayerSymbols.ATTENTION:
-            # inpyt_layernorm
-            input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=src_layer_idx)
-            self.set_layers_input_layernorm_weight(layer_idx=src_layer_idx*2, data=input_layernorm_weight)
-            
-            # self_attention
-            self.set_attn_state(src_layer_idx=src_layer_idx, dst_layer_idx=dst_layer_idx, src_model=src_model)
-        
-        dst_layer_idx = dst_layer_idx + 1
-        assert self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP
-        if self.args.hybrid_override_pattern[dst_layer_idx] == LayerSymbols.MLP:
-            # pre_mlp_layernorm / post_attention_layernorm
-            pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=src_layer_idx)
-            self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=dst_layer_idx,
-                                                                data=pre_mlp_layernorm_weight)
-            # mlp
-            kwargs = {'src_layer_idx': src_layer_idx, "dst_layer_idx": dst_layer_idx}
-            self.set_mlp_state(src_model, **kwargs)
-
-    # TODO
-    def set_mamba_state(self, src_layer_idx, dst_layer_idx, src_model):
-        
-        # mg
-        
-        # mixer
-            # in_proj
-            # conv1d
-            # norm
-            # out_proj
-        # norm
-
-
-        # hf (model.layers.1.input_layernorm.weight)
-        # 
-
-        # mamba_in_proj.weight
-
-        # mamba.A_log
-        # mamba.D
-
-        # mamba.conv1d.weight
-        # mamba.conv1d.bias
-        # mamba.dt_bias
-        
-        # mamba.norm.weight
-        # mamba.out_proj.weight
-
-
-        # model.layers.1.mamba.in_proj.weight
-        in_proj_weight = src_model.get_mamba_mixer_in_proj_weight(layer_idx=src_layer_idx)
-        # self.set_mamba_mixer_in_proj_weight(layer_idx=dst_layer_idx, data=in_proj_weight) # TODO [4384, 2048] (mg) [9248, 2048] (hf)
-
-        # A_log, D, dt_bias (without weight, just nn.parameter)
-        A_log = src_model.get_mamba_mixer_A_log(layer_idx=src_layer_idx)
-        D = src_model.get_mamba_mixer_D(layer_idx=src_layer_idx)
-        dt_bias = src_model.get_mamba_mixer_dt_bias(layer_idx=src_layer_idx)
-
-        # conv1d 
-        conv1d_weight = src_model.get_mamba_mixer_conv1d_weight(layer_idx=src_layer_idx)
-        conv1d_bias = src_model.get_mamba_mixer_conv1d_bias(layer_idx=src_layer_idx)
-        self.set_mamba_mixer_conv1d_weight(layer_idx=dst_layer_idx, data=conv1d_weight)
-        self.set_mamba_mixer_conv1d_bias(layer_idx=dst_layer_idx, data=conv1d_bias)
-
-        # model.layers.1.mamba.norm.weight
-        mixer_norm = src_model.get_mamba_mixer_norm_weight(layer_idx=src_layer_idx)
-        self.set_mamba_mixer_norm_weight(layer_idx=dst_layer_idx, data=mixer_norm)
-
-        # model.layers.1.mamba.out_proj.weight
-        out_proj_weight = src_model.get_mamba_mixer_out_proj_weight(layer_idx=src_layer_idx)
-        self.set_mamba_mixer_out_proj_weight(layer_idx=dst_layer_idx, data=out_proj_weight)
-
-        # mamba norm (not found in hf) # TODO
-
-        pass
-
-
     def set_attn_state(self, src_layer_idx, dst_layer_idx, src_model):
         """Set self-attention params."""
         if self.args.save_lora_to_hf:
@@ -371,7 +270,7 @@ class ModelBase(abc.ABC):
                         self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm)
                 else:
                     q_layernorm = src_model.get_layers_self_attention_q_layernorm_weight(layer_idx=src_layer_idx)
-                    self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm) # error
+                    self.set_layers_self_attention_q_layernorm_weight(layer_idx=dst_layer_idx, data=q_layernorm)
                 k_layernorm = src_model.get_layers_self_attention_k_layernorm_weight(layer_idx=src_layer_idx)
                 self.set_layers_self_attention_k_layernorm_weight(layer_idx=dst_layer_idx, data=k_layernorm)
 
@@ -1502,8 +1401,7 @@ class MegatronMCoreModel(MegatronModel):
             "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1",
             "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2",
             "layers_self_attention_post_mlp_layernorm": module_layer + "post_mlp_layernorm",
-            # "final_layernorm": "decoder.final_layernorm",
-            "final_layernorm": "decoder.final_norm",
+            "final_layernorm": "decoder.final_layernorm",
             "output_layer": "output_layer",
             "rm_head": "rm_head"
         }
@@ -1520,13 +1418,6 @@ class MegatronMCoreModel(MegatronModel):
         self.module_mapping["layers_self_attention_linear_qb"] = module_layer + "self_attention.linear_qb"
         self.module_mapping["layers_self_attention_linear_kvb"] = module_layer + "self_attention.linear_kvb"
 
-        # Mamba
-        self.module_mapping["mamba_mixer_A_log"] = module_layer + "A_log"
-        self.module_mapping["mamba_mixer_in_proj"] = module_layer + "mixer.in_proj"
-        self.module_mapping["mamba_mixer_out_proj"] = module_layer + "mixer.out_proj"
-        self.module_mapping["mamba_mixer_norm"] = module_layer + "mixer.norm"
-        
-
         # shared experts
         self.module_mapping[
             "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1"
-- 
Gitee


From b7d0fcfea6fd423ace95a87f1c69549eb14dbeab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Wed, 23 Jul 2025 17:44:44 +0800
Subject: [PATCH 07/16] support Qwen3-30B-A3B-Mamba training

---
 .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh    | 13 +++--
 mindspeed_llm/core/ssm/mamba_block.py         |  5 ++
 .../features_manager/models/mamba.py          |  2 +-
 .../tasks/models/spec/qwen3_mamba_spec.py     | 56 ++++++++++++++++++-
 4 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
index 66b12c0e5..5f59b529b 100644
--- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
+++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
@@ -17,13 +17,15 @@ CKPT_SAVE_DIR="your model save ckpt path"
 DATA_PATH="your data path"
 TOKENIZER_PATH="your tokenizer path"
 CKPT_LOAD_DIR="your model ckpt path"
- 
+
+
 DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
 TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
 CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
- 
-TP=1
-PP=1
+CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-tp2-pp2-ep2"
+
+TP=2
+PP=2
 EP=2
 CP=1
  
@@ -44,6 +46,8 @@ DISTRIBUTED_ARGS="
  
 NUM_LAYERS=96
 LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
+NUM_LAYERS=4
+LAYER_PATTEN="*-M-"
 MAMBA_ARGS="
     --reuse-fp32-param \
     --no-shared-storage \
@@ -176,4 +180,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \
     $MAMBA_ARGS \
     --distributed-backend nccl \
     --save ${CKPT_SAVE_DIR} \
+    --load ${CKPT_LOAD_DIR}
     | tee logs/train_mcore_qwen3_30b_a3b.log
diff --git a/mindspeed_llm/core/ssm/mamba_block.py b/mindspeed_llm/core/ssm/mamba_block.py
index 4c4266b51..3430ff83b 100644
--- a/mindspeed_llm/core/ssm/mamba_block.py
+++ b/mindspeed_llm/core/ssm/mamba_block.py
@@ -79,6 +79,11 @@ def _mamba_block_method_checkpointed_forward_func(
                     inference_params=None,
                     rotary_pos_emb=rotary_pos_emb,
                 )
+                # The attention layer (currently a simplified transformer layer)
+                # outputs a tuple of (hidden_states, context). Context is intended
+                # for cross-attention, and is not needed in our model.
+                if isinstance(hidden_states, tuple):
+                    hidden_states = hidden_states[0]
             return hidden_states
 
         return custom_forward
diff --git a/mindspeed_llm/features_manager/models/mamba.py b/mindspeed_llm/features_manager/models/mamba.py
index ac482e581..d3fed01e8 100644
--- a/mindspeed_llm/features_manager/models/mamba.py
+++ b/mindspeed_llm/features_manager/models/mamba.py
@@ -15,7 +15,7 @@ class MambaModel(MindSpeedFeature):
         group.add_argument('--mamba-d-state', type=int, default=128, help='state dim for mamba')  
         group.add_argument('--mamba-d-conv', type=int, default=4, help='conv channel dim for mamba')  
         group.add_argument('--mamba-expand', type=int, default=1, help='expand scale for mamba')  
-        group.add_argument('--mamba-headdim', type=int, default=80, help='head dim for mamba')          
+        group.add_argument('--mamba-headdim', type=int, default=64, help='head dim for mamba')          
 
     def register_patches(self, patch_manager, args):
         from mindspeed_llm.core.ssm.mamba_mixer import mamba_mixer_init_wrapper, mamba_mixer_forward, Mamba2RMSNorm
diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
index 0c463a42b..8578d7442 100644
--- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
+++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
@@ -46,14 +46,64 @@ num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped
 #         },
 #     ),
 # )
- 
+
+# layer_spec = ModuleSpec(
+#     module=MambaStack,
+#     submodules=MambaStackSubmodules(
+#         mamba_layer=ModuleSpec(
+#             module=MambaLayer,
+#             submodules=MambaLayerSubmodules(
+#                 norm=PTNorm,
+#                 mixer=ModuleSpec(
+#                     module=MambaMixer,
+#                     submodules=MambaMixerSubmodules(
+#                         in_proj=ColumnParallelLinear,
+#                         out_proj=RowParallelLinear,
+#                     ),
+#                 ),
+#                 mamba_bda=get_bias_dropout_add,
+#             ),
+#         ),
+#         attention_layer=ModuleSpec(
+#             module=TransformerLayer,
+#             submodules=TransformerLayerSubmodules(
+#                 input_layernorm=PTNorm,
+#                 self_attention=ModuleSpec(
+#                     module=SelfAttention,
+#                     params={"attn_mask_type": AttnMaskType.causal},
+#                     submodules=SelfAttentionSubmodules(
+#                         linear_qkv=ColumnParallelLinear,
+#                         core_attention=DotProductAttention,
+#                         linear_proj=RowParallelLinear,
+#                     ),
+#                 ),
+#                 self_attn_bda=get_bias_dropout_add,
+#             ),
+#         ),
+#         mlp_layer=ModuleSpec(
+#             module=TransformerLayer,
+#             submodules=TransformerLayerSubmodules(
+#                 pre_mlp_layernorm=PTNorm,
+#                 mlp=ModuleSpec(
+#                     module=MLP,
+#                     submodules=MLPSubmodules(
+#                         linear_fc1=ColumnParallelLinear,
+#                         linear_fc2=RowParallelLinear,
+#                     ),
+#                 ),
+#                 mlp_bda=get_bias_dropout_add,
+#             ),
+#         ),
+#     ),
+# )
+
 layer_spec = ModuleSpec(
     module=MambaStack,
     submodules=MambaStackSubmodules(
         mamba_layer=ModuleSpec(
             module=MambaLayer,
             submodules=MambaLayerSubmodules(
-                norm=PTNorm,
+                # norm=PTNorm,
                 mixer=ModuleSpec(
                     module=MambaMixer,
                     submodules=MambaMixerSubmodules(
@@ -67,7 +117,7 @@ layer_spec = ModuleSpec(
         attention_layer=ModuleSpec(
             module=TransformerLayer,
             submodules=TransformerLayerSubmodules(
-                input_layernorm=PTNorm,
+                # input_layernorm=PTNorm,
                 self_attention=ModuleSpec(
                     module=SelfAttention,
                     params={"attn_mask_type": AttnMaskType.causal},
-- 
Gitee


From a8549bfb53698513f3d66823aaa03a4df832163d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Wed, 23 Jul 2025 19:18:33 +0800
Subject: [PATCH 08/16] modified config

---
 .../qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
index 5f59b529b..bef9e8346 100644
--- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
+++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
@@ -22,7 +22,7 @@ CKPT_LOAD_DIR="your model ckpt path"
 DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
 TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
 CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
-CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-tp2-pp2-ep2"
+CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-tp2-pp2-ep2"
 
 TP=2
 PP=2
@@ -57,10 +57,10 @@ MAMBA_ARGS="
     --num-layers ${NUM_LAYERS} \
     --mamba-ngroups 8 \
     --mamba-chunk-size 128 \
-    --mamba-d-state 128 \
+    --mamba-d-state 64 \
     --mamba-d-conv 4 \
     --mamba-expand 2 \
-    --mamba-headdim 64 \
+    --mamba-headdim 128 \
     --tokenizer-model ${TOKENIZER_PATH} \
     --hybrid-attention-ratio 0.26 \
     --hybrid-mlp-ratio 0.5 \
@@ -179,6 +179,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \
     $MODEL_PARALLEL_ARGS \
     $MAMBA_ARGS \
     --distributed-backend nccl \
+    --load ${CKPT_LOAD_DIR} \
     --save ${CKPT_SAVE_DIR} \
-    --load ${CKPT_LOAD_DIR}
     | tee logs/train_mcore_qwen3_30b_a3b.log
-- 
Gitee


From 4a88d2f5f8e2fa40ed241d0fe09f4531fb9ce478 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Thu, 24 Jul 2025 16:10:20 +0800
Subject: [PATCH 09/16] add layer norm and modified pre_mlp_norm

---
 .../qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh      | 8 ++++----
 mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
index bef9e8346..873e79d97 100644
--- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
+++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
@@ -22,7 +22,7 @@ CKPT_LOAD_DIR="your model ckpt path"
 DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
 TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
 CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
-CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-tp2-pp2-ep2"
+CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v2-tp2-pp2-ep2"
 
 TP=2
 PP=2
@@ -55,12 +55,12 @@ MAMBA_ARGS="
     --use-flash-attn \
     --use-mcore-models \
     --num-layers ${NUM_LAYERS} \
-    --mamba-ngroups 8 \
+    --mamba-ngroups 4 \
     --mamba-chunk-size 128 \
-    --mamba-d-state 64 \
+    --mamba-d-state 128 \
     --mamba-d-conv 4 \
     --mamba-expand 2 \
-    --mamba-headdim 128 \
+    --mamba-headdim 64 \
     --tokenizer-model ${TOKENIZER_PATH} \
     --hybrid-attention-ratio 0.26 \
     --hybrid-mlp-ratio 0.5 \
diff --git a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
index 8578d7442..008dd51e3 100644
--- a/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
+++ b/mindspeed_llm/tasks/models/spec/qwen3_mamba_spec.py
@@ -103,7 +103,7 @@ layer_spec = ModuleSpec(
         mamba_layer=ModuleSpec(
             module=MambaLayer,
             submodules=MambaLayerSubmodules(
-                # norm=PTNorm,
+                norm=PTNorm,
                 mixer=ModuleSpec(
                     module=MambaMixer,
                     submodules=MambaMixerSubmodules(
@@ -117,7 +117,7 @@ layer_spec = ModuleSpec(
         attention_layer=ModuleSpec(
             module=TransformerLayer,
             submodules=TransformerLayerSubmodules(
-                # input_layernorm=PTNorm,
+                input_layernorm=PTNorm,
                 self_attention=ModuleSpec(
                     module=SelfAttention,
                     params={"attn_mask_type": AttnMaskType.causal},
-- 
Gitee


From 0cb2785e1d32cf0073d1634a81eacc1e9cc878ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Thu, 24 Jul 2025 16:20:23 +0800
Subject: [PATCH 10/16] remove redundant modified

---
 mindspeed_llm/features_manager/models/mamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspeed_llm/features_manager/models/mamba.py b/mindspeed_llm/features_manager/models/mamba.py
index d3fed01e8..c6cdbe879 100644
--- a/mindspeed_llm/features_manager/models/mamba.py
+++ b/mindspeed_llm/features_manager/models/mamba.py
@@ -15,7 +15,7 @@ class MambaModel(MindSpeedFeature):
         group.add_argument('--mamba-d-state', type=int, default=128, help='state dim for mamba')  
         group.add_argument('--mamba-d-conv', type=int, default=4, help='conv channel dim for mamba')  
         group.add_argument('--mamba-expand', type=int, default=1, help='expand scale for mamba')  
-        group.add_argument('--mamba-headdim', type=int, default=64, help='head dim for mamba')          
+        group.add_argument('--mamba-headdim', type=int, default=80, help='head dim for mamba')
 
     def register_patches(self, patch_manager, args):
         from mindspeed_llm.core.ssm.mamba_mixer import mamba_mixer_init_wrapper, mamba_mixer_forward, Mamba2RMSNorm
-- 
Gitee


From 55ac1151c7ebf9e71b865c356a60a014c7c844d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Mon, 28 Jul 2025 14:30:20 +0800
Subject: [PATCH 11/16] change head_dim=128

---
 .../pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh    | 53 +++++++++----------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
index 873e79d97..d3a9b2ab6 100644
--- a/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
+++ b/examples/mcore/qwen3_moe/pretrain_qwen3_30b_a3b_4K_ptd_mamba.sh
@@ -1,41 +1,40 @@
 #!/bin/bash
- 
+
 export HCCL_CONNECT_TIMEOUT=1800
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 export NPU_ASD_ENABLE=0
- 
+
 NPUS_PER_NODE=8
-MASTER_ADDR=localhost
+MASTER_ADDR=7.150.14.181
 MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
+NNODES=2
+NODE_RANK=1
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
- 
+
 # please fill these path configurations
 CKPT_SAVE_DIR="your model save ckpt path"
 DATA_PATH="your data path"
 TOKENIZER_PATH="your tokenizer path"
 CKPT_LOAD_DIR="your model ckpt path"
 
-
-DATA_PATH="/home/ascend-vllm/dataset/alpaca/Qwen3-30B-A3B-convert-pretrain_text_document"
+DATA_PATH="/home/ascend-vllm/dataset/lsb/enwiki20230101/Qwen3-30B-A3B-convert-pretrain_text_document"
 TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
 CKPT_SAVE_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-mamba"
-CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v2-tp2-pp2-ep2"
+CKPT_LOAD_DIR="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v3-tp1-pp4-ep4"
 
-TP=2
-PP=2
-EP=2
+TP=1
+PP=4
+EP=4
 CP=1
- 
+
 MBS=1
 GBS=256
-SEQ_LENGTH=1024
+SEQ_LENGTH=4096
 TRAIN_ITERS=2000
 CP_TYPE='ulysses_cp_algo'
 ROUTER_BALANCING_TYPE='aux_loss'
- 
+
 DISTRIBUTED_ARGS="
     --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
@@ -43,11 +42,11 @@ DISTRIBUTED_ARGS="
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT
 "
- 
+
 NUM_LAYERS=96
 LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
-NUM_LAYERS=4
-LAYER_PATTEN="*-M-"
+# NUM_LAYERS=8
+# LAYER_PATTEN="*-M-*-M-"
 MAMBA_ARGS="
     --reuse-fp32-param \
     --no-shared-storage \
@@ -60,7 +59,7 @@ MAMBA_ARGS="
     --mamba-d-state 128 \
     --mamba-d-conv 4 \
     --mamba-expand 2 \
-    --mamba-headdim 64 \
+    --mamba-headdim 128 \
     --tokenizer-model ${TOKENIZER_PATH} \
     --hybrid-attention-ratio 0.26 \
     --hybrid-mlp-ratio 0.5 \
@@ -70,7 +69,7 @@ MAMBA_ARGS="
     --overlap-grad-reduce \
     --norm-epsilon 1e-6 \
 "
- 
+
 MOE_ARGS="
     --num-experts 128 \
     --moe-router-topk 8 \
@@ -82,7 +81,7 @@ MOE_ARGS="
     --moe-token-dispatcher-type alltoall \
     --moe-aux-loss-coeff 0.001 \
 "
- 
+
 OPTIMIZE_ARGS="
     --use-flash-attn \
     --use-fused-rotary-pos-emb \
@@ -97,7 +96,7 @@ OPTIMIZE_ARGS="
     --recompute-granularity full \
     --recompute-num-layers 1 \
 "
- 
+
 TRAIN_ARGS="
     --micro-batch-size ${MBS} \
     --global-batch-size ${GBS} \
@@ -119,7 +118,7 @@ TRAIN_ARGS="
     --seq-length ${SEQ_LENGTH} \
     --no-shared-storage
 "
- 
+
 MODEL_PARALLEL_ARGS="
     --tensor-model-parallel-size ${TP} \
     --pipeline-model-parallel-size ${PP} \
@@ -127,7 +126,7 @@ MODEL_PARALLEL_ARGS="
     --context-parallel-size ${CP} \
     --context-parallel-algo ${CP_TYPE} \
 "
- 
+
 GPT_ARGS="
     --use-mcore-models \
     --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
@@ -154,12 +153,12 @@ GPT_ARGS="
     --group-query-attention \
     --num-query-groups 4
 "
- 
+
 DATA_ARGS="
     --data-path $DATA_PATH \
     --split 100,0,0
 "
- 
+
 OUTPUT_ARGS="
     --log-interval 1 \
     --save-interval ${TRAIN_ITERS} \
@@ -168,7 +167,7 @@ OUTPUT_ARGS="
     --no-load-optim \
     --no-load-rng
 "
- 
+
 torchrun $DISTRIBUTED_ARGS pretrain_mamba.py \
     $GPT_ARGS \
     $DATA_ARGS \
-- 
Gitee


From 9e7539c1694703ff100a03be2d40e7ded97746f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Tue, 29 Jul 2025 13:35:37 +0000
Subject: [PATCH 12/16] add inference_mamba.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: flippy航 <654733882@qq.com>
---
 inference_mamba.py | 139 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 inference_mamba.py

diff --git a/inference_mamba.py b/inference_mamba.py
new file mode 100644
index 000000000..ac4b8964f
--- /dev/null
+++ b/inference_mamba.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright (c) 2024, HUAWEI CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+
+from mindspeed_llm import megatron_adaptor
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, \
+    get_gpt_layer_local_spec
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_args, print_rank_0
+from megatron.legacy.model import GPTModel
+from megatron.training.initialize import initialize_megatron
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+
+from mindspeed_llm.tasks.inference.infer_base import task_factory
+from mindspeed_llm.tasks.inference.module import GPTModelInfer, MambaModelInfer, MegatronModuleForCausalLM
+from megatron.core.inference_params import InferenceParams
+
+
+
+def model_provider(pre_process=True, post_process=True) -> Union[MambaModelInfer, GPTModel]:
+    """Builds the model.
+
+    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        Union[GPTModelInfer, GPTModel]: The returned model
+    """
+    args = get_args()
+    use_te = args.transformer_impl == "transformer_engine"
+
+    if args.sequence_parallel and args.use_kv_cache:
+        raise AssertionError('Use_kv_cache can not be true in sequence_parallel mode.')
+
+    print_rank_0('building GPT model ...')
+    # Experimental loading arguments from yaml
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+
+    if args.spec is not None:
+        mamba_stack_spec = import_module(args.spec)
+    else:
+        raise "You must provide a valid Mamba layer spec!"
+        
+    if args.use_mcore_models:
+
+        model = MambaModelInfer(
+            config=config,
+            mamba_stack_spec=mamba_stack_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            mamba_ssm_ngroups=args.mamba_ngroups,
+            pre_process=pre_process,
+            hybrid_attention_ratio=args.hybrid_attention_ratio,
+            hybrid_mlp_ratio=args.hybrid_mlp_ratio,
+            hybrid_override_pattern=args.hybrid_override_pattern,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=True,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent,
+            rotary_base=args.rotary_base
+    )   
+
+    else:
+        if not args.context_parallel_size == 1:
+            raise ValueError("Context parallelism is only supported with Megatron Core!")
+
+        model = GPTModel(
+            config,
+            parallel_output=True if args.sequence_parallel else False,
+            pre_process=pre_process,
+            post_process=post_process
+        )
+
+    return model
+
+
+def main():
+    initialize_megatron(args_defaults={'no_load_rng': True,
+                                       'no_load_optim': True})
+
+    args = get_args()
+
+    model = MegatronModuleForCausalLM.from_pretrained(
+        model_provider=model_provider,
+        pretrained_model_name_or_path=args.load
+    )
+
+    task_factory(args, model)
+
+    
+    # # 生成指定输入
+    # import torch
+    # import numpy as np
+    # from megatron.training.utils import get_ltor_masks_and_position_ids
+    # input_ids = torch.tensor([i for i in range(10000, 12048)]).unsqueeze(0).npu()
+    # eod = 0
+    # reset_position_ids = False
+    # reset_attention_mask = False
+    # eod_mask_loss = False
+    # max_batch_size = 1
+    # max_sequence_length = 2048
+    # attention_mask, loss_mask, _ = get_ltor_masks_and_position_ids(
+    #     input_ids,
+    #     eod,
+    #     reset_position_ids,
+    #     reset_attention_mask,
+    #     eod_mask_loss)
+    # inference_params = InferenceParams(max_batch_size, max_sequence_length)
+    # with torch.no_grad():
+    #     outputs = model.forward(input_ids=input_ids, position_ids=None, attention_mask=attention_mask.npu(), inference_params=inference_params)
+    #     print(outputs.shape, outputs.dtype)
+    #     np.save("./npu_forward_out_mg_qewn3_mamba_logits_fp16.npy", outputs.cpu().numpy())
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
-- 
Gitee


From 151b1e0598f7bb8edac9503f65c9726a031d0c7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Tue, 29 Jul 2025 13:36:24 +0000
Subject: [PATCH 13/16] add
 examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: flippy航 <654733882@qq.com>
---
 .../generate_qwen3_30b_a3b_ptd_mamba.sh       | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh

diff --git a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh
new file mode 100644
index 000000000..f9ba7c3cd
--- /dev/null
+++ b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# The number of parameters is not aligned
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# please fill these path configurations
+TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
+CHECKPOINT="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v4-tp1-pp8-ep1"
+
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+TP=1
+PP=8
+EP=1
+SEQ_LENGTH=2048
+ROUTER_BALANCING_TYPE='softmax_topk'
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+MOE_ARGS="
+    --num-experts 128 \
+    --moe-router-topk 8 \
+    --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \
+    --moe-intermediate-size 768 \
+    --moe-permutation-async-comm \
+    --moe-token-dispatcher-type allgather \
+    --moe-aux-loss-coeff 0.001
+"
+
+
+NUM_LAYERS=96
+LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
+# NUM_LAYERS=8
+# LAYER_PATTEN="*-M-*-M-"
+MAMBA_ARGS="
+    --reuse-fp32-param \
+    --no-shared-storage \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --use-mcore-models \
+    --num-layers ${NUM_LAYERS} \
+    --mamba-ngroups 4 \
+    --mamba-chunk-size 128 \
+    --mamba-d-state 128 \
+    --mamba-d-conv 4 \
+    --mamba-expand 2 \
+    --mamba-headdim 128 \
+    --tokenizer-model ${TOKENIZER_PATH} \
+    --hybrid-attention-ratio 0.26 \
+    --hybrid-mlp-ratio 0.5 \
+    --hybrid-override-pattern $LAYER_PATTEN \
+    --untie-embeddings-and-output-weights \
+    --overlap-param-gather \
+    --overlap-grad-reduce \
+    --norm-epsilon 1e-6 \
+"
+
+torchrun $DISTRIBUTED_ARGS inference_mamba.py \
+         $MOE_ARGS \
+         $MAMBA_ARGS \
+         --use-mcore-models \
+         --tensor-model-parallel-size ${TP} \
+         --pipeline-model-parallel-size ${PP} \
+         --expert-model-parallel-size ${EP} \
+         --load ${CHECKPOINT} \
+         --moe-grouped-gemm \
+         --norm-topk-prob \
+         --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
+         --kv-channels 128 \
+         --qk-layernorm \
+         --num-layers ${NUM_LAYERS} \
+         --hidden-size 2048 \
+         --use-rotary-position-embeddings \
+         --num-attention-heads 32 \
+         --ffn-hidden-size 8192 \
+         --max-position-embeddings 40960 \
+         --seq-length ${SEQ_LENGTH} \
+         --make-vocab-size-divisible-by 1 \
+         --padded-vocab-size 151936 \
+         --rotary-base 1000000 \
+         --untie-embeddings-and-output-weights \
+         --micro-batch-size 1 \
+         --disable-bias-linear \
+         --swiglu \
+         --use-fused-swiglu \
+         --use-fused-rmsnorm \
+         --tokenizer-type PretrainedFromHF \
+         --tokenizer-name-or-path ${TOKENIZER_PATH} \
+         --normalization RMSNorm \
+         --position-embedding-type rope \
+         --norm-epsilon 1e-6 \
+         --hidden-dropout 0 \
+         --attention-dropout 0 \
+         --tokenizer-not-use-fast \
+         --max-new-tokens 256 \
+         --no-gradient-accumulation-fusion \
+         --attention-softmax-in-fp32 \
+         --exit-on-missing-checkpoint \
+         --no-masked-softmax-fusion \
+         --group-query-attention \
+         --num-query-groups 4 \
+         --seed 42 \
+         --bf16 \
+         | tee logs/generate_mcore_qwen3_30b_a3b.log
-- 
Gitee


From fa4b018190284ffbceb55156343a62d43935de1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Tue, 29 Jul 2025 21:40:01 +0800
Subject: [PATCH 14/16] support mamba inference

---
 mindspeed_llm/core/ssm/mamba_mixer.py   | 2 +-
 mindspeed_llm/tasks/inference/module.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/mindspeed_llm/core/ssm/mamba_mixer.py b/mindspeed_llm/core/ssm/mamba_mixer.py
index 3ae0b6f77..76b12a774 100644
--- a/mindspeed_llm/core/ssm/mamba_mixer.py
+++ b/mindspeed_llm/core/ssm/mamba_mixer.py
@@ -131,7 +131,7 @@ def mamba_mixer_forward(self, hidden_states, seqlen=None, seq_idx=None, cu_seqle
     )
 
     state_opts = StateOptions(
-        return_final_state=True if ssm_state else False
+        return_final_state=True if ssm_state is not None else False
     )
     state_space_duality = StateSpaceProcessor(config=config)
     y = state_space_duality.process(inputs, state_opts)          
diff --git a/mindspeed_llm/tasks/inference/module.py b/mindspeed_llm/tasks/inference/module.py
index cc9da9d48..c4e4d9dcb 100644
--- a/mindspeed_llm/tasks/inference/module.py
+++ b/mindspeed_llm/tasks/inference/module.py
@@ -25,6 +25,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.training import get_args, global_vars
 from megatron.core import parallel_state, ModelParallelConfig
+from megatron.core.models.mamba import MambaModel
 
 
 class MegatronModuleForCausalLMABC(torch.nn.Module, abc.ABC):
@@ -501,5 +502,13 @@ class GPTModelInfer(GPTModel):
         super().__init__(*args, **kwargs)
         self.infer_model = MegatronModuleForCausalLM()
 
+    def generate(self, input_ids=None, **kwargs):
+        return self.infer_model.generate(input_ids=input_ids, **kwargs)
+
+class MambaModelInfer(MambaModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.infer_model = MegatronModuleForCausalLM()
+
     def generate(self, input_ids=None, **kwargs):
         return self.infer_model.generate(input_ids=input_ids, **kwargs)
\ No newline at end of file
-- 
Gitee


From 05ddf3ace04fbb897193d06bf157a2b72019f31e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Wed, 30 Jul 2025 09:53:34 +0800
Subject: [PATCH 15/16] replace /r

---
 .../generate_qwen3_30b_a3b_ptd_mamba.sh       | 234 +++++++++---------
 1 file changed, 117 insertions(+), 117 deletions(-)

diff --git a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh
index f9ba7c3cd..c33ceda35 100644
--- a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh
+++ b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd_mamba.sh
@@ -1,117 +1,117 @@
-#!/bin/bash
-
-# The number of parameters is not aligned
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-# please fill these path configurations
-TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
-CHECKPOINT="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v4-tp1-pp8-ep1"
-
-
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-NPUS_PER_NODE=8
-WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
-
-TP=1
-PP=8
-EP=1
-SEQ_LENGTH=2048
-ROUTER_BALANCING_TYPE='softmax_topk'
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $NPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-MOE_ARGS="
-    --num-experts 128 \
-    --moe-router-topk 8 \
-    --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \
-    --moe-intermediate-size 768 \
-    --moe-permutation-async-comm \
-    --moe-token-dispatcher-type allgather \
-    --moe-aux-loss-coeff 0.001
-"
-
-
-NUM_LAYERS=96
-LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
-# NUM_LAYERS=8
-# LAYER_PATTEN="*-M-*-M-"
-MAMBA_ARGS="
-    --reuse-fp32-param \
-    --no-shared-storage \
-    --use-distributed-optimizer \
-    --use-flash-attn \
-    --use-mcore-models \
-    --num-layers ${NUM_LAYERS} \
-    --mamba-ngroups 4 \
-    --mamba-chunk-size 128 \
-    --mamba-d-state 128 \
-    --mamba-d-conv 4 \
-    --mamba-expand 2 \
-    --mamba-headdim 128 \
-    --tokenizer-model ${TOKENIZER_PATH} \
-    --hybrid-attention-ratio 0.26 \
-    --hybrid-mlp-ratio 0.5 \
-    --hybrid-override-pattern $LAYER_PATTEN \
-    --untie-embeddings-and-output-weights \
-    --overlap-param-gather \
-    --overlap-grad-reduce \
-    --norm-epsilon 1e-6 \
-"
-
-torchrun $DISTRIBUTED_ARGS inference_mamba.py \
-         $MOE_ARGS \
-         $MAMBA_ARGS \
-         --use-mcore-models \
-         --tensor-model-parallel-size ${TP} \
-         --pipeline-model-parallel-size ${PP} \
-         --expert-model-parallel-size ${EP} \
-         --load ${CHECKPOINT} \
-         --moe-grouped-gemm \
-         --norm-topk-prob \
-         --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
-         --kv-channels 128 \
-         --qk-layernorm \
-         --num-layers ${NUM_LAYERS} \
-         --hidden-size 2048 \
-         --use-rotary-position-embeddings \
-         --num-attention-heads 32 \
-         --ffn-hidden-size 8192 \
-         --max-position-embeddings 40960 \
-         --seq-length ${SEQ_LENGTH} \
-         --make-vocab-size-divisible-by 1 \
-         --padded-vocab-size 151936 \
-         --rotary-base 1000000 \
-         --untie-embeddings-and-output-weights \
-         --micro-batch-size 1 \
-         --disable-bias-linear \
-         --swiglu \
-         --use-fused-swiglu \
-         --use-fused-rmsnorm \
-         --tokenizer-type PretrainedFromHF \
-         --tokenizer-name-or-path ${TOKENIZER_PATH} \
-         --normalization RMSNorm \
-         --position-embedding-type rope \
-         --norm-epsilon 1e-6 \
-         --hidden-dropout 0 \
-         --attention-dropout 0 \
-         --tokenizer-not-use-fast \
-         --max-new-tokens 256 \
-         --no-gradient-accumulation-fusion \
-         --attention-softmax-in-fp32 \
-         --exit-on-missing-checkpoint \
-         --no-masked-softmax-fusion \
-         --group-query-attention \
-         --num-query-groups 4 \
-         --seed 42 \
-         --bf16 \
-         | tee logs/generate_mcore_qwen3_30b_a3b.log
+#!/bin/bash
+
+# The number of parameters is not aligned
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# please fill these path configurations
+TOKENIZER_PATH="/home/ascend-vllm/model/Qwen3-30B-A3B"
+CHECKPOINT="/home/ascend-vllm/model/Qwen3-30B-A3B-Mamba2-v4-tp1-pp8-ep1"
+
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+TP=1
+PP=8
+EP=1
+SEQ_LENGTH=2048
+ROUTER_BALANCING_TYPE='softmax_topk'
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+MOE_ARGS="
+    --num-experts 128 \
+    --moe-router-topk 8 \
+    --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \
+    --moe-intermediate-size 768 \
+    --moe-permutation-async-comm \
+    --moe-token-dispatcher-type allgather \
+    --moe-aux-loss-coeff 0.001
+"
+
+
+NUM_LAYERS=96
+LAYER_PATTEN="*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-M-*-*-"
+# NUM_LAYERS=8
+# LAYER_PATTEN="*-M-*-M-"
+MAMBA_ARGS="
+    --reuse-fp32-param \
+    --no-shared-storage \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --use-mcore-models \
+    --num-layers ${NUM_LAYERS} \
+    --mamba-ngroups 4 \
+    --mamba-chunk-size 128 \
+    --mamba-d-state 128 \
+    --mamba-d-conv 4 \
+    --mamba-expand 2 \
+    --mamba-headdim 128 \
+    --tokenizer-model ${TOKENIZER_PATH} \
+    --hybrid-attention-ratio 0.26 \
+    --hybrid-mlp-ratio 0.5 \
+    --hybrid-override-pattern $LAYER_PATTEN \
+    --untie-embeddings-and-output-weights \
+    --overlap-param-gather \
+    --overlap-grad-reduce \
+    --norm-epsilon 1e-6 \
+"
+
+torchrun $DISTRIBUTED_ARGS inference_mamba.py \
+         $MOE_ARGS \
+         $MAMBA_ARGS \
+         --use-mcore-models \
+         --tensor-model-parallel-size ${TP} \
+         --pipeline-model-parallel-size ${PP} \
+         --expert-model-parallel-size ${EP} \
+         --load ${CHECKPOINT} \
+         --moe-grouped-gemm \
+         --norm-topk-prob \
+         --spec mindspeed_llm.tasks.models.spec.qwen3_mamba_spec layer_spec \
+         --kv-channels 128 \
+         --qk-layernorm \
+         --num-layers ${NUM_LAYERS} \
+         --hidden-size 2048 \
+         --use-rotary-position-embeddings \
+         --num-attention-heads 32 \
+         --ffn-hidden-size 8192 \
+         --max-position-embeddings 40960 \
+         --seq-length ${SEQ_LENGTH} \
+         --make-vocab-size-divisible-by 1 \
+         --padded-vocab-size 151936 \
+         --rotary-base 1000000 \
+         --untie-embeddings-and-output-weights \
+         --micro-batch-size 1 \
+         --disable-bias-linear \
+         --swiglu \
+         --use-fused-swiglu \
+         --use-fused-rmsnorm \
+         --tokenizer-type PretrainedFromHF \
+         --tokenizer-name-or-path ${TOKENIZER_PATH} \
+         --normalization RMSNorm \
+         --position-embedding-type rope \
+         --norm-epsilon 1e-6 \
+         --hidden-dropout 0 \
+         --attention-dropout 0 \
+         --tokenizer-not-use-fast \
+         --max-new-tokens 256 \
+         --no-gradient-accumulation-fusion \
+         --attention-softmax-in-fp32 \
+         --exit-on-missing-checkpoint \
+         --no-masked-softmax-fusion \
+         --group-query-attention \
+         --num-query-groups 4 \
+         --seed 42 \
+         --bf16 \
+         | tee logs/generate_mcore_qwen3_30b_a3b.log
-- 
Gitee


From 6d3a6adc7983942f23c1425ba3bda9e7e6b59410 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?flippy=E8=88=AA?= <654733882@qq.com>
Date: Fri, 1 Aug 2025 17:15:07 +0800
Subject: [PATCH 16/16] precision alignment for Mamba and MoE

---
 mindspeed_llm/core/ssm/mamba_mixer.py              |  6 +++---
 mindspeed_llm/core/transformer/moe/moe_layer.py    |  1 +
 .../tasks/models/ssm/state_space_duality.py        | 14 +++++++++-----
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/mindspeed_llm/core/ssm/mamba_mixer.py b/mindspeed_llm/core/ssm/mamba_mixer.py
index 76b12a774..2d6045378 100644
--- a/mindspeed_llm/core/ssm/mamba_mixer.py
+++ b/mindspeed_llm/core/ssm/mamba_mixer.py
@@ -22,8 +22,8 @@ def mamba_mixer_init_wrapper(fn):
         kwargs["expand"] = param_args.mamba_expand
         kwargs["headdim"] = param_args.mamba_headdim    
         fn(self, *args, **kwargs)
-        dt_min = kwargs.pop('dt_min', 0.001)
-        dt_max = kwargs.pop('dt_max', 0.1)
+        dt_min = kwargs.pop('dt_min', 0.0)
+        dt_max = kwargs.pop('dt_max', float("inf"))
         self.use_mem_eff_path = False
         self.d_ssm = param_args.mamba_d_ssm
         self.dt_min = dt_min
@@ -102,9 +102,9 @@ def mamba_mixer_forward(self, hidden_states, seqlen=None, seq_idx=None, cu_seqle
     x, B, C = torch.split(
         xBC,
         [
-            self.d_inner_local,
             self.ngroups_local * self.d_state,
             self.ngroups_local * self.d_state,
+            self.d_inner_local,
         ],
         dim=-1,
     )
diff --git a/mindspeed_llm/core/transformer/moe/moe_layer.py b/mindspeed_llm/core/transformer/moe/moe_layer.py
index c4997a29c..52f3de78e 100644
--- a/mindspeed_llm/core/transformer/moe/moe_layer.py
+++ b/mindspeed_llm/core/transformer/moe/moe_layer.py
@@ -93,6 +93,7 @@ def moe_layer_forward(self, hidden_states: torch.Tensor):
 
     # process MoE
     scores, indices = self.router(hidden_states)
+    scores = scores / scores.sum(dim=-1, keepdim=True)
     
     if global_args.moe_revert_type_after_topk:
         (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation(
diff --git a/mindspeed_llm/tasks/models/ssm/state_space_duality.py b/mindspeed_llm/tasks/models/ssm/state_space_duality.py
index 809eeffaf..c68d60087 100644
--- a/mindspeed_llm/tasks/models/ssm/state_space_duality.py
+++ b/mindspeed_llm/tasks/models/ssm/state_space_duality.py
@@ -77,12 +77,14 @@ class StateSpaceProcessor:
 
         # Dimension transformations
         x, dt, A, B, C = self._expand_dims(x, A, dt, B, C)
-        B_exp, C_exp = self._expand_groups_to_heads(B, C)
+        # B_exp, C_exp = self._expand_groups_to_heads(B, C)
+        x_exp, B_exp = self._expand_groups_to_heads(x, B)
         dt_proc = self._process_time_step(dt)
-        D = self._prepare_residual(D, x, pad_size)
+        D = self._prepare_residual(D, x_exp, pad_size)
 
         # Chunk processing
-        x_pad, A_pad, B_pad, C_pad = self._chunk_and_pad(x, dt_proc, A, B_exp, C_exp, pad_size)
+        # x_pad, A_pad, B_pad, C_pad = self._chunk_and_pad(x, dt_proc, A, B_exp, C_exp, pad_size)
+        x_pad, A_pad, B_pad, C_pad = self._chunk_and_pad(x_exp, dt_proc, A, B_exp, C, pad_size)
 
         # Core computations
         Y_diag, states, A_cum, C_br = self._compute_diagonal_blocks(A_pad, B_pad, C_pad, x_pad)
@@ -93,11 +95,13 @@ class StateSpaceProcessor:
         return self._synthesize_output((Y_diag, Y_off, D), (pad_size, seq_len), state_opts)
 
     def _expand_dims(self, x, A, dt, B, C):
-        x = rearrange(x, "b l (h p) -> b l h p", p=self.config['headdim']).contiguous()
+        # x = rearrange(x, "b l (h p) -> b l h p", p=self.config['headdim']).contiguous()
+        C = rearrange(C, "b l (h p) -> b l h p", p=self.config['headdim']).contiguous()
         dt = dt.contiguous()
         A = A.contiguous()
         B = rearrange(B, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous()
-        C = rearrange(C, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous()
+        x = rearrange(x, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous()
+        # C = rearrange(C, "b l (g n) -> b l g n", n=self.config['d_state']).contiguous()
         return x, dt, A, B, C
 
     def _prepare_initial_states(self, states: Optional[torch.Tensor]) -> torch.Tensor:
-- 
Gitee