From c9a2813300a6c38925ed361019791ad6b1d1185e Mon Sep 17 00:00:00 2001 From: y30062407 Date: Fri, 20 Jun 2025 11:35:52 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90mindspore=E3=80=91=E3=80=90bugfix?= =?UTF-8?q?=E3=80=91update=20dsv3-pretrain=20sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pretrain_deepseek3_671b_4k_A3_ptd.sh | 1 + .../pretrain_deepseek3_671b_4k_A3_ms.sh | 30 ++++++++++++------- .../pretrain_deepseek3_671b_4k_ms.sh | 21 ++++++++----- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/examples/mcore/deepseek3/pretrain_deepseek3_671b_4k_A3_ptd.sh b/examples/mcore/deepseek3/pretrain_deepseek3_671b_4k_A3_ptd.sh index 5ba927005..0f9d51679 100644 --- a/examples/mcore/deepseek3/pretrain_deepseek3_671b_4k_A3_ptd.sh +++ b/examples/mcore/deepseek3/pretrain_deepseek3_671b_4k_A3_ptd.sh @@ -104,6 +104,7 @@ ROPE_ARGS=" --rope-scaling-type yarn " + GPT_ARGS=" --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --reset-position-ids \ diff --git a/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_A3_ms.sh b/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_A3_ms.sh index 02be45d5e..852ce17bd 100644 --- a/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_A3_ms.sh +++ b/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_A3_ms.sh @@ -7,7 +7,7 @@ export HCCL_BUFFSIZE=400 NPUS_PER_NODE=16 MASTER_ADDR=localhost -MASTER_PORT=9110 +MASTER_PORT=6000 NNODES=32 NODE_RANK=0 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) @@ -78,7 +78,19 @@ MOE_ARGS=" MTP_ARGS=" --mtp-num-layers 1 \ --mtp-loss-scaling-factor 0.3 \ +" + +DUALPIPE_ARGS=" + --moe-fb-overlap \ + --schedules-method dualpipev \ +" + +MEM_ARGS=" --mtp-mem-efficient-logits \ + --use-distributed-optimizer \ + --recompute-granularity full \ + --recompute-method block \ + --recompute-num-layers 8 \ " ROPE_ARGS=" @@ -91,14 +103,8 @@ ROPE_ARGS=" --rope-scaling-type yarn " -MEM_ARGS=" - --use-distributed-optimizer \ - --recompute-method uniform \ - --recompute-granularity full \ - --recompute-num-layers 1 \ -" -GPT_ARGS="\ +GPT_ARGS=" --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --reset-position-ids \ --noop-layers 61,62,63 \ @@ -107,7 +113,6 @@ GPT_ARGS="\ --use-mcore-models \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ - --num-layers-per-virtual-pipeline-stage ${VPP} \ --expert-model-parallel-size ${EP} \ --sequence-parallel \ --context-parallel-size ${CP} \ @@ -164,7 +169,7 @@ DATA_ARGS=" OUTPUT_ARGS=" --log-interval 1 \ - --save-interval 1 \ + --save-interval 2000 \ --eval-interval 2000 \ --eval-iters 0 \ --no-save-optim \ @@ -176,10 +181,13 @@ msrun $DISTRIBUTED_ARGS $basepath/pretrain_gpt.py \ $DATA_ARGS \ $OUTPUT_ARGS \ $MLA_ARGS \ + $DUALPIPE_ARGS \ + $MEM_ARGS \ $ROPE_ARGS \ $MOE_ARGS \ $MTP_ARGS \ - $MEM_ARGS \ --distributed-backend nccl \ + --save $CKPT_SAVE_DIR \ + --load $CKPT_LOAD_DIR \ --ai-framework mindspore \ 2>&1 | tee logs/pretrain_deepseek3_671b_4k_A3_ms.log diff --git a/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_ms.sh b/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_ms.sh index b24d4bb77..0b67b9f3e 100644 --- a/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_ms.sh +++ b/examples/mindspore/deepseek3/pretrain_deepseek3_671b_4k_ms.sh @@ -8,8 +8,8 @@ export HCCL_BUFFSIZE=400 basepath=$(cd `dirname $0`; cd ../../../; pwd) NPUS_PER_NODE=8 -MASTER_ADDR=localhost #主节点IP -MASTER_PORT=9110 +MASTER_ADDR=localhost #MASTER IP +MASTER_PORT=6000 NNODES=64 NODE_RANK=0 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) @@ -23,7 +23,6 @@ TP=4 PP=8 EP=8 CP=1 -VPP=1 CP_TYPE='ulysses_cp_algo' NUM_LAYERS=64 SEQ_LEN=4096 @@ -50,9 +49,11 @@ MLA_ARGS=" --kv-lora-rank 512 \ --v-head-dim 128 \ --qk-layernorm \ + --mla-fa-without-pad \ " MOE_ARGS=" + --router-gating-in-fp32 \ --moe-grouped-gemm \ --moe-permutation-async-comm \ --use-fused-moe-token-permute-and-unpermute \ @@ -67,12 +68,12 @@ MOE_ARGS=" --n-group 8 \ --topk-group 4 \ --routed-scaling-factor 2.5 \ + --moe-aux-loss-coeff 0.0001 \ --seq-aux \ --norm-topk-prob \ --moe-router-score-function sigmoid \ --moe-router-enable-expert-bias \ --moe-tp-extend-ep \ - --moe-alltoall-overlap-comm \ " MTP_ARGS=" @@ -91,6 +92,11 @@ ROPE_ARGS=" --rope-scaling-type yarn " +DUALPIPE_ARGS=" + --moe-fb-overlap \ + --schedules-method dualpipev \ +" + MEM_ARGS=" --use-distributed-optimizer \ --recompute-method uniform \ @@ -99,7 +105,6 @@ MEM_ARGS=" " GPT_ARGS="\ - --no-check-for-nan-in-loss-and-grad \ --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --no-gradient-accumulation-fusion \ --reset-position-ids \ @@ -110,7 +115,6 @@ GPT_ARGS="\ --use-mcore-models \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ - --num-layers-per-virtual-pipeline-stage ${VPP} \ --expert-model-parallel-size ${EP} \ --sequence-parallel \ --context-parallel-size ${CP} \ @@ -169,7 +173,7 @@ DATA_ARGS=" OUTPUT_ARGS=" --log-interval 1 \ - --save-interval 1 \ + --save-interval 2000 \ --eval-interval 2000 \ --eval-iters 0 \ --no-save-optim \ @@ -184,7 +188,10 @@ msrun $DISTRIBUTED_ARGS $basepath/pretrain_gpt.py \ $ROPE_ARGS \ $MOE_ARGS \ $MTP_ARGS \ + $DUALPIPE_ARGS \ $MEM_ARGS \ --distributed-backend nccl \ + --save $CKPT_SAVE_DIR \ + --load $CKPT_LOAD_DIR \ --ai-framework mindspore \ 2>&1 | tee logs/ms_pretrain_deepseek3_671b_4k_ptd.log -- Gitee