From eb3450af1be72b282e26e6ff51d9687a26b27e2f Mon Sep 17 00:00:00 2001 From: HanhuiChen Date: Wed, 6 Aug 2025 15:49:41 +0800 Subject: [PATCH 1/5] update Qwen3-14b pretrain and ft scripts --- .../mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh | 15 +++++++++------ .../mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh index 918d8f4c0a..acb4f9a688 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh @@ -17,10 +17,10 @@ CKPT_SAVE_DIR="your model save ckpt path" DATA_PATH="your data path" TOKENIZER_PATH="your tokenizer path" -TP=4 -PP=2 +TP=8 +PP=1 CP=1 -MBS=1 +MBS=4 GBS=128 SEQ_LENGTH=4096 TRAIN_ITERS=2000 @@ -37,11 +37,13 @@ DISTRIBUTED_ARGS=" OPTIMIZE_ARGS=" --use-flash-attn \ --use-fused-rotary-pos-emb \ - --use-rotary-position-embeddings \ --use-fused-swiglu \ --use-fused-rmsnorm \ - --no-masked-softmax-fusion \ - --use-distributed-optimizer + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-ascend-coc \ + --coc-fused-kernel \ " TRAIN_ARGS=" @@ -82,6 +84,7 @@ GPT_ARGS=" --max-position-embeddings 40960 \ --make-vocab-size-divisible-by 1 \ --padded-vocab-size 151936 \ + --use-rotary-position-embeddings \ --rotary-base 1000000 \ --disable-bias-linear \ --swiglu \ diff --git a/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh b/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh index c8d3c87a47..0b69f14834 100644 --- a/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh +++ b/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh @@ -19,8 +19,8 @@ TOKENIZER_PATH="your tokenizer path" TP=8 PP=1 -MBS=1 -GBS=16 +MBS=4 +GBS=128 SEQ_LENGTH=4096 TRAIN_ITERS=2000 @@ -32,6 +32,18 @@ DISTRIBUTED_ARGS=" --master_port $MASTER_PORT " +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-ascend-coc \ + --coc-fused-kernel \ +" + GPT_ARGS=" --use-mcore-models \ --tensor-model-parallel-size ${TP} \ @@ -51,7 +63,6 @@ GPT_ARGS=" --micro-batch-size ${MBS} \ --global-batch-size ${GBS} \ --make-vocab-size-divisible-by 1 \ - --use-flash-attn \ --padded-vocab-size 151936 \ --rotary-base 1000000 \ --disable-bias-linear \ @@ -105,6 +116,7 @@ TUNE_ARGS=" " torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \ + $DISTRIBUTED_ARGS \ $GPT_ARGS \ $DATA_ARGS \ $OUTPUT_ARGS \ -- Gitee From 41e943e58a91d8bb18f06d76de939b27dd9b9f1c Mon Sep 17 00:00:00 2001 From: HanhuiChen Date: Thu, 7 Aug 2025 09:39:51 +0800 Subject: [PATCH 2/5] update Qwen3-14b pretrain script --- .../mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh b/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh index 0b69f14834..c8d3c87a47 100644 --- a/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh +++ b/examples/mcore/qwen3/tune_qwen3_14b_4K_full_ptd.sh @@ -19,8 +19,8 @@ TOKENIZER_PATH="your tokenizer path" TP=8 PP=1 -MBS=4 -GBS=128 +MBS=1 +GBS=16 SEQ_LENGTH=4096 TRAIN_ITERS=2000 @@ -32,18 +32,6 @@ DISTRIBUTED_ARGS=" --master_port $MASTER_PORT " -OPTIMIZE_ARGS=" - --use-flash-attn \ - --use-fused-rotary-pos-emb \ - --use-fused-swiglu \ - --use-fused-rmsnorm \ - --use-distributed-optimizer \ - --overlap-grad-reduce \ - --overlap-param-gather \ - --use-ascend-coc \ - --coc-fused-kernel \ -" - GPT_ARGS=" --use-mcore-models \ --tensor-model-parallel-size ${TP} \ @@ -63,6 +51,7 @@ GPT_ARGS=" --micro-batch-size ${MBS} \ --global-batch-size ${GBS} \ --make-vocab-size-divisible-by 1 \ + --use-flash-attn \ --padded-vocab-size 151936 \ --rotary-base 1000000 \ --disable-bias-linear \ @@ -116,7 +105,6 @@ TUNE_ARGS=" " torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \ - $DISTRIBUTED_ARGS \ $GPT_ARGS \ $DATA_ARGS \ $OUTPUT_ARGS \ -- Gitee From f777b5f8f5cbb6d31dcfc7dd9f1d31462fd515c2 Mon Sep 17 00:00:00 2001 From: HanhuiChen Date: Thu, 7 Aug 2025 16:52:48 +0800 Subject: [PATCH 3/5] upload qwen3 14b pretrain script for A3 --- .../qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh new file mode 100644 index 0000000000..3f65bd7973 --- /dev/null +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +# Change for multinode config +NPUS_PER_NODE=16 +MASTER_ADDR=localhost +MASTER_PORT=6015 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_LOAD_DIR="your model ckpt path" +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" + +TP=8 +PP=1 +CP=1 +MBS=4 +GBS=128 +SEQ_LENGTH=4096 +TRAIN_ITERS=2000 +CP_TYPE='ulysses_cp_algo' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-ascend-coc \ + --coc-fused-kernel \ +" + +TRAIN_ARGS=" + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 1.25e-6 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --spec mindspeed_llm.tasks.models.spec.qwen3_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --num-layers 40 \ + --hidden-size 5120 \ + --untie-embeddings-and-output-weights \ + --num-attention-heads 40 \ + --ffn-hidden-size 17408 \ + --max-position-embeddings 40960 \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --use-rotary-position-embeddings \ + --rotary-base 1000000 \ + --disable-bias-linear \ + --swiglu \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --normalization RMSNorm \ + --position-embedding-type rope \ + --norm-epsilon 1e-6 \ + --attention-softmax-in-fp32 \ + --exit-on-missing-checkpoint \ + --group-query-attention \ + --num-query-groups 8 \ + --no-load-optim \ + --no-load-rng \ + --sequence-parallel +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 \ + --reset-position-ids +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --log-throughput +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen3_14b.log -- Gitee From 8d3411ef4d6caec9c57bf49e45c9a86363465385 Mon Sep 17 00:00:00 2001 From: HanhuiChen Date: Fri, 8 Aug 2025 10:56:35 +0800 Subject: [PATCH 4/5] update qwen3-14b script --- examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh index 3f65bd7973..ac61480bd5 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh @@ -105,7 +105,6 @@ GPT_ARGS=" DATA_ARGS=" --data-path $DATA_PATH \ --split 100,0,0 \ - --reset-position-ids " OUTPUT_ARGS=" -- Gitee From 28f98ef5b3e008aacf003faa85c90ce2bf046f9d Mon Sep 17 00:00:00 2001 From: HanhuiChen Date: Fri, 8 Aug 2025 15:35:08 +0800 Subject: [PATCH 5/5] update qwen3-14b script --- examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh | 1 - examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh index acb4f9a688..0a1949175d 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh @@ -95,7 +95,6 @@ GPT_ARGS=" --norm-epsilon 1e-6 \ --no-gradient-accumulation-fusion \ --attention-softmax-in-fp32 \ - --exit-on-missing-checkpoint \ --group-query-attention \ --num-query-groups 8 \ --no-load-optim \ diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh index ac61480bd5..83a57e7e85 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh @@ -94,7 +94,6 @@ GPT_ARGS=" --position-embedding-type rope \ --norm-epsilon 1e-6 \ --attention-softmax-in-fp32 \ - --exit-on-missing-checkpoint \ --group-query-attention \ --num-query-groups 8 \ --no-load-optim \ -- Gitee