diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh index 918d8f4c0a412ca0500e7911edacdabd3193dae6..0a1949175d0ce11eb2999d1f532a29a7537563ef 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd.sh @@ -17,10 +17,10 @@ CKPT_SAVE_DIR="your model save ckpt path" DATA_PATH="your data path" TOKENIZER_PATH="your tokenizer path" -TP=4 -PP=2 +TP=8 +PP=1 CP=1 -MBS=1 +MBS=4 GBS=128 SEQ_LENGTH=4096 TRAIN_ITERS=2000 @@ -37,11 +37,13 @@ DISTRIBUTED_ARGS=" OPTIMIZE_ARGS=" --use-flash-attn \ --use-fused-rotary-pos-emb \ - --use-rotary-position-embeddings \ --use-fused-swiglu \ --use-fused-rmsnorm \ - --no-masked-softmax-fusion \ - --use-distributed-optimizer + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-ascend-coc \ + --coc-fused-kernel \ " TRAIN_ARGS=" @@ -82,6 +84,7 @@ GPT_ARGS=" --max-position-embeddings 40960 \ --make-vocab-size-divisible-by 1 \ --padded-vocab-size 151936 \ + --use-rotary-position-embeddings \ --rotary-base 1000000 \ --disable-bias-linear \ --swiglu \ @@ -92,7 +95,6 @@ GPT_ARGS=" --norm-epsilon 1e-6 \ --no-gradient-accumulation-fusion \ --attention-softmax-in-fp32 \ - --exit-on-missing-checkpoint \ --group-query-attention \ --num-query-groups 8 \ --no-load-optim \ diff --git a/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh new file mode 100644 index 0000000000000000000000000000000000000000..83a57e7e85f67e2cd3c3ec6dc038145f7ab3b772 --- /dev/null +++ b/examples/mcore/qwen3/pretrain_qwen3_14b_4K_ptd_A3.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +# Change for multinode config +NPUS_PER_NODE=16 +MASTER_ADDR=localhost +MASTER_PORT=6015 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_LOAD_DIR="your model ckpt path" +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" + +TP=8 +PP=1 +CP=1 +MBS=4 +GBS=128 +SEQ_LENGTH=4096 +TRAIN_ITERS=2000 +CP_TYPE='ulysses_cp_algo' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-ascend-coc \ + --coc-fused-kernel \ +" + +TRAIN_ARGS=" + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 1.25e-6 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --spec mindspeed_llm.tasks.models.spec.qwen3_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --num-layers 40 \ + --hidden-size 5120 \ + --untie-embeddings-and-output-weights \ + --num-attention-heads 40 \ + --ffn-hidden-size 17408 \ + --max-position-embeddings 40960 \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --use-rotary-position-embeddings \ + --rotary-base 1000000 \ + --disable-bias-linear \ + --swiglu \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --normalization RMSNorm \ + --position-embedding-type rope \ + --norm-epsilon 1e-6 \ + --attention-softmax-in-fp32 \ + --group-query-attention \ + --num-query-groups 8 \ + --no-load-optim \ + --no-load-rng \ + --sequence-parallel +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 \ +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --log-throughput +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen3_14b.log