diff --git a/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd.sh b/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd.sh index f58f0a23bafd9a63e9160ca7d8a6cb9eb5565b01..d83c0735da79df330a3f0c90887459c0041c62ee 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd.sh @@ -1,6 +1,5 @@ export HCCL_CONNECT_TIMEOUT=1800 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NPU_ASD_ENABLE=0 NPUS_PER_NODE=8 MASTER_ADDR=localhost @@ -17,14 +16,12 @@ CKPT_LOAD_DIR="your model ckpt path" TP=8 PP=2 -CP=1 -VPP=4 +VPP=8 -MBS=1 -GBS=128 +MBS=2 +GBS=256 SEQ_LENGTH=4096 TRAIN_ITERS=2000 -CP_TYPE='ulysses_cp_algo' DISTRIBUTED_ARGS=" --nproc_per_node $NPUS_PER_NODE \ @@ -47,7 +44,7 @@ OPTIMIZE_ARGS=" " TRAIN_ARGS=" - --micro-batch-size 1 \ + --micro-batch-size ${MBS} \ --global-batch-size ${GBS} \ --lr 1.25e-6 \ --lr-decay-style cosine \ @@ -72,12 +69,11 @@ MODEL_PARALLEL_ARGS=" --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers-per-virtual-pipeline-stage ${VPP} \ - --context-parallel-size ${CP} \ - --context-parallel-algo ${CP_TYPE} \ " GPT_ARGS=" --use-mcore-models \ + --sequence-parallel \ --spec mindspeed_llm.tasks.models.spec.qwen3_spec layer_spec \ --kv-channels 128 \ --qk-layernorm \ @@ -99,7 +95,8 @@ GPT_ARGS=" --attention-softmax-in-fp32 \ --no-gradient-accumulation-fusion \ --group-query-attention \ - --num-query-groups 8 + --num-query-groups 8 \ + --reset-position-ids " DATA_ARGS=" @@ -113,7 +110,8 @@ OUTPUT_ARGS=" --eval-interval ${TRAIN_ITERS} \ --eval-iters 0 \ --no-load-optim \ - --no-load-rng + --no-load-rng \ + --log-throughput " torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ @@ -126,4 +124,4 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ --distributed-backend nccl \ --load ${CKPT_LOAD_DIR} \ --save ${CKPT_SAVE_DIR} \ - | tee logs/train_mcore_qwen3_32b.log \ No newline at end of file + | tee logs/train_mcore_qwen3_32b.log diff --git a/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd_A3.sh b/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd_A3.sh new file mode 100644 index 0000000000000000000000000000000000000000..1cfd82b548f19a962b0867907f4e8efd07f4b9aa --- /dev/null +++ b/examples/mcore/qwen3/pretrain_qwen3_32b_4K_ptd_A3.sh @@ -0,0 +1,129 @@ +export HCCL_CONNECT_TIMEOUT=1800 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=16 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=8 +PP=2 +VPP=4 + +MBS=2 +GBS=256 +SEQ_LENGTH=4096 +TRAIN_ITERS=2000 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather +" + +TRAIN_ARGS=" + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 1.25e-6 \ + --lr-decay-style cosine \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ + --no-shared-storage +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers-per-virtual-pipeline-stage ${VPP} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --sequence-parallel \ + --spec mindspeed_llm.tasks.models.spec.qwen3_spec layer_spec \ + --kv-channels 128 \ + --qk-layernorm \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --num-layers 64 \ + --hidden-size 5120 \ + --ffn-hidden-size 25600 \ + --num-attention-heads 64 \ + --tokenizer-type PretrainedFromHF \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --attention-softmax-in-fp32 \ + --no-gradient-accumulation-fusion \ + --group-query-attention \ + --num-query-groups 8 \ + --reset-position-ids \ + --use-ascend-coc \ + --coc-fused-kernel +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --no-load-optim \ + --no-load-rng \ + --log-throughput +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen3_32b_4k_A3.log