diff --git a/examples/mcore/qwen3/pretrain_qwen3_8b_4K_ptd.sh b/examples/mcore/qwen3/pretrain_qwen3_8b_4K_ptd.sh index d42111d11717e6b2ade3ba19db210d231ef8ad10..556f82134ad8c99c2c09b836ae94752d730350c7 100644 --- a/examples/mcore/qwen3/pretrain_qwen3_8b_4K_ptd.sh +++ b/examples/mcore/qwen3/pretrain_qwen3_8b_4K_ptd.sh @@ -2,7 +2,9 @@ export HCCL_CONNECT_TIMEOUT=1800 export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export NPU_ASD_ENABLE=0 +export TASK_QUEUE_ENABLE=2 NPUS_PER_NODE=8 MASTER_ADDR=localhost @@ -18,13 +20,12 @@ TOKENIZER_PATH="your tokenizer path" CKPT_LOAD_DIR="your model ckpt path" TP=1 -PP=4 +PP=2 CP=1 MBS=1 GBS=64 SEQ_LENGTH=4096 TRAIN_ITERS=2000 -CP_TYPE='ulysses_cp_algo' DISTRIBUTED_ARGS=" --nproc_per_node $NPUS_PER_NODE \ @@ -41,7 +42,11 @@ OPTIMIZE_ARGS=" --use-fused-swiglu \ --use-fused-rmsnorm \ --no-masked-softmax-fusion \ - --use-distributed-optimizer + --use-distributed-optimizer \ + --reuse-fp32-param \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-ascend-coc " TRAIN_ARGS=" @@ -62,15 +67,12 @@ TRAIN_ARGS=" --seed 42 \ --bf16 \ --train-iters ${TRAIN_ITERS} \ - --seq-length ${SEQ_LENGTH} \ - --no-shared-storage + --seq-length ${SEQ_LENGTH} " MODEL_PARALLEL_ARGS=" --tensor-model-parallel-size ${TP} \ - --pipeline-model-parallel-size ${PP} \ - --context-parallel-size ${CP} \ - --context-parallel-algo ${CP_TYPE} \ + --pipeline-model-parallel-size ${PP} " GPT_ARGS=" @@ -95,7 +97,8 @@ GPT_ARGS=" --attention-softmax-in-fp32 \ --no-gradient-accumulation-fusion \ --group-query-attention \ - --num-query-groups 8 + --num-query-groups 8 \ + --norm-epsilon 1e-6 \ " DATA_ARGS="