From dd5bb185c62ae219218aae8ac39f405fc3ce22b6 Mon Sep 17 00:00:00 2001 From: liuchangkun Date: Fri, 20 Jun 2025 17:12:47 +0800 Subject: [PATCH] qwen25-32b-16k/32k sft pack --- .../qwen25/tune_qwen25_32b_16k_full_pack.sh | 123 +++++++++++++++++ .../qwen25/tune_qwen25_32b_32k_full_pack.sh | 124 ++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 examples/mcore/qwen25/tune_qwen25_32b_16k_full_pack.sh create mode 100644 examples/mcore/qwen25/tune_qwen25_32b_32k_full_pack.sh diff --git a/examples/mcore/qwen25/tune_qwen25_32b_16k_full_pack.sh b/examples/mcore/qwen25/tune_qwen25_32b_16k_full_pack.sh new file mode 100644 index 000000000..e7b170984 --- /dev/null +++ b/examples/mcore/qwen25/tune_qwen25_32b_16k_full_pack.sh @@ -0,0 +1,123 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Change for multinode config +NPUS_PER_NODE=16 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=2 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_LOAD_DIR="your model ckpt path" +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" + +TP=8 +PP=2 +MBS=1 +GBS=64 +SEQ_LEN=16384 +TRAIN_ITERS=2000 +NUM_LAYERS=64 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +TUNE_ARGS=" + --finetune \ + --stage sft \ + --prompt-type qwen \ + --reset-position-ids \ + --is-instruction-dataset \ + --neat-pack \ + --padded-samples \ +" + +GPT_ARGS=" + --recompute-activation-function \ + --recompute-activation-function-num-layers 1 \ + --num-layers-per-virtual-pipeline-stage 2 \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --use-distributed-optimizer \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --sequence-parallel \ + --num-layers ${NUM_LAYERS} \ + --hidden-size 5120 \ + --ffn-hidden-size 27648 \ + --num-attention-heads 40 \ + --group-query-attention \ + --num-query-groups 8 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 152064 \ + --rotary-base 1000000 \ + --train-iters ${TRAIN_ITERS} \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --use-fused-rmsnorm \ + --norm-epsilon 1e-5 \ + --swiglu \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --lr 7.75e-7 \ + --min-lr 7.75e-8 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --add-qkv-bias \ + --initial-loss-scale 4096 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --seed 42 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --log-throughput \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \ + $GPT_ARGS \ + $TUNE_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/tune_mcore_qwen25_32b_4k_full_pack.log diff --git a/examples/mcore/qwen25/tune_qwen25_32b_32k_full_pack.sh b/examples/mcore/qwen25/tune_qwen25_32b_32k_full_pack.sh new file mode 100644 index 000000000..091e6022b --- /dev/null +++ b/examples/mcore/qwen25/tune_qwen25_32b_32k_full_pack.sh @@ -0,0 +1,124 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Change for multinode config +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=2 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_LOAD_DIR="your model ckpt path" +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" + +TP=8 +PP=2 +MBS=1 +GBS=128 +SEQ_LEN=4096 +TRAIN_ITERS=2000 +NUM_LAYERS=64 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +TUNE_ARGS=" + --finetune \ + --stage sft \ + --prompt-type qwen \ + --reset-position-ids \ + --is-instruction-dataset \ + --neat-pack \ + --padded-samples \ +" + +GPT_ARGS=" + --recompute-granularity full \ + --recompute-method block \ + --recompute-activation-function-num-layers 14 \ + --recompute-activation-function \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --use-distributed-optimizer \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --sequence-parallel \ + --num-layers ${NUM_LAYERS} \ + --hidden-size 5120 \ + --ffn-hidden-size 27648 \ + --num-attention-heads 40 \ + --group-query-attention \ + --num-query-groups 8 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 152064 \ + --rotary-base 1000000 \ + --train-iters ${TRAIN_ITERS} \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --use-fused-rmsnorm \ + --norm-epsilon 1e-5 \ + --swiglu \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --lr 7.75e-7 \ + --min-lr 7.75e-8 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --add-qkv-bias \ + --initial-loss-scale 4096 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --seed 42 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --log-throughput \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \ + $GPT_ARGS \ + $TUNE_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/tune_mcore_qwen25_32b_4k_full_pack.log -- Gitee