From cae418c05f96b87679593a0445e0e32211dc2c5d Mon Sep 17 00:00:00 2001 From: jiangzhihan1 Date: Fri, 28 Feb 2025 14:51:27 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0deepseekv3=20384die=20?= =?UTF-8?q?=E5=85=A8=E5=8F=82=E5=BE=AE=E8=B0=83=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...ne_deepseek3_671b_4k_384die_full_A3_ptd.sh | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 examples/mcore/deepseek3/tune_deepseek3_671b_4k_384die_full_A3_ptd.sh diff --git a/examples/mcore/deepseek3/tune_deepseek3_671b_4k_384die_full_A3_ptd.sh b/examples/mcore/deepseek3/tune_deepseek3_671b_4k_384die_full_A3_ptd.sh new file mode 100644 index 0000000000..1231a64a20 --- /dev/null +++ b/examples/mcore/deepseek3/tune_deepseek3_671b_4k_384die_full_A3_ptd.sh @@ -0,0 +1,179 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_CONNECT_TIMEOUT=3600 + +GPUS_PER_NODE=16 +MASTER_ADDR=localhost #主节点IP +MASTER_PORT=6000 +NNODES=24 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=4 +PP=12 +EP=8 +CP=1 +CP_TYPE='ulysses_cp_algo' +NUM_LAYERS=61 +SEQ_LEN=4096 +MBS=1 +GBS=384 + + + + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MLA_ARGS=" + --multi-head-latent-attention \ + --qk-rope-head-dim 64 \ + --qk-nope-head-dim 128 \ + --q-lora-rank 1536 \ + --kv-lora-rank 512 \ + --v-head-dim 128 \ + --qk-layernorm \ +" + +MOE_ARGS=" + --moe-grouped-gemm \ + --moe-permutation-async-comm \ + --use-fused-moe-token-permute-and-unpermute \ + --moe-token-dispatcher-type alltoall \ + --first-k-dense-replace 3 \ + --moe-layer-freq 1 \ + --n-shared-experts 1 \ + --n-group 8 \ + --num-experts 256 \ + --moe-router-topk 8 \ + --moe-intermediate-size 2048 \ + --moe-router-load-balancing-type noaux_tc \ + --topk-group 4 \ + --routed-scaling-factor 2.5 \ + --norm-topk-prob \ + --moe-router-score-function sigmoid \ + --moe-router-enable-expert-bias \ +" + +ROPE_ARGS=" + --rope-scaling-beta-fast 32 \ + --rope-scaling-beta-slow 1 \ + --rope-scaling-factor 40 \ + --rope-scaling-mscale 1.0 \ + --rope-scaling-mscale-all-dim 1.0 \ + --rope-scaling-original-max-position-embeddings 4096 \ + --rope-scaling-type yarn +" + +GPT_ARGS=" + --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ + --recompute-granularity full \ + --recompute-method uniform \ + --recompute-num-layers 1 \ + --no-shared-storage \ + --use-distributed-optimizer \ + --reuse-fp32-param \ + --use-flash-attn \ + --shape-order BNSD \ + --num-layer-list 5,5,5,5,5,5,5,5,5,6,6,4 \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --sequence-parallel \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size 7168 \ + --ffn-hidden-size 18432 \ + --num-attention-heads 128 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings 163840 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --make-vocab-size-divisible-by 1 \ + --lr 1.0e-5 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.02 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --swiglu \ + --no-masked-softmax-fusion \ + --no-gradient-accumulation-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.0e-7 \ + --weight-decay 1e-2 \ + --lr-warmup-iters 100 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --initial-loss-scale 65536 \ + --vocab-size 129280 \ + --padded-vocab-size 129280 \ + --rotary-base 10000 \ + --norm-epsilon 1e-6 \ + --no-load-optim \ + --no-load-rng \ + --bf16 \ + --distributed-timeout-minutes 120 \ +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 0 \ + --no-save-optim \ + --no-save-rng \ + --load ${CKPT_LOAD_DIR} \ + --save $CKPT_SAVE_DIR \ +" + + +FINETUNE_ARGS=" + --finetune \ + --stage sft \ + --is-instruction-dataset \ + --variable-seq-lengths \ + --prompt-type deepseek3 \ +" + + +python -m torch.distributed.launch $DISTRIBUTED_ARGS posttrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $MLA_ARGS \ + $ROPE_ARGS \ + $MOE_ARGS \ + $FINETUNE_ARGS \ + --distributed-backend nccl \ + | tee tune_deepseek3_671b_4k_full_384_ptd.log + -- Gitee