From 045e046682ce3f781682810e7e5edbfd9311cfd8 Mon Sep 17 00:00:00 2001 From: kongdeshuo <1670690897@qq.com> Date: Fri, 4 Jul 2025 10:50:04 +0800 Subject: [PATCH] add glm pretrain case --- .../st/baseline_results/glm_pretrain.json | 15 +++ .../st/shell_scripts/glm_pretrain.sh | 113 ++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 tests/mindspore/st/baseline_results/glm_pretrain.json create mode 100644 tests/mindspore/st/shell_scripts/glm_pretrain.sh diff --git a/tests/mindspore/st/baseline_results/glm_pretrain.json b/tests/mindspore/st/baseline_results/glm_pretrain.json new file mode 100644 index 000000000..a934a96a3 --- /dev/null +++ b/tests/mindspore/st/baseline_results/glm_pretrain.json @@ -0,0 +1,15 @@ +{ + "lm loss": [ + 7.9374980926513672, + 7.9206676483154297, + 7.9393277168273926, + 7.9209604263305664, + 7.8812708854675293, + 7.8973255157470703, + 7.8921060562133789, + 7.8614082336425781, + 7.8985671997070312, + 7.8953104019165039, + 7.8852210044860840 + ] +} \ No newline at end of file diff --git a/tests/mindspore/st/shell_scripts/glm_pretrain.sh b/tests/mindspore/st/shell_scripts/glm_pretrain.sh new file mode 100644 index 000000000..00956930b --- /dev/null +++ b/tests/mindspore/st/shell_scripts/glm_pretrain.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_CONNECT_TIMEOUT=3600 +export HCCL_ALGO="alltoall=level0:NA;level1:pipeline" +export HCCL_BUFFSIZE=400 +export HCCL_DETERMINISTIC=true +export ASCEND_LAUNCH_BLOCKING=1 +export NCCL_DETERMINISTIC=1 + +basepath=$(cd `dirname $0`; cd ../../../../; pwd) +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6142 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + + +DATA_PATH="/data/mindspore/st/test_glm_pretrain/dataset/dataset/alpaca_test_document" +TOKENIZER_PATH="/data/mindspore/st/test_glm_pretrain/tokenizer" +CKPT_LOAD_DIR="/data/mindspore/st/test_glm_pretrain/ckpt_ut" + +TP=2 +PP=2 + +DISTRIBUTED_ARGS=" + --local_worker_num $GPUS_PER_NODE \ + --worker_num $WORLD_SIZE \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --join True +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --use-mcore-models \ + --use-flash-attn \ + --use-fused-rmsnorm \ + --use-fused-swiglu \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --use-distributed-optimizer \ + --num-layers 4 \ + --hidden-size 1024 \ + --ffn-hidden-size 1024 \ + --num-attention-heads 32 \ + --seq-length 1024 \ + --micro-batch-size 1 \ + --global-batch-size 4 \ + --max-position-embeddings 1024 \ + --padded-vocab-size 4096 \ + --make-vocab-size-divisible-by 1 \ + --group-query-attention \ + --num-query-groups 2 \ + --disable-bias-linear \ + --add-qkv-bias \ + --position-embedding-type rope \ + --use-glm-rope \ + --rotary-percent 0.5 \ + --no-rope-fusion \ + --normalization RMSNorm \ + --swiglu \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --lr 1.25e-6 \ + --norm-epsilon 1.5625e-07 \ + --train-iters 10 \ + --lr-decay-style cosine \ + --untie-embeddings-and-output-weights \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --initial-loss-scale 4096 \ + --adam-beta2 0.95 \ + --no-load-optim \ + --no-load-rng \ + --no-gradient-accumulation-fusion \ + --no-bias-swiglu-fusion \ + --bf16 \ + --finetune +" + +DATA_ARGS=" + --data-path ${DATA_PATH} \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 0 \ +" + +msrun ${DISTRIBUTED_ARGS} pretrain_gpt.py \ + ${GPT_ARGS} \ + ${DATA_ARGS} \ + ${OUTPUT_ARGS} \ + --load ${CKPT_LOAD_DIR} \ + --distributed-backend nccl \ + --ai-framework mindspore \ -- Gitee