From 045e046682ce3f781682810e7e5edbfd9311cfd8 Mon Sep 17 00:00:00 2001
From: kongdeshuo <1670690897@qq.com>
Date: Fri, 4 Jul 2025 10:50:04 +0800
Subject: [PATCH] add glm pretrain case

---
 .../st/baseline_results/glm_pretrain.json     |  15 +++
 .../st/shell_scripts/glm_pretrain.sh          | 113 ++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 tests/mindspore/st/baseline_results/glm_pretrain.json
 create mode 100644 tests/mindspore/st/shell_scripts/glm_pretrain.sh

diff --git a/tests/mindspore/st/baseline_results/glm_pretrain.json b/tests/mindspore/st/baseline_results/glm_pretrain.json
new file mode 100644
index 000000000..a934a96a3
--- /dev/null
+++ b/tests/mindspore/st/baseline_results/glm_pretrain.json
@@ -0,0 +1,15 @@
+{
+    "lm loss": [
+            7.9374980926513672,
+            7.9206676483154297,
+            7.9393277168273926,
+            7.9209604263305664,
+            7.8812708854675293,
+            7.8973255157470703,
+            7.8921060562133789,
+            7.8614082336425781,
+            7.8985671997070312,
+            7.8953104019165039,
+            7.8852210044860840
+    ]
+}
\ No newline at end of file
diff --git a/tests/mindspore/st/shell_scripts/glm_pretrain.sh b/tests/mindspore/st/shell_scripts/glm_pretrain.sh
new file mode 100644
index 000000000..00956930b
--- /dev/null
+++ b/tests/mindspore/st/shell_scripts/glm_pretrain.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export HCCL_CONNECT_TIMEOUT=3600
+export HCCL_ALGO="alltoall=level0:NA;level1:pipeline"
+export HCCL_BUFFSIZE=400
+export HCCL_DETERMINISTIC=true
+export ASCEND_LAUNCH_BLOCKING=1
+export NCCL_DETERMINISTIC=1
+
+basepath=$(cd `dirname $0`; cd ../../../../; pwd)
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6142
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+
+DATA_PATH="/data/mindspore/st/test_glm_pretrain/dataset/dataset/alpaca_test_document"
+TOKENIZER_PATH="/data/mindspore/st/test_glm_pretrain/tokenizer"
+CKPT_LOAD_DIR="/data/mindspore/st/test_glm_pretrain/ckpt_ut"
+
+TP=2
+PP=2
+
+DISTRIBUTED_ARGS="
+    --local_worker_num $GPUS_PER_NODE \
+    --worker_num $WORLD_SIZE \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    --join True
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --use-mcore-models \
+    --use-flash-attn \
+    --use-fused-rmsnorm \
+    --use-fused-swiglu \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --use-distributed-optimizer \
+    --num-layers 4 \
+    --hidden-size 1024 \
+    --ffn-hidden-size 1024 \
+    --num-attention-heads 32 \
+    --seq-length 1024 \
+    --micro-batch-size 1 \
+    --global-batch-size 4 \
+    --max-position-embeddings 1024 \
+    --padded-vocab-size 4096 \
+    --make-vocab-size-divisible-by 1 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --disable-bias-linear \
+    --add-qkv-bias \
+    --position-embedding-type rope \
+    --use-glm-rope \
+    --rotary-percent 0.5 \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --swiglu \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --lr 1.25e-6 \
+    --norm-epsilon 1.5625e-07 \
+    --train-iters 10 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --initial-loss-scale 4096 \
+    --adam-beta2 0.95 \
+    --no-load-optim \
+    --no-load-rng \
+    --no-gradient-accumulation-fusion \
+    --no-bias-swiglu-fusion \
+    --bf16 \
+    --finetune
+"
+
+DATA_ARGS="
+    --data-path ${DATA_PATH} \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 0 \
+"
+
+msrun ${DISTRIBUTED_ARGS} pretrain_gpt.py \
+    ${GPT_ARGS} \
+    ${DATA_ARGS} \
+    ${OUTPUT_ARGS} \
+    --load ${CKPT_LOAD_DIR} \
+    --distributed-backend nccl \
+    --ai-framework mindspore \
-- 
Gitee