From a1b540e6c816ea78521737ad305448e7dcf2fa0a Mon Sep 17 00:00:00 2001
From: liushengyuan <liushengyuan4@huawei.com>
Date: Fri, 5 Sep 2025 10:54:39 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90=E7=89=B9=E6=80=A7=E3=80=91Performance?=
 =?UTF-8?q?=20statistics=20of=20the=20DeepSpeed=20framework?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/mm/HunyuanDiT/hydit/ds_config.py |  46 +++++-
 .../mm/HunyuanDiT/hydit/train_deepspeed.py    |   2 +-
 .../mm/HunyuanDiT/test/train_full_16p_bf16.sh | 138 ++++++++++++++++++
 .../mm/HunyuanDiT/test/train_full_8p_bf16.sh  |   2 +-
 .../test/train_full_8p_bf16_zero3.sh          |   2 +-
 5 files changed, 186 insertions(+), 4 deletions(-)
 create mode 100644 PyTorch/built-in/mm/HunyuanDiT/test/train_full_16p_bf16.sh

diff --git a/PyTorch/built-in/mm/HunyuanDiT/hydit/ds_config.py b/PyTorch/built-in/mm/HunyuanDiT/hydit/ds_config.py
index 2428620717..f83f60ed86 100644
--- a/PyTorch/built-in/mm/HunyuanDiT/hydit/ds_config.py
+++ b/PyTorch/built-in/mm/HunyuanDiT/hydit/ds_config.py
@@ -2,7 +2,51 @@
 import os
 
 def deepspeed_config_from_args(args, global_batch_size):
-    if args.use_zero_stage == 2:
+    if args.use_zero_stage == 1:
+        deepspeed_config = {
+            "train_batch_size": global_batch_size,
+            "train_micro_batch_size_per_gpu": args.batch_size,
+            "gradient_accumulation_steps": args.grad_accu_steps,
+            "steps_per_print": args.log_every,
+            "optimizer": {
+                "type": "AdamW",
+                "params": {
+                    "lr": args.lr,
+                    "betas": [
+                        0.9,
+                        0.999
+                    ],
+                    "eps": 1e-08,
+                    "weight_decay": args.weight_decay
+                }
+            },
+
+            "zero_optimization": {
+                "stage": 1,
+                "reduce_scatter": False,
+                "reduce_bucket_size": 1e9,
+            },
+
+            "gradient_clipping": 1.0,
+            "prescale_gradients": True,
+
+            "fp16": {
+                "enabled": args.use_fp16,
+                "loss_scale": 0,
+                "loss_scale_window": 500,
+                "hysteresis": 2,
+                "min_loss_scale": 1e-3,
+                "initial_scale_power": 15
+            },
+
+            "bf16": {
+                "enabled": False
+            },
+
+            "wall_clock_breakdown": False
+        }
+        
+    elif args.use_zero_stage == 2:
         deepspeed_config = {
             "train_batch_size": global_batch_size,
             "train_micro_batch_size_per_gpu": args.batch_size,
diff --git a/PyTorch/built-in/mm/HunyuanDiT/hydit/train_deepspeed.py b/PyTorch/built-in/mm/HunyuanDiT/hydit/train_deepspeed.py
index b67f9a2931..dde014b4fb 100644
--- a/PyTorch/built-in/mm/HunyuanDiT/hydit/train_deepspeed.py
+++ b/PyTorch/built-in/mm/HunyuanDiT/hydit/train_deepspeed.py
@@ -517,7 +517,7 @@ def main(args):
                                 f"Lr: {opt.param_groups[0]['lr']:.6g}, "
                                 f"Steps/Sec: {steps_per_sec:.2f}, "
                                 f"Millisec/Step: {(end_time - start_time) * 1000:.2f}," 
-                                f"Samples/Sec: {int(steps_per_sec * batch_size * world_size):d}")
+                                f"SpS: {int(steps_per_sec * batch_size * world_size):d}")
                     # Reset monitoring variables:
                     running_loss = 0
                     log_steps = 0
diff --git a/PyTorch/built-in/mm/HunyuanDiT/test/train_full_16p_bf16.sh b/PyTorch/built-in/mm/HunyuanDiT/test/train_full_16p_bf16.sh
new file mode 100644
index 0000000000..433a30108a
--- /dev/null
+++ b/PyTorch/built-in/mm/HunyuanDiT/test/train_full_16p_bf16.sh
@@ -0,0 +1,138 @@
+# 网络名称，权重路径以及相关参数，需要模型审视修改
+Network="HunyuanDiT"
+BATCH_SIZE=2
+max_train_steps=5000
+task_flag="dit_g2_full_1024p"                                # the task flag is used to identify folders.
+resume=./ckpts/t2i/model/                                    # checkpoint root for resume
+index_file=dataset/porcelain/jsons/porcelain_mt.json         # index file for dataloader
+results_dir=./log_EXP                                        # save root for results
+image_size=1024                                              # training image resolution
+grad_accu_steps=4                                            # gradient accumulation
+warmup_num_steps=0                                           # warm-up steps
+lr=0.0001                                                    # learning rate
+ckpt_every=10000                                             # create a ckpt every a few steps.
+ckpt_latest_every=5000                                       # create a ckpt named `latest.pt` every a few steps.
+
+export WORLD_SIZE=16
+export MASTER_PORT=29500
+export MASTER_ADDR=127.0.0.1
+
+for para in $*
+do
+    if [[ $para == --batch_size* ]]; then
+        BATCH_SIZE=$(echo ${para#*=})
+    elif [[ $para == --max_train_steps* ]]; then
+        max_train_steps=$(echo ${para#*=})
+    fi
+done
+
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=$(pwd)
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ]; then
+  test_path_dir=${cur_path}
+  cd ..
+  cur_path=$(pwd)
+else
+  test_path_dir=${cur_path}/test
+fi
+
+source ${test_path_dir}/env_npu.sh
+
+#创建DeviceID输出目录，不需要修改
+output_path=${cur_path}/test/output/${ASCEND_DEVICE_ID}
+
+mkdir -p ${output_path}
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+model='DiT-g/2'
+params=" \
+            --qk-norm \
+            --model ${model} \
+            --rope-img base512 \
+            --rope-real 
+            "
+            
+deepspeed --num_gpus ${WORLD_SIZE} --num_nodes 1 --master_port=${MASTER_PORT} hydit/train_deepspeed.py ${params} \
+    --task-flag ${task_flag} \
+    --noise-schedule scaled_linear --beta-start 0.00085 --beta-end 0.03 \
+    --predict-type v_prediction \
+    --uncond-p 0.44 \
+    --uncond-p-t5 0.44 \
+    --index-file ${index_file} \
+    --random-flip \
+    --lr ${lr} \
+    --batch-size ${BATCH_SIZE} \
+    --image-size ${image_size} \
+    --global-seed 999 \
+    --grad-accu-steps ${grad_accu_steps} \
+    --warmup-num-steps ${warmup_num_steps} \
+    --use-flash-attn \
+    --use-fp16 \
+    --use-ema \
+    --ema-dtype fp32 \
+    --results-dir ${results_dir} \
+    --resume-split \
+    --resume ${resume} \
+    --ckpt-every ${ckpt_every} \
+    --ckpt-latest-every ${ckpt_latest_every} \
+    --log-every 1 \
+    --deepspeed \
+    --deepspeed-optimizer \
+    --use-zero-stage 1 \
+    --multireso \
+    --reso-step 64 \
+    --epochs 1400 \
+    --max-training-steps ${max_train_steps} \
+    --norm "layer"  \
+    --autocast-dtype  "bf16" \
+ > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(($end_time - $start_time))
+
+
+# 训练用例信息，不需要修改
+BatchSize=${BATCH_SIZE}
+DeviceType=$(uname -m)
+CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc'
+
+# 结果打印，不需要修改
+echo "------------------ Final result ------------------"
+# 输出性能FPS，需要模型审视修改
+
+FPS=$(grep -oPa 'RunningAvgSamplesPerSec=\s*\K[\d.]+' "${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" |
+        tail -n 1 |
+        awk -F "RunningAvgSamplesPerSec=" '{print $1}')
+
+avg_millisec_per_step=$(echo "$WORLD_SIZE * 1000 * $BatchSize / $FPS" | bc)
+
+# 打印，不需要修改
+echo "Final Performance AvgSamplesPerSec : $FPS"
+echo "E2E Training Duration sec : $e2e_time"
+echo "avg_millisec_per_step : $avg_millisec_per_step"
+
+
+# 获取性能数据，不需要修改
+# 吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${BATCH_SIZE}'*'${WORLD_SIZE}'/'${FPS}'}')
+
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "AvgTrainingTime = ${avg_millisec_per_step}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_avg_millisec_per_step.log
diff --git a/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16.sh b/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16.sh
index 7f02fe7b9c..eb0960617a 100644
--- a/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16.sh
+++ b/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16.sh
@@ -123,7 +123,7 @@ echo "avg_millisec_per_step(100-200step) : $avg_millisec_per_step"
 # 吞吐量
 ActualFPS=${FPS}
 #单迭代训练时长
-TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${BATCH_SIZE}'*8/'${FPS}'}')
+TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${BATCH_SIZE}'*'${WORLD_SIZE}'/'${FPS}'}')
 
 
 # 关键信息打印到${CaseName}.log中，不需要修改
diff --git a/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16_zero3.sh b/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16_zero3.sh
index fa6b152818..88e29da160 100644
--- a/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16_zero3.sh
+++ b/PyTorch/built-in/mm/HunyuanDiT/test/train_full_8p_bf16_zero3.sh
@@ -119,7 +119,7 @@ echo "avg_millisec_per_step(100-200step) : $avg_millisec_per_step"
 # 吞吐量
 ActualFPS=${FPS}
 #单迭代训练时长
-TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${BATCH_SIZE}'*8/'${FPS}'}')
+TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${BATCH_SIZE}'*'${WORLD_SIZE}'/'${FPS}'}')
 
 
 # 关键信息打印到${CaseName}.log中，不需要修改
-- 
Gitee