From 6cd5ec70ae9539d736bf868ce05bc8ad599c41f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:31:22 +0000
Subject: [PATCH 1/8] update train_full_1p.sh.

---
 .../BertBase_ID0490_for_PyTorch/test/train_full_1p.sh            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh
index abe4444d24..fcbca90608 100644
--- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh
+++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh
@@ -64,6 +64,7 @@ if [[ $data_path == "" ]];then
     exit 1
 fi
 
+export RANK=0
 #训练开始时间，不需要修改
 start_time=$(date +%s)
 
-- 
Gitee


From bcfaa8656891e32be3175bbf98afc64b8eabb8be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:31:56 +0000
Subject: [PATCH 2/8] update train_performance_1p.sh.

---
 .../BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh
index 884fd0cc89..f09723a2fa 100644
--- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh
@@ -64,6 +64,7 @@ if [[ $data_path == "" ]];then
     exit 1
 fi
 
+export RANK=0
 #训练开始时间，不需要修改
 start_time=$(date +%s)
 
-- 
Gitee


From 48cdacaa117eb42aadcce30c2193dd228e4b1376 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:34:43 +0000
Subject: [PATCH 3/8] update main_apex_d76_npu.py.

---
 .../DistributedResnet50/main_apex_d76_npu.py                  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
index 26edd676ce..34329f23a4 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
@@ -416,8 +416,10 @@ def main_worker(gpu, ngpus_per_node, args):
             args.rank = args.rank * ngpus_per_node + gpu
 
         if args.device == 'npu':
+            RANK = int(os.environ["rank_server"])
+            print("args.rank+RANK :", args.rank+RANK)
             dist.init_process_group(backend=args.dist_backend,
-                                    world_size=args.world_size, rank=args.rank)
+                                    world_size=args.world_size, rank=args.rank+RANK)
         else:
             dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                     world_size=args.world_size, rank=args.rank)
-- 
Gitee


From 8869c20d2be6fbadf14bb8e0cb108ca0bf1ba17e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:36:17 +0000
Subject: [PATCH 4/8] add train_performance_16p.sh.

---
 .../test/train_performance_16p.sh             | 156 ++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh
new file mode 100644
index 0000000000..d602403ad9
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="ResNet50_ID0095_for_PyTorch"
+# 训练batch_size
+batch_size=4096
+# 训练使用的npu卡数
+export RANK_SIZE=16
+# 数据集路径,保持为空,不需要修改
+data_path=""
+conf_path=""
+server_index=""
+fix_node_ip=""
+
+# 训练epoch 90
+train_epochs=3
+# 加载数据进程数
+workers=128
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --conf_path* ]];then
+            conf_path=`echo ${para#*=}`
+    elif [[ $para == --server_index* ]];then
+            server_index=`echo ${para#*=}`
+    elif [[ $para == --fix_node_ip* ]];then
+            fix_node_ip=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'`
+linux_num=`find $conf_path -name "server_*.info" |wc -l`
+
+export HCCL_IF_IP=$fix_node_ip
+export MASTER_ADDR=$one_node_ip
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+# 训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+export rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'`
+export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'`
+
+python3.7 ./DistributedResnet50/main_apex_d76_npu.py \
+        --data ${data_path} \
+        --addr=$one_node_ip \
+        --seed=49 \
+        --workers=${workers} \
+        --learning-rate=1.6 \
+        --warmup=8 \
+        --label-smoothing=0.1 \
+        --mom=0.9 \
+        --weight-decay=1.0e-04  \
+        --static-loss-scale=128 \
+        --print-freq=1 \
+        --dist-url='tcp://127.0.0.1:50000' \
+        --dist-backend='hccl' \
+        --multiprocessing-distributed \
+        --world-size=2 \
+        --rank=0 \
+        --benchmark=0 \
+        --device='npu' \
+        --epochs=${train_epochs} \
+        --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+# 训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+# 结果打印，不需要修改
+echo "------------------ Final result ------------------"
+# 输出性能FPS，需要模型审视修改
+grep "FPS@all" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $11}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_fps.log
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'`
+# 打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+# 输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+# 打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+# 性能看护结果汇总
+# 获取性能数据，不需要修改
+# 吞吐量
+ActualFPS=${FPS}
+# 单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+# 最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
-- 
Gitee


From 0ca299265f51daa4eb1ed8ddc09ad3e65b60c4e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:37:01 +0000
Subject: [PATCH 5/8] update train_performance_8p.sh.

---
 .../ResNet50_for_PyTorch/test/train_performance_8p.sh           | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh
index 850d3dda9e..b5353c8fab 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh
@@ -63,6 +63,8 @@ if [ x"${etp_flag}" != x"true" ];then
     source ${test_path_dir}/env_npu.sh
 fi
 
+export rank_server=0
+
 python3.7 ./DistributedResnet50/main_apex_d76_npu.py \
         --data ${data_path} \
         --addr=$(hostname -I |awk '{print $1}') \
-- 
Gitee


From 5795272857282272759ea0a43ac26358ca757fa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:38:25 +0000
Subject: [PATCH 6/8] update train_full_8p.sh.

---
 .../ResNet50_for_PyTorch/test/train_full_8p.sh            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
index 56b6147f44..50e8908c31 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
@@ -51,9 +51,9 @@ do
     #设置环境变量，不需要修改
     export ASCEND_DEVICE_ID=$RANK_ID
     echo "Device ID: $ASCEND_DEVICE_ID"
-    
-    
-    
+
+
+
     #创建DeviceID输出目录，不需要修改
     if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
         rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
@@ -66,7 +66,7 @@ do
 
     #SOLVER.MAX_ITER 82000 \
     #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
-done 
+done
 
 #################启动训练脚本#################
 # 训练开始时间，不需要修改
-- 
Gitee


From df7c5326962221d0a7dc78c07d22b12a2f9088c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:40:13 +0000
Subject: [PATCH 7/8] update train_full_8p.sh.

---
 .../ResNet50_for_PyTorch/test/train_full_8p.sh            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
index 50e8908c31..56b6147f44 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
@@ -51,9 +51,9 @@ do
     #设置环境变量，不需要修改
     export ASCEND_DEVICE_ID=$RANK_ID
     echo "Device ID: $ASCEND_DEVICE_ID"
-
-
-
+    
+    
+    
     #创建DeviceID输出目录，不需要修改
     if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
         rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
@@ -66,7 +66,7 @@ do
 
     #SOLVER.MAX_ITER 82000 \
     #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
-done
+done 
 
 #################启动训练脚本#################
 # 训练开始时间，不需要修改
-- 
Gitee


From 4e7f23c24b790f87f3194ad8489ff710f0de4836 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com>
Date: Thu, 7 Apr 2022 02:41:21 +0000
Subject: [PATCH 8/8] update train_full_8p.sh.

---
 .../cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
index 56b6147f44..b609a83864 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
@@ -68,6 +68,7 @@ do
     #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
 done 
 
+export rank_server=0
 #################启动训练脚本#################
 # 训练开始时间，不需要修改
 start_time=$(date +%s)
-- 
Gitee