From 34208ed13c949ded0a3f00df7e3b79e92c2fdf1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:19:33 +0000
Subject: [PATCH 1/9] update
 cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_full_8p.sh                     | 23 +++++--------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh
index e17f87b2d..8cd78e2f2 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh
@@ -43,20 +43,9 @@ do
         name_bind="_bindcore"
 	elif [[ $para == --dynamic_input* ]];then
       dynamic_input=`echo ${para#*=}` 
-    elif [[ $para == --one_node_ip* ]];then
-        one_node_ip=`echo ${para#*=}`
-    fi
+   fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
-
 if [[ $data_path  == "" ]];then
     echo "[Error] para \"data_path\" must be config"
     exit 1
@@ -70,8 +59,8 @@ python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$c
 wait
 
 export ASCEND_DEVICE_ID=0
-export RANK_SIZES=8
-#export RANK_TABLE_FILE="${cur_path}/test/8p.json"
+export RANK_SIZE=8
+export RANK_TABLE_FILE="${cur_path}/test/8p.json"
 export JOB_ID=10086
 
 start=$(date +%s)
@@ -80,7 +69,7 @@ start=$(date +%s)
 for i in 0 1 2 3 4 5 6 7
 do
     #设置环境变量
-    export RANK_IDS=$i
+    export RANK_ID=$i
     export ASCEND_DEVICE_ID=$i
     ASCEND_DEVICE_ID=$i
     echo "Device ID: $ASCEND_DEVICE_ID"
@@ -94,8 +83,8 @@ do
     echo $ASCEND_DEVICE_ID
     #(Step3)训练
     corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l`
-    let a=RANK_IDS*${corenum}/8
-    let b=RANK_IDS+1
+    let a=RANK_ID*${corenum}/8
+    let b=RANK_ID+1
     let c=b*${corenum}/8-1
     if [ "x${bind_core}" != x ];then
         bind_core="taskset -c $a-$c"
-- 
Gitee


From b62af686bad0576be3ca8642f7362893ec1020a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:20:03 +0000
Subject: [PATCH 2/9] update
 image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_RT2_performance_8p.sh            | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh
index bf5829ca4..8b3c78b5d 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh
@@ -49,15 +49,6 @@ do
     fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
-
 if [[ $data_path  == "" ]];then
     echo "[Error] para \"data_path\" must be config"
     exit 1
@@ -75,6 +66,14 @@ export RANK_SIZES=8
 #export RANK_TABLE_FILE="${cur_path}/test/8p.json"
 export JOB_ID=10086
 
+#8p训练必须参数（本机IP）
+one_node_ip=$one_node_ip
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
+export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
+export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
+export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
+export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
 start=$(date +%s)
 
 # 8P训练模式
-- 
Gitee


From 51a6078d5e12d1d272feeacdce68e577283f3725 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:20:29 +0000
Subject: [PATCH 3/9] update
 image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_performance_16p.sh             | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh
index f997f2418..496f7ab50 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh
@@ -60,20 +60,6 @@ done
 
 linux_num=$servers_num
 
-if [[ $conf_path == "" ]];then
-    fix_node_ip=$fix_node_ip
-    one_node_ip=$one_node_ip
-else
-    one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'`
-fi
-
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=16            #卡数，单机为8，多机为8n,所有服务器一致
-export CM_WORKER_IP=${fix_node_ip}  #当前服务器ip，不同环境ip不同
-
 if [[ $data_path  == "" ]];then
     echo "[Error] para \"data_path\" must be config"
     exit 1
@@ -93,7 +79,19 @@ cd $cur_path
 python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000000-bootstrap
 wait
 
+if [[ $conf_path == "" ]];then
+    fix_node_ip=$fix_node_ip
+    one_node_ip=$one_node_ip
+else
+    one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'`
+fi
 
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
+export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
+export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
+export CM_WORKER_SIZE=16            #卡数，单机为8，多机为8n,所有服务器一致
+export CM_WORKER_IP=${fix_node_ip}  #当前服务器ip，不同环境ip不同
 start=$(date +%s)
 
 # 8P训练模式
-- 
Gitee


From f32a11de6a73e7dd18dbd84267b4ac3ce165efe9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:25:29 +0000
Subject: [PATCH 4/9] update
 /cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_performance_8p.sh                 | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh
index f280a3168..764068710 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh
@@ -48,14 +48,6 @@ do
     fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
 
 if [[ $data_path  == "" ]];then
     echo "[Error] para \"data_path\" must be config"
@@ -74,6 +66,14 @@ export RANK_SIZES=8
 #export RANK_TABLE_FILE="${cur_path}/test/8p.json"
 export JOB_ID=10086
 
+#8p训练必须参数（本机IP）
+one_node_ip=$one_node_ip
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
+export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
+export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
+export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
+export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
 start=$(date +%s)
 
 # 8P训练模式
-- 
Gitee


From 9abee80d41b1151e2f53ea0f88835eb3589cabaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:29:35 +0000
Subject: [PATCH 5/9] update
 image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_full_8p.sh                     | 30 +++++--------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh
index b69de76e2..02a6ba19f 100644
--- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh
@@ -5,9 +5,9 @@ cur_path=`pwd`
 
 export LANG=en_US.UTF-8
 
-export RANK_SIZES=8
+export RANK_SIZE=8
 export JOB_ID=10087
-#export RANK_TABLE_FILE=$cur_path/../scripts/8p.json
+export RANK_TABLE_FILE=$cur_path/../scripts/8p.json
 RANK_ID_START=0
 
 
@@ -71,19 +71,9 @@ do
         cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/
     elif [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
-    elif [[ $para == --one_node_ip* ]];then
-        one_node_ip=`echo ${para#*=}`
     fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
 
 #data_path='../'
 #校验是否传入data_path,不需要修改
@@ -91,23 +81,18 @@ if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
     exit 1
 fi
-cd $cur_path/../
-sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
-sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
+
 cd $cur_path/../scripts
 
 #训练开始时间，不需要修改
 start_time=$(date +%s)
 
 bash run_accuracy_8p.sh ${data_path} all
-wait
 
 #训练结束时间，不需要修改
 end_time=$(date +%s)
 e2e_time=$(( $end_time - $start_time ))
-cd $cur_path/../
-sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
-sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
+
 
 sleep 30
 train_accuracy=`grep -r "whole" $cur_path/output/0/train_0.log | awk '{print $6}'`
@@ -120,6 +105,7 @@ echo "E2E Training Duration sec : $e2e_time"
 echo "------------------ Final result ------------------"
 #输出性能FPS，需要模型审视修改
 fps=`grep throughput_train $cur_path/output/0/train_0.log|awk -F 'throughput_train' '{print $2}'|awk -F ':' '{print $2}'|awk '{print $1}'`
+#FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${fps}'}'`
 FPS=1.5
 #打印，不需要修改
 echo "Final Performance images/sec : $FPS"
@@ -130,13 +116,13 @@ echo "E2E Training Duration sec : $e2e_time"
 #训练用例信息，不需要修改
 BatchSize=${batch_size}
 DeviceType=`uname -m`
-CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
 
 #获取性能数据，不需要修改
 #吞吐量
 ActualFPS=${FPS}
 #单迭代训练时长
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
 
 
 #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
@@ -147,7 +133,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt`
 
 #关键信息打印到${CaseName}.log中，不需要修改
 echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log
-echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/0/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log
 echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log
 echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log
 echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log
-- 
Gitee


From db7ee2b48c12cdbb93268525eee86ed564c116ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:30:08 +0000
Subject: [PATCH 6/9] update
 image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../scripts/run_accuracy_8p.sh                           | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh
index f2d9c2f99..771f5fb92 100644
--- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh
+++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh
@@ -9,8 +9,8 @@ mkdir -p ${upDir}/test/output/0
 
 # user env
 export JOB_ID=NPU20210126
-export RANK_SIZES=8
-#export RANK_TABLE_FILE=${currentDir}/8p.json
+export RANK_SIZE=8
+export RANK_TABLE_FILE=${currentDir}/8p.json
 
 data_dir=$1
 fold=$2
@@ -21,7 +21,7 @@ if [ x"${fold}" = x"all" ] ;
 then
     for device_index in ${device_group}
     do
-        RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 &
+        RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 &
     done
 
     wait
@@ -32,9 +32,10 @@ else
     echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train start"
     for device_index in ${device_group}
     do
-        RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} &
+        RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} &
     done
 
     wait
     echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train end"
 fi
+
-- 
Gitee


From 0d44b19b4aac44c2087f84197bb5160e2eceb98e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:34:23 +0000
Subject: [PATCH 7/9] update
 recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_full_8p.sh                     | 42 ++++++++-----------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh
index 7250053b3..e58e09555 100644
--- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh
@@ -4,11 +4,13 @@
 cur_path=`pwd`
 
 #集合通信参数,不需要修改
+
+
 export HCCL_CONNECT_TIMEOUT=1200
 
 #集合通信参数,不需要修改
-export RANK_SIZES=8
-#export RANK_TABLE_FILE=$cur_path/8p.json
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$cur_path/8p.json
 export JOB_ID=10087
 RANK_ID_START=0
 ASCEND_DEVICE_ID_START=0
@@ -55,19 +57,14 @@ elif [[ $para == --over_dump* ]];then
         mkdir -p ${over_dump_path}
     elif [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
-    elif [[ $para == --one_node_ip* ]];then
-        one_node_ip=`echo ${para#*=}`
     fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
 
 #校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
@@ -105,8 +102,6 @@ sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs
 sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py
 sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py
 sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py
-sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py
-sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py
 #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run`
 cp configs/config.py configs/config.py.run
 
@@ -114,13 +109,13 @@ cp configs/config.py configs/config.py.run
 cd $cur_path/../
 
 start=$(date +%s)
-for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++));
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
 do
     #设置环境变量，不需要修改
-    echo "Device ID: $RANK_IDS"
-    export RANK_IDS=$RANK_IDS
-    export ASCEND_DEVICE_ID=$RANK_IDS
-    ASCEND_DEVICE_ID=$RANK_IDS
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
   if [   -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
      rm -rf $cur_path/output/${ASCEND_DEVICE_ID}
      mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
@@ -136,8 +131,7 @@ done
 wait
 end=$(date +%s)
 e2e_time=$(( $end - $start ))
-sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py
-sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py
+
 #配置文件恢复
 mv -f configs/config.py.bak configs/config.py
 
@@ -161,9 +155,9 @@ echo "E2E Training Duration sec : $e2e_time"
 BatchSize=${batch_size}
 DeviceType=`uname -m`
 if [[ $precision_mode == "must_keep_origin_dtype" ]];then
-    CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'fp32'_'acc'
+    CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'acc'
 else
-    CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
+    CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
 fi
 
 ##获取性能数据
@@ -182,7 +176,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt |
 
 #关键信息打印到${CaseName}.log中，不需要修改
 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-- 
Gitee


From dfb4d3a2cb1d18c3c409a29dd37be0921b671e9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:35:47 +0000
Subject: [PATCH 8/9] update
 /image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_full_8p.sh                     | 47 +++++++++----------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh
index cc5390018..254cad2ad 100644
--- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh
@@ -5,8 +5,8 @@ cur_path=`pwd`
 
 #集合通信参数,不需要修改
 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下
-export RANK_SIZES=8
-#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json
+export RANK_SIZE=8
+export RANK_TABLE_FILE=${cur_path}/../configs/8p.json
 export JOB_ID=10087
 RANK_ID_START=0
 
@@ -21,7 +21,7 @@ Network="ResNet50_ID0058_for_TensorFlow"
 export HCCL_CONNECT_TIMEOUT=600
 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l`
 export RANK_INDEX=0
-export RANK_IDS=0
+export RANK_ID=0
 
 config_file=res50_256bs_8p_eval
 iterations_per_loop=100
@@ -86,20 +86,9 @@ do
     elif [[ $para == --bind_core* ]]; then
         bind_core=`echo ${para#*=}`
         name_bind="_bindcore"
-    elif [[ $para == --one_node_ip* ]];then
-        one_node_ip=`echo ${para#*=}`
     fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
-
 #校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
@@ -115,16 +104,16 @@ fi
 
 #训练开始时间，不需要修改
 start_time=$(date +%s)
-sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py
+
 #进入训练脚本目录，需要模型审视修改
 cd $cur_path/..
-for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++));
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
 do
     #设置环境变量，不需要修改
-    echo "Device ID: $RANK_IDS"
-    # export RANK_IDS=$RANK_IDS
-    export ASCEND_DEVICE_ID=$RANK_IDS
-    ASCEND_DEVICE_ID=$RANK_IDS
+    echo "Device ID: $RANK_ID"
+    # export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
     export DEVICE_ID=$ASCEND_DEVICE_ID
 	DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
 	export DEVICE_INDEX=$DEVICE_INDEX
@@ -139,8 +128,8 @@ do
 
     #执行训练脚本，需要模型审视修改
     corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l`
-    let a=RANK_IDS*${corenum}/8
-    let b=RANK_IDS+1
+    let a=RANK_ID*${corenum}/8
+    let b=RANK_ID+1
     let c=b*${corenum}/8-1
     if [ "x${bind_core}" != x ];then
         bind_core="taskset -c $a-$c"
@@ -169,7 +158,6 @@ wait
 #训练结束时间，不需要修改
 end_time=$(date +%s)
 e2e_time=$(( $end_time - $start_time ))
-sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py
 
 echo "------------------ Final result ------------------"
 #单step时长，需要从train_$ASCEND_DEVICE_ID.log里，通过关键字获取。需要模型审视修改
@@ -186,13 +174,20 @@ echo "E2E Training Duration sec : $e2e_time"
 #训练用例信息，不需要修改
 BatchSize=${batch_size}
 DeviceType=`uname -m`
-CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
 
 ##获取性能数据
 #吞吐量，不需要修改
 ActualFPS=${FPS}
 #单迭代训练时长，不需要修改
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
+
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${e2e_time}
+#单迭代训练时长
+TrainingTime=${e2e_time}
 
 ##获取Loss，通过train_*.log中关键字，需要根据模型审视
 grep "total_loss:" $cur_path/output/0/train_0.log|awk '{print $9}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
@@ -202,7 +197,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam
 
 #关键信息打印到${CaseName}.log中，不需要修改
 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-- 
Gitee


From 3b1f28697e02c2162378997e691d92429241342e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Wed, 22 Mar 2023 07:44:05 +0000
Subject: [PATCH 9/9] update
 /image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 高兴成 <1358493914@qq.com>
---
 .../test/train_full_8p.sh                     | 41 +++++++------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh
index 74ea8a80f..013de4755 100644
--- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh
@@ -5,8 +5,8 @@ cur_path=`pwd`
 
 #集合通信参数,不需要修改
 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下
-export RANK_SIZES=8
-#export RANK_TABLE_FILE=${cur_path}/../8p.json
+export RANK_SIZE=8
+export RANK_TABLE_FILE=${cur_path}/../8p.json
 export JOB_ID=10087
 RANK_ID_START=0
 
@@ -89,19 +89,9 @@ do
     elif [[ $para == --bind_core* ]]; then
         bind_core=`echo ${para#*=}`
         name_bind="_bindcore"
-    elif [[ $para == --one_node_ip* ]];then
-        one_node_ip=`echo ${para#*=}`
     fi
 done
 
-#8p训练必须参数（本机IP）
-one_node_ip=$one_node_ip
-#新增适配集群环境变量
-export CM_CHIEF_IP=${one_node_ip}   #主节点ip，所有服务器一致
-export CM_CHIEF_PORT=29688          #通信端口，所有服务器一致
-export CM_CHIEF_DEVICE=0            #配置为0，配置主卡，类似于主节点，所有服务器一致
-export CM_WORKER_SIZE=8             #卡数，单机为8，所有服务器一致
-export CM_WORKER_IP=${one_node_ip}  #当前服务器ip，不同环境ip不同
 #校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
@@ -118,17 +108,15 @@ fi
 #训练开始时间，不需要修改
 start_time=$(date +%s)
 
-sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py
-sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py
 #进入训练脚本目录，需要模型审视修改
 cd $cur_path/../
-for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++));
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
 do
     #设置环境变量，不需要修改
-    echo "Device ID: $RANK_IDS"
-    export RANK_IDS=$RANK_IDS
-    export ASCEND_DEVICE_ID=$RANK_IDS
-    ASCEND_DEVICE_ID=$RANK_IDS
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
     
     #创建DeviceID输出目录，不需要修改
     if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
@@ -140,9 +128,9 @@ do
     
      # 绑核，不需要的绑核的模型删除，需要模型审视修改
     corenum=`cat /proc/cpuinfo |grep "processor"|wc -l`
-    let a=RANK_IDS*${corenum}/${RANK_SIZES}
-    let b=RANK_IDS+1
-    let c=b*${corenum}/${RANK_SIZES}-1
+    let a=RANK_ID*${corenum}/${RANK_SIZE}
+    let b=RANK_ID+1
+    let c=b*${corenum}/${RANK_SIZE}-1
 
     #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
     #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path，--data_dump_flag，--data_dump_step，--data_dump_path，--profiling，--profiling_dump_path
@@ -165,8 +153,7 @@ wait
 #训练结束时间，不需要修改
 end_time=$(date +%s)
 e2e_time=$(( $end_time - $start_time ))
-sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py
-sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py
+
 #结果打印，不需要修改
 echo "------------------ Final result ------------------"
 #输出性能FPS，需要模型审视修改
@@ -184,13 +171,13 @@ echo "E2E Training Duration sec : $e2e_time"
 #训练用例信息，不需要修改
 BatchSize=${batch_size}
 DeviceType=`uname -m`
-CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
 
 ##获取性能数据
 #吞吐量，不需要修改
 ActualFPS=${FPS}
 #单迭代训练时长，不需要修改
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
 
 #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
 grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
@@ -200,7 +187,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam
 
 #关键信息打印到${CaseName}.log中，不需要修改
 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
 echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-- 
Gitee