From d2ecc5d03bb3d99956249b2ca416d2c9b588a64d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 07:18:15 +0000
Subject: [PATCH 01/11] update
cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../scripts/run_accuracy_8p.sh | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh
index 771f5fb92..95427b89c 100644
--- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh
+++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh
@@ -9,8 +9,8 @@ mkdir -p ${upDir}/test/output/0
# user env
export JOB_ID=NPU20210126
-export RANK_SIZE=8
-export RANK_TABLE_FILE=${currentDir}/8p.json
+export RANK_SIZES=8
+#export RANK_TABLE_FILE=${currentDir}/8p.json
data_dir=$1
fold=$2
@@ -21,7 +21,7 @@ if [ x"${fold}" = x"all" ] ;
then
for device_index in ${device_group}
do
- RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 &
+ RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 &
done
wait
@@ -32,7 +32,7 @@ else
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train start"
for device_index in ${device_group}
do
- RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} &
+ RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} &
done
wait
--
Gitee
From 7f08b22a48c9180c23c5083b0252aea4f38cafb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 07:18:53 +0000
Subject: [PATCH 02/11] update
cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../test/train_full_8p.sh | 30 ++++++++++++++-----
1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh
index 02a6ba19f..b69de76e2 100644
--- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh
@@ -5,9 +5,9 @@ cur_path=`pwd`
export LANG=en_US.UTF-8
-export RANK_SIZE=8
+export RANK_SIZES=8
export JOB_ID=10087
-export RANK_TABLE_FILE=$cur_path/../scripts/8p.json
+#export RANK_TABLE_FILE=$cur_path/../scripts/8p.json
RANK_ID_START=0
@@ -71,9 +71,19 @@ do
cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/
elif [[ $para == --data_path* ]];then
data_path=`echo ${para#*=}`
+ elif [[ $para == --one_node_ip* ]];then
+ one_node_ip=`echo ${para#*=}`
fi
done
+#8p训练必须参数(本机IP)
+one_node_ip=$one_node_ip
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致
+export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致
+export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致
+export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致
+export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同
#data_path='../'
#校验是否传入data_path,不需要修改
@@ -81,18 +91,23 @@ if [[ $data_path == "" ]];then
echo "[Error] para \"data_path\" must be confing"
exit 1
fi
-
+cd $cur_path/../
+sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
+sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
cd $cur_path/../scripts
#训练开始时间,不需要修改
start_time=$(date +%s)
bash run_accuracy_8p.sh ${data_path} all
+wait
#训练结束时间,不需要修改
end_time=$(date +%s)
e2e_time=$(( $end_time - $start_time ))
-
+cd $cur_path/../
+sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
+sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py
sleep 30
train_accuracy=`grep -r "whole" $cur_path/output/0/train_0.log | awk '{print $6}'`
@@ -105,7 +120,6 @@ echo "E2E Training Duration sec : $e2e_time"
echo "------------------ Final result ------------------"
#输出性能FPS,需要模型审视修改
fps=`grep throughput_train $cur_path/output/0/train_0.log|awk -F 'throughput_train' '{print $2}'|awk -F ':' '{print $2}'|awk '{print $1}'`
-#FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${fps}'}'`
FPS=1.5
#打印,不需要修改
echo "Final Performance images/sec : $FPS"
@@ -116,13 +130,13 @@ echo "E2E Training Duration sec : $e2e_time"
#训练用例信息,不需要修改
BatchSize=${batch_size}
DeviceType=`uname -m`
-CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
#获取性能数据,不需要修改
#吞吐量
ActualFPS=${FPS}
#单迭代训练时长
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'`
#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视
@@ -133,7 +147,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt`
#关键信息打印到${CaseName}.log中,不需要修改
echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log
+echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/0/${CaseName}.log
echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log
echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log
echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log
--
Gitee
From 7777c4e79acd77a4dce394f90b2e972e847a45d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 07:51:06 +0000
Subject: [PATCH 03/11] update
TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../MiniGo_ID0629_for_TensorFlow/README.md | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
index 44ac5abee..5340d2377 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
@@ -195,6 +195,7 @@ pip3 install requirements.txt
4. 8卡训练
4.1 首先检查 MiniGo_ID0629_for_TensorFlow/test 目录下是否有存在8卡IP的json配置文件 "8p.json"
+ (export CMxxx环境变量方式拉起训练无需检查)
4.2 设置单卡训练参数(脚本位于./MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh),示例如下。
@@ -207,7 +208,9 @@ pip3 install requirements.txt
4.3 单卡训练指令(脚本位于./MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh)
```
- bash train_full_8p.sh --data_path=xx
+ bash train_full_8p.sh --data_path=xx(ranktable方式)
+ bash train_full_8p.sh --data_path=xx --one_node_ip=本机IP(去ranktable--CM环境变量方式)
+
数据集路径默认为 MiniGo_ID0629_for_TensorFlow/outputs/data/selfplay(即Step2自对弈的生成路径,不建议改动)
数据集应有如下结构(数据切分可能不同),配置data_path时需指定为selfplay这一层,例:--data_path=./outputs/data/selfplay
├─selfplay
--
Gitee
From ee2a67a3ce51ebaf2cc0666f1cc245b0dc5016d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 07:54:49 +0000
Subject: [PATCH 04/11] update
built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../test/train_full_8p.sh | 22 ++++++++++++++-----
1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh
index 8cd78e2f2..37476144b 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh
@@ -43,7 +43,9 @@ do
name_bind="_bindcore"
elif [[ $para == --dynamic_input* ]];then
dynamic_input=`echo ${para#*=}`
- fi
+ elif [[ $para == --one_node_ip* ]];then
+ one_node_ip=`echo ${para#*=}`
+ fi
done
if [[ $data_path == "" ]];then
@@ -59,17 +61,25 @@ python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$c
wait
export ASCEND_DEVICE_ID=0
-export RANK_SIZE=8
-export RANK_TABLE_FILE="${cur_path}/test/8p.json"
+export RANK_SIZES=8
+#export RANK_TABLE_FILE="${cur_path}/test/8p.json"
export JOB_ID=10086
+#8p训练必须参数(本机IP)
+one_node_ip=$one_node_ip
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致
+export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致
+export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致
+export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致
+export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同
start=$(date +%s)
# 8P训练模式
for i in 0 1 2 3 4 5 6 7
do
#设置环境变量
- export RANK_ID=$i
+ export RANK_IDS=$i
export ASCEND_DEVICE_ID=$i
ASCEND_DEVICE_ID=$i
echo "Device ID: $ASCEND_DEVICE_ID"
@@ -83,8 +93,8 @@ do
echo $ASCEND_DEVICE_ID
#(Step3)训练
corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l`
- let a=RANK_ID*${corenum}/8
- let b=RANK_ID+1
+ let a=RANK_IDS*${corenum}/8
+ let b=RANK_IDS+1
let c=b*${corenum}/8-1
if [ "x${bind_core}" != x ];then
bind_core="taskset -c $a-$c"
--
Gitee
From bb1aeecd4e0556caccee77a0d8efd421ce1259bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:07:25 +0000
Subject: [PATCH 05/11] update
TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../image_classification/MiniGo_ID0629_for_TensorFlow/README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
index 5340d2377..fe90d7ff1 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
@@ -196,6 +196,8 @@ pip3 install requirements.txt
4.1 首先检查 MiniGo_ID0629_for_TensorFlow/test 目录下是否有存在8卡IP的json配置文件 "8p.json"
(export CMxxx环境变量方式拉起训练无需检查)
+ 环境变量方式拉起详情请参考:
+ https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/63RC1alpha002/tfmoddevg/tfmigr1/atlasmprtg_13_9036.html
4.2 设置单卡训练参数(脚本位于./MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh),示例如下。
--
Gitee
From 308badc8784d5dbe07f45d6e7381ba222121301d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:08:10 +0000
Subject: [PATCH 06/11] update
TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../MiniGo_ID0629_for_TensorFlow/README.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
index fe90d7ff1..9964bcbfe 100644
--- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/README.md
@@ -195,7 +195,8 @@ pip3 install requirements.txt
4. 8卡训练
4.1 首先检查 MiniGo_ID0629_for_TensorFlow/test 目录下是否有存在8卡IP的json配置文件 "8p.json"
- (export CMxxx环境变量方式拉起训练无需检查)
+ (export CMxxx环境变量方式拉起训练无需检查)
+
环境变量方式拉起详情请参考:
https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/63RC1alpha002/tfmoddevg/tfmigr1/atlasmprtg_13_9036.html
--
Gitee
From 7f3b6eda6fd6123fa0032cd7da1ddf8a47042e5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:15:19 +0000
Subject: [PATCH 07/11] update
TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../WideDeep_ID2712_for_TensorFlow/README.md | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md
index c9e116c76..356608a38 100644
--- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md
@@ -118,7 +118,14 @@ pip3 install requirements.txt
bash train_performance_1p.sh --data_path=/data (功能和性能)
bash train_full_1p.sh --data_path=/data (全量)
```
+- 8卡训练-CMxxx环境变量方式拉起
+ 以数据集放在/data为例
+
+环境变量方式拉起详情请参考:
+https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/63RC1alpha002/tfmoddevg/tfmigr1/atlasmprtg_13_9036.html
+ bash train_performance_8p.sh --data_path=/data --one_node_ip=本机IP
+ bash train_full_8p.sh --data_path=/data --one_node_ip=本机IP
## 高级参考
#### 脚本和示例代码
--
Gitee
From bdaa24321fba1e3cfe021a8d73344dfce29931f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:17:26 +0000
Subject: [PATCH 08/11] update
TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../test/train_full_8p.sh | 42 +++++++++++--------
1 file changed, 24 insertions(+), 18 deletions(-)
diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh
index e58e09555..7250053b3 100644
--- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh
@@ -4,13 +4,11 @@
cur_path=`pwd`
#集合通信参数,不需要修改
-
-
export HCCL_CONNECT_TIMEOUT=1200
#集合通信参数,不需要修改
-export RANK_SIZE=8
-export RANK_TABLE_FILE=$cur_path/8p.json
+export RANK_SIZES=8
+#export RANK_TABLE_FILE=$cur_path/8p.json
export JOB_ID=10087
RANK_ID_START=0
ASCEND_DEVICE_ID_START=0
@@ -57,14 +55,19 @@ elif [[ $para == --over_dump* ]];then
mkdir -p ${over_dump_path}
elif [[ $para == --data_path* ]];then
data_path=`echo ${para#*=}`
+ elif [[ $para == --one_node_ip* ]];then
+ one_node_ip=`echo ${para#*=}`
fi
done
-#校验是否传入data_path,不需要修改
-if [[ $data_path == "" ]];then
- echo "[Error] para \"data_path\" must be confing"
- exit 1
-fi
+#8p训练必须参数(本机IP)
+one_node_ip=$one_node_ip
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致
+export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致
+export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致
+export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致
+export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同
#校验是否传入data_path,不需要修改
if [[ $data_path == "" ]];then
@@ -102,6 +105,8 @@ sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs
sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py
sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py
sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py
+sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py
+sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py
#echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run`
cp configs/config.py configs/config.py.run
@@ -109,13 +114,13 @@ cp configs/config.py configs/config.py.run
cd $cur_path/../
start=$(date +%s)
-for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++));
do
#设置环境变量,不需要修改
- echo "Device ID: $RANK_ID"
- export RANK_ID=$RANK_ID
- export ASCEND_DEVICE_ID=$RANK_ID
- ASCEND_DEVICE_ID=$RANK_ID
+ echo "Device ID: $RANK_IDS"
+ export RANK_IDS=$RANK_IDS
+ export ASCEND_DEVICE_ID=$RANK_IDS
+ ASCEND_DEVICE_ID=$RANK_IDS
if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
rm -rf $cur_path/output/${ASCEND_DEVICE_ID}
mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
@@ -131,7 +136,8 @@ done
wait
end=$(date +%s)
e2e_time=$(( $end - $start ))
-
+sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py
+sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py
#配置文件恢复
mv -f configs/config.py.bak configs/config.py
@@ -155,9 +161,9 @@ echo "E2E Training Duration sec : $e2e_time"
BatchSize=${batch_size}
DeviceType=`uname -m`
if [[ $precision_mode == "must_keep_origin_dtype" ]];then
- CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'acc'
+ CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'fp32'_'acc'
else
- CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+ CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
fi
##获取性能数据
@@ -176,7 +182,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt |
#关键信息打印到${CaseName}.log中,不需要修改
echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
--
Gitee
From d064b833b66c1edfcb9c1c644e8456defb004d1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:18:37 +0000
Subject: [PATCH 09/11] update
TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../recommendation/WideDeep_ID2712_for_TensorFlow/README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md
index 356608a38..d506b9e46 100644
--- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/README.md
@@ -125,6 +125,7 @@ pip3 install requirements.txt
https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/63RC1alpha002/tfmoddevg/tfmigr1/atlasmprtg_13_9036.html
bash train_performance_8p.sh --data_path=/data --one_node_ip=本机IP
+
bash train_full_8p.sh --data_path=/data --one_node_ip=本机IP
## 高级参考
--
Gitee
From 21d4939ef8f0c3c54c257497a77288544806abd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:21:28 +0000
Subject: [PATCH 10/11] update
TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../image_segmentation/UNet3D_ID0057_for_TensorFlow/README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/README.md b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/README.md
index 1209d2ef9..b13088414 100644
--- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/README.md
@@ -181,6 +181,8 @@ run_config = NPURunConfig(
bash run_accuracy_8p.sh --data_dir=/data/BraTS2019Train
```
+ 环境变量方式拉起详情请参考:
+ https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/63RC1alpha002/tfmoddevg/tfmigr1/atlasmprtg_13_9036.html
## 高级参考
#### 脚本和示例代码
--
Gitee
From 302217d887a2836d0e6bf0c02a18a68609af1daf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com>
Date: Tue, 18 Apr 2023 08:36:48 +0000
Subject: [PATCH 11/11] update
built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 高兴成 <1358493914@qq.com>
---
.../test/train_full_8p.sh | 42 ++++++++++++-------
1 file changed, 27 insertions(+), 15 deletions(-)
diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh
index 254cad2ad..d00a27957 100644
--- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh
+++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh
@@ -5,8 +5,8 @@ cur_path=`pwd`
#集合通信参数,不需要修改
#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下
-export RANK_SIZE=8
-export RANK_TABLE_FILE=${cur_path}/../configs/8p.json
+export RANK_SIZES=8
+#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json
export JOB_ID=10087
RANK_ID_START=0
@@ -21,7 +21,7 @@ Network="ResNet50_ID0058_for_TensorFlow"
export HCCL_CONNECT_TIMEOUT=600
corenum=`cat /proc/cpuinfo |grep "processor"|wc -l`
export RANK_INDEX=0
-export RANK_ID=0
+export RANK_IDS=0
config_file=res50_256bs_8p_eval
iterations_per_loop=100
@@ -86,9 +86,20 @@ do
elif [[ $para == --bind_core* ]]; then
bind_core=`echo ${para#*=}`
name_bind="_bindcore"
+ elif [[ $para == --one_node_ip* ]];then
+ one_node_ip=`echo ${para#*=}`
fi
done
+#8p训练必须参数(本机IP)
+one_node_ip=$one_node_ip
+#新增适配集群环境变量
+export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致
+export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致
+export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致
+export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致
+export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同
+
#校验是否传入data_path,不需要修改
if [[ $data_path == "" ]];then
echo "[Error] para \"data_path\" must be confing"
@@ -104,16 +115,16 @@ fi
#训练开始时间,不需要修改
start_time=$(date +%s)
-
+sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py
#进入训练脚本目录,需要模型审视修改
cd $cur_path/..
-for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++));
do
#设置环境变量,不需要修改
- echo "Device ID: $RANK_ID"
- # export RANK_ID=$RANK_ID
- export ASCEND_DEVICE_ID=$RANK_ID
- ASCEND_DEVICE_ID=$RANK_ID
+ echo "Device ID: $RANK_IDS"
+ # export RANK_IDS=$RANK_IDS
+ export ASCEND_DEVICE_ID=$RANK_IDS
+ ASCEND_DEVICE_ID=$RANK_IDS
export DEVICE_ID=$ASCEND_DEVICE_ID
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=$DEVICE_INDEX
@@ -128,8 +139,8 @@ do
#执行训练脚本,需要模型审视修改
corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l`
- let a=RANK_ID*${corenum}/8
- let b=RANK_ID+1
+ let a=RANK_IDS*${corenum}/8
+ let b=RANK_IDS+1
let c=b*${corenum}/8-1
if [ "x${bind_core}" != x ];then
bind_core="taskset -c $a-$c"
@@ -158,6 +169,7 @@ wait
#训练结束时间,不需要修改
end_time=$(date +%s)
e2e_time=$(( $end_time - $start_time ))
+sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py
echo "------------------ Final result ------------------"
#单step时长,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改
@@ -174,13 +186,13 @@ echo "E2E Training Duration sec : $e2e_time"
#训练用例信息,不需要修改
BatchSize=${batch_size}
DeviceType=`uname -m`
-CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc'
##获取性能数据
#吞吐量,不需要修改
ActualFPS=${FPS}
#单迭代训练时长,不需要修改
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'`
##获取性能数据,不需要修改
@@ -197,7 +209,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam
#关键信息打印到${CaseName}.log中,不需要修改
echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
@@ -205,4 +217,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName
echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
--
Gitee