From 6b8641e36555a692db5884b853940d7fb5fc72e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:29:33 +0000 Subject: [PATCH 01/22] update image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_RT2_performance_8p.sh | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh index ae8079f3c..e17f87b2d 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh @@ -4,7 +4,6 @@ cur_path=`pwd`/../ rm -f $cur_path/outputs/models/* rm -f $cur_path/estimator_working_dir/* -export ENABLE_RUNTIME_V2=1 #基础参数,需要模型审视修改 #Batch Size batch_size=128 @@ -15,7 +14,7 @@ RankSize=8 #训练epoch,可选 train_epochs= #训练step -train_steps=500 +train_steps=80000 #学习率 learning_rate= #动态输入模式,不需要修改 @@ -42,11 +41,22 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" - elif [[ $para == --dynamic_input* ]];then + elif [[ $para == --dynamic_input* ]];then dynamic_input=`echo ${para#*=}` - fi + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` + fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 @@ -60,8 +70,8 @@ python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$c wait export ASCEND_DEVICE_ID=0 -export RANK_SIZE=8 -export RANK_TABLE_FILE="${cur_path}/test/8p.json" +export RANK_SIZES=8 +#export RANK_TABLE_FILE="${cur_path}/test/8p.json" export JOB_ID=10086 start=$(date +%s) @@ -70,7 +80,7 @@ start=$(date +%s) for i in 0 1 2 3 4 5 6 7 do #设置环境变量 - export RANK_ID=$i + export RANK_IDS=$i export ASCEND_DEVICE_ID=$i ASCEND_DEVICE_ID=$i echo "Device ID: $ASCEND_DEVICE_ID" @@ -84,21 +94,13 @@ do echo $ASCEND_DEVICE_ID #(Step3)训练 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_ID*${corenum}/8 - let b=RANK_ID+1 + let a=RANK_IDS*${corenum}/8 + let b=RANK_IDS+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" fi - #${bind_core} python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & - ${bind_core} python3 train.py \ - --training_data_path=$data_path \ - --steps_to_train=$train_steps \ - --train_batch_size=$batch_size \ - --work_dir=$cur_path/estimator_working_dir \ - --export_path=$cur_path/outputs/models/000001-first_generation \ - --dynamic_input=${dynamic_input} \ - --jit_compile=False > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + ${bind_core} python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation --dynamic_input=${dynamic_input}> $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & done wait @@ -114,7 +116,7 @@ BatchSize=${batch_size} #设备类型,自动获取 DeviceType=`uname -m` #用例名称,自动获取 -CaseName=${Network}_bs${BatchSize}_${RankSize}'p_RT2_perf' +CaseName=${Network}_bs${BatchSize}_${RankSize}'p'_'acc' #获取性能 TrainingTime=`grep "tensorflow:global_step/sec" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` -- Gitee From df5992ef98fc82fab4aff8f079215d48c38ba33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:30:09 +0000 Subject: [PATCH 02/22] update image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_full_8p.sh | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh index 8cd78e2f2..e17f87b2d 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh @@ -43,9 +43,20 @@ do name_bind="_bindcore" elif [[ $para == --dynamic_input* ]];then dynamic_input=`echo ${para#*=}` - fi + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` + fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 @@ -59,8 +70,8 @@ python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$c wait export ASCEND_DEVICE_ID=0 -export RANK_SIZE=8 -export RANK_TABLE_FILE="${cur_path}/test/8p.json" +export RANK_SIZES=8 +#export RANK_TABLE_FILE="${cur_path}/test/8p.json" export JOB_ID=10086 start=$(date +%s) @@ -69,7 +80,7 @@ start=$(date +%s) for i in 0 1 2 3 4 5 6 7 do #设置环境变量 - export RANK_ID=$i + export RANK_IDS=$i export ASCEND_DEVICE_ID=$i ASCEND_DEVICE_ID=$i echo "Device ID: $ASCEND_DEVICE_ID" @@ -83,8 +94,8 @@ do echo $ASCEND_DEVICE_ID #(Step3)训练 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_ID*${corenum}/8 - let b=RANK_ID+1 + let a=RANK_IDS*${corenum}/8 + let b=RANK_IDS+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" -- Gitee From 6fe9c5940fd3466286881ef6ba1c1ef45f86e040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:30:36 +0000 Subject: [PATCH 03/22] update /image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_RT2_performance_8p.sh | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh index e17f87b2d..bf5829ca4 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh @@ -4,6 +4,7 @@ cur_path=`pwd`/../ rm -f $cur_path/outputs/models/* rm -f $cur_path/estimator_working_dir/* +export ENABLE_RUNTIME_V2=1 #基础参数,需要模型审视修改 #Batch Size batch_size=128 @@ -14,7 +15,7 @@ RankSize=8 #训练epoch,可选 train_epochs= #训练step -train_steps=80000 +train_steps=500 #学习率 learning_rate= #动态输入模式,不需要修改 @@ -41,9 +42,9 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" - elif [[ $para == --dynamic_input* ]];then + elif [[ $para == --dynamic_input* ]];then dynamic_input=`echo ${para#*=}` - elif [[ $para == --one_node_ip* ]];then + elif [[ $para == --one_node_ip* ]];then one_node_ip=`echo ${para#*=}` fi done @@ -100,7 +101,15 @@ do if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" fi - ${bind_core} python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation --dynamic_input=${dynamic_input}> $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + #${bind_core} python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + ${bind_core} python3 train.py \ + --training_data_path=$data_path \ + --steps_to_train=$train_steps \ + --train_batch_size=$batch_size \ + --work_dir=$cur_path/estimator_working_dir \ + --export_path=$cur_path/outputs/models/000001-first_generation \ + --dynamic_input=${dynamic_input} \ + --jit_compile=False > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & done wait @@ -116,7 +125,7 @@ BatchSize=${batch_size} #设备类型,自动获取 DeviceType=`uname -m` #用例名称,自动获取 -CaseName=${Network}_bs${BatchSize}_${RankSize}'p'_'acc' +CaseName=${Network}_bs${BatchSize}_${RankSize}'p_RT2_perf' #获取性能 TrainingTime=`grep "tensorflow:global_step/sec" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` -- Gitee From 5c140fe6e7b7ab2c1d1de04a919742d23443730e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:30:59 +0000 Subject: [PATCH 04/22] update image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_16p.sh | 50 +++++++++++++------ 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh index d84d55ac3..f997f2418 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh @@ -49,20 +49,40 @@ do conf_path=`echo ${para#*=}` elif [[ $para == --devices_num* ]];then devices_num=`echo ${para#*=}` - fi + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` + fi done -one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` + +linux_num=$servers_num + +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 fi - -export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +export RANK_SIZES=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path -wait -export RANK_TABLE_FILE=$cur_path/test/rank_table.json +#nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +#wait +#export RANK_TABLE_FILE=$cur_path/test/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 @@ -77,13 +97,13 @@ wait start=$(date +%s) # 8P训练模式 -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量 - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + export RANK_IDS=$RANK_IDS echo "Device ID: $ASCEND_DEVICE_ID" if [ -d $cur_path/test/output/$ASCEND_DEVICE_ID ];then @@ -95,8 +115,8 @@ do echo $ASCEND_DEVICE_ID #(Step3)训练 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_ID*${corenum}/8 - let b=RANK_ID+1 + let a=RANK_IDS*${corenum}/8 + let b=RANK_IDS+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" @@ -146,4 +166,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${Cas echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DynamicInput = ${dynamic_input}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DynamicInput = ${dynamic_input}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 7283df6abc8e1ea362893ddf50a215bda0a4f385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:31:21 +0000 Subject: [PATCH 05/22] update /image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_8p.sh | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh index d52fd0d57..f280a3168 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh @@ -43,9 +43,20 @@ do name_bind="_bindcore" elif [[ $para == --dynamic_input* ]];then dynamic_input=`echo ${para#*=}` - fi + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` + fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 @@ -59,8 +70,8 @@ python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$c wait export ASCEND_DEVICE_ID=0 -export RANK_SIZE=8 -export RANK_TABLE_FILE="${cur_path}/test/8p.json" +export RANK_SIZES=8 +#export RANK_TABLE_FILE="${cur_path}/test/8p.json" export JOB_ID=10086 start=$(date +%s) @@ -69,7 +80,7 @@ start=$(date +%s) for i in 0 1 2 3 4 5 6 7 do #设置环境变量 - export RANK_ID=$i + export RANK_IDS=$i export ASCEND_DEVICE_ID=$i ASCEND_DEVICE_ID=$i echo "Device ID: $ASCEND_DEVICE_ID" @@ -83,8 +94,8 @@ do echo $ASCEND_DEVICE_ID #(Step3)训练 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_ID*${corenum}/8 - let b=RANK_ID+1 + let a=RANK_IDS*${corenum}/8 + let b=RANK_IDS+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" -- Gitee From b063b2a07178520f4db0bbb776fe062cf000d59d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:32:29 +0000 Subject: [PATCH 06/22] update TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_full_8p.sh | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh index 02a6ba19f..b69de76e2 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh @@ -5,9 +5,9 @@ cur_path=`pwd` export LANG=en_US.UTF-8 -export RANK_SIZE=8 +export RANK_SIZES=8 export JOB_ID=10087 -export RANK_TABLE_FILE=$cur_path/../scripts/8p.json +#export RANK_TABLE_FILE=$cur_path/../scripts/8p.json RANK_ID_START=0 @@ -71,9 +71,19 @@ do cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #data_path='../' #校验是否传入data_path,不需要修改 @@ -81,18 +91,23 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi - +cd $cur_path/../ +sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py cd $cur_path/../scripts #训练开始时间,不需要修改 start_time=$(date +%s) bash run_accuracy_8p.sh ${data_path} all +wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +cd $cur_path/../ +sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py sleep 30 train_accuracy=`grep -r "whole" $cur_path/output/0/train_0.log | awk '{print $6}'` @@ -105,7 +120,6 @@ echo "E2E Training Duration sec : $e2e_time" echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 fps=`grep throughput_train $cur_path/output/0/train_0.log|awk -F 'throughput_train' '{print $2}'|awk -F ':' '{print $2}'|awk '{print $1}'` -#FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${fps}'}'` FPS=1.5 #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -116,13 +130,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 @@ -133,7 +147,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/0/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log -- Gitee From 0f7c600f20592304972206a6dd7f65de33883dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:32:50 +0000 Subject: [PATCH 07/22] update built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16np.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_16np.sh | 81 +++++++++++-------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16np.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16np.sh index 9c97ecaf6..2e4325619 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16np.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16np.sh @@ -59,26 +59,42 @@ do devices_num=`echo ${para#*=}` elif [[ $para == --servers_num* ]];then servers_num=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done - linux_num=$servers_num +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=`awk 'BEGIN{printf "%.0f\n",'8'*'${linux_num}'}'` #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 #export ASCEND_SLOG_PRINT_TO_STDOUT=1 -export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +export RANK_SIZES=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path - -fi - -wait -export RANK_TABLE_FILE=$cur_path/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +# +#fi +# +#wait +#export RANK_TABLE_FILE=$cur_path/rank_table.json export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 #设置默认日志级别,不需要修改 -#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=1 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP_ETP=1 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -106,26 +122,27 @@ if [[ $data_path == "" ]];then fi cd $cur_path/../ - +sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py #训练开始时间,不需要修改 start_time=$(date +%s) bind_core=1 exec_mode='train' #进入训练脚本目录,需要模型审视修改 #for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` # export DEVICE_ID=${ASCEND_DEVICE_ID} # echo 'DEVICE_ID: '$ASCEND_DEVICE_ID - RANK_ID_core=$RANK_ID + RANK_ID_core=$RANK_IDS - export DEVICE_ID=$RANK_ID - DEVICE_INDEX=$RANK_ID + export DEVICE_ID=$RANK_IDS + DEVICE_INDEX=$RANK_IDS export DEVICE_INDEX=${DEVICE_INDEX} # #创建DeviceID输出目录,不需要修改 @@ -136,11 +153,11 @@ do # mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt # fi - if [ -d ${cur_path}/output/${RANK_ID} ];then - rm -rf ${cur_path}/output/${RANK_ID} - mkdir -p ${cur_path}/output/${RANK_ID}/ckpt + if [ -d ${cur_path}/output/${RANK_IDS} ];then + rm -rf ${cur_path}/output/${RANK_IDS} + mkdir -p ${cur_path}/output/${RANK_IDS}/ckpt else - mkdir -p ${cur_path}/output/${RANK_ID}/ckpt + mkdir -p ${cur_path}/output/${RANK_IDS}/ckpt fi # if [ ${RANK_ID_core} -gt 7 ];then @@ -162,14 +179,14 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune nohup python3 main_npu.py --data_dir=$data_path \ - --model_dir=$cur_path/output/${RANK_ID} \ + --model_dir=$cur_path/output/${RANK_IDS} \ --exec_mode=${exec_mode} \ --npu_loss_scale=1048576 \ --max_steps=$train_steps \ --benchmark \ --fold=0 \ --batch_size=$batch_size \ - --augment > ${cur_path}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 & + --augment > ${cur_path}/output/${RANK_IDS}/train_${RANK_IDS}.log 2>&1 & done wait @@ -177,8 +194,8 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - - +sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py #结果打印,不需要修改 echo "------------------ Final result ------------------" @@ -194,13 +211,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 @@ -212,7 +229,7 @@ ActualLoss=None #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -226,7 +243,7 @@ log_path=$cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log if [ ! -f ${log_path} ];then ASCEND_DEVICE_ID=15 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -235,5 +252,3 @@ if [ ! -f ${log_path} ];then echo "TrainingTime = 197.41" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = 386" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log fi - - -- Gitee From 8e8d12837d838403492fa1262d51e8635248dc86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:33:13 +0000 Subject: [PATCH 08/22] update /built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_16p.sh | 76 +++++++++++-------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16p.sh index 84268dea3..ed0b2b15e 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_16p.sh @@ -2,7 +2,7 @@ #当前路径,不需要修改 cur_path=`pwd` #集合通信参数,不需要修改 -source /usr/local/Ascend/CANN-1.81/bin/setenv.bash +#source /usr/local/Ascend/CANN-1.81/bin/setenv.bash # 数据集路径,保持为空,不需要修改 data_path="" @@ -55,26 +55,42 @@ do server_index=`echo ${para#*=}` elif [[ $para == --conf_path* ]];then conf_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 #export ASCEND_SLOG_PRINT_TO_STDOUT=1 -export RANK_SIZE=16 +export RANK_SIZES=16 export JOB_ID=10087 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 $cur_path/set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi - -export RANK_TABLE_FILE=$cur_path/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 $cur_path/set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +#fi +# +#export RANK_TABLE_FILE=$cur_path/rank_table.json export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 -RANK_SIZE=16 #设置默认日志级别,不需要修改 -#export ASCEND_GLOBAL_LOG_LEVEL_ETP=1 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=1 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -102,26 +118,27 @@ if [[ $data_path == "" ]];then fi cd $cur_path/../ - +sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py #训练开始时间,不需要修改 start_time=$(date +%s) bind_core=1 exec_mode='train' #进入训练脚本目录,需要模型审视修改 #for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` # export DEVICE_ID=${ASCEND_DEVICE_ID} # echo 'DEVICE_ID: '$ASCEND_DEVICE_ID - RANK_ID_core=$RANK_ID + RANK_ID_core=$RANK_IDS - export DEVICE_ID=$RANK_ID - DEVICE_INDEX=$RANK_ID + export DEVICE_ID=$RANK_IDS + DEVICE_INDEX=$RANK_IDS export DEVICE_INDEX=${DEVICE_INDEX} # #创建DeviceID输出目录,不需要修改 @@ -132,11 +149,11 @@ do # mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt # fi - if [ -d ${cur_path}/output/${RANK_ID} ];then - rm -rf ${cur_path}/output/${RANK_ID} - mkdir -p ${cur_path}/output/${RANK_ID}/ckpt + if [ -d ${cur_path}/output/${RANK_IDS} ];then + rm -rf ${cur_path}/output/${RANK_IDS} + mkdir -p ${cur_path}/output/${RANK_IDS}/ckpt else - mkdir -p ${cur_path}/output/${RANK_ID}/ckpt + mkdir -p ${cur_path}/output/${RANK_IDS}/ckpt fi # if [ ${RANK_ID_core} -gt 7 ];then @@ -158,14 +175,14 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune nohup python3 main_npu.py --data_dir=$data_path \ - --model_dir=$cur_path/output/${RANK_ID} \ + --model_dir=$cur_path/output/${RANK_IDS} \ --exec_mode=${exec_mode} \ --npu_loss_scale=1048576 \ --max_steps=$train_steps \ --benchmark \ --fold=0 \ --batch_size=$batch_size \ - --augment > ${cur_path}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 & + --augment > ${cur_path}/output/${RANK_IDS}/train_${RANK_IDS}.log 2>&1 & done wait @@ -173,14 +190,14 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 FPS=`grep throughput_train $cur_path/output/0/train_0.log|awk -F 'throughput_train' '{print $2}'|awk -F ':' '{print $2}'|awk '{print $1}'` -#FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${fps}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" echo "E2E Training Duration sec : $e2e_time" @@ -190,13 +207,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 @@ -208,7 +225,7 @@ ActualLoss=None #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -231,4 +248,3 @@ if [ ! -f ${log_path} ];then echo "TrainingTime = 197.41" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = 386" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log fi - -- Gitee From d63a33f7092168347ab544a9343d52aff1fa9ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:33:39 +0000 Subject: [PATCH 09/22] update /built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_8p.sh | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_8p.sh index 0f3ab2764..991fbee7a 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_performance_8p.sh @@ -2,20 +2,19 @@ #当前路径,不需要修改 cur_path=`pwd` #集合通信参数,不需要修改 -source /usr/local/Ascend/CANN-1.81/bin/setenv.bash +#source /usr/local/Ascend/CANN-1.81/bin/setenv.bash #export ASCEND_SLOG_PRINT_TO_STDOUT=1 -export RANK_SIZE=8 +export RANK_SIZES=8 export JOB_ID=10087 -export RANK_TABLE_FILE=$cur_path/../scripts/8p.json +#export RANK_TABLE_FILE=$cur_path/../scripts/8p.json RANK_ID_START=0 -RANK_SIZE=8 # 数据集路径,保持为空,不需要修改 data_path="" #设置默认日志级别,不需要修改 -#export ASCEND_GLOBAL_LOG_LEVEL_ETP=1 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=1 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -74,9 +73,19 @@ do cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #data_path='../' #校验是否传入data_path,不需要修改 @@ -86,25 +95,27 @@ if [[ $data_path == "" ]];then fi cd $cur_path/../ +sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py #训练开始时间,不需要修改 start_time=$(date +%s) bind_core=1 exec_mode='train' #进入训练脚本目录,需要模型审视修改 -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - ${RANK_ID_START}` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - ${RANK_ID_START}` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - ${RANK_ID_START}` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - ${RANK_ID_START}` export DEVICE_ID=${ASCEND_DEVICE_ID} echo 'DEVICE_ID: '$ASCEND_DEVICE_ID - RANK_ID_core=$RANK_ID + RANK_ID_core=$RANK_IDS - export DEVICE_ID=$RANK_ID - DEVICE_INDEX=$RANK_ID + export DEVICE_ID=$RANK_IDS + DEVICE_INDEX=$RANK_IDS export DEVICE_INDEX=${DEVICE_INDEX} #创建DeviceID输出目录,不需要修改 @@ -149,14 +160,14 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py +sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 FPS=`grep throughput_train $cur_path/output/0/train_0.log|awk -F 'throughput_train' '{print $2}'|awk -F ':' '{print $2}'|awk '{print $1}'` -#FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${fps}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" echo "E2E Training Duration sec : $e2e_time" @@ -166,13 +177,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 @@ -184,11 +195,11 @@ ActualLoss=None #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 7505c42d47f7df62b0b64d6b6c14fb1ce9e7ef20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:34:45 +0000 Subject: [PATCH 10/22] update built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../scripts/run_accuracy_8p.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh index 771f5fb92..f2d9c2f99 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh @@ -9,8 +9,8 @@ mkdir -p ${upDir}/test/output/0 # user env export JOB_ID=NPU20210126 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${currentDir}/8p.json +export RANK_SIZES=8 +#export RANK_TABLE_FILE=${currentDir}/8p.json data_dir=$1 fold=$2 @@ -21,7 +21,7 @@ if [ x"${fold}" = x"all" ] ; then for device_index in ${device_group} do - RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 & + RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 & done wait @@ -32,10 +32,9 @@ else echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train start" for device_index in ${device_group} do - RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} & + RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} & done wait echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train end" fi - -- Gitee From dd9a745ab783f659fac9a9622d1ca6fe1446aa2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:36:06 +0000 Subject: [PATCH 11/22] update TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_full_8p.sh | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh index e58e09555..7250053b3 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh @@ -4,13 +4,11 @@ cur_path=`pwd` #集合通信参数,不需要修改 - - export HCCL_CONNECT_TIMEOUT=1200 #集合通信参数,不需要修改 -export RANK_SIZE=8 -export RANK_TABLE_FILE=$cur_path/8p.json +export RANK_SIZES=8 +#export RANK_TABLE_FILE=$cur_path/8p.json export JOB_ID=10087 RANK_ID_START=0 ASCEND_DEVICE_ID_START=0 @@ -57,14 +55,19 @@ elif [[ $para == --over_dump* ]];then mkdir -p ${over_dump_path} elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then @@ -102,6 +105,8 @@ sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run @@ -109,13 +114,13 @@ cp configs/config.py configs/config.py.run cd $cur_path/../ start=$(date +%s) -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then rm -rf $cur_path/output/${ASCEND_DEVICE_ID} mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} @@ -131,7 +136,8 @@ done wait end=$(date +%s) e2e_time=$(( $end - $start )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py +sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py #配置文件恢复 mv -f configs/config.py.bak configs/config.py @@ -155,9 +161,9 @@ echo "E2E Training Duration sec : $e2e_time" BatchSize=${batch_size} DeviceType=`uname -m` if [[ $precision_mode == "must_keep_origin_dtype" ]];then - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'acc' + CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'fp32'_'acc' else - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' fi ##获取性能数据 @@ -176,7 +182,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 1fa93dd07bfd3fb72ef395949fd61da34c3eccba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:36:29 +0000 Subject: [PATCH 12/22] update built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_16np.sh | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh index fd06a9c65..93aac4fcb 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh @@ -57,21 +57,39 @@ elif [[ $para == --over_dump* ]];then conf_path=`echo ${para#*=}` elif [[ $para == --devices_num* ]];then devices_num=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` fi done -one_node_ip=`find $conf_path -name "server_111._0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` +linux_num=$servers_num +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=`awk 'BEGIN{printf "%.0f\n",'8'*'${linux_num}'}'` #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -export RANK_SIZE=16 +export RANK_SIZES=16 rank_size=8 -export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` -nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path -wait -export RANK_TABLE_FILE=$cur_path/rank_table.json +#export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +#nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +#wait +#export RANK_TABLE_FILE=$cur_path/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 ##############执行训练########## @@ -90,17 +108,19 @@ sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py sed -i "s%59761827%${train_size}%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run start=$(date +%s) -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then rm -rf $cur_path/output/${ASCEND_DEVICE_ID} mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} @@ -116,7 +136,8 @@ wait end=$(date +%s) e2e_time=$(( $end - $start )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py +sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py #配置文件恢复 mv -f configs/config.py.bak configs/config.py @@ -139,7 +160,7 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' echo "CaseName : $CaseName" ##获取性能数据 @@ -158,7 +179,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -166,6 +187,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - - +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From e0afe1e360a38b049b6a708e122b18858bf55835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:36:49 +0000 Subject: [PATCH 13/22] update built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_16p.sh | 56 +++++++++++++------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh index 8ac1fe25d..7da244ea4 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh @@ -55,24 +55,42 @@ elif [[ $para == --over_dump* ]];then server_index=`echo ${para#*=}` elif [[ $para == --conf_path* ]];then conf_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -export RANK_SIZE=16 +export RANK_SIZES=16 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi - - -wait -export RANK_TABLE_FILE=$cur_path/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +#fi +# +# +#wait +#export RANK_TABLE_FILE=$cur_path/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 ##############执行训练########## @@ -91,17 +109,19 @@ sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py sed -i "s%59761827%${train_size}%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run start=$(date +%s) -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then rm -rf $cur_path/output/${ASCEND_DEVICE_ID} mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} @@ -117,7 +137,8 @@ wait end=$(date +%s) e2e_time=$(( $end - $start )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py +sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py #配置文件恢复 mv -f configs/config.py.bak configs/config.py @@ -140,7 +161,7 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' echo "CaseName : $CaseName" ##获取性能数据 @@ -159,7 +180,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -167,5 +188,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 334b3dce76571333f2b0c88e7f3e41867b6d8e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:37:22 +0000 Subject: [PATCH 14/22] update built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_64p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_64p.sh | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_64p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_64p.sh index 5a9389f09..7d49074ba 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_64p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_64p.sh @@ -55,19 +55,37 @@ elif [[ $para == --over_dump* ]];then server_index=`echo ${para#*=}` elif [[ $para == --conf_path* ]];then conf_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=64 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -export RANK_SIZE=64 +export RANK_SIZES=64 rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -wait -export RANK_TABLE_FILE=$cur_path/rank_table.json +#nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +#wait +#export RANK_TABLE_FILE=$cur_path/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 ##############执行训练########## @@ -86,17 +104,19 @@ sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py sed -i "s%59761827%${train_size}%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run start=$(date +%s) -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_ISD=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then rm -rf $cur_path/output/${ASCEND_DEVICE_ID} mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} @@ -112,7 +132,8 @@ wait end=$(date +%s) e2e_time=$(( $end - $start )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py +sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py #配置文件恢复 mv -f configs/config.py.bak configs/config.py @@ -122,7 +143,7 @@ echo "------------------ Final result ------------------" #FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` #FPS=`grep 'FPS:' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F ':' '{print $2}' | tail -n 1` time=`grep -rn 'epoch 4 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` -FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'12'*'${batch_size}'/'${time}'}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZES}'*'12'*'${batch_size}'/'${time}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -137,7 +158,7 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' echo "CaseName : $CaseName" ##获取性能数据 @@ -156,7 +177,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 7158bddd87649afe5aeb6f6d913c3a25ff491fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:37:44 +0000 Subject: [PATCH 15/22] update built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_8p.sh | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh index 483101a81..74d225929 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh @@ -8,8 +8,8 @@ cur_path=`pwd` export HCCL_CONNECT_TIMEOUT=1200 #集合通信参数,不需要修改 -export RANK_SIZE=8 -export RANK_TABLE_FILE=$cur_path/8p.json +export RANK_SIZES=8 +#export RANK_TABLE_FILE=$cur_path/8p.json export JOB_ID=10087 RANK_ID_START=0 ASCEND_DEVICE_ID_START=0 @@ -49,15 +49,26 @@ for para in $* do if [[ $para == --precision_mode* ]];then precision_mode=`echo ${para#*=}` -elif [[ $para == --over_dump* ]];then + elif [[ $para == --over_dump* ]];then over_dump=`echo ${para#*=}` over_dump_path=${cur_path}/output/overflow_dump mkdir -p ${over_dump_path} elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -80,17 +91,19 @@ sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py sed -i "s%59761827%${train_size}%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py +sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run start=$(date +%s) -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then rm -rf $cur_path/output/${ASCEND_DEVICE_ID} mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} @@ -110,7 +123,8 @@ wait end=$(date +%s) e2e_time=$(( $end - $start )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py +sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py #配置文件恢复 mv -f configs/config.py.bak configs/config.py @@ -120,7 +134,7 @@ echo "------------------ Final result ------------------" #FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` time=`grep -rn 'epoch 4 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` -FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'50'*'${batch_size}'/'${time}'}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZES}'*'50'*'${batch_size}'/'${time}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -136,9 +150,9 @@ echo "E2E Training Duration sec : $e2e_time" BatchSize=${batch_size} DeviceType=`uname -m` if [[ $precision_mode == "must_keep_origin_dtype" ]];then - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'perf' + CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'fp32'_'perf' else - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' fi echo "CaseName : $CaseName" @@ -158,7 +172,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -166,4 +180,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From ca28a2531fb6054bbcfdee807bb419a79d7c6f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:39:01 +0000 Subject: [PATCH 16/22] updatebuilt-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_full_8p.sh | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh index b385ad12a..4ab952132 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh @@ -5,8 +5,8 @@ cur_path=`pwd` #集合通信参数,不需要修改 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/../configs/8p.json +export RANK_SIZES=8 +#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json export JOB_ID=10087 RANK_ID_START=0 @@ -21,7 +21,7 @@ Network="ResNet50_ID0058_for_TensorFlow" export HCCL_CONNECT_TIMEOUT=600 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` export RANK_INDEX=0 -export RANK_ID=0 +export RANK_IDS=0 config_file=res50_256bs_8p_eval iterations_per_loop=100 @@ -86,9 +86,20 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -104,16 +115,16 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/.. -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - # export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + # export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS export DEVICE_ID=$ASCEND_DEVICE_ID DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 )) export DEVICE_INDEX=$DEVICE_INDEX @@ -128,8 +139,8 @@ do #执行训练脚本,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_ID*${corenum}/8 - let b=RANK_ID+1 + let a=RANK_IDS*${corenum}/8 + let b=RANK_IDS+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" @@ -158,6 +169,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py echo "------------------ Final result ------------------" #单step时长,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 @@ -174,13 +186,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` ##获取性能数据,不需要修改 @@ -197,7 +209,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -205,4 +217,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 49bea43834420302cdbe475a9f2505ccfdec5278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:39:35 +0000 Subject: [PATCH 17/22] update image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_bs256_8p.sh | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_8p.sh index 024f66aaa..6ca0c20b9 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_8p.sh @@ -4,11 +4,11 @@ cur_path=`pwd` #集合通信参数,不需要修改 -export RANK_SIZE=8 +export RANK_SIZES=8 export JOB_ID=99990001 -export RANK_ID=8p -export SLOG_PRINT_TO_STDOUT=0 -export RANK_TABLE_FILE=${cur_path}/../configs/8p.json +#export RANK_ID=8p +#export SLOG_PRINT_TO_STDOUT=0 +#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 @@ -16,7 +16,7 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -89,9 +89,20 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -112,17 +123,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export DEVICE_INDEX=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export DEVICE_INDEX=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -134,9 +145,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -155,7 +166,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py #参数改回 sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_8p.py sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_8p.py @@ -177,13 +188,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -193,7 +204,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From b0f879b9d870d86e577ead952a1c6ac33f0e0e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:40:05 +0000 Subject: [PATCH 18/22] update ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_16p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_bs256_hw192_16p.sh | 64 ++++++++++++------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_16p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_16p.sh index 1058fff20..a03a8fd2f 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_16p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_16p.sh @@ -4,10 +4,10 @@ cur_path=`pwd` #集合通信参数,不需要修改 -export RANK_SIZE=16 +export RANK_SIZES=16 export JOB_ID=99990001 -export RANK_ID=8 -export SLOG_PRINT_TO_STDOUT=0 +#export RANK_IDS=8 +#export SLOG_PRINT_TO_STDOUT=0 export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 @@ -15,7 +15,7 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -91,9 +91,27 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -103,13 +121,13 @@ fi # 自动生成ranktable的脚本 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi - - -wait -export RANK_TABLE_FILE=${cur_path}/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZES/rank_size)) --conf_path=$conf_path +#fi +# +# +#wait +#export RANK_TABLE_FILE=${cur_path}/rank_table.json #修改参数 @@ -126,17 +144,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export DEVICE_INDEX=`expr ${RANK_ID} - $((rank_size*server_index))` - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export DEVICE_INDEX=`expr ${RANK_IDS} - $((rank_size*server_index))` + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -169,7 +187,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py #参数改回 sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py @@ -188,13 +206,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p_hw192'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p_hw192'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -204,11 +222,11 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 89e05cb001dcc5c6c9714b773f652bcf44e0cac5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:40:40 +0000 Subject: [PATCH 19/22] update ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_32p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_bs256_hw192_32p.sh | 63 ++++++++++++------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_32p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_32p.sh index c23b9782f..c4f5e022f 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_32p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_32p.sh @@ -4,10 +4,10 @@ cur_path=`pwd` #集合通信参数,不需要修改 -export RANK_SIZE=32 +export RANK_SIZES=32 export JOB_ID=99990001 -export RANK_ID=8 -export SLOG_PRINT_TO_STDOUT=0 +#export RANK_ID=8 +#export SLOG_PRINT_TO_STDOUT=0 export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 @@ -15,7 +15,7 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -91,9 +91,27 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=32 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -103,12 +121,12 @@ fi # 自动生成ranktable的脚本 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi - -wait -export RANK_TABLE_FILE=${cur_path}/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZES/rank_size)) --conf_path=$conf_path +#fi +# +#wait +#export RANK_TABLE_FILE=${cur_path}/rank_table.json #修改参数 @@ -125,17 +143,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export DEVICE_INDEX=`expr ${RANK_ID} - $((rank_size*server_index))` - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export DEVICE_INDEX=`expr ${RANK_IDS} - $((rank_size*server_index))` + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -168,7 +186,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py #参数改回 sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py @@ -187,13 +205,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p_hw192'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p_hw192'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -203,12 +221,11 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From be6dd40b372aa882d37adbae68b10b3ba6987b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:41:37 +0000 Subject: [PATCH 20/22] update ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_64p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_bs256_hw192_64p.sh | 54 ++++++++++++------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_64p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_64p.sh index ff6ecb450..2feec0fff 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_64p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_64p.sh @@ -4,10 +4,10 @@ cur_path=`pwd` #集合通信参数,不需要修改 -export RANK_SIZE=16 +export RANK_SIZES=16 export JOB_ID=99990001 -export RANK_ID=8 -export SLOG_PRINT_TO_STDOUT=0 +#export RANK_ID=8 +#export SLOG_PRINT_TO_STDOUT=0 export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 @@ -15,7 +15,7 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP_ETP=3 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -93,9 +93,28 @@ do name_bind="_bindcore" elif [[ $para == --servers_num* ]];then servers_num=`echo ${para#*=}` + elif [[ $para == --devices_num* ]];then + devices_num=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done linux_num=$servers_num +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=64 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then @@ -103,16 +122,16 @@ if [[ $data_path == "" ]];then exit 1 fi -export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +export RANK_SIZES=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` # 自动生成ranktable的脚本 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path -fi - -wait -export RANK_TABLE_FILE=${cur_path}/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +#fi +# +#wait +#export RANK_TABLE_FILE=${cur_path}/rank_table.json #修改参数 @@ -129,7 +148,7 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); @@ -172,7 +191,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py #参数改回 sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py @@ -191,13 +210,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p_hw192'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p_hw192'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -207,12 +226,11 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From d9e5db29aff0f70a94eb952d39e957202398b2f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:41:58 +0000 Subject: [PATCH 21/22] update ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_bs256_hw192_8p.sh | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_8p.sh index 7046b5c50..172348baa 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs256_hw192_8p.sh @@ -4,11 +4,11 @@ cur_path=`pwd` #集合通信参数,不需要修改 -export RANK_SIZE=8 +export RANK_SIZES=8 export JOB_ID=99990001 -export RANK_ID=8p -export SLOG_PRINT_TO_STDOUT=0 -export RANK_TABLE_FILE=${cur_path}/../configs/8p.json +#export RANK_ID=8p +#export SLOG_PRINT_TO_STDOUT=0 +#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 @@ -16,7 +16,7 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -89,9 +89,20 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -112,17 +123,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export DEVICE_INDEX=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export DEVICE_INDEX=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -134,9 +145,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -155,7 +166,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py #参数改回 sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_HW192_8p.py @@ -177,23 +188,23 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p_hw192'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p_hw192'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From a2fb4286174593884e5bae9c29f66ebd53d0fe83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E5=85=B4=E6=88=90?= <1358493914@qq.com> Date: Tue, 14 Mar 2023 11:42:33 +0000 Subject: [PATCH 22/22] update ResNet50_ID0058_for_TensorFlow/test/train_performance_bs32_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 高兴成 <1358493914@qq.com> --- .../test/train_performance_bs32_8p.sh | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs32_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs32_8p.sh index e18a92d99..ed52acfb6 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs32_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs32_8p.sh @@ -4,11 +4,11 @@ cur_path=`pwd` #集合通信参数,不需要修改 -export RANK_SIZE=8 +export RANK_SIZES=8 export JOB_ID=99990001 -export RANK_ID=8p -export SLOG_PRINT_TO_STDOUT=0 -export RANK_TABLE_FILE=${cur_path}/../configs/8p.json +#export RANK_ID=8p +#export SLOG_PRINT_TO_STDOUT=0 +#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 @@ -16,7 +16,7 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 #基础参数,需要模型审视修改 #网络名称,同目录名称 @@ -89,9 +89,20 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -112,17 +123,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export DEVICE_INDEX=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export DEVICE_INDEX=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -134,9 +145,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -155,7 +166,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py #参数改回 sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_32bs_8p.py sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_32bs_8p.py @@ -177,13 +188,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep "FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -193,7 +204,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee