diff --git a/TensorFlow/built-in/cv/image_segmentation/DeeplabV3_ID0047_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_segmentation/DeeplabV3_ID0047_for_TensorFlow/test/train_full_8p.sh index 5e467e9320c66dc8e08826f47fdea911622767df..bb325a045e1810ffdbcf3aa972628293bbead83c 100644 --- a/TensorFlow/built-in/cv/image_segmentation/DeeplabV3_ID0047_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/DeeplabV3_ID0047_for_TensorFlow/test/train_full_8p.sh @@ -39,13 +39,13 @@ if [[ $1 == --help || $1 == -h ]];then echo " " echo "parameter explain: --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message + --data_path source data of training + -h/--help show help message " exit 1 fi @@ -112,7 +112,7 @@ do export ASCEND_DEVICE_ID=$RANK_ID export DEVICE_ID=$ASCEND_DEVICE_ID ASCEND_DEVICE_ID=$RANK_ID - + #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} @@ -125,8 +125,8 @@ do mkdir -p ${cur_path}/output/s8/r1/${ASCEND_DEVICE_ID}/ mkdir -p ${cur_path}/output/s8/r2/${ASCEND_DEVICE_ID}/ fi - - + + # 绑核,不需要的绑核的模型删除,需要模型审视修改 #corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` #let a=RANK_ID*${corenum}/${RANK_SIZE} @@ -180,7 +180,7 @@ do export ASCEND_DEVICE_ID=$RANK_ID export DEVICE_ID=$ASCEND_DEVICE_ID ASCEND_DEVICE_ID=$RANK_ID - + #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} @@ -188,7 +188,7 @@ do else mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt fi - + # 绑核,不需要的绑核的模型删除,需要模型审视修改 #corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` #let a=RANK_ID*${corenum}/${RANK_SIZE} @@ -213,7 +213,7 @@ do --train_batch_size=${batch_size} \ --training_number_of_steps=10000 \ --fine_tune_batch_norm=true \ - --tf_initial_checkpoint=${cur_path}/output/s8/r1/0/resnet_101/model.ckpt-15000 \ + --tf_initial_checkpoint=${cur_path}/output/s8/r1/${ASCEND_DEVICE_ID}/resnet_101/model.ckpt-15000 \ --log_steps=100 \ --weight_decay=0.00002 \ --last_layer_gradient_multiplier=1 \ @@ -233,32 +233,48 @@ do cd - done wait +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + export DEVICE_ID=$ASCEND_DEVICE_ID + ASCEND_DEVICE_ID=$RANK_ID -cd ${cur_path}/output/0/ -python3.7 $cur_path/../train_npu.py \ - --mode=evaluate \ - --eval_split="val" \ - --model_variant="resnet_v1_101_beta" \ - --iterations_per_loop=1 \ - --atrous_rates=6 \ - --atrous_rates=12 \ - --atrous_rates=18 \ - --output_stride=8 \ - --multi_grid=1 \ - --multi_grid=2 \ - --multi_grid=4 \ - --eval_scales=0.5 \ - --eval_scales=0.75 \ - --eval_scales=1.0 \ - --eval_scales=1.25 \ - --eval_scales=1.5 \ - --eval_scales=1.75 \ - --eval_crop_size="513,513" \ - --aspp_with_separable_conv=False \ - --checkpoint_dir=${cur_path}/output/s8/r2/0/resnet_101 \ - --dataset_dir=${data_path}/tfrecord \ - --max_number_of_evaluations=1 > ${cur_path}/output/0/train_0.log 2>&1 -cd - + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + if [ $ASCEND_DEVICE_ID -eq 7 ];then + python3.7 $cur_path/../train_npu.py \ + --mode=evaluate \ + --eval_split="val" \ + --model_variant="resnet_v1_101_beta" \ + --iterations_per_loop=1 \ + --atrous_rates=6 \ + --atrous_rates=12 \ + --atrous_rates=18 \ + --output_stride=8 \ + --multi_grid=1 \ + --multi_grid=2 \ + --multi_grid=4 \ + --eval_scales=0.5 \ + --eval_scales=0.75 \ + --eval_scales=1.0 \ + --eval_scales=1.25 \ + --eval_scales=1.5 \ + --eval_scales=1.75 \ + --eval_crop_size="513,513" \ + --aspp_with_separable_conv=False \ + --checkpoint_dir=${cur_path}/output/s8/r2/${ASCEND_DEVICE_ID}/resnet_101 \ + --dataset_dir=${data_path}/tfrecord \ + --max_number_of_evaluations=1 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 + fi +done wait @@ -269,12 +285,12 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep 'FPS:' ${cur_path}/output/s8/r2/0/train_0.log| awk -F "=" '{print $1}' | awk 'END {print $6}'` +FPS=`grep 'FPS:' ${cur_path}/output/s8/r2/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F "=" '{print $1}' | awk 'END {print $6}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'class_20' ${cur_path}/output/0/train_0.log| awk -F " " '{print $3}'` +train_accuracy=`grep 'class_20' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F " " '{print $3}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -292,19 +308,19 @@ ActualFPS=${FPS} TrainingTime=`echo "scale=2;${batch_size} * ${RANK_SIZE} * 1000 / ${FPS}"|bc` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep 'loss:' ${cur_path}/output/s8/r2/0/train_0.log|awk '{print $7}' | awk -F "loss:" '{print $2}' >> $cur_path/output/0/train_${CaseName}_loss.txt +grep 'loss:' ${cur_path}/output/s8/r2/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $7}' | awk -F "loss:" '{print $2}' >> $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/0/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/0/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/0/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/0/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/0/${CaseName}.log \ No newline at end of file +echo "Network = ${Network}" > $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log \ No newline at end of file