From 9b35bee6a371e8fd01ce5ce8ddcb7c0282babdd5 Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Thu, 1 Dec 2022 11:32:02 +0000 Subject: [PATCH] =?UTF-8?q?update=20UNet=5FIndustrial=5FID0007=5Ffor=5FTen?= =?UTF-8?q?sorFlow/test/train=5Ffull=5F1p.sh.=201p=E7=B2=BE=E5=BA=A6?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=E4=B8=8D=E5=AF=B9=EF=BC=8C=E6=A0=B9=E6=8D=AE?= =?UTF-8?q?8p=E4=BF=AE=E6=94=B9=EF=BC=8C=E5=B7=B2=E8=87=AA=E9=AA=8C?= =?UTF-8?q?=EF=BC=8Cdebug01112607?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_full_1p.sh | 82 ++++++++++++------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet_Industrial_ID0007_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet_Industrial_ID0007_for_TensorFlow/test/train_full_1p.sh index 57809d68f..d2db57d9a 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet_Industrial_ID0007_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet_Industrial_ID0007_for_TensorFlow/test/train_full_1p.sh @@ -3,32 +3,30 @@ cur_path=`pwd` #集合通信参数,不需要修改 - +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 export RANK_SIZE=1 export JOB_ID=10087 RANK_ID_START=0 - # 数据集路径,保持为空,不需要修改 data_path="" #设置默认日志级别,不需要修改 # export ASCEND_GLOBAL_LOG_LEVEL=3 -#基础参数,需要模型审视修改 +#基础参数 需要模型审视修改 #网络名称,同目录名称 Network="UNet_Industrial_ID0007_for_TensorFlow" -#训练epoch -train_epochs=1 + #训练batch_size batch_size=16 -#训练step -train_steps=2500 -#学习率 -learning_rate= + + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} #维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" +#precision_mode="allow_mix_precision" #维持参数,以下不需要修改 over_dump=False data_dump_flag=False @@ -38,17 +36,17 @@ autotune=False # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_1p.sh " + echo"usage:./train_full_8p.sh " echo " " echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message " exit 1 fi @@ -83,12 +81,15 @@ do cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" elif [[ $para == --conda_name* ]];then conda_name=`echo ${para#*=}` source set_conda.sh source activate $conda_name cd ${cur_path}/../ - pip3 install dllogger/ + pip3 install dllogger/ fi done @@ -98,6 +99,13 @@ if [[ $data_path == "" ]];then exit 1 fi +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + cd $cur_path/ cp -r /usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json ${cur_path}/old/ @@ -107,12 +115,17 @@ python3.7 aic_change.py start_time=$(date +%s) #进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 echo "Device ID: $ASCEND_DEVICE_ID" export RANK_ID=$RANK_ID + DEVICE_ID=$ASCEND_DEVICE_ID + export DEVICE_ID=$ASCEND_DEVICE_ID + + #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} @@ -120,18 +133,27 @@ do else mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt fi + + # 绑核,不需要的绑核的模型删除,需要模型审视修改 + #corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + #let a=RANK_ID*${corenum}/${RANK_SIZE} + #let b=RANK_ID+1 + #let c=b*${corenum}/${RANK_SIZE}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + #if [ "x${bind_core}" != x ];then + # bind_core="taskset -c $a-$c" + #fi nohup python3.7 $cur_path/../main.py \ --unet_variant='tinyUNet' \ --activation_fn='relu' \ --exec_mode='train_and_evaluate' \ --iter_unit='batch' \ --num_iter=2500 \ - --batch_size=16 \ + --batch_size=2 \ --warmup_step=10 \ - --results_dir=${cur_path}/../Result \ + --results_dir=${cur_path}/../Result_8p \ --data_dir=${data_path} \ --dataset_name='DAGM2007' \ --dataset_classID="1" \ @@ -151,7 +173,6 @@ do --display_every=250 \ --debug_verbosity=0 \ > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - done wait @@ -161,30 +182,32 @@ e2e_time=$(( $end_time - $start_time )) cp -r ${cur_path}/old/aic-ascend910-ops-info.json /usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/config/ascend910/ - #结果打印,不需要修改 echo "------------------ Final result ------------------" + #输出性能FPS,需要模型审视修改 -FPS=`grep 'fps:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $6}' | grep -Po "\d+\.\d+" | awk '{sum+=$1} END {print sum/NR}'` +temp_FPS=`grep 'fps:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $6}' | grep -Po "\d+\.\d+" | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${temp_FPS}'*8}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +train_accuracy=`grep TNR $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $6}'|cut -c 3-8` #打印,不需要修改 -#echo "Final Train Accuracy : ${train_accuracy}" +echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" #稳定性精度看护结果汇总 #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 +#TrainingTime=`echo "scale=2;${batch_size} * ${RANK_SIZE} * 1000 / ${FPS}"|bc` TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${ActualFPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 @@ -200,6 +223,7 @@ echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee