From 0efd8e5ce0acc4878d1a50c08e7928f1ffdf6a6d Mon Sep 17 00:00:00 2001 From: xuzongqi <1450345865@qq.com> Date: Fri, 1 Apr 2022 08:58:58 +0000 Subject: [PATCH] update train_full_8p.sh. --- .../CRNN_for_TensorFlow/test/train_full_8p.sh | 131 +++++++++--------- 1 file changed, 62 insertions(+), 69 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/CRNN_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/detection/CRNN_for_TensorFlow/test/train_full_8p.sh index 4697ac92b..3ad21ab23 100644 --- a/TensorFlow/built-in/cv/detection/CRNN_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/detection/CRNN_for_TensorFlow/test/train_full_8p.sh @@ -4,32 +4,26 @@ cur_path=`pwd` #集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export JOB_ID=9999001 export RANK_SIZE=8 -export RANK_ID=npu8p -export SLOG_PRINT_TO_STDOUT=0 +export JOB_ID=99990001 +export RANK_ID=8p +export RANK_TABLE_FILE=${cur_path}/../config/rank_table_8p.json export HCCL_CONNECT_TIMEOUT=600 -export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json RANK_ID_START=0 # 数据集路径,保持为空,不需要修改 data_path="" -#设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 - -#基础参数 需要模型审视修改 +#基础参数,需要模型审视修改 #网络名称,同目录名称 -Network="InceptionV4_for_TensorFlow" -#迭代 -ITERATIONS=240000 +Network="CRNN_for_TensorFlow" + +#训练batch_size +batch_size=64 -#TF2.X独有,不需要修改 -#export NPU_LOOP_SIZE=${train_steps} #维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" +#precision_mode="allow_mix_precision" #维持参数,以下不需要修改 over_dump=False data_dump_flag=False @@ -39,17 +33,17 @@ autotune=False # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p.sh " + echo"usage:./train_full_1p.sh " echo " " echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message " exit 1 fi @@ -75,7 +69,6 @@ do mkdir -p ${profiling_dump_path} elif [[ $para == --autotune* ]];then autotune=`echo ${para#*=}` - export autotune=$autotune mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak autotune_dump_path=${cur_path}/output/autotune_dump @@ -102,7 +95,6 @@ if [[ $autotune == True ]]; then train_full_1p.sh --autotune=$autotune --data_path=$data_path wait autotune=False - export autotune=$autotune fi #训练开始时间,不需要修改 @@ -110,19 +102,14 @@ start_time=$(date +%s) #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID_n=$RANK_ID_START;RANK_ID_n<$((RANK_SIZE+RANK_ID_START));RANK_ID_n++)); +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID_n" - #export RANK_ID_n=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID_n - ASCEND_DEVICE_ID=$RANK_ID_n - - # 自行添加环境变量 - - export DEVICE_ID=$RANK_ID_n - DEVICE_INDEX=$DEVICE_ID - export DEVICE_INDEX=${DEVICE_INDEX} + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -132,38 +119,43 @@ do mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt fi - + # 绑核,不需要的绑核的模型删除,需要模型审视修改 + #corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + #let a=RANK_ID*${corenum}/${RANK_SIZE} + #let b=RANK_ID+1 + #let c=b*${corenum}/${RANK_SIZE}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_ID*${corenum}/8 - let b=RANK_ID+1 - let c=b*${corenum}/8-1 - if [ "x${bind_core}" != x ];then - bind_core="taskset -c $a-$c" - fi - ${bind_core} python3 tools/train_npu.py --dataset_dir=$data_path \ - --char_dict_path=$data_path/char_dict/char_dict.json \ - --ord_map_dict_path=$data_path/char_dict/ord_map.json \ - --save_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt/ \ - --momentum=0.95 \ - --lr=0.08 \ - --use_nesterov=True \ - --warmup_step=8000 \ - --num_iters=${ITERATIONS} \ - --over_dump=${over_dump} \ - --over_dump_path=${over_dump_path} \ - > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - #--data_dump_flag=${data_dump_flag} \ - #--data_dump_step=${data_dump_step} \ - #--data_dump_path=${data_dump_path} \ - #--profiling=${profiling} \ - #--profiling_dump_path=${profiling_dump_path} \ - #--autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + #if [ "x${bind_core}" != x ];then + # bind_core="taskset -c $a-$c" + #fi + python3.7 ${cur_path}/../tools/train_npu.py \ + --dataset_dir=${data_path} \ + --char_dict_path=${data_path}/char_dict/char_dict.json \ + --ord_map_dict_path=${data_path}/char_dict/ord_map.json \ + --save_dir=./ \ + --momentum=0.95 \ + --lr=0.08 \ + --use_nesterov=True \ + --warmup_step=8000 \ + --num_iters=240000 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait +unset RANK_TABLE_FILE +python3 ${cur_path}/../tools/eval_ckpt.py --weights_path=${cur_path}/../ \ + --device_id=0 \ + --scripts=${cur_path}/../tools/other_dataset_evaluate_shadownet.py \ + --dataset_dir=${data_path}/test/svt1/processed/ \ + --root_dir=${cur_path}/../ \ + --char_dict_path=${data_path}/char_dict/char_dict.json \ + --ord_map_dict_path=${data_path}/char_dict/ord_map.json \ + --annotation_file=${data_path}/test/svt1/annotation.txt >> ${cur_path}/output/0/train_0.log 2>&1 & +wait + + + #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) @@ -171,12 +163,13 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` +ASCEND_DEVICE_ID=0 +FPS=`cat ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep "Epoch_Train:" | grep -v "train_npu.py" | awk -F "FPS: " '{print $2}' | awk -F "," '{print $1}' | tail -n +2 | awk '{sum+=$1} END {print sum/NR}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep train_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'|cut -c 1-5` +train_accuracy=`grep "accuracy" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v "other_dataset_evaluate_shadownet.py"|grep -v "full"|awk 'END {print $7}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -191,10 +184,10 @@ CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep "Epoch_Train:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | grep -v "train_npu.py" | awk -F "cost= " '{print $2}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` @@ -206,7 +199,7 @@ echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee