diff --git a/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..e80a3a0c48d3650374c72259dd03fb45ed3d8d97 --- /dev/null +++ b/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/test/train_performance_1p.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` + +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` + +export RANK_SIZE=1 +export RANK_ID=0 +export JOB_ID=10087 + +# 路径参数初始化 +data_path="" +output_path="" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message + " + exit 1 +fi + +# 参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi + +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +modelarts_flag=${MODELARTS_MODEL_PATH} +if [ x"${modelarts_flag}" != x ]; +then + echo "running without etp..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" +fi +echo "### get your log here : ${print_log}" + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +batch_size=8 + +sed -i s#"/home/dingwei/yolov5"#"${data_path}"#g ./2007_train.txt +sed -i s#"/home/dingwei/yolov5"#"${data_path}"#g ./2007_val.txt + +if [ x"${modelarts_flag}" != x ]; +then + python3.7 ./train.py --epochs=10 --steps=48 --freeze_flag=False +else + python3.7 ./train.py --epochs=10 --steps=48 --freeze_flag=False 1>${print_log} 2>&1 +fi + +# 性能相关数据计算 +StepTime=`grep "ms/step" ${print_log} | tail -n 10 | awk -F"ms" '{print $1}' | awk '{print $NF} | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 提取所有loss打印信息 +grep "loss:" ${print_log} | awk '{print $NF}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" + +# 输出训练精度 +echo "Final Train Accuracy : ${train_accuracy}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/train.py b/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/train.py index 9d42f872a418cfd2a87cc6c86518d2ef7c9aa4d2..865dd1a2843523b2f221381b26a924d6512c7689 100644 --- a/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/train.py +++ b/TensorFlow/contrib/cv/YOLOV5_ID0378_for_TensorFlow/train.py @@ -15,8 +15,18 @@ from utils.dataloader import YoloDatasets from utils.utils import get_anchors, get_classes from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig from npu_bridge.npu_init import * - + +import argparse + if __name__ == "__main__": + + # 解析输入参数data_url + parser = argparse.ArgumentParser() + parser.add_argument("--epochs", type=int, default=120) + parser.add_argument("--steps", type=int, default=-1) + parser.add_argument("--freeze_flag", type=str, default="True") + config = parser.parse_args() + #---------------------------------------------------------------------# # classes_path 指向model_data下的txt,与自己训练的数据集相关 # 训练前一定要修改classes_path,使其对应自己的数据集 @@ -107,14 +117,14 @@ if __name__ == "__main__": # UnFreeze_Epoch 模型总共训练的epoch # Unfreeze_batch_size 模型在解冻后的batch_size #------------------------------------------------------------------# - UnFreeze_Epoch = 120 + UnFreeze_Epoch = config.epochs Unfreeze_batch_size = 8 #------------------------------------------------------------------# # Freeze_Train 是否进行冻结训练 # 默认先冻结主干训练后解冻训练。 # 如果设置Freeze_Train=False,建议使用优化器为sgd #------------------------------------------------------------------# - Freeze_Train = True + Freeze_Train = config.freeze_flag #------------------------------------------------------------------# # 其它训练参数:学习率、优化器、学习率下降有关 @@ -145,7 +155,7 @@ if __name__ == "__main__": #------------------------------------------------------------------# # save_period 多少个epoch保存一次权值,默认每个世代都保存 #------------------------------------------------------------------# - save_period = 1 + save_period = 20 #------------------------------------------------------------------# # save_dir 权值与日志文件保存的文件夹 #------------------------------------------------------------------# @@ -259,6 +269,10 @@ if __name__ == "__main__": epoch_step = num_train // batch_size epoch_step_val = num_val // batch_size + if config.steps != -1: + epoch_step = config.steps + epoch_step_val = config.steps + if epoch_step == 0 or epoch_step_val == 0: raise ValueError('数据集过小,无法进行训练,请扩充数据集。')