From 7fcdd408cfcb7df010657c3b387447347edade70 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Sun, 9 Oct 2022 03:34:23 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Tens?= =?UTF-8?q?orFlow/built-in/cv/detection/YoloV3=5FID0076=5Ffor=5FTensorFlow?= =?UTF-8?q?/test/train=5Ffull=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_full_1p.sh | 189 ------------------ 1 file changed, 189 deletions(-) delete mode 100644 TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/test/train_full_1p.sh diff --git a/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/test/train_full_1p.sh deleted file mode 100644 index ffe599e5b..000000000 --- a/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/test/train_full_1p.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -RANK_SIZE=1 -RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json -RANK_ID_START=0 - -# 数据集路径,保持为空,不需要修改 -data_path="" - -#设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 - -#基础参数 需要模型审视修改 -#网络名称,同目录名称 -Network="InceptionV4_for_TensorFlow" -#训练epoch -train_epochs=100 -#训练batch_size -batch_size=64 - -#TF2.X独有,不需要修改 -#export NPU_LOOP_SIZE=${train_steps} - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False -autotune=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --autotune* ]];then - autotune=`echo ${para#*=}` - export autotune=$autotune - mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak - mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak - autotune_dump_path=${cur_path}/output/autotune_dump - mkdir -p ${autotune_dump_path}/GA - mkdir -p ${autotune_dump_path}/rl - cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ - cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID - - # 自行添加环境变量 - export RANK_SIZE=8 - export DEVICE_ID=$RANK_ID - DEVICE_INDEX=$DEVICE_ID - export DEVICE_INDEX=${DEVICE_INDEX} - export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json - export JOB_ID=123678 - export FUSION_TENSOR_SIZE=1000000000 - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - cp -r $data_path/darknet53* ${cur_path}/../data/darknet_weights/ - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - python3 train.py \ - --mode multi \ - --data_url $data_path \ - --train_url ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ - --over_dump ${over_dump} \ - --over_dump_path ${over_dump_path} \ - > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - #--data_dump_flag=${data_dump_flag} \ - #--data_dump_step=${data_dump_step} \ - #--data_dump_path=${data_dump_path} \ - #--profiling=${profiling} \ - #--profiling_dump_path=${profiling_dump_path} \ - #--autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -FPS=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep train_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'|cut -c 1-5` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#稳定性精度看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' - -##获取性能数据 -#吞吐量,不需要修改 -ActualFPS=${FPS} -#单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From f863267c20329f17ca2f8ffaa6c6556de58a9e1e Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Sun, 9 Oct 2022 06:47:30 +0000 Subject: [PATCH 2/2] update TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/README.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../cv/detection/YoloV3_ID0076_for_TensorFlow/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/README.md index 8c8099652..75eec49b4 100644 --- a/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow/README.md @@ -152,7 +152,7 @@ config_proto = tf.ConfigProto(allow_soft_placement=True) 2. 单卡训练 - 2.1 设置单卡训练参数(脚本位于YoloV3_ID0076_for_TensorFlow/test/train_full_1p.sh),示例如下。 + 2.1 设置单卡训练参数(脚本位于YoloV3_ID0076_for_TensorFlow/test/train_performance_1p.sh),示例如下。 ``` @@ -168,7 +168,7 @@ config_proto = tf.ConfigProto(allow_soft_placement=True) 2.2 单卡训练指令(脚本位于YoloV3_ID0076_for_TensorFlow/test) ``` - bash train_full_1p.sh --data_path=xx + bash train_performance_1p.sh --data_path=xx 数据集应有如下目录结构,指定data_path时应当指定为dataset这一层, 例如--data_path=/home/dataset ├──dataset @@ -215,7 +215,6 @@ config_proto = tf.ConfigProto(allow_soft_placement=True) ├── requirements.txt //依赖 ├── train.py //训练入口脚本 ├── test - |—— train_full_1p.sh //单卡训练脚本 |—— train_performance_1p.sh //单卡训练脚本 |—— train_full_8p.sh //多卡训练脚本 |—— train_performance_8p.sh //多卡训练脚本 @@ -232,4 +231,4 @@ train_epochs 总训练epoch数 ## 训练过程 通过“模型训练”中的训练指令启动单卡训练。 -将训练脚本(train_full_1p.sh)中的data_path设置为训练数据集的路径。具体的流程参见“模型训练”的示例。 \ No newline at end of file +将训练脚本(train_performance_1p.sh)中的data_path设置为训练数据集的路径。具体的流程参见“模型训练”的示例。 \ No newline at end of file -- Gitee