From 04b0a4f2c1754338a8b6ed458b86e4e251481727 Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 2 Dec 2022 03:29:57 +0000 Subject: [PATCH] =?UTF-8?q?update=20AlexNet=5FID0072=5Ffor=5FTensorFlow/te?= =?UTF-8?q?st/train=5Ffull=5F1p.sh.=201p=E7=B2=BE=E5=BA=A6=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E8=B6=85=E5=8F=82=E4=B8=8D=E5=AF=B9=EF=BC=8C=E6=A0=B9?= =?UTF-8?q?=E6=8D=AE8p=E7=B2=BE=E5=BA=A6=E8=84=9A=E6=9C=AC=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=EF=BC=8C=E4=B8=AA=E4=BA=BA=E4=BB=93=E9=AA=8C=E8=AF=81?= =?UTF-8?q?=EF=BC=9Adebug01112347?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_full_1p.sh | 227 ++++++++++-------- 1 file changed, 133 insertions(+), 94 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh index 5e0051b4e..3d3b93d1a 100644 --- a/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh @@ -5,9 +5,8 @@ cur_path=`pwd` #集合通信参数,不需要修改 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=10087 export RANK_SIZE=1 -unset RANK_TABLE_FILE -#export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json export JOB_ID=10087 RANK_ID_START=0 @@ -15,55 +14,21 @@ RANK_ID_START=0 data_path="" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_GLOBAL_LOG_LEVEL=3 +#基础参数 需要模型审视修改 #网络名称,同目录名称 -Network="AlexNet_for_TensorFlow" +Network="AlexNet_ID0072_for_TensorFlow" + #训练batch_size -batch_size=256 -#学习率 -learning_rate=0.015 +batch_size=128 + #维持参数,以下不需要修改 over_dump=False data_dump_flag=False data_dump_step="10" profiling=False #参数校验,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi -#help info - -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 - for para in $* do if [[ $para == --precision_mode* ]];then @@ -84,13 +49,6 @@ do mkdir -p ${profiling_dump_path} elif [[ $para == --autotune* ]];then autotune=`echo ${para#*=}` - autotune=True -#开autotune特有环境变量 - export autotune=True - export REPEAT_TUNE=True - export ASCEND_DEVICE_ID=0 - export ENABLE_TUNE_BANK=True - export TE_PARALLEL_COMPILER=32 mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak autotune_dump_path=${cur_path}/output/autotune_dump @@ -98,75 +56,156 @@ do mkdir -p ${autotune_dump_path}/rl cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ - - elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" fi done #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" + + + exit 1 +fi +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + sh -x train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False + export autotune=False + + export RANK_SIZE=1 + export JOB_ID=10087 + RANK_ID_START=0 + unset TE_PARALLEL_COMPILER + +fi - # sed -i 's/n_epoches = 1/n_epoches = 20/g' ../configs/config.py - - # sed -i 's/iteration_per_loop = 1/iteration_per_loop = 10/g' ../configs/config.py - +#训练开始时间,不需要修改 +start_time=$(date +%s) - exit 1 - -fi +#进入训练脚本目录,需要模型审视修改 for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" + echo "Device ID: $ASCEND_DEVICE_ID" export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID - - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + + export DEVICE_ID=$ASCEND_DEVICE_ID + DEVICE_INDEX=$ASCEND_DEVICE_ID + export DEVICE_INDEX=${DEVICE_INDEX} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt fi -EXEC_DIR=$(pwd) -RESULTS=results/1p - -mkdir -p ${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID} - -rm -rf ${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID}/* - -cd ${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID} - -env > ${EXEC_DIR}/${RESULTS}/env_${ASCEND_DEVICE_ID}.log - - -python3.7 ${EXEC_DIR}/../train.py --rank_size=1 \ - --iterations_per_loop=100 \ - --batch_size=${batch_size} \ - --data_dir=${data_path} \ - --mode=train \ - --checkpoint_dir=${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID}/model_1p/ \ - --lr=0.015 \ - --log_dir=./model_1p > ./train_${ASCEND_DEVICE_ID}.log 2>&1 + # 绑核,不需要的绑核的模型删除,需要模型审视修改 + corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + let a=RANK_ID*${corenum}/${RANK_SIZE} + let b=RANK_ID+1 + let c=b*${corenum}/${RANK_SIZE}-1 -if [ $? -eq 0 ] ; -then - echo "turing train success" >> ${EXEC_DIR}/${RESULTS}/train_${ASCEND_DEVICE_ID}.log -else - echo "turing train fail" >> ${EXEC_DIR}/${RESULTS}/train_${ASCEND_DEVICE_ID}.log -fi + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi +python3.7 ${cur_path}/../train.py --rank_size=1 \ + --epochs_between_evals=1 \ + --mode=train \ + --max_epochs=150 \ + --iterations_per_loop=100 \ + --batch_size=${batch_size} \ + --data_dir=${data_path} \ + --lr=0.0075 \ + --checkpoint_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --log_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + done +wait +#设置环境变量,不需要修改 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export DEVICE_ID=$ASCEND_DEVICE_ID + DEVICE_INDEX=$ASCEND_DEVICE_ID + export DEVICE_INDEX=${DEVICE_INDEX} + python3 ${cur_path}/../train.py --rank_size=1 \ + --epochs_between_evals=1 \ + --mode=evaluate \ + --max_epochs=150 \ + --iterations_per_loop=100 \ + --batch_size=${batch_size} \ + --data_dir=${data_path} \ + --lr=0.075 \ + --checkpoint_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --log_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - - - +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能ms/step,需要模型审视修改 +step_sec=`grep FPS ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $5}'|awk -F ":" 'END {print $2}'|awk -F "," 'END {print $1}'|awk -F "." '{print $1}'` +#打印,不需要修改 +echo "Final Performance ms/step : $step_sec" + + +#打印,不需要修改 +echo "Final Training Duration sec : $e2e_sec" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -B 1 "Finished" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|head -1|awk 'END {print $3}'` +#打印,不需要修改 +echo "Final train_accuracy is ${train_accuracy}" +echo "E2E training Duration sec: $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${step_sec} +#单迭代训练时长,需要模型审视修改 +TrainingTime=`expr ${batch_size} \* 1000 / ${step_sec}` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +`grep total_loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $9}'|tr -d , >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +TrainAccuracy=$train_accuracy +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee