From 04b0a4f2c1754338a8b6ed458b86e4e251481727 Mon Sep 17 00:00:00 2001
From: liuyihang <1905527319@qq.com>
Date: Fri, 2 Dec 2022 03:29:57 +0000
Subject: [PATCH] =?UTF-8?q?update=20AlexNet=5FID0072=5Ffor=5FTensorFlow/te?=
 =?UTF-8?q?st/train=5Ffull=5F1p.sh.=201p=E7=B2=BE=E5=BA=A6=E8=84=9A?=
 =?UTF-8?q?=E6=9C=AC=E8=B6=85=E5=8F=82=E4=B8=8D=E5=AF=B9=EF=BC=8C=E6=A0=B9?=
 =?UTF-8?q?=E6=8D=AE8p=E7=B2=BE=E5=BA=A6=E8=84=9A=E6=9C=AC=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9=EF=BC=8C=E4=B8=AA=E4=BA=BA=E4=BB=93=E9=AA=8C=E8=AF=81?=
 =?UTF-8?q?=EF=BC=9Adebug01112347?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: liuyihang <1905527319@qq.com>
---
 .../test/train_full_1p.sh                     | 227 ++++++++++--------
 1 file changed, 133 insertions(+), 94 deletions(-)
diff --git a/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh
index 5e0051b4e..3d3b93d1a 100644
--- a/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh
+++ b/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_1p.sh
@@ -5,9 +5,8 @@ cur_path=`pwd`
 
 #集合通信参数,不需要修改
 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下
+export JOB_ID=10087
 export RANK_SIZE=1
-unset RANK_TABLE_FILE
-#export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json
 export JOB_ID=10087
 RANK_ID_START=0
 
@@ -15,55 +14,21 @@ RANK_ID_START=0
 data_path=""
 
 #设置默认日志级别,不需要修改
-export ASCEND_GLOBAL_LOG_LEVEL=3
+#export ASCEND_GLOBAL_LOG_LEVEL=3
 
+#基础参数 需要模型审视修改
 #网络名称，同目录名称
-Network="AlexNet_for_TensorFlow"
+Network="AlexNet_ID0072_for_TensorFlow"
+
 #训练batch_size
-batch_size=256
-#学习率
-learning_rate=0.015
+batch_size=128
+
 #维持参数，以下不需要修改
 over_dump=False
 data_dump_flag=False
 data_dump_step="10"
 profiling=False
 #参数校验，不需要修改
-if [[ $1 == --help || $1 == -h ]];then
-    echo"usage:./train_full_8p.sh <args>"
-    echo " "
-    echo "parameter explain:
-    --precision_mode           precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
-    --over_dump		           if or not over detection, default is False
-    --data_dump_flag		   data dump flag, default is 0
-    --data_dump_step		   data dump step, default is 10
-    --profiling		           if or not profiling for performance debug, default is False
-    --autotune                 whether to enable autotune, default is False
-    --data_path		           source data of training
-    -h/--help		           show help message
-    "
-    exit 1
-fi
-#help info
-
-if [[ $1 == --help || $1 == -h ]];then
-    echo"usage:./train_full_8p.sh <args>"
-    echo " "
-    echo "parameter explain:
-    --precision_mode           precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
-    --over_dump		           if or not over detection, default is False
-    --data_dump_flag		   data dump flag, default is 0
-    --data_dump_step		   data dump step, default is 10
-    --profiling		           if or not profiling for performance debug, default is False
-    --autotune                 whether to enable autotune, default is False
-    --data_path		           source data of training
-    -h/--help		           show help message
-    "
-    exit 1
-fi
-
-#参数校验，不需要修改
-
 for para in $*
 do
     if [[ $para == --precision_mode* ]];then
@@ -84,13 +49,6 @@ do
         mkdir -p ${profiling_dump_path}
     elif [[ $para == --autotune* ]];then
         autotune=`echo ${para#*=}`
-		autotune=True
-#开autotune特有环境变量
-		export autotune=True
-		export REPEAT_TUNE=True
-		export ASCEND_DEVICE_ID=0
-		export ENABLE_TUNE_BANK=True
-		export TE_PARALLEL_COMPILER=32
         mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak
         mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak
         autotune_dump_path=${cur_path}/output/autotune_dump
@@ -98,75 +56,156 @@ do
         mkdir -p ${autotune_dump_path}/rl
         cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/
         cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/
-
-	
     elif [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
+    elif [[ $para == --bind_core* ]]; then
+        bind_core=`echo ${para#*=}`
+        name_bind="_bindcore"
     fi
 done
 
 #校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
+	
+	
+    exit 1
+fi
 
 
+#autotune时，先开启autotune执行单P训练，不需要修改
+if [[ $autotune == True ]]; then
+    sh -x train_full_1p.sh --autotune=$autotune --data_path=$data_path
+    wait
+    autotune=False
+	 export autotune=False
+	 
+	export RANK_SIZE=1
+    export JOB_ID=10087
+    RANK_ID_START=0
+	unset TE_PARALLEL_COMPILER
+	 
+fi
 
-         # sed -i 's/n_epoches = 1/n_epoches = 20/g' ../configs/config.py 
-
-        #  sed -i 's/iteration_per_loop = 1/iteration_per_loop = 10/g' ../configs/config.py
-		  
+#训练开始时间，不需要修改
+start_time=$(date +%s)
 
-    exit 1
-	
-fi
+#进入训练脚本目录，需要模型审视修改
 
 for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
 do
     #设置环境变量，不需要修改
-    echo "Device ID: $RANK_ID"
+    echo "Device ID: $ASCEND_DEVICE_ID"
     export RANK_ID=$RANK_ID
-    export ASCEND_DEVICE_ID=$RANK_ID
-    ASCEND_DEVICE_ID=$RANK_ID
-	
-	   if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+
+    export DEVICE_ID=$ASCEND_DEVICE_ID
+	DEVICE_INDEX=$ASCEND_DEVICE_ID
+    export DEVICE_INDEX=${DEVICE_INDEX}
+
+    #创建DeviceID输出目录，不需要修改
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
         rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
-      mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
-   else
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    else
         mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
     fi
 
-EXEC_DIR=$(pwd)
-RESULTS=results/1p
-
-mkdir -p ${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID}	
-	
-rm -rf ${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID}/*
-
-cd ${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID}
-
-env > ${EXEC_DIR}/${RESULTS}/env_${ASCEND_DEVICE_ID}.log
-
-
-python3.7 ${EXEC_DIR}/../train.py --rank_size=1 \
-	--iterations_per_loop=100 \
-	--batch_size=${batch_size} \
-	--data_dir=${data_path} \
-	--mode=train \
-	--checkpoint_dir=${EXEC_DIR}/${RESULTS}/${ASCEND_DEVICE_ID}/model_1p/ \
-	--lr=0.015 \
-	--log_dir=./model_1p > ./train_${ASCEND_DEVICE_ID}.log 2>&1 
+	# 绑核，不需要的绑核的模型删除，需要模型审视修改
+    corenum=`cat /proc/cpuinfo |grep "processor"|wc -l`
+    let a=RANK_ID*${corenum}/${RANK_SIZE}
+    let b=RANK_ID+1
+    let c=b*${corenum}/${RANK_SIZE}-1
 
-if [ $? -eq 0 ] ;
-then
-    echo "turing train success" >> ${EXEC_DIR}/${RESULTS}/train_${ASCEND_DEVICE_ID}.log
-else
-    echo "turing train fail" >> ${EXEC_DIR}/${RESULTS}/train_${ASCEND_DEVICE_ID}.log
-fi
+    #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+    #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path，--data_dump_flag，--data_dump_step，--data_dump_path，--profiling，--profiling_dump_path
+    if [ "x${bind_core}" != x ];then
+        bind_core="taskset -c $a-$c"
+    fi
+python3.7 ${cur_path}/../train.py --rank_size=1 \
+                      --epochs_between_evals=1 \
+                      --mode=train \
+        	            --max_epochs=150 \
+                      --iterations_per_loop=100 \
+        	            --batch_size=${batch_size} \
+        	            --data_dir=${data_path} \
+        	            --lr=0.0075 \
+                      --checkpoint_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \
+        	            --log_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+						
 
 
 done
+wait
+#设置环境变量，不需要修改
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $ASCEND_DEVICE_ID"
+    export RANK_ID=$RANK_ID
 
+    export DEVICE_ID=$ASCEND_DEVICE_ID
+        DEVICE_INDEX=$ASCEND_DEVICE_ID
+    export DEVICE_INDEX=${DEVICE_INDEX}
+    python3 ${cur_path}/../train.py --rank_size=1 \
+                      --epochs_between_evals=1 \
+                      --mode=evaluate \
+                            --max_epochs=150 \
+                      --iterations_per_loop=100 \
+                            --batch_size=${batch_size} \
+                            --data_dir=${data_path} \
+                            --lr=0.075 \
+                      --checkpoint_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \
+                            --log_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 
-
-
-
+done
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能ms/step,需要模型审视修改
+step_sec=`grep FPS  ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $5}'|awk -F ":" 'END {print $2}'|awk -F "," 'END {print $1}'|awk -F "." '{print $1}'`
+#打印，不需要修改
+echo "Final Performance ms/step : $step_sec"
+
+
+#打印，不需要修改
+echo "Final Training Duration sec : $e2e_sec"
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -B 1 "Finished" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|head -1|awk 'END {print $3}'`
+#打印，不需要修改
+echo "Final train_accuracy is ${train_accuracy}"
+echo "E2E training Duration sec: $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据
+#吞吐量，不需要修改
+ActualFPS=${step_sec}
+#单迭代训练时长,需要模型审视修改
+TrainingTime=`expr ${batch_size} \* 1000 / ${step_sec}`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+`grep total_loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $9}'|tr -d , >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+TrainAccuracy=$train_accuracy
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
-- 
Gitee