From 14b545cbe4ebac3b7125379256e8df6f8db5cbd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Fri, 30 Sep 2022 15:33:11 +0800 Subject: [PATCH 1/6] =?UTF-8?q?[UPDATE]=20=E6=9B=B4=E6=94=B9performance?= =?UTF-8?q?=E4=B8=AD=E7=9A=84train=5Fepoch=EF=BC=8C=E4=BB=8E2=E6=94=B9?= =?UTF-8?q?=E6=88=905?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh index c9fdc5ae1..aa175ca94 100644 --- a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh +++ b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh @@ -19,7 +19,7 @@ data_path="" #网络名称,同目录名称 Network="Oct-ResNet_ID2890_for_TensorFlow2.X" #训练epoch -train_epochs=2 +train_epochs=5 #训练step #train_steps=1000 #训练batch_size -- Gitee From 43994bc8e79ef0adc6aa394b55c2ef09f863d396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Sat, 8 Oct 2022 11:41:27 +0800 Subject: [PATCH 2/6] =?UTF-8?q?[ADD]=20=E6=B7=BB=E5=8A=A0=E7=BB=91?= =?UTF-8?q?=E6=A0=B8=E6=93=8D=E4=BD=9C=E8=BF=9B=E8=A1=8C=E6=80=A7=E8=83=BD?= =?UTF-8?q?=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_1p.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh index aa175ca94..999508f6b 100644 --- a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh +++ b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh @@ -132,12 +132,20 @@ else fi #############执行训练######################### +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_path, --model_dir, --precision_mode, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +RANK_ID=0 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=RANK_ID*$cpustep +let b=RANK_ID+1 +let c=b*$cpustep-1 #训练开始时间,不需要修改 start_time=$(date +%s) -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -#--data_path, --model_dir, --precision_mode, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune -nohup python3 train.py \ +nohup taskset -c $a-$c python3 train.py \ --data_dir=${data_path} \ --batch_size=${batch_size} \ --train_epochs=${train_epochs} \ -- Gitee From d7a9c1acde1e693ca2a94284b2e22bb7c7c5a192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Sat, 8 Oct 2022 13:56:12 +0800 Subject: [PATCH 3/6] =?UTF-8?q?[ADD]=20=E6=B7=BB=E5=8A=A0=E6=89=93?= =?UTF-8?q?=E5=8D=B0=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_1p.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh index 699dbe9a9..b17e71f2b 100644 --- a/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh +++ b/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh @@ -163,6 +163,7 @@ do sleep 60 num=`grep 'E19999' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | wc -l` + echo "$num****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log while [ ${num} -eq 0 ] do num=`grep 'E19999' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | wc -l` @@ -173,6 +174,8 @@ do done wait +echo "****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + cd ../tcc sed -i "s|${data_path}/tmp|tmp|g" train.py sed -i "s|${data_path}/tmp|tmp|g" config.py @@ -195,6 +198,8 @@ FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${step_per_s}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" +echo "$FPS****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + #输出训练精度,需要模型审视修改 #train_accuracy=`grep 'accuracy =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $9}'|awk -F "," '{print $1}'` #打印,不需要修改 @@ -219,6 +224,8 @@ grep 'Loss:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +echo "$cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From f5b7933fe562aa5a10b8e7e38f046ec61c10b094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Sat, 8 Oct 2022 14:02:42 +0800 Subject: [PATCH 4/6] =?UTF-8?q?[UPDATE]=20=E7=BB=91=E6=A0=B8=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E6=80=A7=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_1p.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh index 999508f6b..b96f7e8ba 100644 --- a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh +++ b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh @@ -135,12 +135,12 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_path, --model_dir, --precision_mode, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 -RANK_ID=0 +#RANK_ID=0 cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` cpustep=`expr $cpucount / 8` echo "taskset c steps:" $cpustep -let a=RANK_ID*$cpustep -let b=RANK_ID+1 +let a=${ASCEND_DEVICE_ID}*$cpustep +let b=${ASCEND_DEVICE_ID}+1 let c=b*$cpustep-1 #训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From cb387d64bd4399260b64bad0a74462e35c9ccf23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Sat, 8 Oct 2022 14:29:56 +0800 Subject: [PATCH 5/6] =?UTF-8?q?[BACK]=20=E5=88=A0=E9=99=A4=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_1p.sh | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh index b96f7e8ba..aa175ca94 100644 --- a/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh +++ b/TensorFlow2/built-in/keras_sample/cv/Oct-ResNet_ID2890_for_TensorFlow2.X/test/train_performance_1p.sh @@ -132,20 +132,12 @@ else fi #############执行训练######################### -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -#--data_path, --model_dir, --precision_mode, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune -#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 -#RANK_ID=0 -cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` -cpustep=`expr $cpucount / 8` -echo "taskset c steps:" $cpustep -let a=${ASCEND_DEVICE_ID}*$cpustep -let b=${ASCEND_DEVICE_ID}+1 -let c=b*$cpustep-1 #训练开始时间,不需要修改 start_time=$(date +%s) -nohup taskset -c $a-$c python3 train.py \ +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_path, --model_dir, --precision_mode, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +nohup python3 train.py \ --data_dir=${data_path} \ --batch_size=${batch_size} \ --train_epochs=${train_epochs} \ -- Gitee From e9508ce9fbde4242854c10bcb669f658f8e5cd88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Sat, 8 Oct 2022 14:31:12 +0800 Subject: [PATCH 6/6] =?UTF-8?q?[DEL]=20=E5=88=A0=E9=99=A4=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_1p.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh index b17e71f2b..699dbe9a9 100644 --- a/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh +++ b/TensorFlow2/built-in/cv/image_classification/TCC_ID0714_for_TensorFlow2.X/test/train_performance_1p.sh @@ -163,7 +163,6 @@ do sleep 60 num=`grep 'E19999' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | wc -l` - echo "$num****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log while [ ${num} -eq 0 ] do num=`grep 'E19999' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | wc -l` @@ -174,8 +173,6 @@ do done wait -echo "****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log - cd ../tcc sed -i "s|${data_path}/tmp|tmp|g" train.py sed -i "s|${data_path}/tmp|tmp|g" config.py @@ -198,8 +195,6 @@ FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${step_per_s}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" -echo "$FPS****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log - #输出训练精度,需要模型审视修改 #train_accuracy=`grep 'accuracy =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $9}'|awk -F "," '{print $1}'` #打印,不需要修改 @@ -224,8 +219,6 @@ grep 'Loss:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -echo "$cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log****************************************************************************" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log - #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee