From e61aafcd91188ed3abeaeed0848998ac4145f6e7 Mon Sep 17 00:00:00 2001 From: chenhao388 Date: Sat, 10 Aug 2024 16:38:49 +0800 Subject: [PATCH 1/4] =?UTF-8?q?[contrib][Pytorch][Resnet50=5FID4149]=20?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=80=A7=E8=83=BD=E7=BB=9F=E8=AE=A1=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_16p.sh | 2 +- .../test/train_performance_1p.sh | 71 +++++++++---------- .../test/train_performance_8p.sh | 4 +- 3 files changed, 37 insertions(+), 40 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh index 7cf2b07c32..e78001b025 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh @@ -130,7 +130,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 20 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` FPS=`echo "${batch_size} / ${step_time}"|bc` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh index ecbdb30c86..04dc3e04cf 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh @@ -1,22 +1,22 @@ #!/bin/bash -################òҪģ޸################## -# ѡֶ(ڴ˴IJ): Network batch_size RANK_SIZE -# ƣͬĿ¼ +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 Network="ResNet50_ID4149_for_PyTorch" -# ѵbatch_size +# 训练batch_size batch_size=512 -# ѵʹõnpu +# 训练使用的npu卡数 export RANK_SIZE=1 -# ݼ·,Ϊ,Ҫ޸ +# 数据集路径,保持为空,不需要修改 data_path="" -# ѵepoch 90 +# 训练epoch 1 train_epochs=1 -# ݽ +# 加载数据进程数 workers=64 device_id=0 -# У飬data_pathΪشɾģ˴ж岢ֵ +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 for para in $* do if [[ $para == --data_path* ]];then @@ -34,14 +34,14 @@ do fi done -# УǷdata_path,Ҫ޸ +# 校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -###############ָѵűִ·############### -# cdtestļͬ㼶Ŀ¼ִнű߼ԣtest_path_dirΪtestļе· +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` cur_path_last_dirname=${cur_path##*/} if [ x"${cur_path_last_dirname}" == x"test" ];then @@ -52,10 +52,10 @@ else test_path_dir=${cur_path}/test fi -# УǷָdevice_id,ֶ̬device_idֶָdevice_id,˴Ҫ޸ +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 if [ $ASCEND_DEVICE_ID ];then echo "device id is ${ASCEND_DEVICE_ID}" - # ƽ̨ݼ· + # 平台运行软链数据集路径 elif [ ${device_id} ];then export ASCEND_DEVICE_ID=${device_id} echo "device id is ${ASCEND_DEVICE_ID}" @@ -64,7 +64,7 @@ else exit 1 fi -#################־Ŀ¼Ҫ޸################# +#################创建日志输出目录,不需要修改################# if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID @@ -73,10 +73,10 @@ else fi -#################ѵű################# -# ѵʼʱ䣬Ҫ޸ +#################启动训练脚本################# +# 训练开始时间,不需要修改 start_time=$(date +%s) -# ƽ̨ʱsource +# 非平台场景时source 环境变量 check_etp_flag=`env | grep etp_running_flag` etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then @@ -97,43 +97,41 @@ nohup python3 main.py \ wait -##################ȡѵ################ -# ѵʱ䣬Ҫ޸ +##################获取训练数据################ +# 训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -# ӡҪ޸ +# 结果打印,不需要修改 echo "------------------ Final result ------------------" -# FPSҪģ޸ -step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 100 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +# 输出性能FPS,需要模型审视修改 +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` FPS=`echo "${batch_size} / ${step_time}"|bc` -# ӡҪ޸ +# 打印,不需要修改 echo "Final Performance images/sec : $FPS" -CompileTime=`grep step_time ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| head -2 |awk -F "step_time = " '{print $2}' | awk '{sum+=$1} END {print"",sum}' |sed s/[[:space:]]//g` - -# ѵ,Ҫģ޸ +# 输出训练精度,需要模型审视修改 train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1 " '{print $NF}'|awk -F " " '{print $1}'` -# ӡҪ޸ +# 打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" -# ѵϢҪ޸ +# 训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' -# +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +# 吞吐量 ActualFPS=${FPS} -# ѵʱ +# 单迭代训练时长 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` -# train_$ASCEND_DEVICE_ID.logȡLosstrain_${CaseName}_loss.txtУҪģ +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -# һlossֵҪ޸ +# 最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -# ؼϢӡ${CaseName}.logУҪ޸ +# 关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -143,5 +141,4 @@ echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${ echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CompileTime = ${CompileTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh index 5244b07e10..8c81e7fa18 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh @@ -1,4 +1,4 @@ -#!/bin/bash + #!/bin/bash ################基础配置参数,需要模型审视修改################## # 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE @@ -130,7 +130,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 20 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` FPS=`echo "${batch_size} / ${step_time}"|bc` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" -- Gitee From 7895fae41bd728183c3b2b3468cfbc27439a36a9 Mon Sep 17 00:00:00 2001 From: chenhao388 Date: Sat, 10 Aug 2024 16:49:17 +0800 Subject: [PATCH 2/4] =?UTF-8?q?[contrib][Pytorch][Resnet50=5FID4149]=20?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=80=A7=E8=83=BD=E7=BB=9F=E8=AE=A1=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh index 04dc3e04cf..a5136d0047 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh @@ -106,6 +106,7 @@ e2e_time=$(( $end_time - $start_time )) echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` + FPS=`echo "${batch_size} / ${step_time}"|bc` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" -- Gitee From 2d4ec98d45def4b304e09b34371c8e5795150638 Mon Sep 17 00:00:00 2001 From: chenhao388 Date: Sat, 10 Aug 2024 16:52:04 +0800 Subject: [PATCH 3/4] =?UTF-8?q?[contrib][Pytorch][Resnet50=5FID4149]=20?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=80=A7=E8=83=BD=E7=BB=9F=E8=AE=A1=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh index a5136d0047..197c5daef7 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh @@ -142,4 +142,4 @@ echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${ echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From e7f2f80a8ad684528da9990d4f3102d47ebab303 Mon Sep 17 00:00:00 2001 From: chenhao388 Date: Sat, 10 Aug 2024 16:53:47 +0800 Subject: [PATCH 4/4] =?UTF-8?q?[contrib][Pytorch][Resnet50=5FID4149]=20?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=80=A7=E8=83=BD=E7=BB=9F=E8=AE=A1=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh index 197c5daef7..236e3f5593 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh @@ -106,7 +106,6 @@ e2e_time=$(( $end_time - $start_time )) echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` - FPS=`echo "${batch_size} / ${step_time}"|bc` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" -- Gitee