diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh index 7cf2b07c3235f8216094dd362441b46eea734aa1..e78001b025701f36d760552bc1e7fd54868c94f8 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_16p.sh @@ -130,7 +130,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 20 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` FPS=`echo "${batch_size} / ${step_time}"|bc` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh index ecbdb30c869f37de9ccf5bd8963057c07afa799e..236e3f55930cde45141c23b2317e18708030258f 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh @@ -1,22 +1,22 @@ #!/bin/bash -################òҪģ޸################## -# ѡֶ(ڴ˴IJ): Network batch_size RANK_SIZE -# ƣͬĿ¼ +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 Network="ResNet50_ID4149_for_PyTorch" -# ѵbatch_size +# 训练batch_size batch_size=512 -# ѵʹõnpu +# 训练使用的npu卡数 export RANK_SIZE=1 -# ݼ·,Ϊ,Ҫ޸ +# 数据集路径,保持为空,不需要修改 data_path="" -# ѵepoch 90 +# 训练epoch 1 train_epochs=1 -# ݽ +# 加载数据进程数 workers=64 device_id=0 -# У飬data_pathΪشɾģ˴ж岢ֵ +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 for para in $* do if [[ $para == --data_path* ]];then @@ -34,14 +34,14 @@ do fi done -# УǷdata_path,Ҫ޸ +# 校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -###############ָѵűִ·############### -# cdtestļͬ㼶Ŀ¼ִнű߼ԣtest_path_dirΪtestļе· +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` cur_path_last_dirname=${cur_path##*/} if [ x"${cur_path_last_dirname}" == x"test" ];then @@ -52,10 +52,10 @@ else test_path_dir=${cur_path}/test fi -# УǷָdevice_id,ֶ̬device_idֶָdevice_id,˴Ҫ޸ +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 if [ $ASCEND_DEVICE_ID ];then echo "device id is ${ASCEND_DEVICE_ID}" - # ƽ̨ݼ· + # 平台运行软链数据集路径 elif [ ${device_id} ];then export ASCEND_DEVICE_ID=${device_id} echo "device id is ${ASCEND_DEVICE_ID}" @@ -64,7 +64,7 @@ else exit 1 fi -#################־Ŀ¼Ҫ޸################# +#################创建日志输出目录,不需要修改################# if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID @@ -73,10 +73,10 @@ else fi -#################ѵű################# -# ѵʼʱ䣬Ҫ޸ +#################启动训练脚本################# +# 训练开始时间,不需要修改 start_time=$(date +%s) -# ƽ̨ʱsource +# 非平台场景时source 环境变量 check_etp_flag=`env | grep etp_running_flag` etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then @@ -97,43 +97,41 @@ nohup python3 main.py \ wait -##################ȡѵ################ -# ѵʱ䣬Ҫ޸ +##################获取训练数据################ +# 训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -# ӡҪ޸ +# 结果打印,不需要修改 echo "------------------ Final result ------------------" -# FPSҪģ޸ -step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 100 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +# 输出性能FPS,需要模型审视修改 +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` FPS=`echo "${batch_size} / ${step_time}"|bc` -# ӡҪ޸ +# 打印,不需要修改 echo "Final Performance images/sec : $FPS" -CompileTime=`grep step_time ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| head -2 |awk -F "step_time = " '{print $2}' | awk '{sum+=$1} END {print"",sum}' |sed s/[[:space:]]//g` - -# ѵ,Ҫģ޸ +# 输出训练精度,需要模型审视修改 train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1 " '{print $NF}'|awk -F " " '{print $1}'` -# ӡҪ޸ +# 打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" -# ѵϢҪ޸ +# 训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' -# +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +# 吞吐量 ActualFPS=${FPS} -# ѵʱ +# 单迭代训练时长 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` -# train_$ASCEND_DEVICE_ID.logȡLosstrain_${CaseName}_loss.txtУҪģ +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -# һlossֵҪ޸ +# 最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -# ؼϢӡ${CaseName}.logУҪ޸ +# 关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -144,4 +142,3 @@ echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CompileTime = ${CompileTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh index 5244b07e1047c9c54494653ef55c006771ad6b70..8c81e7fa18b34c98a09db99e373498231233f88d 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh @@ -1,4 +1,4 @@ -#!/bin/bash + #!/bin/bash ################基础配置参数,需要模型审视修改################## # 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE @@ -130,7 +130,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 20 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $2}' |awk -F " " '{print $3}' | awk -F ")" '{print $1}' | tail -n 1` FPS=`echo "${batch_size} / ${step_time}"|bc` # 打印,不需要修改 echo "Final Performance images/sec : $FPS"