From 07c1e78bd06acb776d2dcd6fe2219635e8649650 Mon Sep 17 00:00:00 2001 From: Zn Date: Tue, 10 May 2022 15:03:27 +0800 Subject: [PATCH 1/2] =?UTF-8?q?[=E8=87=AA=E7=A0=94][PyTorch]EfficientNet-B?= =?UTF-8?q?1=5FID1713=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83=E5=90=AF?= =?UTF-8?q?=E5=8A=A8=E8=84=9A=E6=9C=AC=E5=8F=AA=E7=95=99test=E4=B8=8Bshell?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=EF=BC=8C=20=E5=85=B6=E4=BD=99=E5=88=A0?= =?UTF-8?q?=E9=99=A4=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zn --- .../README.md | 11 +-- .../run_1p.sh | 25 ----- .../run_8p.sh | 31 ------ .../test/train_full_1p.sh | 95 ++++++++++--------- .../test/train_full_8p.sh | 11 ++- .../test/train_performance_1p.sh | 70 +++++++++----- .../test/train_performance_8p.sh | 12 ++- 7 files changed, 118 insertions(+), 137 deletions(-) delete mode 100644 PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh delete mode 100644 PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md index c96358198e..19fbe569a6 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md @@ -4,9 +4,9 @@ ``` 1.安装环境 - 2.修改run_1p.sh字段"data"为当前磁盘的数据集路径 + 2.修改train_performance_1p.sh字段"data"为当前磁盘的数据集路径 3.修改字段device_id(单卡训练所使用的device id),为训练配置device_id,比如device_id=0 - 4.cd到run_1p.sh文件的目录,执行bash run_1p.sh单卡脚本, 进行单卡训练 + 4.cd到train_performance_1p.sh文件的目录(也可直接在模型目录下),执行bash train_performance_1p.sh单卡脚本, 进行单卡训练 ``` @@ -15,8 +15,8 @@ ``` 1.安装环境 2.修改多P脚本中字段"data"为当前磁盘的数据集路径 - 3.修改run_8p.sh字段"addr"为当前主机ip地址 - 4.cd到run_8p.sh文件的目录,执行bash run_8p.sh等多卡脚本, 进行多卡训练 + 3.修改train_performance_8p.sh字段"addr"为当前主机ip地址 + 4.cd到train_performance_8p.sh文件的目录(也可直接在模型目录下),执行bash train_performance_8p.sh等多卡脚本, 进行多卡训练 ``` @@ -37,8 +37,7 @@ 三、测试结果 训练日志路径:在训练脚本的同目录下result文件夹里,如: - - /home/Efficientnet/result/training_8p_job_20201121023601 + /home/Efficientnet/test/output/0/train_0.log diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh deleted file mode 100644 index a7bec33616..0000000000 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_1p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/examples/imagenet/main.py \ - --data=/data/imagenet \ - --arch=efficientnet-b0 \ - --batch-size=512 \ - --lr=0.2 \ - --momentum=0.9 \ - --epochs=100 \ - --autoaug \ - --amp \ - --pm=O1 \ - --loss_scale=32 \ - --val_feq=10 \ - --npu=0 > ${train_log_dir}/train_1p.log 2>&1 & \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh deleted file mode 100644 index 63e06c3410..0000000000 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error -/usr/local/Ascend/driver/tools/msnpureport -d 4 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_8p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/examples/imagenet/main.py \ - --data=/data/imagenet \ - --arch=efficientnet-b0 \ - --batch-size=4096 \ - --lr=1.6 \ - --momentum=0.9 \ - --epochs=100 \ - --autoaug \ - --amp \ - --pm=O1 \ - --loss_scale=32 \ - --val_feq=10 \ - --addr=$(hostname -I |awk '{print $1}') \ - --dist-backend=hccl \ - --multiprocessing-distributed \ - --world-size 1 \ - --rank 0 \ - --device_list '0,1,2,3,4,5,6,7' > ${train_log_dir}/train_8p.log 2>&1 & \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh index dedbfdfdd5..a6b3f11d1c 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh @@ -2,9 +2,10 @@ #当前路径,不需要修改 cur_path=`pwd` +# 指定训练所使用的npu device卡id +device_id=0 #集合通信参数,不需要修改 - export RANK_SIZE=1 export JOB_ID=10087 RANK_ID_START=0 @@ -55,41 +56,42 @@ if [[ $1 == --help || $1 == -h ]];then exit 1 fi -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#进入训练脚本目录,需要模型审视修改 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #训练开始时间,不需要修改 start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 @@ -99,16 +101,16 @@ do #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - python3 ${cur_path}/../examples/imagenet/main.py \ + nohup python3.7 ${cur_path}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ @@ -120,7 +122,7 @@ do --pm=O1 \ --loss_scale=32 \ --val_feq=10 \ - --npu=$ASCEND_DEVICE_ID > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --npu=$ASCEND_DEVICE_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -131,12 +133,12 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -154,19 +156,18 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAcuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh index 78ca564ede..f4701d634b 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh @@ -98,9 +98,14 @@ fi ##################启动训练脚本################## #训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 -#source ${test_path_dir}/env.sh -python3 ${test_path_dir}/../examples/imagenet/main.py \ +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${test_path_dir}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh index 3257adb89e..9934573e46 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh @@ -2,9 +2,10 @@ #source env_npu.sh #当前路径,不需要修改 cur_path=`pwd` +# 指定训练所使用的npu device卡id +device_id=0 #集合通信参数,不需要修改 - export RANK_SIZE=1 export JOB_ID=10087 RANK_ID_START=0 @@ -84,12 +85,37 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#进入训练脚本目录,需要模型审视修改 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #训练开始时间,不需要修改 start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 @@ -99,16 +125,16 @@ do #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - python3 ${cur_path}/../examples/imagenet/main.py \ + nohup python3.7 ${cur_path}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ @@ -121,7 +147,7 @@ do --loss_scale=32 \ --val_feq=10 \ --stop-step-num=128 \ - --npu=$ASCEND_DEVICE_ID > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --npu=$ASCEND_DEVICE_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -132,12 +158,12 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -155,18 +181,18 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh index 84cdda3c2d..b3ec488f67 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh @@ -106,9 +106,14 @@ fi ##################启动训练脚本################## #训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 -# source ${test_path_dir}/env.sh -python3 ${test_path_dir}/../examples/imagenet/main.py \ +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${test_path_dir}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ @@ -175,6 +180,7 @@ echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${Cas echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 9bb993bac48d9f992031974dd0629fa9254a6826 Mon Sep 17 00:00:00 2001 From: Zn Date: Thu, 12 May 2022 14:36:54 +0800 Subject: [PATCH 2/2] =?UTF-8?q?[=E8=87=AA=E7=A0=94][PyTorch]ResNet50=5Ffor?= =?UTF-8?q?=5FPyTorch=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83=E5=90=AF?= =?UTF-8?q?=E5=8A=A8=E8=84=9A=E6=9C=AC=E5=8F=AA=E7=95=99test=E4=B8=8Bshell?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=EF=BC=8C=20=E5=85=B6=E4=BD=99=E5=88=A0?= =?UTF-8?q?=E9=99=A4=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zn --- .../ResNet50_for_PyTorch/README.md | 10 ++--- .../ResNet50_for_PyTorch/run_1p.sh | 31 ------------- .../ResNet50_for_PyTorch/run_2p.sh | 44 ------------------- .../ResNet50_for_PyTorch/run_4p.sh | 43 ------------------ .../ResNet50_for_PyTorch/run_8p.sh | 41 ----------------- .../train_ID3071_ResNet50_performance_8p.sh | 2 +- .../test/train_ID3071_performance_1p.sh | 2 +- .../test/train_eval_1p.sh | 2 +- .../test/train_full_16p.sh | 2 +- .../test/train_full_1p.sh | 2 +- .../test/train_full_8p.sh | 2 +- .../test/train_performance_16p.sh | 2 +- .../test/train_performance_1p.sh | 2 +- .../test/train_performance_8p.sh | 2 +- 14 files changed, 14 insertions(+), 173 deletions(-) delete mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh delete mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh delete mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh delete mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md index 291e2e2c31..1de5116b9a 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md @@ -13,9 +13,9 @@ ``` 1.安装环境 - 2.修改run_1p.sh字段"data"为当前磁盘的数据集路径 - 3.修改字段device_id(单卡训练所使用的device id),为训练配置device_id,比如device_id=0 - 4.cd到run_1p.sh文件的目录,执行bash run_1p.sh单卡脚本, 进行单卡训练 + 2.修改train_performance_1p.sh字段"data"为当前磁盘的数据集路径; + 3.修改字段device_id(单卡训练所使用的device id),为训练配置device_id,比如device_id=0; + 4.执行bash train_performance_1p.sh单卡脚本, 进行单卡训练; ``` @@ -25,7 +25,7 @@ 1.安装环境 2.修改多P脚本中字段"data"为当前磁盘的数据集路径 3.修改字段device_id_list(多卡训练所使用的device id列表),为训练配置device_id,比如4p,device_id_list=0,1,2,3;8P默认使用0,1,2,3,4,5,6,7卡不用配置 - 4.cd到run_8p.sh文件的目录,执行bash run_8p.sh等多卡脚本, 进行多卡训练 + 4.执行bash train_performance_8p.sh等多卡脚本, 进行多卡训练; ``` @@ -48,6 +48,6 @@ 训练日志路径:在训练脚本的同目录下result文件夹里,如: - /home/ResNet50/result/training_8p_job_20201121023601 + /home/ResNet50/test/output/device_id/training_8p_job_20201121023601 diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh deleted file mode 100644 index e542152b60..0000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -device_id=0 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_1p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/pytorch_resnet50_apex.py \ - --data /data/imagenet \ - --npu ${device_id} \ - -j64 \ - -b512 \ - --lr 0.2 \ - --warmup 5 \ - --label-smoothing=0.1 \ - --epochs 90 \ - --num_classes=1000 \ - --optimizer-batch-size 512 > ./resnet50_1p.log 2>&1 & - - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh deleted file mode 100644 index 047849d5f8..0000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -ip=$(hostname -I |awk '{print $1}') -device_id_list=0,1 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_2p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/DistributedResnet50/main_apex_d76_npu.py \ - --data /data/imagenet \ - --addr=$(hostname -I |awk '{print $1}') \ - --seed=49 \ - --workers=128 \ - --learning-rate=0.4 \ - --warmup=8 \ - --label-smoothing=0.1 \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --static-loss-scale=128 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --dist-backend='hccl' \ - --multiprocessing-distributed \ - --world-size=1 \ - --rank=0 \ - --device-list=${device_id_list} \ - --benchmark=0 \ - --device='npu' \ - --epochs=90 \ - --num_classes=1000 \ - --batch-size=1024 > ./resnet50_2p.log 2>&1 & - - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh deleted file mode 100644 index 2b29adfe64..0000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -ip=$(hostname -I |awk '{print $1}') -device_id_list=0,1,2,3 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_4p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/DistributedResnet50/main_apex_d76_npu.py \ - --data /data/imagenet \ - --addr=$(hostname -I |awk '{print $1}') \ - --seed=49 \ - --workers=128 \ - --learning-rate=0.8 \ - --warmup=8 \ - --label-smoothing=0.1 \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --static-loss-scale=128 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --dist-backend='hccl' \ - --multiprocessing-distributed \ - --world-size=1 \ - --rank=0 \ - --device-list=${device_id_list} \ - --benchmark=0 \ - --device='npu' \ - --epochs=90 \ - --num_classes=1000 \ - --batch-size=2048 > ./resnet50_4p.log 2>&1 & - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh deleted file mode 100644 index e3b0a5b523..0000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error -/usr/local/Ascend/driver/tools/msnpureport -d 4 -g error - -ip=$(hostname -I |awk '{print $1}') -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_8p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/DistributedResnet50/main_apex_d76_npu.py \ - --data /data/imagenet \ - --addr=$(hostname -I |awk '{print $1}') \ - --seed=49 \ - --workers=128 \ - --learning-rate=1.6 \ - --warmup=8 \ - --label-smoothing=0.1 \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --static-loss-scale=128 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --dist-backend='hccl' \ - --multiprocessing-distributed \ - --world-size=1 \ - --rank=0 \ - --benchmark=0 \ - --device='npu' \ - --epochs=90 \ - --num_classes=1000 \ - --batch-size=4096 > ./resnet50_8p.log 2>&1 & - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh index 0013d69590..f8d134d5bd 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh @@ -64,7 +64,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh index f2f584cd46..d116b996ad 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh @@ -81,7 +81,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh index e89e5332b6..ea11306d36 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh @@ -79,7 +79,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh index dccd7239c2..e355a2471f 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh @@ -96,7 +96,7 @@ fi export NODE_RANK=${server_index} export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$one_node_ip \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh index 37fd0fd4b8..e754979b9f 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh @@ -77,7 +77,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 1cc50890bd..490324f5e5 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -79,7 +79,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh index cea3033577..d047e2847e 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -81,7 +81,7 @@ fi export NODE_RANK=${server_index} export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$one_node_ip \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh index 96226ecf32..7b20d0af05 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh @@ -79,7 +79,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh index 84c999e576..d7969e07f3 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh @@ -65,7 +65,7 @@ fi export NODE_RANK=0 -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ -- Gitee