diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md index c96358198e1117e7218032573a99c8a922069ee1..19fbe569a65484a360596e97e20ab0af5c3b9ba3 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/README.md @@ -4,9 +4,9 @@ ``` 1.安装环境 - 2.修改run_1p.sh字段"data"为当前磁盘的数据集路径 + 2.修改train_performance_1p.sh字段"data"为当前磁盘的数据集路径 3.修改字段device_id(单卡训练所使用的device id),为训练配置device_id,比如device_id=0 - 4.cd到run_1p.sh文件的目录,执行bash run_1p.sh单卡脚本, 进行单卡训练 + 4.cd到train_performance_1p.sh文件的目录(也可直接在模型目录下),执行bash train_performance_1p.sh单卡脚本, 进行单卡训练 ``` @@ -15,8 +15,8 @@ ``` 1.安装环境 2.修改多P脚本中字段"data"为当前磁盘的数据集路径 - 3.修改run_8p.sh字段"addr"为当前主机ip地址 - 4.cd到run_8p.sh文件的目录,执行bash run_8p.sh等多卡脚本, 进行多卡训练 + 3.修改train_performance_8p.sh字段"addr"为当前主机ip地址 + 4.cd到train_performance_8p.sh文件的目录(也可直接在模型目录下),执行bash train_performance_8p.sh等多卡脚本, 进行多卡训练 ``` @@ -37,8 +37,7 @@ 三、测试结果 训练日志路径:在训练脚本的同目录下result文件夹里,如: - - /home/Efficientnet/result/training_8p_job_20201121023601 + /home/Efficientnet/test/output/0/train_0.log diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh deleted file mode 100644 index a7bec336168786b24f40fc44e91facf1ee8a81c1..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_1p.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_1p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/examples/imagenet/main.py \ - --data=/data/imagenet \ - --arch=efficientnet-b0 \ - --batch-size=512 \ - --lr=0.2 \ - --momentum=0.9 \ - --epochs=100 \ - --autoaug \ - --amp \ - --pm=O1 \ - --loss_scale=32 \ - --val_feq=10 \ - --npu=0 > ${train_log_dir}/train_1p.log 2>&1 & \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh deleted file mode 100644 index 63e06c3410e42314e4899f5f62ff7e2361440993..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/run_8p.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error -/usr/local/Ascend/driver/tools/msnpureport -d 4 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_8p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/examples/imagenet/main.py \ - --data=/data/imagenet \ - --arch=efficientnet-b0 \ - --batch-size=4096 \ - --lr=1.6 \ - --momentum=0.9 \ - --epochs=100 \ - --autoaug \ - --amp \ - --pm=O1 \ - --loss_scale=32 \ - --val_feq=10 \ - --addr=$(hostname -I |awk '{print $1}') \ - --dist-backend=hccl \ - --multiprocessing-distributed \ - --world-size 1 \ - --rank 0 \ - --device_list '0,1,2,3,4,5,6,7' > ${train_log_dir}/train_8p.log 2>&1 & \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh index dedbfdfdd51df59c0c1b37b67b62744c6e9e2e1a..a6b3f11d1cb35db6267f9fd031fcdf02ef78b1c7 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_1p.sh @@ -2,9 +2,10 @@ #当前路径,不需要修改 cur_path=`pwd` +# 指定训练所使用的npu device卡id +device_id=0 #集合通信参数,不需要修改 - export RANK_SIZE=1 export JOB_ID=10087 RANK_ID_START=0 @@ -55,41 +56,42 @@ if [[ $1 == --help || $1 == -h ]];then exit 1 fi -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#进入训练脚本目录,需要模型审视修改 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #训练开始时间,不需要修改 start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 @@ -99,16 +101,16 @@ do #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - python3 ${cur_path}/../examples/imagenet/main.py \ + nohup python3.7 ${cur_path}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ @@ -120,7 +122,7 @@ do --pm=O1 \ --loss_scale=32 \ --val_feq=10 \ - --npu=$ASCEND_DEVICE_ID > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --npu=$ASCEND_DEVICE_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -131,12 +133,12 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -154,19 +156,18 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAcuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh index 78ca564ede1e925d64e8685a5a44e9788f07929e..f4701d634b72b825ac2653f9edd15df77d47134d 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_full_8p.sh @@ -98,9 +98,14 @@ fi ##################启动训练脚本################## #训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 -#source ${test_path_dir}/env.sh -python3 ${test_path_dir}/../examples/imagenet/main.py \ +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${test_path_dir}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh index 3257adb89edc6031833920a62d535d78095a9a7a..9934573e464b8185c8e4f95d55fa57743450a07d 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_1p.sh @@ -2,9 +2,10 @@ #source env_npu.sh #当前路径,不需要修改 cur_path=`pwd` +# 指定训练所使用的npu device卡id +device_id=0 #集合通信参数,不需要修改 - export RANK_SIZE=1 export JOB_ID=10087 RANK_ID_START=0 @@ -84,12 +85,37 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#进入训练脚本目录,需要模型审视修改 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #训练开始时间,不需要修改 start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 @@ -99,16 +125,16 @@ do #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - python3 ${cur_path}/../examples/imagenet/main.py \ + nohup python3.7 ${cur_path}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ @@ -121,7 +147,7 @@ do --loss_scale=32 \ --val_feq=10 \ --stop-step-num=128 \ - --npu=$ASCEND_DEVICE_ID > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --npu=$ASCEND_DEVICE_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -132,12 +158,12 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -155,18 +181,18 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh index 84cdda3c2dcf8db15ce40f15e8db258d21ca9412..b3ec488f679245914d4ea1fc81025e213ced601a 100644 --- a/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/EfficientNet-B1_ID1713_for_PyTorch/test/train_performance_8p.sh @@ -106,9 +106,14 @@ fi ##################启动训练脚本################## #训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 -# source ${test_path_dir}/env.sh -python3 ${test_path_dir}/../examples/imagenet/main.py \ +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${test_path_dir}/examples/imagenet/main.py \ --data=${data_path} \ --arch=efficientnet-b1 \ --batch-size=${batch_size} \ @@ -175,6 +180,7 @@ echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${Cas echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md index 291e2e2c31a0f5103ac0427d3785efe047185af6..1de5116b9a4a6f26341c803f81393baa06b93d15 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/README.md @@ -13,9 +13,9 @@ ``` 1.安装环境 - 2.修改run_1p.sh字段"data"为当前磁盘的数据集路径 - 3.修改字段device_id(单卡训练所使用的device id),为训练配置device_id,比如device_id=0 - 4.cd到run_1p.sh文件的目录,执行bash run_1p.sh单卡脚本, 进行单卡训练 + 2.修改train_performance_1p.sh字段"data"为当前磁盘的数据集路径; + 3.修改字段device_id(单卡训练所使用的device id),为训练配置device_id,比如device_id=0; + 4.执行bash train_performance_1p.sh单卡脚本, 进行单卡训练; ``` @@ -25,7 +25,7 @@ 1.安装环境 2.修改多P脚本中字段"data"为当前磁盘的数据集路径 3.修改字段device_id_list(多卡训练所使用的device id列表),为训练配置device_id,比如4p,device_id_list=0,1,2,3;8P默认使用0,1,2,3,4,5,6,7卡不用配置 - 4.cd到run_8p.sh文件的目录,执行bash run_8p.sh等多卡脚本, 进行多卡训练 + 4.执行bash train_performance_8p.sh等多卡脚本, 进行多卡训练; ``` @@ -48,6 +48,6 @@ 训练日志路径:在训练脚本的同目录下result文件夹里,如: - /home/ResNet50/result/training_8p_job_20201121023601 + /home/ResNet50/test/output/device_id/training_8p_job_20201121023601 diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh deleted file mode 100644 index e542152b60bd22d2866bf227dd9d9bd56fc0051f..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_1p.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -device_id=0 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_1p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/pytorch_resnet50_apex.py \ - --data /data/imagenet \ - --npu ${device_id} \ - -j64 \ - -b512 \ - --lr 0.2 \ - --warmup 5 \ - --label-smoothing=0.1 \ - --epochs 90 \ - --num_classes=1000 \ - --optimizer-batch-size 512 > ./resnet50_1p.log 2>&1 & - - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh deleted file mode 100644 index 047849d5f8bae663a472f792fbfab0529146c647..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_2p.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -ip=$(hostname -I |awk '{print $1}') -device_id_list=0,1 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_2p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/DistributedResnet50/main_apex_d76_npu.py \ - --data /data/imagenet \ - --addr=$(hostname -I |awk '{print $1}') \ - --seed=49 \ - --workers=128 \ - --learning-rate=0.4 \ - --warmup=8 \ - --label-smoothing=0.1 \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --static-loss-scale=128 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --dist-backend='hccl' \ - --multiprocessing-distributed \ - --world-size=1 \ - --rank=0 \ - --device-list=${device_id_list} \ - --benchmark=0 \ - --device='npu' \ - --epochs=90 \ - --num_classes=1000 \ - --batch-size=1024 > ./resnet50_2p.log 2>&1 & - - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh deleted file mode 100644 index 2b29adfe64f837bdef5d2eb30268331429559496..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_4p.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -ip=$(hostname -I |awk '{print $1}') -device_id_list=0,1,2,3 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error - -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_4p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/DistributedResnet50/main_apex_d76_npu.py \ - --data /data/imagenet \ - --addr=$(hostname -I |awk '{print $1}') \ - --seed=49 \ - --workers=128 \ - --learning-rate=0.8 \ - --warmup=8 \ - --label-smoothing=0.1 \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --static-loss-scale=128 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --dist-backend='hccl' \ - --multiprocessing-distributed \ - --world-size=1 \ - --rank=0 \ - --device-list=${device_id_list} \ - --benchmark=0 \ - --device='npu' \ - --epochs=90 \ - --num_classes=1000 \ - --batch-size=2048 > ./resnet50_4p.log 2>&1 & - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh deleted file mode 100644 index e3b0a5b523d702b2cca1c209c9c9f3a370edfabe..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/run_8p.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -source env_npu.sh -export WHICH_OP=GEOP -export NEW_GE_FE_ID=1 -export GE_AICPU_FLAG=1 - -/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error -/usr/local/Ascend/driver/tools/msnpureport -d 4 -g error - -ip=$(hostname -I |awk '{print $1}') -currentDir=$(cd "$(dirname "$0")";pwd) -currtime=`date +%Y%m%d%H%M%S` -train_log_dir=${currentDir}/result/training_8p_job_${currtime} -mkdir -p ${train_log_dir} -cd ${train_log_dir} -echo "train log path is ${train_log_dir}" - -python3.7 ${currentDir}/DistributedResnet50/main_apex_d76_npu.py \ - --data /data/imagenet \ - --addr=$(hostname -I |awk '{print $1}') \ - --seed=49 \ - --workers=128 \ - --learning-rate=1.6 \ - --warmup=8 \ - --label-smoothing=0.1 \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --static-loss-scale=128 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --dist-backend='hccl' \ - --multiprocessing-distributed \ - --world-size=1 \ - --rank=0 \ - --benchmark=0 \ - --device='npu' \ - --epochs=90 \ - --num_classes=1000 \ - --batch-size=4096 > ./resnet50_8p.log 2>&1 & - - diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh index 0013d695905b43a64bd7a73c8d9c5f9e341b726f..f8d134d5bd661605471cd18e201da81946310f68 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh @@ -64,7 +64,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh index f2f584cd464fb0c93a5c9026a6bbd8b082b811f6..d116b996ad3eaf05c2b79509cd598e4bde2249b9 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_performance_1p.sh @@ -81,7 +81,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh index e89e5332b6319b738260062cfad5b593b3f39baa..ea11306d364cc5df6e5ad7b6aa5cd497192c84d7 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_eval_1p.sh @@ -79,7 +79,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh index dccd7239c221c6db39b53b7bf477014255ac8cb7..e355a2471f863f1efefaa4d44655a9c703bef532 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_16p.sh @@ -96,7 +96,7 @@ fi export NODE_RANK=${server_index} export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$one_node_ip \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh index 37fd0fd4b8c3f01b3406578561c597fde64c190b..e754979b9f338598f1081b9a222fa32ab9c20f35 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh @@ -77,7 +77,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 1cc50890bdcaae00df4ea9639d7918e79052203d..490324f5e56ebbc9a39bd465a06a91b6392c66d1 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -79,7 +79,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh index cea303357750e5a27b69159a76004a1e38830883..d047e2847ebbd21127963122a1130fcd0d430ff9 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -81,7 +81,7 @@ fi export NODE_RANK=${server_index} export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$one_node_ip \ --seed=49 \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh index 96226ecf321f592c0a6be9f4a02b38046c9eb37f..7b20d0af0572ef7e419c2d83de340f9e58c85b1f 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh @@ -79,7 +79,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7 ./pytorch_resnet50_apex.py \ +nohup python3.7 ./pytorch_resnet50_apex.py \ --data ${data_path} \ --npu ${ASCEND_DEVICE_ID} \ -j ${workers} \ diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh index 84c999e576af2628baaab29af6e759d7cedf555a..d7969e07f3f022ba0df2aa141018767dfcedc53b 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh @@ -65,7 +65,7 @@ fi export NODE_RANK=0 -python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ +nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \