diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/main.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/main.py index c7bdf20029af5e256afcd2230ba5b28d9e587086..301ce9d5d9b901304eee1ee9f698cd9fe16864f5 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/main.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/main.py @@ -15,6 +15,8 @@ import os import json import torch +if torch.__version__.startswith('1.8'): + import torch_npu from torch import nn from torch import optim from torch.optim import lr_scheduler @@ -112,11 +114,12 @@ if __name__ == '__main__': if opt.use_apex == 1: from apex import amp - if opt.loss_scale < 0: - model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, loss_scale=None) - else: - # model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, loss_scale=opt.loss_scale, combine_grad=True) - model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, loss_scale=opt.loss_scale) + #if opt.loss_scale < 0: + # model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, loss_scale=None) + #else: + # # model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, loss_scale=opt.loss_scale, combine_grad=True) + # model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, loss_scale=opt.loss_scale) + model,optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level,loss_scale=opt.loss_scale) # resume model best_pre1 = 0 diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/requirements.txt b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/requirements.txt index 39b92c7dba6bef1d4fcd9ac9db9d29ceb07a0ebe..105f194e33136c1327f2b32520e907ead279fbf5 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/requirements.txt +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/requirements.txt @@ -1,4 +1,3 @@ -Python==3.7.0 -torch==1.5.0 +numpy>=1.17.3 torchvision==0.6.0 opencv-python==4.5.3.56 diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/getmodel.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/getmodel.py index 2e6b18ed23a9a1e855193f0f677cbac63791d18e..6a4db1bd282ad538a9cb75ef4e4cc48a166f377e 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/getmodel.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/getmodel.py @@ -15,6 +15,9 @@ import os import json import torch +if torch.__version__.startswith('1.8'): + import torch_npu + from torch import nn from torch import optim from torch.optim import lr_scheduler diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/opts.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/opts.py index e4583c32e613ba471d873a4d9971d3e5455f19c6..ceb6e6d09d71160b65ce30358611c7d69ec17041 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/opts.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/opts.py @@ -41,7 +41,7 @@ def parse_opts(): parser.add_argument('--use_prof', default=1, type=int, help='use_prof') parser.add_argument('--use_apex', default=0, type=int, help='use_apex') parser.add_argument('--opt_level', default='O2', type=str, help='Initial opt_level') - parser.add_argument('--loss_scale', default=128, type=float, help='Initial loss_scale') + parser.add_argument('--loss_scale', default="dynamic", help='Initial loss_scale') parser.add_argument('--batch_size', default=8, type=int, help='Batch Size') parser.add_argument('--n_epochs', default=2, type=int, help='Number of total epochs to run') diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/spatial_transforms.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/spatial_transforms.py index bc532e2093be2256191bdd0f0c4f44e744b24f01..5bb23484f41da97b38581a60b443df2d55f18f30 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/spatial_transforms.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/spatial_transforms.py @@ -18,6 +18,9 @@ import numbers import collections import numpy as np import torch +if torch.__version__.startswith('1.8'): + import torch_npu + import cv2 import scipy.ndimage from PIL import Image, ImageOps diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/test.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/test.py index 4a1abb3f5dc4ca7ed0f7e0c24e89213a6afca50c..b780dd1557c77607d4a9c8b05ab37476b42d15e6 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/test.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/test.py @@ -19,6 +19,8 @@ import time import os import sys import json +if torch.__version__.startswith('1.8'): + import torch_npu from run.utils import AverageMeter diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/train.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/train.py index 312ff580551ef382c2e41618d2d0153b2eba4405..14225a0476736fc1b013e370bf4f422e3966e746 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/train.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/train.py @@ -17,6 +17,8 @@ from torch.autograd import Variable import time import os import sys +if torch.__version__.startswith('1.8'): + import torch_npu from run.utils import AverageMeter, calculate_accuracy @@ -126,7 +128,7 @@ def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, if i >= 2: tot_time.update(current_batch_time) - if device_ids == 0: # distributed master or 1p + if (device_ids == 0) or (opt.device_num == 1): # distributed master or 1p batch_logger.log({ 'date': time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), 'epoch': epoch, @@ -159,7 +161,7 @@ def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, epoch_fps = opt.batch_size * opt.device_num / tot_time.avg - if device_ids == 0: # distributed master or 1p + if (device_ids == 0)or (opt.device_num == 1): # distributed master or 1p epochlog = { 'date': time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), 'epoch': epoch, diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/utils.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/utils.py index 70357d2254782432d3a9d3c66068c79aa3f85442..40c1216f99eae37b70cf4cceeae3ae556eaf378f 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/utils.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/utils.py @@ -18,6 +18,9 @@ import shutil import numpy as np import json import os +if torch.__version__.startswith('1.8'): + import torch_npu + from run.mean import get_mean, get_std class AverageMeter(object): diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/validation.py b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/validation.py index 61827e741592b277f61c32a2b470f61d67d3a41e..795e555bc17be55fdcdd5ae288efa1033c6eef22 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/validation.py +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/run/validation.py @@ -13,6 +13,9 @@ # limitations under the License. # ============================================================================ import torch +if torch.__version__.startswith('1.8'): + import torch_npu + from torch.autograd import Variable import time import sys diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_1p.sh b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_1p.sh index 17b06a389225a6b971d4768dd692b2a5063ff232..8e898916bf1d4f6671b1cb38e6915230b9933b5d 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_1p.sh @@ -1,6 +1,31 @@ #!/usr/bin/env bash + +source env_npu.sh + +# 网络名称,同目录名称 +Network="Efficient-3DCNNs" +# 训练batch_size +batch_size=80 + root_path=$1 +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${root_path}/results/${ASCEND_DEVICE_ID} ];then + rm -rf ${root_path}/results/${ASCEND_DEVICE_ID} + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +else + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) + + +KERNEL_NUM=$(($(nproc)/8)) +PID_START=$((KERNEL_NUM * RANK_ID)) +PID_END=$((PID_START + KERNEL_NUM - 1)) nohup python3.7 ../main.py \ --root_path ${root_path} \ --gpu_or_npu npu \ @@ -16,7 +41,64 @@ nohup python3.7 ../main.py \ --batch_size 80 \ --n_threads 16 \ --ft_portion complete \ - > ${root_path}/results/npu_train_full_1p.log 2>&1 & + > ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_1p.log 2>&1 & + + + + +wait + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +FPS=`grep 'fps' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_1p.log| tail -n 1 |awk '{print $6}'|awk 'END {print}'` +FPS=${FPS%,*} +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +top1_acc=`grep 'test top1 acc' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_1p.log|awk '{print $4}'|awk 'END {print}'` +top1_acc=`echo ${top1_acc%,*}` +# 打印,不需要修改 +echo "Final Train Accuracy : ${top1_acc}" +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'date' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_1p.log|awk '{print $8}' >> ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +# 删除loss值后逗号 +sed -i 's/,//g' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${top1_acc}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_8p.sh b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_8p.sh index cc344cf943cfce8c6a0a58ae25fbc5ec581eee8c..c658476d0cc182790e193206a3f4710207896131 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_8p.sh +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_full_8p.sh @@ -1,8 +1,30 @@ #!/usr/bin/env bash -RANK_ID_START=0 + +source env_npu.sh + +# 网络名称,同目录名称 +Network="Efficient-3DCNNs" +# 训练batch_size +batch_size=640 + RANK_SIZE=8 root_path=$1 +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${root_path}/results/${ASCEND_DEVICE_ID} ];then + rm -rf ${root_path}/results/${ASCEND_DEVICE_ID} + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +else + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) + +RANK_ID_START=0 + for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do @@ -25,5 +47,61 @@ nohup taskset -c $PID_START-$PID_END python3.7 ../main.py --local_rank $RANK_ID --batch_size 640 \ --n_threads 64 \ --ft_portion complete \ - > ${root_path}/results/npu_train_full_8p.log 2>&1 & -done \ No newline at end of file + > ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_8p.log 2>&1 & +done + + + +wait + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +FPS=`grep 'fps' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_8p.log| tail -n 1 |awk '{print $6}'|awk 'END {print}'` +FPS=${FPS%,*} +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +top1_acc=`grep 'test top1 acc' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_8p.log|awk '{print $4}'|awk 'END {print}'` +top1_acc=`echo ${top1_acc%,*}` +# 打印,不需要修改 +echo "Final Train Accuracy : ${top1_acc}" +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'date' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_full_8p.log|awk '{print $8}' >> ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +# 删除loss值后逗号 +sed -i 's/,//g' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${top1_acc}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_1p.sh b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_1p.sh index 49773c1153c233ad3bbbf5839abadb11d62a7bc1..881ce6da08c8ada7c00da95c028c27060fd8bd30 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_1p.sh @@ -1,6 +1,31 @@ #!/usr/bin/env bash + +source env_npu.sh + +# 网络名称,同目录名称 +Network="Efficient-3DCNNs" +# 训练batch_size +batch_size=80 + + root_path=$1 +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${root_path}/results/${ASCEND_DEVICE_ID} ];then + rm -rf ${root_path}/results/${ASCEND_DEVICE_ID} + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +else + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) + + + + nohup python3.7 ../main.py \ --root_path ${root_path} \ --gpu_or_npu npu \ @@ -16,7 +41,59 @@ nohup python3.7 ../main.py \ --batch_size 80 \ --n_threads 16 \ --ft_portion complete \ - > ${root_path}/results/npu_train_performance_1p.log 2>&1 & + > ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_1p.log 2>&1 & + + +wait + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +FPS=`grep 'fps' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_1p.log| tail -n 1 |awk '{print $6}'|awk 'END {print}'` +FPS=${FPS%,*} +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +top1_acc=`grep 'test top1 acc' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_1p.log|awk '{print $4}'|awk 'END {print}'` +top1_acc=`echo ${top1_acc%,*}` +# 打印,不需要修改 +echo "Final Train Accuracy : ${top1_acc}" +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'date' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_1p.log|awk '{print $8}' >> ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +# 删除loss值后逗号 +sed -i 's/,//g' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${top1_acc}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_8p.sh b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_8p.sh index ad27280e186819eedf1dd9a58383e62a9ba1062c..5df7b95cdee07150895b30c9601309f7d5937653 100644 --- a/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_8p.sh +++ b/PyTorch/contrib/cv/classification/Efficient-3DCNNs_ID1230_for_PyTorch/test/npu_train_performance_8p.sh @@ -1,8 +1,30 @@ #!/usr/bin/env bash -RANK_ID_START=0 + +source env_npu.sh + +# 网络名称,同目录名称 +Network="Efficient-3DCNNs" +# 训练batch_size +batch_size=640 + RANK_SIZE=8 root_path=$1 +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${root_path}/results/${ASCEND_DEVICE_ID} ];then + rm -rf ${root_path}/results/${ASCEND_DEVICE_ID} + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +else + mkdir -p ${root_path}/results/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) + +RANK_ID_START=0 + for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do @@ -22,8 +44,62 @@ nohup taskset -c $PID_START-$PID_END python3.7 ../main.py --local_rank $RANK_ID --learning_rate 0.04 \ --droupout_rate 0.2 \ --n_epochs 2 \ - --batch_size 640 \ + --batch_size ${batch_size} \ --n_threads 64 \ --ft_portion complete \ - > ${root_path}/results/npu_train_performance_8p.log 2>&1 & -done \ No newline at end of file + > ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_8p.log 2>&1 & +done + +wait + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +FPS=`grep 'fps' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_8p.log| tail -n 1 |awk '{print $6}'|awk 'END {print}'` +FPS=${FPS%,*} +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +top1_acc=`grep 'test top1 acc' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_8p.log|awk '{print $4}'|awk 'END {print}'` +top1_acc=`echo ${top1_acc%,*}` +# 打印,不需要修改 +echo "Final Train Accuracy : ${top1_acc}" +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'date' ${root_path}/results/${ASCEND_DEVICE_ID}/npu_train_performance_8p.log|awk '{print $8}' >> ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +# 删除loss值后逗号 +sed -i 's/,//g' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${root_path}/results/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${top1_acc}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${root_path}/results/$ASCEND_DEVICE_ID/${CaseName}.log