From 860166bb0497d1337c58cb90218164a723e0b333 Mon Sep 17 00:00:00 2001 From: FangManLin <2387517593@qq.com> Date: Sat, 7 Jan 2023 09:56:24 +0800 Subject: [PATCH] add online eval --- .../ShuffleNetV1_ID1625_for_PyTorch/README.md | 6 +- .../test/eval.sh | 102 ++++++++++++++++++ .../ShuffleNetV1_ID1625_for_PyTorch/train.py | 24 ++++- 3 files changed, 126 insertions(+), 6 deletions(-) create mode 100644 PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh diff --git a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md index 255bacb62d..70f3c483f2 100644 --- a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md +++ b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md @@ -164,7 +164,11 @@ ShuffleNet V1是一个计算效率极高的图像分类网络,它是专门为 | NPU_1.5 | 66.45 | 3956| 8 | 240 | O2 | | NPU_1.8 | 66.30 | 14510| 8 | 240 | O2 | - +# 在线评估 + bash ./test/eval.sh --data_path=./imagenet2012/ + + [npu id: 0 ] [AVG-ACC] * Acc@1 66.136, Acc@5 86.759 + 备注:在线评估使用的是val数据集和训练评估的数据集不同,故评估精度略有差异 # 版本说明 ## 变更 diff --git a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh new file mode 100644 index 0000000000..ae31ec0b39 --- /dev/null +++ b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="ShuffleNetV1_ID1625_for_PyTorch" +# 训练batch_size +batch_size=1024 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 学习率 +learning_rate=1 +# 加载数据进程数 +workers=0 +device_num=1 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +# 训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 -u train.py \ + --model-size 1.0x \ + --evaluate \ + --resume 8p.pth.tar \ + --batch-size ${batch_size} \ + --opt-level O2 \ + --workers ${workers} \ + --local-rank ${ASCEND_DEVICE_ID} \ + --world-size 1 \ + --device-num ${device_num} \ + --data ${data_path} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & + + +wait + + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $7}'|awk 'END {print}'` +FPS=${FPS%,*} +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +eval_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk '{print $1}' | awk -F "," '{print $1}'` +# 打印,不需要修改 +echo "Final eval Accuracy : ${eval_accuracy}" +echo "E2E evaling Duration sec : $e2e_time" \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py index 5438f0fe85..929da98db9 100644 --- a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py +++ b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py @@ -33,6 +33,7 @@ import time import warnings import random import torch.distributed as dist +from collections import OrderedDict from network import ShuffleNetV1 from utils import accuracy, AvgrageMeter, CrossEntropyLabelSmooth, save_checkpoint, get_lastest_model, get_parameters from utils import get_pytorch_train_loader, get_pytorch_val_loader, adjust_learning_rate @@ -100,10 +101,22 @@ def get_args(): args = parser.parse_args() return args +def proc_nodes_module(checkpoint, AttrName): + new_state_dict = OrderedDict() + for k, v in checkpoint[AttrName].items(): + if (k[0:7] == "module."): + name = k[7:] + else: + name = k[0:] + new_state_dict[name] = v + return new_state_dict + + def main(): global best_acc1 args = get_args() + print("args",args) random.seed(args.seed) torch.manual_seed(args.seed) warnings.warn('You have chosen to seed training. ' @@ -143,12 +156,12 @@ def main(): print('load data successfully') # create model if args.pretrained: - print("[npu id:", args.local_rank, "]", "=> creating model") + print("[npu id:", args.local_rank, "]", "=> creating model") model = ShuffleNetV1(group=args.group, model_size=args.model_size) pretrained_dict = \ torch.load(args.pretrain_pth_path, map_location="cpu")["state_dict"] model.load_state_dict({k.replace('module.',''): v for k, v in pretrained_dict.items()}) - model.load_state_dict(pretrained_dict, strict=False) + model.load_state_dict(pretrained_dict, strict=False) else: print("[npu id:", args.local_rank, "]", "=> creating model") model = ShuffleNetV1(group=args.group, model_size=args.model_size) @@ -173,10 +186,12 @@ def main(): checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] + checkpoint['state_dict'] = proc_nodes_module(checkpoint, 'state_dict') model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.amp: amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) model_loaded = True else: @@ -185,7 +200,7 @@ def main(): if not model_loaded: print('Error: please load the model you want to evaluate with --resume !') return - validate(model, device, args, epoch=epoch) + validate(model, device, args, epoch=args.start_epoch) return for epoch in range(args.start_epoch, args.epochs): @@ -316,5 +331,4 @@ def load_checkpoint(net, checkpoint): net.load_state_dict(temp, strict=True) if __name__ == "__main__": - main() - + main() \ No newline at end of file -- Gitee