From 860166bb0497d1337c58cb90218164a723e0b333 Mon Sep 17 00:00:00 2001
From: FangManLin <2387517593@qq.com>
Date: Sat, 7 Jan 2023 09:56:24 +0800
Subject: [PATCH] add online eval

---
 .../ShuffleNetV1_ID1625_for_PyTorch/README.md |   6 +-
 .../test/eval.sh                              | 102 ++++++++++++++++++
 .../ShuffleNetV1_ID1625_for_PyTorch/train.py  |  24 ++++-
 3 files changed, 126 insertions(+), 6 deletions(-)
 create mode 100644 PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh

diff --git a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md
index 255bacb62d..70f3c483f2 100644
--- a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md
+++ b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/README.md
@@ -164,7 +164,11 @@ ShuffleNet V1是一个计算效率极高的图像分类网络，它是专门为
 | NPU_1.5 | 66.45 |  3956|        8 |    240 |       O2 |
 | NPU_1.8 | 66.30 | 14510|        8 |    240 |       O2 |
 
-
+# 在线评估
+    bash ./test/eval.sh  --data_path=./imagenet2012/
+    
+    [npu id: 0 ] [AVG-ACC] * Acc@1 66.136, Acc@5 86.759
+    备注：在线评估使用的是val数据集和训练评估的数据集不同，故评估精度略有差异
 # 版本说明
 
 ## 变更
diff --git a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh
new file mode 100644
index 0000000000..ae31ec0b39
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/test/eval.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="ShuffleNetV1_ID1625_for_PyTorch"
+# 训练batch_size
+batch_size=1024
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 学习率
+learning_rate=1
+# 加载数据进程数
+workers=0
+device_num=1
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+#################启动训练脚本#################
+# 训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3.7 -u train.py \
+    --model-size 1.0x \
+    --evaluate \
+    --resume 8p.pth.tar \
+    --batch-size ${batch_size} \
+    --opt-level O2 \
+    --workers ${workers} \
+    --local-rank ${ASCEND_DEVICE_ID} \
+    --world-size 1 \
+    --device-num ${device_num} \
+    --data ${data_path} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+
+wait
+
+
+##################获取训练数据################
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+# 结果打印，不需要修改
+echo "------------------ Final result ------------------"
+# 输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $7}'|awk 'END {print}'`
+FPS=${FPS%,*}
+# 打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+# 输出训练精度,需要模型审视修改
+eval_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk '{print $1}' | awk -F "," '{print $1}'`
+# 打印，不需要修改
+echo "Final eval Accuracy : ${eval_accuracy}"
+echo "E2E evaling Duration sec : $e2e_time"
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py
index 5438f0fe85..929da98db9 100644
--- a/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py
+++ b/PyTorch/contrib/cv/classification/ShuffleNetV1_ID1625_for_PyTorch/train.py
@@ -33,6 +33,7 @@ import time
 import warnings
 import random
 import torch.distributed as dist
+from collections import OrderedDict
 from network import ShuffleNetV1
 from utils import accuracy, AvgrageMeter, CrossEntropyLabelSmooth, save_checkpoint, get_lastest_model, get_parameters
 from utils import get_pytorch_train_loader, get_pytorch_val_loader, adjust_learning_rate
@@ -100,10 +101,22 @@ def get_args():
     args = parser.parse_args()
     return args
 
+def proc_nodes_module(checkpoint, AttrName):
+    new_state_dict = OrderedDict()
+    for k, v in checkpoint[AttrName].items():
+        if (k[0:7] == "module."):
+            name = k[7:]
+        else:
+            name = k[0:]
+        new_state_dict[name] = v
+    return new_state_dict
+
+
 
 def main():
     global best_acc1
     args = get_args()
+    print("args",args)
     random.seed(args.seed)
     torch.manual_seed(args.seed)
     warnings.warn('You have chosen to seed training. '
@@ -143,12 +156,12 @@ def main():
     print('load data successfully')
     # create model
     if args.pretrained:
-        print("[npu id:", args.local_rank, "]", "=> creating model")       
+        print("[npu id:", args.local_rank, "]", "=> creating model")
         model = ShuffleNetV1(group=args.group, model_size=args.model_size)
         pretrained_dict = \
         torch.load(args.pretrain_pth_path, map_location="cpu")["state_dict"]
         model.load_state_dict({k.replace('module.',''): v for k, v in pretrained_dict.items()})
-        model.load_state_dict(pretrained_dict, strict=False)        
+        model.load_state_dict(pretrained_dict, strict=False)
     else:
         print("[npu id:", args.local_rank, "]", "=> creating model")
         model = ShuffleNetV1(group=args.group, model_size=args.model_size)
@@ -173,10 +186,12 @@ def main():
             checkpoint = torch.load(args.resume, map_location=device)
             args.start_epoch = checkpoint['epoch']
             best_acc1 = checkpoint['best_acc1']
+            checkpoint['state_dict'] = proc_nodes_module(checkpoint, 'state_dict')
             model.load_state_dict(checkpoint['state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer'])
             if args.amp:
                 amp.load_state_dict(checkpoint['amp'])
+
             print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
             model_loaded = True
         else:
@@ -185,7 +200,7 @@ def main():
         if not model_loaded:
             print('Error: please load the model you want to evaluate with --resume !')
             return
-        validate(model, device, args, epoch=epoch)
+        validate(model, device, args, epoch=args.start_epoch)
         return
 
     for epoch in range(args.start_epoch, args.epochs):
@@ -316,5 +331,4 @@ def load_checkpoint(net, checkpoint):
     net.load_state_dict(temp, strict=True)
 
 if __name__ == "__main__":
-    main()
-
+    main()
\ No newline at end of file
-- 
Gitee