From 2d02f946509f4a5825d5ae780382b4a4f37f3f08 Mon Sep 17 00:00:00 2001
From: xuhan-cn <xuhan-cn@qq.com>
Date: Thu, 25 Aug 2022 18:39:56 +0800
Subject: [PATCH 1/2] =?UTF-8?q?LetNet=E5=88=9D=E6=AC=A1=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../contrib/cv/classification/LeNet/README.md | 157 ++++
 .../contrib/cv/classification/LeNet/main.py   | 680 ++++++++++++++++++
 .../classification/LeNet/models/__init__.py   |   0
 .../cv/classification/LeNet/models/lenet_5.py |  36 +
 .../classification/LeNet/modelzoo_level.txt   |   3 +
 .../cv/classification/LeNet/requirements.txt  |   4 +
 .../cv/classification/LeNet/test/env_npu.sh   |  76 ++
 .../LeNet/test/train_eval_8p.sh               | 132 ++++
 .../LeNet/test/train_finetune_1p.sh           | 155 ++++
 .../LeNet/test/train_full_1p.sh               | 143 ++++
 .../LeNet/test/train_full_8p.sh               | 134 ++++
 .../LeNet/test/train_performance_1p.sh        | 143 ++++
 .../LeNet/test/train_performance_8p.sh        | 136 ++++
 13 files changed, 1799 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/README.md
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/main.py
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/models/__init__.py
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/requirements.txt
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh

diff --git a/PyTorch/contrib/cv/classification/LeNet/README.md b/PyTorch/contrib/cv/classification/LeNet/README.md
new file mode 100644
index 0000000000..fa2ee7871e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/README.md
@@ -0,0 +1,157 @@
+# LeNet for PyTorch
+
+-   [概述](概述.md)
+-   [准备训练环境](准备训练环境.md)
+-   [开始训练](开始训练.md)
+-   [训练结果展示](训练结果展示.md)
+-   [版本说明](版本说明.md)
+
+
+
+# 概述
+
+## 简述
+
+Lenet是一个7层的神经网络，包含3个卷积层，2个池化层，1个全连接层。其中所有卷积层的所有卷积核都为5x5，步长strid=1，池化方法都为全局池化，激活函数为Sigmoid。
+
+
+- 参考实现：
+
+  ```
+  url=https://github.com/allegrofb/LeNet.git
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/classification
+  ```
+  
+- 通过Git获取代码方法如下：
+
+  ```
+  git clone https://gitee.com/xuhan-cn/ModelZoo-PyTorch.git       # 克隆仓库的代码
+  cd ./ModelZoo-PyTorch/PyTorch/contrib/cv/classification        # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
+  ```
+  
+- 通过单击“立即下载”，下载源码包。
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。
+
+  **表 1**  版本配套表
+
+  | 配套       | 版本                                                         |
+  | ---------- | ------------------------------------------------------------ |
+  | 固件与驱动 | [1.0.15](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | CANN       | [5.1.RC1](https://www.hiascend.com/software/cann/commercial?version=5.1.RC1) |
+  | PyTorch    | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/)
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖（根据模型需求，按需添加所需依赖）。
+
+  ```
+  pip install -r requirements.txt
+  ```
+
+
+## 数据集
+
+LeNet_5模型选用的数据集为MNIST手写数据集，数据集将在模型训练过程中自动下载。
+
+ 
+
+## 获取预训练模型
+
+预训练模型存放在'./checkpoint'文件夹下，训练过程中每5个epoch保存一次模型。
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录。
+
+   ```
+   cd ./LeNet
+   ```
+
+2. 运行训练脚本。
+
+   该模型支持单机单卡训练和单机8卡训练。
+
+   - 单机单卡训练
+
+     启动单卡训练。
+
+     ```
+     # training 1p accuracy
+     bash ./test/train_full_1p.sh --data_path
+
+     # training 1p performance
+     bash ./test/train_performance_1p.sh --data_path
+
+     # finetuning 1p 
+     bash test/train_finetune_1p.sh --data_path --pth_path=real_pre_train_model_path
+
+     ```
+
+   - 单机8卡训练
+
+     启动8卡训练。
+
+     ```
+     # training 8p accuracy
+     bash ./test/train_full_8p.sh --data_path
+
+     # training 8p performance
+     bash ./test/train_performance_8p.sh --data_path  
+
+     #test 8p accuracy
+     bash test/train_eval_8p.sh --data_path --pth_path=real_pre_train_model_path
+     ```
+
+   --data\_path参数后无需填写数据集路径，因为MNIST数据集是自动下载的
+   --pth\_path参数填写预训练模型路径。
+
+   模型训练脚本参数说明如下。
+
+   ```
+   公共参数：
+   --data_path                           //数据集路径
+   --addr                              //主机地址     
+   --Epoch                             //重复训练次数
+   --batchSize                        //训练批次大小
+   --lr                                //初始学习率，默认：0.012
+   --momentum                          //动量，默认：0.9
+   --weight_decay                      //权重衰减，默认：0.0001
+   --resume                             //中断重新开始模型参数路径
+   --start-epoch                        //开始训练epoch 默认1
+   --pretrained                         //预训练模型路径
+   --amp                               //是否使用混合精度
+   --loss-scale                        //混合精度lossscale大小
+   --opt-level                         //混合精度类型
+   多卡训练参数：
+   --multiprocessing-distributed       //是否使用多卡训练
+   --device-list '0,1,2,3,4,5,6,7'     //多卡训练指定训练用卡
+   ```
+   
+   训练完成后，权重文件保存在'./checkpoint'路径下，并输出模型训练精度和性能信息。
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+
+| NAME    | Acc@1 |  FPS | Epochs | AMP_Type |
+| ------- | -----   | ----: | ------ | -------: |
+| 1p-竞品 | 99.36   |38000   | 80      |        - |
+| 1p-NPU  | 99.44   |21000   | 80      |       O2 |
+| 8p-竞品 | 99.07   |82000  | 80      |        - |
+| 8p-NPU  | 99.10   |41000   | 80      |       O2 |
diff --git a/PyTorch/contrib/cv/classification/LeNet/main.py b/PyTorch/contrib/cv/classification/LeNet/main.py
new file mode 100644
index 0000000000..22ef71ec76
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/main.py
@@ -0,0 +1,680 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import torch
+import numpy as np
+import apex
+from apex import amp
+import torch.nn as nn
+import torch.nn.parallel
+import torch.npu
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torch.nn.functional as F
+from models.lenet_5 import LeNet5
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--workers', default=8, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=80, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=2048, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.012, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')          
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=5, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--pth_path', default='', type=str, metavar='PATH',
+                    help='path to pretrained checkpoint (default: none)')
+parser.add_argument('--world-size', default=1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=0, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://127.0.0.1:30000', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='hccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--num_classes', default=10, type=int,
+                    help='The number of classes.')
+## for ascend 910
+parser.add_argument('--device', default='npu', type=str, help='npu or gpu')
+parser.add_argument('--addr', default='127.0.0.1',
+                    type=str, help='master addr')
+parser.add_argument('--device_list', default='0,1,2,3,4,5,6,7',
+                    type=str, help='device id list')
+parser.add_argument('--warm_up_epochs', default=0, type=int,
+                    help='warm up')
+parser.add_argument('--amp', default=False, action='store_true',
+                    help='use amp to train the model')
+parser.add_argument('--loss-scale', default=128., type=float,
+                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--opt-level', default='O2', type=str,
+                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--prof', default=False, action='store_true',
+                    help='use profiling to evaluate the performance of model')
+args = parser.parse_args()
+best_acc1 = 0
+
+def device_id_to_process_device_map(device_list):
+    devices = device_list.split(",")
+    devices = [int(x) for x in devices]
+    devices.sort()
+
+    process_device_map = dict()
+    for process_id, device_id in enumerate(devices):
+        process_device_map[process_id] = device_id
+
+    return process_device_map
+
+
+def main():
+    print(args.device_list)
+
+    os.environ['MASTER_ADDR'] = args.addr
+    os.environ['MASTER_PORT'] = '29688'
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    args.process_device_map = device_id_to_process_device_map(args.device_list)
+
+    if args.device == 'npu':
+        ngpus_per_node = len(args.process_device_map)
+    else:
+        if args.distributed:
+            ngpus_per_node = torch.cuda.device_count()
+        else:
+            ngpus_per_node = 1
+    print('ngpus_per_node:', ngpus_per_node)
+    
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node,
+                 args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+transforms = transforms.Compose([transforms.Resize((32, 32)),
+                                 transforms.ToTensor()])
+# Data loading code
+if args.data:
+    train_dataset = datasets.MNIST(root=args.data, 
+                        train=True, 
+                        transform=transforms,
+                        download=False)
+    val_dataset = datasets.MNIST(root=args.data, 
+                            train=False, 
+                            transform=transforms)
+else:
+    train_dataset = datasets.MNIST(root='mnist_data', 
+                            train=True, 
+                            transform=transforms,
+                            download=True)
+    val_dataset = datasets.MNIST(root='mnist_data', 
+                            train=False, 
+                            transform=transforms)
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = args.process_device_map[gpu]
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,  # init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model LenNet_5")
+        model = LeNet5(args.num_classes)
+        print("loading model of yours...")
+        if args.pth_path:
+            print("load pth you give")
+            pretrained_dict = torch.load(args.pth_path, map_location="cpu")["state_dict"]
+        else:
+            pretrained_dict = torch.load("./model_best.pth.tar", map_location="cpu")["state_dict"]
+        
+        if "fc.weight" in pretrained_dict:
+            print("pop fc layer weight")
+            pretrained_dict.pop('fc.weight')
+            pretrained_dict.pop('fc.bias')
+        model.load_state_dict(pretrained_dict, strict=False)
+    else:
+        print("=> creating model LeNet_5")
+        model = LeNet5(args.num_classes)
+
+    if args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.gpu is not None:
+            if args.device == 'npu':
+                loc = 'npu:{}'.format(args.gpu)
+                torch.npu.set_device(loc)
+                model = model.to(loc)
+            else:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+
+            # When using a single GPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of GPUs we have
+            args.batch_size = int(args.batch_size / args.world_size)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+        else:
+            if args.device == 'npu':
+                loc = 'npu:{}'.format(args.gpu)
+                model = model.to(loc)
+            else:
+                model.cuda()
+            # DistributedDataParallel will divide and allocate batch_size to all
+            # available GPUs if device_ids are not set
+            print("[gpu id:", args.gpu, "]",
+                  "============================test   args.gpu is not None   else==========================")
+    elif args.gpu is not None:
+        print("[gpu id:", args.gpu, "]",
+              "============================test   elif args.gpu is not None:==========================")
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            torch.npu.set_device(args.gpu)
+            model = model.to(loc)
+        else:
+            torch.cuda.set_device(args.gpu)
+            model = model.cuda(args.gpu)
+
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        print("[gpu id:", args.gpu, "]", "============================test   1==========================")
+        print("[gpu id:", args.gpu, "]", "============================test   3==========================")
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+        else:
+            print("before : model = torch.nn.DataParallel(model).cuda()")
+
+    # define loss function (criterion) and optimizer
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+    #optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr,
+    #                            momentum=args.momentum,
+    #                            weight_decay=args.weight_decay)
+
+    if args.amp:
+        model, optimizer = amp.initialize(
+            model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True)
+
+    if args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.gpu is not None:
+            # When using a single GPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of GPUs we have
+            if args.pretrained:
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False,
+                                                                  find_unused_parameters=True)
+            else:
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+        else:
+            print("[gpu id:", args.gpu, "]",
+                  "============================test   args.gpu is not None   else==========================")
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None:
+        print("[gpu id:", args.gpu, "]",
+              "============================test   elif args.gpu is not None:==========================")
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        print("[gpu id:", args.gpu, "]", "============================test   1==========================")
+        print("[gpu id:", args.gpu, "]", "============================test   3==========================")
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            model = torch.nn.DataParallel(model).to(loc)
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+
+    if args.device == 'npu':
+        loc = 'npu:{}'.format(args.gpu)
+        criterion = nn.CrossEntropyLoss().to(loc)
+    else:
+        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            else:
+                # Map model to be loaded to specified single gpu.
+                if args.device == 'npu':
+                    loc = 'npu:{}'.format(args.gpu)
+                else:
+                    loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(
+            train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
+                              batch_size=args.batch_size, 
+                              shuffle=(train_sampler is None),
+                              num_workers=args.workers,
+                              pin_memory=False,
+                              sampler=train_sampler,
+                              drop_last=True)
+
+
+    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
+                              batch_size=args.batch_size, 
+                              shuffle=False)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args, ngpus_per_node)
+        return
+        
+    if args.prof:
+        profiling(train_loader, model, criterion, optimizer, args)
+        return
+    
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        adjust_learning_rate(optimizer, epoch, args)
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, args, ngpus_per_node)
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+        if args.device == 'npu' and args.gpu == 0 and epoch == 19:
+            print("Complete 90 epoch training, take time:{}h".format(round((time.time() - start_time) / 3600.0, 2)))
+
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                    and args.rank % ngpus_per_node == 0):
+
+            ############## npu modify begin #############
+            if args.amp:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'arch': 'LeNet5',
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                    'amp': amp.state_dict(),
+                }, is_best)
+            else:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'arch': 'LeNet',
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                }, is_best)
+        ############## npu modify end #############
+
+
+def profiling(data_loader, model, criterion, optimizer, args):
+    # switch to train mode
+    model.train()
+
+    def update(model, images, target, optimizer):
+        output, _ = model(images)
+        loss = criterion(output, target)
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        optimizer.zero_grad()
+        optimizer.step()
+
+    for step, (images, target) in enumerate(data_loader):
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            images = images.to(loc, non_blocking=True).to(torch.float)
+            target = target.to(torch.int32).to(loc, non_blocking=True)
+        else:
+            images = images.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+            
+        if step < 5:
+            update(model, images, target, optimizer)
+        else:
+            if args.device == 'npu':
+                with torch.autograd.profiler.profile(use_npu=True) as prof:
+                    update(model, images, target, optimizer)
+            else:
+                with torch.autograd.profiler.profile(use_cuda=True) as prof:
+                    update(model, images, target, optimizer)
+            break
+
+    prof.export_chrome_trace("output.prof")
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            images = images.to(loc, non_blocking=True).to(torch.float)
+            target = target.to(torch.int32).to(loc, non_blocking=True)
+        else:
+            images = images.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+
+        # compute output
+        output, _ = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        optimizer.step()
+        if args.device == 'npu':
+            torch.npu.synchronize()
+
+        # measure elapsed time
+        cost_time = time.time() - end
+        batch_time.update(cost_time)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % ngpus_per_node == 0):
+                progress.display(i)
+
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                and args.rank % ngpus_per_node == 0):
+        if batch_time.avg:
+            print("[npu id:", args.gpu, "]", "batch_size:", args.world_size * args.batch_size,
+                  'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format(
+                    args.batch_size * args.world_size / batch_time.avg))
+
+
+def validate(val_loader, model, criterion, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        i=0
+        for X, y_true in val_loader:
+            if args.gpu is not None:
+                if args.device == 'npu':
+                    loc = 'npu:{}'.format(args.gpu)
+                    X = X.to(loc).to(torch.float)
+                else:
+                    X = X.cuda(args.gpu, non_blocking=True)
+            if args.device == 'npu':
+                loc = 'npu:{}'.format(args.gpu)
+                y_true = y_true.to(torch.int32).to(loc, non_blocking=True)
+            else:
+                y_true = y_true.cuda(args.gpu, non_blocking=True)
+
+            # compute output
+            output, _ = model(X)
+            loss = criterion(output, y_true)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, y_true, topk=(1, 5))
+            losses.update(loss.item(), X.size(0))
+            top1.update(acc1[0], X.size(0))
+            top5.update(acc5[0], X.size(0))
+
+            # measure elapsed time
+            cost_time = time.time() - end
+            batch_time.update(cost_time)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                            and args.rank % ngpus_per_node == 0):
+                    progress.display(i)
+            i=i+1
+
+        if i % args.print_freq == 0:
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % ngpus_per_node == 0):
+                print("[gpu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+                      .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint/checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'checkpoint/model_best.pth.tar')
+        
+        
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=':f', start_count_index=2):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+        self.start_count_index = start_count_index
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        if self.count == 0:
+            self.N = n
+
+        self.val = val
+        self.count += n
+        if self.count > (self.start_count_index * self.N):
+            self.sum += val * n
+            self.avg = self.sum / (self.count - self.start_count_index * self.N)
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by cosine method"""
+
+    if args.warm_up_epochs > 0 and epoch < args.warm_up_epochs:
+        lr = args.lr * ((epoch + 1) / (args.warm_up_epochs + 1))
+    else:
+        alpha = 0
+        cosine_decay = 0.5 * (
+                1 + np.cos(np.pi * (epoch - args.warm_up_epochs) / (args.epochs - args.warm_up_epochs)))
+        decayed = (1 - alpha) * cosine_decay + alpha
+        lr = args.lr * decayed    
+        #if (epoch != 0) and epoch % args.en == 0:
+        #    lr = args.lr * args.dr
+        #    args.lr = lr
+        #else:
+        #    lr = args.lr
+
+    print("=> Epoch[%d] Setting lr: %.4f" % (epoch, lr))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/LeNet/models/__init__.py b/PyTorch/contrib/cv/classification/LeNet/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py b/PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py
new file mode 100644
index 0000000000..df92fdc6ea
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LeNet5(nn.Module):
+
+    def __init__(self, n_classes):
+        super(LeNet5, self).__init__()
+        
+        self.feature_extractor = nn.Sequential(            
+            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
+            # nn.Tanh(),
+            nn.ReLU(),
+            nn.AvgPool2d(kernel_size=2),
+            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
+            # nn.Tanh(),
+            nn.ReLU(),
+            nn.AvgPool2d(kernel_size=2),
+            nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
+            # nn.Tanh()
+            nn.ReLU(),
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(in_features=120, out_features=84),
+            # nn.Tanh(),
+            nn.ReLU(),
+            nn.Linear(in_features=84, out_features=n_classes),
+        )
+
+    def forward(self, x):
+        x = self.feature_extractor(x)
+        x = torch.flatten(x, 1)
+        logits = self.classifier(x)
+        probs = F.softmax(logits, dim=1)
+        return logits, probs
diff --git a/PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt b/PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt
new file mode 100644
index 0000000000..31529da2e6
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:OK
+PrecisionStatus:OK
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LeNet/requirements.txt b/PyTorch/contrib/cv/classification/LeNet/requirements.txt
new file mode 100644
index 0000000000..00352a8956
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.5.0
+apex
+torchvision==0.5.0
+numpy
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh b/PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh
new file mode 100644
index 0000000000..4740fafdcc
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=0
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh
new file mode 100644
index 0000000000..78313a496f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size resume RANK_SIZE
+# 网络名称，同目录名称
+Network="LeNet_5_for_PyTorch"
+# 训练batch_size
+batch_size=2048
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# checkpoint文件路径,以实际路径为准
+pth_path=""
+# 训练epoch
+train_epochs=80
+# 学习率
+learning_rate=0.012
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --pth_path* ]];then
+        pth_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否传入 pth_path , 验证脚本需要传入此参数
+if [[ $pth_path == "" ]];then
+    echo "[Error] para \"pth_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+echo ${data_path}
+nohup python3.7 ${cur_path}/main.py \
+    "" \
+    --evaluate \
+    --resume ${pth_path} \
+    --seed=49 \
+    --learning-rate=${learning_rate} \
+    --mom=0.9 \
+    --weight-decay=0.0005  \
+    --print-freq=1 \
+    --dist-url='tcp://127.0.0.1:50023' \
+    --dist-backend 'hccl' \
+    --multiprocessing-distributed \
+    --world-size=1 \
+    --rank=0 \
+    --device='npu' \
+    --epochs=${train_epochs} \
+    --loss-scale=32 \
+    --amp \
+    --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+
+# 输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+# 训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+
+# 最后一个迭代loss值，不需要修改
+ActualLoss=`grep Test ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}.log | awk '{print $8}' | awk 'END {print}'`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh
new file mode 100644
index 0000000000..2668bfa127
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LeNet_5_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# checkpoint文件路径,以实际路径为准
+pth_path=""
+# 训练epoch
+train_epochs=80
+# 指定训练所使用的npu device卡id
+device_id=0
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --pth_path* ]];then
+        pth_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否传入 pth_path , 验证脚本需要传入此参数
+if [[ $pth_path == "" ]];then
+    echo "[Error] para \"pth_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+echo ${pth_path}
+nohup python3.7 ${cur_path}/main.py \
+    "" \
+    --learning-rate=0.012 \
+    --mom=0.9 \
+    --world-size=1 \
+    --weight-decay=0.0005 \
+    --print-freq=1 \
+    --device='npu' \
+    --gpu=${ASCEND_DEVICE_ID} \
+    --dist-backend 'hccl' \
+    --epochs=${train_epochs} \
+    --loss-scale=32 \
+    --amp \
+    --pretrained \
+    --pth_path=${pth_path} \
+    --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh
new file mode 100644
index 0000000000..1a36df7d69
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LeNet_5_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=80
+# 指定训练所使用的npu device卡id
+device_id=0
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3.7 ${cur_path}/main.py \
+    "" \
+    --learning-rate=0.012 \
+    --momentum=0.9 \
+    --world-size=1 \
+    --weight-decay=0.0005 \
+    --print-freq=1 \
+    --device='npu' \
+    --gpu=${ASCEND_DEVICE_ID} \
+    --dist-backend 'hccl' \
+    --epochs=${train_epochs} \
+    --loss-scale=32 \
+    --amp \
+    --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh
new file mode 100644
index 0000000000..413ac99fe4
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LeNet_5_for_PyTorch"
+# 训练batch_size
+batch_size=2048
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=80
+# 学习率
+learning_rate=0.012
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3.7 ${cur_path}/main.py \
+    "" \
+    --learning-rate=${learning_rate} \
+    --mom=0.9 \
+    --weight-decay=0.0005  \
+    --print-freq=1 \
+    --dist-backend 'hccl' \
+    --multiprocessing-distributed \
+    --world-size=1 \
+    --device='npu' \
+    --epochs=${train_epochs} \
+    --loss-scale=32 \
+    --amp \
+    --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh
new file mode 100644
index 0000000000..17b80a5222
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LeNet_5_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=2
+# 指定训练所使用的npu device卡id
+device_id=0
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3.7 ${cur_path}/main.py \
+    "" \
+    --learning-rate=0.012 \
+    --momentum=0.9 \
+    --world-size=1 \
+    --weight-decay=0.0005 \
+    --print-freq=1 \
+    --device='npu' \
+    --gpu=${ASCEND_DEVICE_ID} \
+    --dist-backend 'hccl' \
+    --epochs=${train_epochs} \
+    --loss-scale=32 \
+    --amp \
+    --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh
new file mode 100644
index 0000000000..34f1f91c26
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LeNet_5_for_PyTorch"
+# 训练batch_size
+batch_size=2048
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=2
+# 学习率
+learning_rate=0.012
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+python3.7 ./main.py \
+    "" \
+    --seed=49 \
+    --learning-rate=${learning_rate} \
+    --momentum=0.9 \
+    --weight-decay=0.0005  \
+    --print-freq=1 \
+    --dist-url='tcp://127.0.0.1:50000' \
+    --dist-backend 'hccl' \
+    --multiprocessing-distributed \
+    --world-size=1 \
+    --rank=0 \
+    --device='npu' \
+    --epochs=${train_epochs} \
+    --loss-scale=32 \
+    --amp \
+    --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
-- 
Gitee


From a258bebac27786ca714ed0b60ea010ec89414e92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E6=97=AD?= <xuhan-cn@qq.com>
Date: Thu, 27 Oct 2022 08:23:52 +0000
Subject: [PATCH 2/2] update PyTorch/contrib/cv/classification/LeNet/README.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 韩旭 <xuhan-cn@qq.com>
---
 PyTorch/contrib/cv/classification/LeNet/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/PyTorch/contrib/cv/classification/LeNet/README.md b/PyTorch/contrib/cv/classification/LeNet/README.md
index fa2ee7871e..e9c64b307a 100644
--- a/PyTorch/contrib/cv/classification/LeNet/README.md
+++ b/PyTorch/contrib/cv/classification/LeNet/README.md
@@ -85,6 +85,12 @@ LeNet_5模型选用的数据集为MNIST手写数据集，数据集将在模型
 2. 运行训练脚本。
 
    该模型支持单机单卡训练和单机8卡训练。
+   - 建立储存模型的文件夹
+
+   ```
+   mkdir checkpoint
+
+   ```     
 
    - 单机单卡训练
 
-- 
Gitee