From fa00871daec5928344e06897836c4245d6bfd744 Mon Sep 17 00:00:00 2001
From: sysulyccc <laiych9@gmail.com>
Date: Thu, 29 Sep 2022 14:57:35 +0800
Subject: [PATCH 1/2] =?UTF-8?q?[=E4=B8=AD=E5=B1=B1=E5=A4=A7=E5=AD=A6][?=
 =?UTF-8?q?=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][Pytorch][Inception=5Fv2?=
 =?UTF-8?q?=5F231]--=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

update
---
 .../Inception_v2_231_for_Pytorch/Dockerfile   |   5 +
 .../InceptionV2.py                            | 227 +++++++
 .../Inception_v2_231_for_Pytorch/LICENSE      |  16 +
 .../Inception_v2_231_for_Pytorch/README.md    | 123 ++++
 .../Inception_v2_231_for_Pytorch/demo.py      | 141 ++++
 .../docker_start.sh                           |  25 +
 .../Inception_v2_231_for_Pytorch/main-8p.py   | 631 ++++++++++++++++++
 .../Inception_v2_231_for_Pytorch/main.py      | 588 ++++++++++++++++
 .../modelzoo_level.txt                        |   5 +
 .../pthtar2onnx.py                            |  49 ++
 .../requirements.txt                          |   1 +
 .../test/env_npu.sh                           |  67 ++
 .../test/train_full_8p.sh                     | 167 +++++
 .../test/train_performance_1p.sh              | 171 +++++
 .../test/train_performance_8p.sh              | 169 +++++
 15 files changed, 2385 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh

diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile
new file mode 100644
index 0000000000..30a31af558
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile
@@ -0,0 +1,5 @@
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME
+
+COPY requirements.txt .
+RUN pip3.7 install -r requirements.txt
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py
new file mode 100644
index 0000000000..a5e069ff5c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import torch
+import torch.nn as nn
+import torchvision
+
+def ConvBNReLU(in_channels,out_channels,kernel_size,stride=1,padding=0):
+    return nn.Sequential(
+        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,padding=padding),
+        nn.BatchNorm2d(out_channels),
+        nn.ReLU6(inplace=True),
+    )
+
+def ConvBNReLUFactorization(in_channels,out_channels,kernel_sizes,paddings):
+    return nn.Sequential(
+        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_sizes, stride=1,padding=paddings),
+        nn.BatchNorm2d(out_channels),
+        nn.ReLU6(inplace=True)
+    )
+
+class InceptionV2ModuleA(nn.Module):
+    def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4):
+        super(InceptionV2ModuleA, self).__init__()
+
+        self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1)
+
+        self.branch2 = nn.Sequential(
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1),
+            ConvBNReLU(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_size=3, padding=1),
+        )
+
+        self.branch3 = nn.Sequential(
+            ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1),
+            ConvBNReLU(in_channels=out_channels3reduce, out_channels=out_channels3, kernel_size=3, padding=1),
+            ConvBNReLU(in_channels=out_channels3, out_channels=out_channels3, kernel_size=3, padding=1),
+        )
+
+        self.branch4 = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1),
+        )
+
+    def forward(self, x):
+        out1 = self.branch1(x)
+        out2 = self.branch2(x)
+        out3 = self.branch3(x)
+        out4 = self.branch4(x)
+        out = torch.cat([out1, out2, out3, out4], dim=1)
+        return out
+
+class InceptionV2ModuleB(nn.Module):
+    def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4):
+        super(InceptionV2ModuleB, self).__init__()
+
+        self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1)
+
+        self.branch2 = nn.Sequential(
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1),
+            ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2reduce, kernel_sizes=[1,3],paddings=[0,1]),
+            ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[3,1],paddings=[1, 0]),
+        )
+
+        self.branch3 = nn.Sequential(
+            ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1),
+            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce,kernel_sizes=[1, 3], paddings=[0, 1]),
+            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce,kernel_sizes=[3, 1], paddings=[1, 0]),
+            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce, kernel_sizes=[1, 3], paddings=[0, 1]),
+            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3,kernel_sizes=[3, 1], paddings=[1, 0]),
+        )
+
+        self.branch4 = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1),
+        )
+
+    def forward(self, x):
+        out1 = self.branch1(x)
+        out2 = self.branch2(x)
+        out3 = self.branch3(x)
+        out4 = self.branch4(x)
+        out = torch.cat([out1, out2, out3, out4], dim=1)
+        return out
+
+class InceptionV2ModuleC(nn.Module):
+    def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4):
+        super(InceptionV2ModuleC, self).__init__()
+
+        self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1)
+
+        self.branch2_conv1 = ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1)
+        self.branch2_conv2a = ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[1,3],paddings=[0,1])
+        self.branch2_conv2b = ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[3,1],paddings=[1,0])
+
+        self.branch3_conv1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1)
+        self.branch3_conv2 = ConvBNReLU(in_channels=out_channels3reduce, out_channels=out_channels3, kernel_size=3,stride=1,padding=1)
+        self.branch3_conv3a = ConvBNReLUFactorization(in_channels=out_channels3, out_channels=out_channels3, kernel_sizes=[3, 1],paddings=[1, 0])
+        self.branch3_conv3b = ConvBNReLUFactorization(in_channels=out_channels3, out_channels=out_channels3, kernel_sizes=[1, 3],paddings=[0, 1])
+
+        self.branch4 = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1),
+        )
+
+    def forward(self, x):
+        out1 = self.branch1(x)
+        x2 = self.branch2_conv1(x)
+        out2 = torch.cat([self.branch2_conv2a(x2), self.branch2_conv2b(x2)],dim=1)
+        x3 = self.branch3_conv2(self.branch3_conv1(x))
+        out3 = torch.cat([self.branch3_conv3a(x3), self.branch3_conv3b(x3)], dim=1)
+        out4 = self.branch4(x)
+        out = torch.cat([out1, out2, out3, out4], dim=1)
+        return out
+
+class InceptionV3ModuleD(nn.Module):
+    def __init__(self, in_channels,out_channels1reduce,out_channels1,out_channels2reduce, out_channels2):
+        super(InceptionV3ModuleD, self).__init__()
+
+        self.branch1 = nn.Sequential(
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels1reduce, kernel_size=1),
+            ConvBNReLU(in_channels=out_channels1reduce, out_channels=out_channels1, kernel_size=3,stride=2,padding=1)
+        )
+
+        self.branch2 = nn.Sequential(
+            ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1),
+            ConvBNReLU(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_size=3, stride=1, padding=1),
+            ConvBNReLU(in_channels=out_channels2, out_channels=out_channels2, kernel_size=3, stride=2,padding=1),
+        )
+
+        self.branch3 = nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
+
+    def forward(self, x):
+        out1 = self.branch1(x)
+        out2 = self.branch2(x)
+        out3 = self.branch3(x)
+        out = torch.cat([out1, out2, out3], dim=1)
+        return out
+
+class InceptionAux(nn.Module):
+    def __init__(self, in_channels,out_channels):
+        super(InceptionAux, self).__init__()
+
+        self.auxiliary_avgpool = nn.AvgPool2d(kernel_size=5, stride=3)
+        self.auxiliary_conv1 = ConvBNReLU(in_channels=in_channels, out_channels=128, kernel_size=1)
+        self.auxiliary_conv2 = nn.Conv2d(in_channels=128, out_channels=768, kernel_size=5,stride=1)
+        self.auxiliary_dropout = nn.Dropout(p=0.7)
+        self.auxiliary_linear1 = nn.Linear(in_features=768, out_features=out_channels)
+
+    def forward(self, x):
+        x = self.auxiliary_conv1(self.auxiliary_avgpool(x))
+        x = self.auxiliary_conv2(x)
+        x = x.view(x.size(0), -1)
+        out = self.auxiliary_linear1(self.auxiliary_dropout(x))
+        return out
+
+class InceptionV2(nn.Module):
+    def __init__(self, num_classes=1000, stage='train'):
+        super(InceptionV2, self).__init__()
+        self.stage = stage
+
+        self.block1 = nn.Sequential(
+            ConvBNReLU(in_channels=3, out_channels=64, kernel_size=7,stride=2,padding=3),
+            nn.MaxPool2d(kernel_size=3,stride=2,padding=1),
+        )
+
+        self.block2 = nn.Sequential(
+            ConvBNReLU(in_channels=64, out_channels=192, kernel_size=3, stride=1, padding=1),
+            nn.MaxPool2d(kernel_size=3, stride=2,padding=1),
+        )
+
+        self.block3 = nn.Sequential(
+            InceptionV2ModuleA(in_channels=192,out_channels1=64,out_channels2reduce=64, out_channels2=64, out_channels3reduce=64, out_channels3=96, out_channels4=32),
+            InceptionV2ModuleA(in_channels=256, out_channels1=64, out_channels2reduce=64, out_channels2=96,out_channels3reduce=64, out_channels3=96, out_channels4=64),
+            InceptionV3ModuleD(in_channels=320, out_channels1reduce=128, out_channels1=160, out_channels2reduce=64,out_channels2=96),
+        )
+
+        self.block4 = nn.Sequential(
+            InceptionV2ModuleB(in_channels=576, out_channels1=224, out_channels2reduce=64, out_channels2=96,out_channels3reduce=96, out_channels3=128, out_channels4=128),
+            InceptionV2ModuleB(in_channels=576, out_channels1=192, out_channels2reduce=96, out_channels2=128,out_channels3reduce=96, out_channels3=128, out_channels4=128),
+            InceptionV2ModuleB(in_channels=576, out_channels1=160, out_channels2reduce=128, out_channels2=160,out_channels3reduce=128, out_channels3=128, out_channels4=128),
+            InceptionV2ModuleB(in_channels=576, out_channels1=96, out_channels2reduce=128, out_channels2=192,out_channels3reduce=160, out_channels3=160, out_channels4=128),
+            InceptionV3ModuleD(in_channels=576, out_channels1reduce=128, out_channels1=192, out_channels2reduce=192,out_channels2=256),
+        )
+
+        self.block5 = nn.Sequential(
+            InceptionV2ModuleC(in_channels=1024, out_channels1=352, out_channels2reduce=192, out_channels2=160,out_channels3reduce=160, out_channels3=112, out_channels4=128),
+            InceptionV2ModuleC(in_channels=1024, out_channels1=352, out_channels2reduce=192, out_channels2=160,
+                               out_channels3reduce=192, out_channels3=112, out_channels4=128)
+        )
+
+        self.max_pool = nn.MaxPool2d(kernel_size=7, stride=1)
+        self.dropout = nn.Dropout(p=0.5)
+        self.linear = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.max_pool(x)
+        x = self.dropout(x)
+        x = x.view(x.size(0), -1)
+        out = self.linear(x)
+        return out
+
+if __name__=='__main__':
+    model = InceptionV2()
+    print(model)
+
+    input = torch.randn(1, 3, 224, 224)
+    out = model(input)
+    print(out.shape)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE
new file mode 100644
index 0000000000..82adefb928
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE
@@ -0,0 +1,16 @@
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved
+#
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the BSD 3-Clause License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://spdx.org/licenses/BSD-3-Clause.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md
new file mode 100644
index 0000000000..ebe02e8775
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md
@@ -0,0 +1,123 @@
+# Inception_v2_231
+-   [概述](#概述)
+-   [准备训练环境](#准备训练环境)
+-   [开始训练](#开始训练)
+-   [训练结果展示](#训练结果展示)
+
+# 概述
+
+## 简述
+
+InceptionV2主要是在GoogLeNet的基础上添加了 BN 层，并且采用 VGG 的思想，利用两个小卷积核代替大卷积核，在保持相同感受野的同时减少参数，并提高非线性表示能力。
+
+- 参考实现：
+
+  ```
+  https://github.com/shanglianlm0525/PyTorch-Networks/blob/master/ClassicNetwork/InceptionV2.py
+  ```
+  
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/classification
+  ```
+  
+- 通过Git获取代码方法如下：
+
+  ```
+  git clone {url}       # 克隆仓库的代码
+  cd {code_path}        # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
+  ```
+  
+- 通过单击“立即下载”，下载源码包
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示
+
+  **表 1**  版本配套表
+
+  | 配套        | 版本                                                         |
+  | ---------- | ------------------------------------------------------------ |
+  | 固件与驱动  | [1.0.9](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | CANN       | [3.2.1](https://www.hiascend.com/software/cann/commercial?version=3.2.1) |
+  | PyTorch    | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) |
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖。
+
+  ```
+  pip install -r requirements.txt
+  ```
+
+
+## 准备数据集
+
+1. 获取数据集。
+
+   用户自行获取原始数据集，可选用的开源数据集包括ImageNet2012，CIFAR-10等，将数据集上传到服务器任意路径下并解压。
+
+   以ImageNet2012数据集为例，数据集目录结构参考如下所示。
+
+   ```
+   ├── ImageNet2012
+         ├──train
+              ├──类别1
+                    │──图片1
+                    │──图片2
+                    │   ...       
+              ├──类别2
+                    │──图片1
+                    │──图片2
+                    │   ...   
+              ├──...                     
+         ├──val  
+              ├──类别1
+                    │──图片1
+                    │──图片2
+                    │   ...       
+              ├──类别2
+                    │──图片1
+                    │──图片2
+                    │   ...              
+   ```
+
+   > **说明：** 
+   >数据集路径以用户自行定义的路径为准
+
+# 开始训练
+
+## 训练模型
+```bash
+# prefomance training 1p
+bash ./test/train_performance_1p.sh --data_path=/opt/npu/dataset/imagenet
+
+# prefomance training 8p
+bash ./test/train_performance_8p.sh --data_path=/opt/npu/dataset/imagenet
+
+# full training 1p
+bash ./test/train_full_1p.sh --data_path=/opt/npu/dataset/imagenet
+
+# full training 8p
+bash ./test/train_full_8p.sh --data_path=/opt/npu/dataset/imagenet
+
+# eval 
+bash ./test/eval_8p.sh --data_path=/opt/npu/dataset/imagenet
+```
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+|  NAME  |  LOSS  |   FPS   | Epochs | AMP_Type | ACC@1  |
+| :----: | :----: | :-----: | :----: | :------: | :----: |
+| 1p-GPU | 7.5401 | 793.85  |   1    |    -     |   -    |
+| 1p-NPU | 7.6044 | 942.15  |   1    |    O1    |   -    |
+| 8p-GPU | 2.3929 | 5752.80 |  100   |    -     | 68.654 |
+| 8p-NPU | 2.1992 | 6395.45 |  100   |    O1    | 68.663 |
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py
new file mode 100644
index 0000000000..bd832f9ace
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py
@@ -0,0 +1,141 @@
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved
+#
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the BSD 3-Clause License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://spdx.org/licenses/BSD-3-Clause.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -*- coding: utf-8 -*-
+
+
+import os
+
+import torch
+import numpy as np
+from inception import inception_v3 
+import argparse
+from apex import amp
+import apex
+import torch.distributed as dist
+parser = argparse.ArgumentParser(description='inception_v3 demo ')
+parser.add_argument('--device', default='npu', type=str,
+                    help='npu or gpu')
+
+parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
+parser.add_argument('--dist-backend', default='hccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--addr', default='', type=str,
+                    help='master addr')
+
+'''
+print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+
+'''
+def device_id_to_process_device_map(device_list):
+    devices = device_list.split(",")
+    devices = [int(x) for x in devices]
+    devices.sort()
+
+    process_device_map = dict()
+    for process_id, device_id in enumerate(devices):
+        process_device_map[process_id] = device_id
+
+    return process_device_map
+
+
+def build_model():
+    global loc
+    # 请自定义模型并加载预训练模型
+    args = parser.parse_args()
+    args.process_device_map = device_id_to_process_device_map(args.device_list)
+    os.environ['MASTER_ADDR'] = args.addr
+    os.environ['MASTER_PORT'] = '29688'
+    ngpus_per_node = len(args.process_device_map)
+    
+    dist.init_process_group(backend=args.dist_backend,  # init_method=args.dist_url,
+                                    world_size=1, rank=0)
+    
+    args.gpu = args.process_device_map[0]
+    loc = 'npu:{}'.format(args.gpu)
+    torch.npu.set_device(loc)
+    
+    model = inception_v3().to(loc)
+    optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), 0.8,
+                                momentum=0.9,
+                                weight_decay=1.0e-04)
+    model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+    checkpoint = torch.load('./checkpoint.pth.tar')
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()  # 注意设置eval模式
+    return model
+
+
+def get_raw_data():
+    # 请自定义获取数据方式，请勿将原始数据上传至代码仓
+    from PIL import Image
+    from urllib.request import urlretrieve
+    IMAGE_URL = 'https://bbs-img.huaweicloud.com/blogs/img/thumb/1591951315139_8989_1363.png'
+    urlretrieve(IMAGE_URL, 'tmp.jpg')
+    img = Image.open("tmp.jpg")
+    img = img.convert('RGB')
+    return img
+
+
+def pre_process(raw_data):
+    # 请自定义模型预处理方法
+    from torchvision import transforms
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    transforms_list = transforms.Compose([
+        transforms.Resize(299),
+        transforms.CenterCrop(299),
+        transforms.ToTensor(),
+        normalize
+    ])
+    input_data = transforms_list(raw_data)
+    return input_data.unsqueeze(0)
+
+
+def post_process(output_tensor):
+    # 请自定义后处理方法
+    print(output_tensor)
+    return torch.argmax(output_tensor, 1)
+
+
+if __name__ == '__main__':
+    # 1. 获取原始数据
+    raw_data = get_raw_data()
+
+    # 2. 构建模型
+    model = build_model()
+
+    # 3. 预处理
+    input_tensor = pre_process(raw_data)
+
+    # 4. 执行forward
+    output_tensor = model(input_tensor.to(loc))
+
+    # 5. 后处理
+    result = post_process(output_tensor)
+
+    # 6. 打印
+    print(result)
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh
new file mode 100644
index 0000000000..944bca3cda
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+docker_image=$1
+data_dir=$2
+model_dir=$3
+
+docker run -it --ipc=host \
+              --device=/dev/davinci0 \
+              --device=/dev/davinci1 \
+              --device=/dev/davinci2 \
+              --device=/dev/davinci3 \
+              --device=/dev/davinci4 \
+              --device=/dev/davinci5 \
+              --device=/dev/davinci6 \
+              --device=/dev/davinci7 \
+              --device=/dev/davinci_manager \
+              --device=/dev/devmm_svm --device=/dev/hisi_hdc \
+              -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+              -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
+              -v ${model_dir}:${model_dir} \
+              -v ${data_dir}:${data_dir}  \
+              -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
+              -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
+              -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
+              /bin/bash
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py
new file mode 100644
index 0000000000..22ba96dd09
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py
@@ -0,0 +1,631 @@
+# BSD 3-Clause License
+
+# Copyright (c) Soumith Chintala 2016,
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+from InceptionV2 import InceptionV2
+
+from apex import amp
+import apex
+import numpy as np
+from apex.optimizers import NpuFusedSGD
+
+if torch.__version__ >= '1.8':
+    import torch_npu
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
+                    help='path to dataset')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50')
+parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=512, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('--workspace', type=str, default='./', metavar='DIR',
+                    help='path to directory where checkpoints will be stored')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('-ef', '--eval-freq', default=5, type=int,
+                    metavar='N', help='evaluate frequency (default: 5)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--fine-tuning', action='store_true',
+                    help='use fine-tuning model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--npu', default=None, type=int,
+                    help='npu id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('-bm', '--benchmark', default=0, type=int,
+                    metavar='N', help='set benchmark status (default: 1,run benchmark)')
+parser.add_argument('--device', default='npu', type=str,
+                    help='npu or gpu')
+parser.add_argument('--addr', default='', type=str,
+                    help='master addr')
+parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str,
+                    help='checkpoint-nameprefix')
+parser.add_argument('--checkpoint-freq', default=0, type=int,
+                    metavar='N', help='checkpoint frequency (default: 0)'
+                                      '0: save only one file whitch per epoch;'
+                                      'n: save diff file per n epoch'
+                                      '-1:no checkpoint,not support')
+parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
+# apex
+parser.add_argument('--amp', default=False, action='store_true',
+                    help='use amp to train the model')
+parser.add_argument('--loss-scale', default=1024., type=float,
+                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--opt-level', default='O1', type=str,
+                    help='loss scale using in amp, default -1 means dynamic')
+
+parser.add_argument('--label-smoothing',
+                    default=0.0,
+                    type=float,
+                    metavar='S',
+                    help='label smoothing')
+parser.add_argument('--warm_up_epochs', default=0, type=int,
+                    help='warm up')
+
+
+warnings.filterwarnings('ignore')
+best_acc1 = 0
+
+
+def device_id_to_process_device_map(device_list):
+    devices = device_list.split(",")
+    devices = [int(x) for x in devices]
+    devices.sort()
+
+    process_device_map = dict()
+    for process_id, device_id in enumerate(devices):
+        process_device_map[process_id] = device_id
+
+    return process_device_map
+
+
+def main():
+    args = parser.parse_args()
+    print("===============main()=================")
+    print(args)
+    print("===============main()=================")
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    os.environ['MASTER_ADDR'] = args.addr
+    os.environ['MASTER_PORT'] = '29688'
+
+    if args.npu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    args.process_device_map = device_id_to_process_device_map(args.device_list)
+
+    if args.device == 'npu':
+        ngpus_per_node = len(args.process_device_map)
+    else:
+        ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        # The child process uses the environment variables of the parent process,
+        # we have to set KERNEL_NAME_ID for every proc
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+
+    else:
+        # Simply call main_worker function
+        main_worker(args.npu, ngpus_per_node, args)
+
+
+def main_worker(npu, ngpus_per_node, args):
+    global best_acc1
+    args.npu = args.process_device_map[npu]
+
+    if args.npu is not None:
+        print("[npu id:", args.npu, "]", "Use npu: {} for training".format(args.npu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + npu
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,  # init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+    loc = 'npu:{}'.format(args.npu)
+    torch.npu.set_device(loc)
+
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        #model = InceptionV2(pretrained=True)
+        model = InceptionV2()
+        print("Load my train models...")
+        pretrained_dict = \
+        torch.load("/home/Inception/model_best.pthtar", map_location="cpu")["state_dict"]
+        model.load_state_dict(pretrained_dict, strict=False)         
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = InceptionV2()
+
+    if args.fine_tuning:
+        print("=> transfer-learning mode + fine-tuning (train only the last FC layer)")
+        for param in model.parameters():
+            param.requires_grad = False
+        if args.arch == 'inception_v2':
+            model.classifier = nn.Linear(1024, 1000)
+            parameters = model.classifier.parameters()
+        else:
+            print("Error:Fine-tuning is not supported on this architecture")
+            exit(-1)
+    else:
+        parameters = model.parameters()
+
+    args.batch_size = int(args.batch_size / ngpus_per_node)
+    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(342),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=True,
+        num_workers=args.workers, pin_memory=False, drop_last=True)
+
+    # create model
+    model = model.to(loc)
+
+    # define loss function (criterion) and optimizer
+
+    loss = nn.CrossEntropyLoss().to(loc)
+    if args.label_smoothing > 0.0:        
+        loss = lambda: LabelSmoothing(loc, args.label_smoothing)
+    criterion = loss().to(loc)
+    
+    #optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                #momentum=args.momentum,
+                                #weight_decay=args.weight_decay)
+    optimizer = NpuFusedSGD(
+        model.parameters(),
+        lr=args.lr,
+        momentum=args.momentum,
+        weight_decay=args.weight_decay,
+    )
+    if args.amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.npu], broadcast_buffers=False)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args, ngpus_per_node)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        lr = adjust_learning_rate(optimizer, epoch, args)
+        
+        steps_per_epoch = len(train_loader)
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+            and args.rank % ngpus_per_node == 0):
+            print("=> Epoch[%d] Setting lr: %.4f" % (epoch, lr))
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node)
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, args, ngpus_per_node)
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                and args.rank % ngpus_per_node == 0):
+            if (epoch <= 80 and epoch % 25 == 0 ) :
+                if args.amp:
+                    save_checkpoint_v1({
+                        'epoch': epoch + 1,
+                        'arch': args.arch,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                        'amp': amp.state_dict(),
+                        },is_best)
+                else:
+                    save_checkpoint_v1({
+                        'epoch': epoch + 1,
+                        'arch': args.arch,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                        },is_best)
+            elif (epoch > 80 and epoch <= 100):
+                if args.amp:
+                    save_checkpoint({
+                        'epoch': epoch + 1,
+                        'arch': args.arch,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                        'amp': amp.state_dict(),
+                        }, is_best)
+                else:
+                    save_checkpoint({
+                        'epoch': epoch + 1,
+                        'arch': args.arch,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                        }, is_best)
+            else:
+                print("Modify the number of epoches so that the total number of saved models does not exceed 20!")
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+    end = time.time()
+    loc = 'npu:{}'.format(args.npu)
+
+    # steps_per_epoch = len(train_loader)
+    steps_per_epoch = len(train_loader)
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+            and args.rank % ngpus_per_node == 0):
+        print('==========step per epoch======================', steps_per_epoch)
+
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+        target = target.to(torch.int32)
+        images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False)
+
+        # compute output
+        loss,output = get_loss(model, target, images, criterion)
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        optimizer.step()
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        optimizer.zero_grad()
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        if i % args.print_freq == 0:
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % ngpus_per_node == 0):
+                progress.display(i)
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                and args.rank % ngpus_per_node == 0):
+        print("[npu id:", args.npu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
+
+
+def validate(val_loader, model, criterion, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        loc = 'npu:{}'.format(args.npu)
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+            target = target.to(torch.int32)
+            images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+            stream = torch.npu.current_stream()
+            stream.synchronize()
+
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            stream = torch.npu.current_stream()
+            stream.synchronize()
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                            and args.rank % ngpus_per_node == 0):
+                    progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                    and args.rank % ngpus_per_node == 0):
+            print("[npu id:", args.npu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+                  .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+def get_loss(model, target, images, criterion):
+     output = model(images)
+     loss = criterion(output, target)
+     #loss2 = criterion(aux1, target)
+     # According to the paper BN auxiliary classifier
+     #loss = loss1 + 0.4*loss2
+     return loss, output
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch']))
+
+def save_checkpoint_v1(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch']))
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+        self.start_count_index = 10
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.count += n
+        if self.count > (self.start_count_index * n):
+            self.sum += val * n
+            self.avg = self.sum / (self.count - self.start_count_index * n)
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("[npu id:", '0', "]", '\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    if args.warm_up_epochs > 0 and epoch < args.warm_up_epochs:
+        lr = args.lr * ((epoch + 1) / (args.warm_up_epochs + 1))
+    else:
+        alpha = 0
+        cosine_decay = 0.5 * (
+                1 + np.cos(np.pi * (epoch - args.warm_up_epochs) / (args.epochs - args.warm_up_epochs)))
+        decayed = (1 - alpha) * cosine_decay + alpha
+        lr = args.lr * decayed
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, loc, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.device = loc
+
+    def forward(self, x, target):
+        target = target.to(torch.int64)
+
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1).to(torch.int64))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py
new file mode 100644
index 0000000000..8726453f24
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py
@@ -0,0 +1,588 @@
+# BSD 3-Clause License
+
+# Copyright (c) Soumith Chintala 2016,
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import yaml
+import os
+import random
+import shutil
+import time
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torch.npu
+from InceptionV2 import InceptionV2
+import apex
+from apex import amp
+warnings.filterwarnings('ignore')
+CALCULATE_DEVICE = "npu:0"
+
+if torch.__version__ >= '1.8':
+    import torch_npu
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    help='model architecture')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--npu', default=None, type=int,
+                    help='NPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+
+parser.add_argument('--optimizer-batch-size',
+                    default=-1,
+                    type=int,
+                    metavar='N',
+                    help=
+                    'size of a total batch size, for simulating bigger batches using gradient accumulation')
+
+
+parser.add_argument('--gpu',
+                    default=None,
+                    type=int,
+                    help='GPU id to use.')
+
+parser.add_argument('--warmup',
+                    default=0,
+                    type=int,
+                    metavar='E',
+                    help='number of warmup epochs')
+parser.add_argument('--label-smoothing',
+                    default=0.0,
+                    type=float,
+                    metavar='S',
+                    help='label smoothing')
+
+parser.add_argument('--static-loss-scale',
+                    type=float,
+                    default=1,
+                    help=
+                    'Static loss scale, positive power of 2 values can improve fp16 convergence.')
+parser.add_argument('-t',
+                    '--fine-tuning',
+                    action='store_true',
+                    help='transfer learning + fine tuning - train only the last FC layer.')
+
+parser.add_argument('--amp', action='store_true', help='use apex')
+parser.add_argument('--pm', '--precision-mode', default='O1', type=str,
+                    help='precision mode to use for mix precision, only support O1, O2')
+parser.add_argument('--loss_scale', default=1024, type=int, help='loss_scale for amp')
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+    print("=======================")
+    print(args)
+    print("=======================")
+    if args.npu is None:
+        args.npu = 0
+    global CALCULATE_DEVICE
+    CALCULATE_DEVICE = "npu:{}".format(args.npu)
+    torch.npu.set_device(CALCULATE_DEVICE)
+    print("use ", CALCULATE_DEVICE)
+
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.npu is not None:
+        warnings.warn('You have chosen a specific NPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    ngpus_per_node = torch.npu.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.npu, ngpus_per_node, args)
+
+
+def main_worker(npu, ngpus_per_node, args):
+    global best_acc1
+    args.npu = npu
+
+    if args.npu is not None:
+        print("Use NPU: {} for training".format(args.npu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + npu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        #model = models.__dict__[args.arch](pretrained=True)
+        model = InceptionV2()
+        print("Load my train models...")
+        pretrained_dict = \
+        torch.load("/home/Inception/model_best.pth.tar", map_location="cpu")["state_dict"]
+        model.load_state_dict(pretrained_dict, strict=False)       
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = InceptionV2()
+        #model = models.__dict__[args.arch]()
+    
+    if args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.npu is not None:
+            loc = 'npu:{}'.format(args.npu)
+            torch.npu.set_device(loc)
+            model.to(loc)
+            # When using a single NPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of NPUs we have
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.npu])
+        else:
+            model.cuda()
+            # DistributedDataParallel will divide and allocate batch_size to all
+            # available GPUs if device_ids are not set
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.npu is not None:
+        loc = 'npu:{}'.format(args.npu)
+        torch.npu.set_device(loc)
+        model = model.to(loc)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = model.to(CALCULATE_DEVICE)
+            #model = torch.nn.DataParallel(model).cuda()
+
+    # define loss function (criterion) and optimizer
+    loss = nn.CrossEntropyLoss()
+    if args.label_smoothing > 0.0:
+        loss = lambda: LabelSmoothing(args.label_smoothing)
+    criterion = loss().to(CALCULATE_DEVICE)
+    optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+    if args.amp:
+        print("=> use amp...")
+        if args.pm not in ['O1', 'O2']:
+            print('=>unsupported precision mode!')
+            exit()
+        opt_level = args.pm
+        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=args.loss_scale, combine_grad=True)
+
+    '''
+     if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+    '''
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.npu is None:
+                checkpoint = torch.load(args.resume)
+            else:
+                # Map model to be loaded to specified single npu.
+                loc = 'npu:{}'.format(args.npu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.npu is not None:
+                # best_acc1 may be from a checkpoint from a different NPU
+                best_acc1 = best_acc1.to(args.npu)
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                                     std=[0.5, 0.5, 0.5])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=False, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(341),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=False)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+    
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        adjust_learning_rate(optimizer, epoch, args)
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, args)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, args)
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                and args.rank % ngpus_per_node == 0):
+            if not args.amp:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'arch': args.arch,
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer' : optimizer.state_dict(),
+                }, is_best)
+            else:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'arch': args.arch,
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                    'amp': amp.state_dict(),
+                    }, is_best)
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args):
+
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    n=0
+    end = time.time()
+    loc = 'npu:{}'.format(args.npu)
+    for i, (images, target) in enumerate(train_loader):
+        if n==201:
+            pass
+        n=n+1
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.npu is not None:
+            images = images.to(loc, non_blocking=False)
+        if torch.cuda.is_available():
+            target = target.cuda(loc, non_blocking=False)
+        images = images.to(loc, non_blocking=True)
+        target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+        # compute output
+        loss,output = get_loss(model, target, images, criterion)
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.display(i)
+    print("batch_size:", args.batch_size, 'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format(
+            args.batch_size/batch_time.avg))
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+    loc = 'npu:{}'.format(args.npu)
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+            if args.npu is not None:
+                images = images.to(loc, non_blocking=False)
+            if torch.cuda.is_available():
+                target = target.to(loc, non_blocking=False)
+            images = images.to(CALCULATE_DEVICE, non_blocking=False)
+            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=False)
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+def get_loss(model, target, images, criterion):
+     output = model(images)
+     loss = criterion(output, target)
+     #loss2 = criterion(aux1, target)
+     # According to the paper BN auxiliary classifier
+     #loss = loss1 + 0.4*loss2
+     return loss, output
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):       
+        target = target.to(torch.int64) 
+
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+        index = target.unsqueeze(1)
+        
+        nll_loss = -logprobs.gather(dim=-1, index=index)
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        
+        return loss.mean()
+        
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt
new file mode 100644
index 0000000000..41d561e947
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt
@@ -0,0 +1,5 @@
+GPUStatus:OK
+NPUMigrationStatus:OK
+FuncStatus:OK
+PerfStatus:OK
+PrecisionStatus:OK
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py
new file mode 100644
index 0000000000..df26ace55e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py
@@ -0,0 +1,49 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import torch
+from inception import inception_v3
+import torch.onnx
+
+from collections import OrderedDict
+
+
+def proc_node_module(checkpoint, AttrName):
+    new_state_dict = OrderedDict()
+    for k, v in checkpoint[AttrName].items():
+        if(k[0:7] == "module."):
+            name = k[7:]
+        else:
+            name = k[0:]
+        new_state_dict[name] = v
+    return new_state_dict
+
+
+def convert():
+    checkpoint = torch.load("./model_best.pth.tar", map_location='cpu')
+    checkpoint['state_dict'] = proc_node_module(checkpoint, 'state_dict')
+    model = inception_v3()
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    print(model)
+
+    input_names = ["actual_input_1"]
+    output_names = ["output1"]
+    dummy_input = torch.randn(16, 3, 299, 299)
+    torch.onnx.export(model, dummy_input, "inception_npu_16.onnx", input_names=input_names, output_names=output_names,
+                      opset_version=11)
+
+
+if __name__ == "__main__":
+    convert()
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt
new file mode 100644
index 0000000000..d93a42f0f3
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt
@@ -0,0 +1 @@
+torchvision=0.6.0
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh
new file mode 100644
index 0000000000..4cde526129
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+    CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+#设置device侧日志登记为error
+msnpureport -g error -d 0
+msnpureport -g error -d 1
+msnpureport -g error -d 2
+msnpureport -g error -d 3
+msnpureport -g error -d 4
+msnpureport -g error -d 5
+msnpureport -g error -d 6
+msnpureport -g error -d 7
+#关闭Device侧Event日志
+msnpureport -e disable
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh
new file mode 100644
index 0000000000..45eeeb1cb2
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+
+export RANK_SIZE=8
+export JOB_ID=10087
+RANK_ID_START=0
+# source env.sh
+RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#设置默认日志级别,不需要修改
+# export ASCEND_GLOBAL_LOG_LEVEL=3
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="Inception_v2_231"
+#训练epoch
+train_epochs=100
+#训练batch_size
+batch_size=2048
+#训练step
+train_steps=`expr 1281167 / ${batch_size}`
+#学习率
+learning_rate=0.72
+
+
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="amp"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+if [[ $1 == --help || $1 == --h ]];then
+   echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04"
+   exit 1
+fi
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+      data_path=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+      batch_size=`echo ${para#*=}`
+    elif [[ $para == --learning_rate* ]];then
+      learning_rate=`echo ${para#*=}`
+    elif [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    fi
+done
+
+ PREC=""
+if [[ $precision_mode == "amp" ]];then
+  PREC="--amp"
+fi
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+ 
+cd $cur_path
+
+#设置环境变量，不需要修改
+echo "Device ID: $ASCEND_DEVICE_ID"
+export RANK_ID=$RANK_ID
+
+if [ -d $cur_path/output_8p ];then
+   rm -rf $cur_path/output_8p/*
+   mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID
+else
+   mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID
+fi
+wait
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时 source 环境变量
+check_etp_flag = `env | grep etp_running_flag`
+etp_flag = `echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ]; then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+nohup python3.7 main-8p.py \
+	  -a inception_v2 \
+	  --amp \
+	  --loss-scale 128 \
+    --data ${data_path} \
+    --addr=$(hostname -I |awk '{print $1}') \
+    --seed=49 \
+    --workers=184 \
+    --learning-rate=${learning_rate} \
+    --mom=0.9 \
+    --weight-decay=1.0e-04  \
+    --print-freq=30 \
+    --dist-url='tcp://127.0.0.1:50000' \
+    --dist-backend='hccl' \
+    --multiprocessing-distributed \
+    --world-size=1 \
+    --rank=0 \
+    --device='npu' \
+    --epochs=$train_epochs \
+	  --label-smoothing=0.1 \
+    --batch-size=${batch_size} > $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+wait
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+Timeavg=`grep "Epoch:" $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $8}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g `
+Timeavg=`awk 'BEGIN{printf "%.2f\n",'$Timeavg'}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$Timeavg'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1' $cur_path/output_8p/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${cur_path}/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh
new file mode 100644
index 0000000000..de9bbe468a
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+
+export RANK_SIZE=1
+export JOB_ID=10087
+export ASCEND_DEVICE_ID=2
+RANK_ID_START=0
+# source env.sh
+RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#设置默认日志级别,不需要修改
+# export ASCEND_GLOBAL_LOG_LEVEL=3
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="Inception_v2_231"
+#训练epoch
+train_epochs=1
+#训练batch_size
+batch_size=256
+#训练step
+train_steps=`expr 1281167 / ${batch_size}`
+#学习率
+learning_rate=0.045
+
+
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="amp"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+
+if [[ $1 == --help || $1 == --h ]];then
+   echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04"
+   exit 1
+fi
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+      data_path=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+      batch_size=`echo ${para#*=}`
+    elif [[ $para == --learning_rate* ]];then
+      learning_rate=`echo ${para#*=}`
+    elif [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    fi
+done
+
+PREC=""
+if [[ $precision_mode == "amp" ]];then
+  PREC="--amp"
+fi
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+   
+cd $cur_path
+
+sed -i "s|pass|break|g" main.py
+
+#设置环境变量，不需要修改
+echo "Device ID: $ASCEND_DEVICE_ID"
+export RANK_ID=$RANK_ID
+
+if [ -d $cur_path/output_1p ];then
+   rm -rf $cur_path/output_1p/*
+   mkdir -p $cur_path/output_1p/$ASCEND_DEVICE_ID
+else
+   mkdir -p $cur_path/output_1p/$ASCEND_DEVICE_ID
+fi
+wait
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时 source 环境变量
+check_etp_flag = `env | grep etp_running_flag`
+etp_flag = `echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ]; then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+nohup python3.7 main.py \
+    --data ${data_path} \
+    --npu ${ASCEND_DEVICE_ID} \
+	  -a inception_v2 \
+    -b ${batch_size} \
+    --lr ${learning_rate} \
+    --epochs $train_epochs \
+	  -j 128 \
+	  -p 10 \
+	  ${PREC} \
+  	--label-smoothing 0.1 \
+   	--wd 0.0002 > $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+wait
+
+sed -i "s|break|pass|g" main.py
+
+# fps=`grep -a 'FPS' $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $7}'`
+# step_time=`awk 'BEGIN{printf "%.2f\n",'1000'*'${batch_size}'/'$fps'}'`
+
+# echo "Final Performance image/s : $fps"
+# echo "Final Performance ms/step : $step_time"
+# echo "Final Training Duration sec : $e2etime"
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+Timeavg=`grep "Epoch:" $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $5}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g `
+Timeavg=`awk 'BEGIN{printf "%.2f\n",'$Timeavg'}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$Timeavg'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1' $cur_path/output_1p/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print$2}' | awk -F "e" '{print$1}' > $cur_path/output_1p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output_1p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh
new file mode 100644
index 0000000000..19291390e6
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+
+export RANK_SIZE=8
+export JOB_ID=10087
+RANK_ID_START=0
+# source env.sh
+RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#设置默认日志级别,不需要修改
+# export ASCEND_GLOBAL_LOG_LEVEL=3
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="Inception_v2_231"
+#训练epoch
+train_epochs=1
+#训练batch_size
+batch_size=2048
+#训练step
+train_steps=`expr 1281167 / ${batch_size}`
+#学习率
+learning_rate=0.72
+
+
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="amp"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+
+if [[ $1 == --help || $1 == --h ]];then
+   echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04"
+   exit 1
+fi
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+      data_path=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+      batch_size=`echo ${para#*=}`
+    elif [[ $para == --learning_rate* ]];then
+      learning_rate=`echo ${para#*=}`
+    elif [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    fi
+done
+PREC=""
+if [[ $precision_mode == "amp" ]];then
+  PREC="--amp"
+fi
+
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+ 
+cd $cur_path
+
+sed -i "s|pass|break|g" main.py
+
+#设置环境变量，不需要修改
+echo "Device ID: $ASCEND_DEVICE_ID"
+export RANK_ID=$RANK_ID
+
+if [ -d $cur_path/output_8p ];then
+   rm -rf $cur_path/output_8p/*
+   mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID
+else
+   mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID
+fi
+wait
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时 source 环境变量
+check_etp_flag = `env | grep etp_running_flag`
+etp_flag = `echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ]; then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+nohup python3.7 main-8p.py \
+	  -a inception_v2 \
+  	${PREC} \
+  	--loss-scale 128 \
+    --data ${data_path} \
+    --addr=$(hostname -I |awk '{print $1}') \
+    --seed=49 \
+    --workers=184 \
+    --learning-rate=${learning_rate} \
+    --mom=0.9 \
+    --weight-decay=1.0e-04  \
+    --print-freq=30 \
+    --dist-url='tcp://127.0.0.1:50000' \
+    --dist-backend='hccl' \
+    --multiprocessing-distributed \
+    --world-size=1 \
+    --rank=0 \
+    --device='npu' \
+    --epochs=$train_epochs \
+  	--label-smoothing=0.1 \
+    --batch-size=${batch_size} > $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+wait
+
+sed -i "s|break|pass|g" main.py
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+#Timeavg=`grep "Epoch:" $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $8}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g `
+Timeavg=`grep "Epoch:" $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Time" '{print $2}'|awk -F "(" '{print $1}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g `
+Timeavg=`awk 'BEGIN{printf "%.2f\n",'$Timeavg'}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$Timeavg'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1' $cur_path/output_8p/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
-- 
Gitee


From d264a76714d4af35b46eb96aa95dfe9eb26ae4e0 Mon Sep 17 00:00:00 2001
From: sysulyccc <laiych9@gmail.com>
Date: Thu, 22 Dec 2022 08:39:32 +0000
Subject: [PATCH 2/2] update
 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md.

Signed-off-by: sysulyccc <laiych9@gmail.com>
---
 .../Inception_v2_231_for_Pytorch/README.md    | 96 +++++++++++++------
 1 file changed, 69 insertions(+), 27 deletions(-)

diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md
index ebe02e8775..4cb7d8623e 100644
--- a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md
+++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md
@@ -1,4 +1,4 @@
-# Inception_v2_231
+# Inception_v2_231 for PyTorch
 -   [概述](#概述)
 -   [准备训练环境](#准备训练环境)
 -   [开始训练](#开始训练)
@@ -8,12 +8,13 @@
 
 ## 简述
 
-InceptionV2主要是在GoogLeNet的基础上添加了 BN 层，并且采用 VGG 的思想，利用两个小卷积核代替大卷积核，在保持相同感受野的同时减少参数，并提高非线性表示能力。
+InceptionV2主要是在GoogLeNet的基础上添加了 BN 层，并且采用 VGG 的思想，利用两个小卷积核代替大卷积核，在保持相同感受野的同时减少参数，并提高非线性表示能力
 
 - 参考实现：
 
   ```
-  https://github.com/shanglianlm0525/PyTorch-Networks/blob/master/ClassicNetwork/InceptionV2.py
+  url=https://github.com/shanglianlm0525/PyTorch-Networks/blob/master/ClassicNetwork/InceptionV2.py
+  commit_id=298bc76761d34e472fdc73615f50a5a9afd7e8b9
   ```
   
 - 适配昇腾 AI 处理器的实现：
@@ -36,15 +37,16 @@ InceptionV2主要是在GoogLeNet的基础上添加了 BN 层，并且采用 VGG
 
 ## 准备环境
 
-- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示
+- 当前模型支持的硬件、NPU固件驱动、CANN 和 PyTorch 如下表所示
 
   **表 1**  版本配套表
 
-  | 配套        | 版本                                                         |
-  | ---------- | ------------------------------------------------------------ |
-  | 固件与驱动  | [1.0.9](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
-  | CANN       | [3.2.1](https://www.hiascend.com/software/cann/commercial?version=3.2.1) |
-  | PyTorch    | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) |
+  | 配套          | 版本                                                         |
+  | ------------- | ------------------------------------------------------------ |
+  | 硬件          | [1.0.17](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | NPU固件与驱动 | [6.0.RC1](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | CANN          | [6.0.RC1](https://www.hiascend.com/software/cann/commercial?version=6.0.RC1) |
+  | PyTorch       | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/)       |
 
 - 环境准备指导。
 
@@ -89,35 +91,75 @@ InceptionV2主要是在GoogLeNet的基础上添加了 BN 层，并且采用 VGG
    ```
 
    > **说明：** 
-   >数据集路径以用户自行定义的路径为准
+   > 数据集路径以用户自行定义的路径为准
 
 # 开始训练
 
 ## 训练模型
-```bash
-# prefomance training 1p
-bash ./test/train_performance_1p.sh --data_path=/opt/npu/dataset/imagenet
 
-# prefomance training 8p
-bash ./test/train_performance_8p.sh --data_path=/opt/npu/dataset/imagenet
+1. 进入解压后的源码包根目录
 
-# full training 1p
-bash ./test/train_full_1p.sh --data_path=/opt/npu/dataset/imagenet
+    ```bash
+    cd /${模型文件夹名称} 
+    ```
 
-# full training 8p
-bash ./test/train_full_8p.sh --data_path=/opt/npu/dataset/imagenet
+2. 运行训练脚本
 
-# eval 
-bash ./test/eval_8p.sh --data_path=/opt/npu/dataset/imagenet
+    该模型支持单机单卡性能和单机8卡训练与性能
+
+    * 单机单卡性能
+
+        ```bash
+        # prefomance 1p, --data_path填写数据集路径, 输出日志在./output_1p
+        bash ./test/train_performance_1p.sh --data_path=real_data_path
+        ```
+
+    * 单机8卡
+
+        ```bash
+        # full 8p, --data_path填写数据集路径, 输出日志在./output_8p
+        bash ./test/train_full_8p.sh --data_path=real_data_path
+        
+        # prefomance 8p, --data_path填写数据集路径, 输出日志在./output_8p
+        bash ./test/train_performance_8p.sh --data_path=real_data_path
+        ```
+
+模型训练脚本参数说明如下：
+
+```
+--data_path				//数据集路径
+--addr					//主机地址
+--workers				//加载数据进程数
+--learning-rate			        //初始化学习率
+--mom					//动量
+--weight-decay			        //权重衰减
+--multiprocessing-distributed	        //是否使用多卡训练
+--epochs				//重复训练次数
+--batch-size			        //训练批次大小
+--device				//设备
+--amp				        //是否使用混合精度
+--opt-level			        //混合精度类型
 ```
 
+训练完成后，权重文件保存在当前路径下，精度和性能日志在output_1p或者output_8p下
+
 # 训练结果展示
 
 **表 2**  训练结果展示表
 
-|  NAME  |  LOSS  |   FPS   | Epochs | AMP_Type | ACC@1  |
-| :----: | :----: | :-----: | :----: | :------: | :----: |
-| 1p-GPU | 7.5401 | 793.85  |   1    |    -     |   -    |
-| 1p-NPU | 7.6044 | 942.15  |   1    |    O1    |   -    |
-| 8p-GPU | 2.3929 | 5752.80 |  100   |    -     | 68.654 |
-| 8p-NPU | 2.1992 | 6395.45 |  100   |    O1    | 68.663 |
+|  NAME  |  LOSS  |  FPS  | Epochs | AMP_Type | ACC@1  |
+| :----: | :----: | :---: | :----: | :------: | :----: |
+| 1p-GPU | 7.5401 |  652  |   1    |    -     |   -    |
+| 1p-NPU | 7.6044 | 1280  |   1    |    O1    |   -    |
+| 8p-GPU | 2.3929 | 5752 |  100   |    -     | 68.654 |
+| 8p-NPU | 2.1992 | 6400  |  100   |    O1    | 68.663 |
+
+## 版本说明
+
+### 变更
+
+2022.9.29：首次发布
+
+### 已知问题
+
+无
\ No newline at end of file
-- 
Gitee