From fa00871daec5928344e06897836c4245d6bfd744 Mon Sep 17 00:00:00 2001 From: sysulyccc Date: Thu, 29 Sep 2022 14:57:35 +0800 Subject: [PATCH 1/2] =?UTF-8?q?[=E4=B8=AD=E5=B1=B1=E5=A4=A7=E5=AD=A6][?= =?UTF-8?q?=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][Pytorch][Inception=5Fv2?= =?UTF-8?q?=5F231]--=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update --- .../Inception_v2_231_for_Pytorch/Dockerfile | 5 + .../InceptionV2.py | 227 +++++++ .../Inception_v2_231_for_Pytorch/LICENSE | 16 + .../Inception_v2_231_for_Pytorch/README.md | 123 ++++ .../Inception_v2_231_for_Pytorch/demo.py | 141 ++++ .../docker_start.sh | 25 + .../Inception_v2_231_for_Pytorch/main-8p.py | 631 ++++++++++++++++++ .../Inception_v2_231_for_Pytorch/main.py | 588 ++++++++++++++++ .../modelzoo_level.txt | 5 + .../pthtar2onnx.py | 49 ++ .../requirements.txt | 1 + .../test/env_npu.sh | 67 ++ .../test/train_full_8p.sh | 167 +++++ .../test/train_performance_1p.sh | 171 +++++ .../test/train_performance_8p.sh | 169 +++++ 15 files changed, 2385 insertions(+) create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh create mode 100644 PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile new file mode 100644 index 0000000000..30a31af558 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/Dockerfile @@ -0,0 +1,5 @@ +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME + +COPY requirements.txt . +RUN pip3.7 install -r requirements.txt \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py new file mode 100644 index 0000000000..a5e069ff5c --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/InceptionV2.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import torch +import torch.nn as nn +import torchvision + +def ConvBNReLU(in_channels,out_channels,kernel_size,stride=1,padding=0): + return nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,padding=padding), + nn.BatchNorm2d(out_channels), + nn.ReLU6(inplace=True), + ) + +def ConvBNReLUFactorization(in_channels,out_channels,kernel_sizes,paddings): + return nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_sizes, stride=1,padding=paddings), + nn.BatchNorm2d(out_channels), + nn.ReLU6(inplace=True) + ) + +class InceptionV2ModuleA(nn.Module): + def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4): + super(InceptionV2ModuleA, self).__init__() + + self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1) + + self.branch2 = nn.Sequential( + ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1), + ConvBNReLU(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_size=3, padding=1), + ) + + self.branch3 = nn.Sequential( + ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1), + ConvBNReLU(in_channels=out_channels3reduce, out_channels=out_channels3, kernel_size=3, padding=1), + ConvBNReLU(in_channels=out_channels3, out_channels=out_channels3, kernel_size=3, padding=1), + ) + + self.branch4 = nn.Sequential( + nn.MaxPool2d(kernel_size=3, stride=1, padding=1), + ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1), + ) + + def forward(self, x): + out1 = self.branch1(x) + out2 = self.branch2(x) + out3 = self.branch3(x) + out4 = self.branch4(x) + out = torch.cat([out1, out2, out3, out4], dim=1) + return out + +class InceptionV2ModuleB(nn.Module): + def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4): + super(InceptionV2ModuleB, self).__init__() + + self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1) + + self.branch2 = nn.Sequential( + ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1), + ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2reduce, kernel_sizes=[1,3],paddings=[0,1]), + ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[3,1],paddings=[1, 0]), + ) + + self.branch3 = nn.Sequential( + ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1), + ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce,kernel_sizes=[1, 3], paddings=[0, 1]), + ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce,kernel_sizes=[3, 1], paddings=[1, 0]), + ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce, kernel_sizes=[1, 3], paddings=[0, 1]), + ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3,kernel_sizes=[3, 1], paddings=[1, 0]), + ) + + self.branch4 = nn.Sequential( + nn.MaxPool2d(kernel_size=3, stride=1, padding=1), + ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1), + ) + + def forward(self, x): + out1 = self.branch1(x) + out2 = self.branch2(x) + out3 = self.branch3(x) + out4 = self.branch4(x) + out = torch.cat([out1, out2, out3, out4], dim=1) + return out + +class InceptionV2ModuleC(nn.Module): + def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4): + super(InceptionV2ModuleC, self).__init__() + + self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1) + + self.branch2_conv1 = ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1) + self.branch2_conv2a = ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[1,3],paddings=[0,1]) + self.branch2_conv2b = ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[3,1],paddings=[1,0]) + + self.branch3_conv1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1) + self.branch3_conv2 = ConvBNReLU(in_channels=out_channels3reduce, out_channels=out_channels3, kernel_size=3,stride=1,padding=1) + self.branch3_conv3a = ConvBNReLUFactorization(in_channels=out_channels3, out_channels=out_channels3, kernel_sizes=[3, 1],paddings=[1, 0]) + self.branch3_conv3b = ConvBNReLUFactorization(in_channels=out_channels3, out_channels=out_channels3, kernel_sizes=[1, 3],paddings=[0, 1]) + + self.branch4 = nn.Sequential( + nn.MaxPool2d(kernel_size=3, stride=1, padding=1), + ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1), + ) + + def forward(self, x): + out1 = self.branch1(x) + x2 = self.branch2_conv1(x) + out2 = torch.cat([self.branch2_conv2a(x2), self.branch2_conv2b(x2)],dim=1) + x3 = self.branch3_conv2(self.branch3_conv1(x)) + out3 = torch.cat([self.branch3_conv3a(x3), self.branch3_conv3b(x3)], dim=1) + out4 = self.branch4(x) + out = torch.cat([out1, out2, out3, out4], dim=1) + return out + +class InceptionV3ModuleD(nn.Module): + def __init__(self, in_channels,out_channels1reduce,out_channels1,out_channels2reduce, out_channels2): + super(InceptionV3ModuleD, self).__init__() + + self.branch1 = nn.Sequential( + ConvBNReLU(in_channels=in_channels, out_channels=out_channels1reduce, kernel_size=1), + ConvBNReLU(in_channels=out_channels1reduce, out_channels=out_channels1, kernel_size=3,stride=2,padding=1) + ) + + self.branch2 = nn.Sequential( + ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1), + ConvBNReLU(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_size=3, stride=1, padding=1), + ConvBNReLU(in_channels=out_channels2, out_channels=out_channels2, kernel_size=3, stride=2,padding=1), + ) + + self.branch3 = nn.MaxPool2d(kernel_size=3,stride=2,padding=1) + + def forward(self, x): + out1 = self.branch1(x) + out2 = self.branch2(x) + out3 = self.branch3(x) + out = torch.cat([out1, out2, out3], dim=1) + return out + +class InceptionAux(nn.Module): + def __init__(self, in_channels,out_channels): + super(InceptionAux, self).__init__() + + self.auxiliary_avgpool = nn.AvgPool2d(kernel_size=5, stride=3) + self.auxiliary_conv1 = ConvBNReLU(in_channels=in_channels, out_channels=128, kernel_size=1) + self.auxiliary_conv2 = nn.Conv2d(in_channels=128, out_channels=768, kernel_size=5,stride=1) + self.auxiliary_dropout = nn.Dropout(p=0.7) + self.auxiliary_linear1 = nn.Linear(in_features=768, out_features=out_channels) + + def forward(self, x): + x = self.auxiliary_conv1(self.auxiliary_avgpool(x)) + x = self.auxiliary_conv2(x) + x = x.view(x.size(0), -1) + out = self.auxiliary_linear1(self.auxiliary_dropout(x)) + return out + +class InceptionV2(nn.Module): + def __init__(self, num_classes=1000, stage='train'): + super(InceptionV2, self).__init__() + self.stage = stage + + self.block1 = nn.Sequential( + ConvBNReLU(in_channels=3, out_channels=64, kernel_size=7,stride=2,padding=3), + nn.MaxPool2d(kernel_size=3,stride=2,padding=1), + ) + + self.block2 = nn.Sequential( + ConvBNReLU(in_channels=64, out_channels=192, kernel_size=3, stride=1, padding=1), + nn.MaxPool2d(kernel_size=3, stride=2,padding=1), + ) + + self.block3 = nn.Sequential( + InceptionV2ModuleA(in_channels=192,out_channels1=64,out_channels2reduce=64, out_channels2=64, out_channels3reduce=64, out_channels3=96, out_channels4=32), + InceptionV2ModuleA(in_channels=256, out_channels1=64, out_channels2reduce=64, out_channels2=96,out_channels3reduce=64, out_channels3=96, out_channels4=64), + InceptionV3ModuleD(in_channels=320, out_channels1reduce=128, out_channels1=160, out_channels2reduce=64,out_channels2=96), + ) + + self.block4 = nn.Sequential( + InceptionV2ModuleB(in_channels=576, out_channels1=224, out_channels2reduce=64, out_channels2=96,out_channels3reduce=96, out_channels3=128, out_channels4=128), + InceptionV2ModuleB(in_channels=576, out_channels1=192, out_channels2reduce=96, out_channels2=128,out_channels3reduce=96, out_channels3=128, out_channels4=128), + InceptionV2ModuleB(in_channels=576, out_channels1=160, out_channels2reduce=128, out_channels2=160,out_channels3reduce=128, out_channels3=128, out_channels4=128), + InceptionV2ModuleB(in_channels=576, out_channels1=96, out_channels2reduce=128, out_channels2=192,out_channels3reduce=160, out_channels3=160, out_channels4=128), + InceptionV3ModuleD(in_channels=576, out_channels1reduce=128, out_channels1=192, out_channels2reduce=192,out_channels2=256), + ) + + self.block5 = nn.Sequential( + InceptionV2ModuleC(in_channels=1024, out_channels1=352, out_channels2reduce=192, out_channels2=160,out_channels3reduce=160, out_channels3=112, out_channels4=128), + InceptionV2ModuleC(in_channels=1024, out_channels1=352, out_channels2reduce=192, out_channels2=160, + out_channels3reduce=192, out_channels3=112, out_channels4=128) + ) + + self.max_pool = nn.MaxPool2d(kernel_size=7, stride=1) + self.dropout = nn.Dropout(p=0.5) + self.linear = nn.Linear(1024, num_classes) + + def forward(self, x): + x = self.block1(x) + x = self.block2(x) + x = self.block3(x) + x = self.block4(x) + x = self.block5(x) + x = self.max_pool(x) + x = self.dropout(x) + x = x.view(x.size(0), -1) + out = self.linear(x) + return out + +if __name__=='__main__': + model = InceptionV2() + print(model) + + input = torch.randn(1, 3, 224, 224) + out = model(input) + print(out.shape) \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE new file mode 100644 index 0000000000..82adefb928 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/LICENSE @@ -0,0 +1,16 @@ +# Copyright (c) Soumith Chintala 2016, +# All rights reserved +# +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://spdx.org/licenses/BSD-3-Clause.html +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md new file mode 100644 index 0000000000..ebe02e8775 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md @@ -0,0 +1,123 @@ +# Inception_v2_231 +- [概述](#概述) +- [准备训练环境](#准备训练环境) +- [开始训练](#开始训练) +- [训练结果展示](#训练结果展示) + +# 概述 + +## 简述 + +InceptionV2主要是在GoogLeNet的基础上添加了 BN 层,并且采用 VGG 的思想,利用两个小卷积核代替大卷积核,在保持相同感受野的同时减少参数,并提高非线性表示能力。 + +- 参考实现: + + ``` + https://github.com/shanglianlm0525/PyTorch-Networks/blob/master/ClassicNetwork/InceptionV2.py + ``` + +- 适配昇腾 AI 处理器的实现: + + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/contrib/cv/classification + ``` + +- 通过Git获取代码方法如下: + + ``` + git clone {url} # 克隆仓库的代码 + cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 + ``` + +- 通过单击“立即下载”,下载源码包 + +# 准备训练环境 + +## 准备环境 + +- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示 + + **表 1** 版本配套表 + + | 配套 | 版本 | + | ---------- | ------------------------------------------------------------ | + | 固件与驱动 | [1.0.9](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | CANN | [3.2.1](https://www.hiascend.com/software/cann/commercial?version=3.2.1) | + | PyTorch | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) | + +- 环境准备指导。 + + 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + +- 安装依赖。 + + ``` + pip install -r requirements.txt + ``` + + +## 准备数据集 + +1. 获取数据集。 + + 用户自行获取原始数据集,可选用的开源数据集包括ImageNet2012,CIFAR-10等,将数据集上传到服务器任意路径下并解压。 + + 以ImageNet2012数据集为例,数据集目录结构参考如下所示。 + + ``` + ├── ImageNet2012 + ├──train + ├──类别1 + │──图片1 + │──图片2 + │ ... + ├──类别2 + │──图片1 + │──图片2 + │ ... + ├──... + ├──val + ├──类别1 + │──图片1 + │──图片2 + │ ... + ├──类别2 + │──图片1 + │──图片2 + │ ... + ``` + + > **说明:** + >数据集路径以用户自行定义的路径为准 + +# 开始训练 + +## 训练模型 +```bash +# prefomance training 1p +bash ./test/train_performance_1p.sh --data_path=/opt/npu/dataset/imagenet + +# prefomance training 8p +bash ./test/train_performance_8p.sh --data_path=/opt/npu/dataset/imagenet + +# full training 1p +bash ./test/train_full_1p.sh --data_path=/opt/npu/dataset/imagenet + +# full training 8p +bash ./test/train_full_8p.sh --data_path=/opt/npu/dataset/imagenet + +# eval +bash ./test/eval_8p.sh --data_path=/opt/npu/dataset/imagenet +``` + +# 训练结果展示 + +**表 2** 训练结果展示表 + +| NAME | LOSS | FPS | Epochs | AMP_Type | ACC@1 | +| :----: | :----: | :-----: | :----: | :------: | :----: | +| 1p-GPU | 7.5401 | 793.85 | 1 | - | - | +| 1p-NPU | 7.6044 | 942.15 | 1 | O1 | - | +| 8p-GPU | 2.3929 | 5752.80 | 100 | - | 68.654 | +| 8p-NPU | 2.1992 | 6395.45 | 100 | O1 | 68.663 | diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py new file mode 100644 index 0000000000..bd832f9ace --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/demo.py @@ -0,0 +1,141 @@ +# Copyright (c) Soumith Chintala 2016, +# All rights reserved +# +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://spdx.org/licenses/BSD-3-Clause.html +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- + + +import os + +import torch +import numpy as np +from inception import inception_v3 +import argparse +from apex import amp +import apex +import torch.distributed as dist +parser = argparse.ArgumentParser(description='inception_v3 demo ') +parser.add_argument('--device', default='npu', type=str, + help='npu or gpu') + +parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list') +parser.add_argument('--dist-backend', default='hccl', type=str, + help='distributed backend') +parser.add_argument('--addr', default='', type=str, + help='master addr') + +''' +print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + +''' +def device_id_to_process_device_map(device_list): + devices = device_list.split(",") + devices = [int(x) for x in devices] + devices.sort() + + process_device_map = dict() + for process_id, device_id in enumerate(devices): + process_device_map[process_id] = device_id + + return process_device_map + + +def build_model(): + global loc + # 请自定义模型并加载预训练模型 + args = parser.parse_args() + args.process_device_map = device_id_to_process_device_map(args.device_list) + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29688' + ngpus_per_node = len(args.process_device_map) + + dist.init_process_group(backend=args.dist_backend, # init_method=args.dist_url, + world_size=1, rank=0) + + args.gpu = args.process_device_map[0] + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + + model = inception_v3().to(loc) + optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), 0.8, + momentum=0.9, + weight_decay=1.0e-04) + model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + checkpoint = torch.load('./checkpoint.pth.tar') + model.load_state_dict(checkpoint['state_dict']) + model.eval() # 注意设置eval模式 + return model + + +def get_raw_data(): + # 请自定义获取数据方式,请勿将原始数据上传至代码仓 + from PIL import Image + from urllib.request import urlretrieve + IMAGE_URL = 'https://bbs-img.huaweicloud.com/blogs/img/thumb/1591951315139_8989_1363.png' + urlretrieve(IMAGE_URL, 'tmp.jpg') + img = Image.open("tmp.jpg") + img = img.convert('RGB') + return img + + +def pre_process(raw_data): + # 请自定义模型预处理方法 + from torchvision import transforms + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + transforms_list = transforms.Compose([ + transforms.Resize(299), + transforms.CenterCrop(299), + transforms.ToTensor(), + normalize + ]) + input_data = transforms_list(raw_data) + return input_data.unsqueeze(0) + + +def post_process(output_tensor): + # 请自定义后处理方法 + print(output_tensor) + return torch.argmax(output_tensor, 1) + + +if __name__ == '__main__': + # 1. 获取原始数据 + raw_data = get_raw_data() + + # 2. 构建模型 + model = build_model() + + # 3. 预处理 + input_tensor = pre_process(raw_data) + + # 4. 执行forward + output_tensor = model(input_tensor.to(loc)) + + # 5. 后处理 + result = post_process(output_tensor) + + # 6. 打印 + print(result) diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh new file mode 100644 index 0000000000..944bca3cda --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/docker_start.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +docker_image=$1 +data_dir=$2 +model_dir=$3 + +docker run -it --ipc=host \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \ + -v ${model_dir}:${model_dir} \ + -v ${data_dir}:${data_dir} \ + -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \ + -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \ + -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \ + /bin/bash \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py new file mode 100644 index 0000000000..22ba96dd09 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main-8p.py @@ -0,0 +1,631 @@ +# BSD 3-Clause License + +# Copyright (c) Soumith Chintala 2016, +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import random +import shutil +import time +import warnings + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models +from InceptionV2 import InceptionV2 + +from apex import amp +import apex +import numpy as np +from apex.optimizers import NpuFusedSGD + +if torch.__version__ >= '1.8': + import torch_npu + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet', + help='path to dataset') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50') +parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', + help='number of data loading workers (default: 4)') +parser.add_argument('--epochs', default=90, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=512, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('--workspace', type=str, default='./', metavar='DIR', + help='path to directory where checkpoints will be stored') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('-ef', '--eval-freq', default=5, type=int, + metavar='N', help='evaluate frequency (default: 5)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--fine-tuning', action='store_true', + help='use fine-tuning model') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--npu', default=None, type=int, + help='npu id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +parser.add_argument('-bm', '--benchmark', default=0, type=int, + metavar='N', help='set benchmark status (default: 1,run benchmark)') +parser.add_argument('--device', default='npu', type=str, + help='npu or gpu') +parser.add_argument('--addr', default='', type=str, + help='master addr') +parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str, + help='checkpoint-nameprefix') +parser.add_argument('--checkpoint-freq', default=0, type=int, + metavar='N', help='checkpoint frequency (default: 0)' + '0: save only one file whitch per epoch;' + 'n: save diff file per n epoch' + '-1:no checkpoint,not support') +parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list') +# apex +parser.add_argument('--amp', default=False, action='store_true', + help='use amp to train the model') +parser.add_argument('--loss-scale', default=1024., type=float, + help='loss scale using in amp, default -1 means dynamic') +parser.add_argument('--opt-level', default='O1', type=str, + help='loss scale using in amp, default -1 means dynamic') + +parser.add_argument('--label-smoothing', + default=0.0, + type=float, + metavar='S', + help='label smoothing') +parser.add_argument('--warm_up_epochs', default=0, type=int, + help='warm up') + + +warnings.filterwarnings('ignore') +best_acc1 = 0 + + +def device_id_to_process_device_map(device_list): + devices = device_list.split(",") + devices = [int(x) for x in devices] + devices.sort() + + process_device_map = dict() + for process_id, device_id in enumerate(devices): + process_device_map[process_id] = device_id + + return process_device_map + + +def main(): + args = parser.parse_args() + print("===============main()=================") + print(args) + print("===============main()=================") + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29688' + + if args.npu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + args.process_device_map = device_id_to_process_device_map(args.device_list) + + if args.device == 'npu': + ngpus_per_node = len(args.process_device_map) + else: + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + # The child process uses the environment variables of the parent process, + # we have to set KERNEL_NAME_ID for every proc + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + + else: + # Simply call main_worker function + main_worker(args.npu, ngpus_per_node, args) + + +def main_worker(npu, ngpus_per_node, args): + global best_acc1 + args.npu = args.process_device_map[npu] + + if args.npu is not None: + print("[npu id:", args.npu, "]", "Use npu: {} for training".format(args.npu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + npu + + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, # init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + + loc = 'npu:{}'.format(args.npu) + torch.npu.set_device(loc) + + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + #model = InceptionV2(pretrained=True) + model = InceptionV2() + print("Load my train models...") + pretrained_dict = \ + torch.load("/home/Inception/model_best.pthtar", map_location="cpu")["state_dict"] + model.load_state_dict(pretrained_dict, strict=False) + else: + print("=> creating model '{}'".format(args.arch)) + model = InceptionV2() + + if args.fine_tuning: + print("=> transfer-learning mode + fine-tuning (train only the last FC layer)") + for param in model.parameters(): + param.requires_grad = False + if args.arch == 'inception_v2': + model.classifier = nn.Linear(1024, 1000) + parameters = model.classifier.parameters() + else: + print("Error:Fine-tuning is not supported on this architecture") + exit(-1) + else: + parameters = model.parameters() + + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + + # Data loading code + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(342), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=False, drop_last=True) + + # create model + model = model.to(loc) + + # define loss function (criterion) and optimizer + + loss = nn.CrossEntropyLoss().to(loc) + if args.label_smoothing > 0.0: + loss = lambda: LabelSmoothing(loc, args.label_smoothing) + criterion = loss().to(loc) + + #optimizer = torch.optim.SGD(model.parameters(), args.lr, + #momentum=args.momentum, + #weight_decay=args.weight_decay) + optimizer = NpuFusedSGD( + model.parameters(), + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + if args.amp: + model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.npu], broadcast_buffers=False) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + if args.evaluate: + validate(val_loader, model, criterion, args, ngpus_per_node) + return + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + lr = adjust_learning_rate(optimizer, epoch, args) + + steps_per_epoch = len(train_loader) + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + print("=> Epoch[%d] Setting lr: %.4f" % (epoch, lr)) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node) + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args, ngpus_per_node) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + if (epoch <= 80 and epoch % 25 == 0 ) : + if args.amp: + save_checkpoint_v1({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + 'amp': amp.state_dict(), + },is_best) + else: + save_checkpoint_v1({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + },is_best) + elif (epoch > 80 and epoch <= 100): + if args.amp: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + 'amp': amp.state_dict(), + }, is_best) + else: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + }, is_best) + else: + print("Modify the number of epoches so that the total number of saved models does not exceed 20!") + + +def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + end = time.time() + loc = 'npu:{}'.format(args.npu) + + # steps_per_epoch = len(train_loader) + steps_per_epoch = len(train_loader) + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + print('==========step per epoch======================', steps_per_epoch) + + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + target = target.to(torch.int32) + images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False) + + # compute output + loss,output = get_loss(model, target, images, criterion) + stream = torch.npu.current_stream() + stream.synchronize() + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + stream = torch.npu.current_stream() + stream.synchronize() + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + + if args.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + stream = torch.npu.current_stream() + stream.synchronize() + + optimizer.step() + stream = torch.npu.current_stream() + stream.synchronize() + + optimizer.zero_grad() + stream = torch.npu.current_stream() + stream.synchronize() + + if i % args.print_freq == 0: + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + progress.display(i) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + print("[npu id:", args.npu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg)) + + +def validate(val_loader, model, criterion, args, ngpus_per_node): + batch_time = AverageMeter('Time', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + loc = 'npu:{}'.format(args.npu) + end = time.time() + for i, (images, target) in enumerate(val_loader): + target = target.to(torch.int32) + images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False) + + # compute output + output = model(images) + loss = criterion(output, target) + stream = torch.npu.current_stream() + stream.synchronize() + + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + stream = torch.npu.current_stream() + stream.synchronize() + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + progress.display(i) + + # TODO: this should also be done with the ProgressMeter + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + print("[npu id:", args.npu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + +def get_loss(model, target, images, criterion): + output = model(images) + loss = criterion(output, target) + #loss2 = criterion(aux1, target) + # According to the paper BN auxiliary classifier + #loss = loss1 + 0.4*loss2 + return loss, output + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch'])) + +def save_checkpoint_v1(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch'])) + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + self.start_count_index = 10 + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.count += n + if self.count > (self.start_count_index * n): + self.sum += val * n + self.avg = self.sum / (self.count - self.start_count_index * n) + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("[npu id:", '0', "]", '\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, epoch, args): + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + if args.warm_up_epochs > 0 and epoch < args.warm_up_epochs: + lr = args.lr * ((epoch + 1) / (args.warm_up_epochs + 1)) + else: + alpha = 0 + cosine_decay = 0.5 * ( + 1 + np.cos(np.pi * (epoch - args.warm_up_epochs) / (args.epochs - args.warm_up_epochs))) + decayed = (1 - alpha) * cosine_decay + alpha + lr = args.lr * decayed + + for param_group in optimizer.param_groups: + param_group['lr'] = lr + return lr + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + +class LabelSmoothing(nn.Module): + """ + NLL loss with label smoothing. + """ + def __init__(self, loc, smoothing=0.0): + """ + Constructor for the LabelSmoothing module. + + :param smoothing: label smoothing factor + """ + super(LabelSmoothing, self).__init__() + self.confidence = 1.0 - smoothing + self.smoothing = smoothing + self.device = loc + + def forward(self, x, target): + target = target.to(torch.int64) + + logprobs = torch.nn.functional.log_softmax(x, dim=-1) + nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1).to(torch.int64)) + nll_loss = nll_loss.squeeze(1) + smooth_loss = -logprobs.mean(dim=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + +if __name__ == '__main__': + main() diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py new file mode 100644 index 0000000000..8726453f24 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/main.py @@ -0,0 +1,588 @@ +# BSD 3-Clause License + +# Copyright (c) Soumith Chintala 2016, +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import yaml +import os +import random +import shutil +import time +import warnings + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torch.npu +from InceptionV2 import InceptionV2 +import apex +from apex import amp +warnings.filterwarnings('ignore') +CALCULATE_DEVICE = "npu:0" + +if torch.__version__ >= '1.8': + import torch_npu + + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('--data', metavar='DIR', + help='path to dataset') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', + help='model architecture') +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', + help='number of data loading workers (default: 4)') +parser.add_argument('--epochs', default=90, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--npu', default=None, type=int, + help='NPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') + +parser.add_argument('--optimizer-batch-size', + default=-1, + type=int, + metavar='N', + help= + 'size of a total batch size, for simulating bigger batches using gradient accumulation') + + +parser.add_argument('--gpu', + default=None, + type=int, + help='GPU id to use.') + +parser.add_argument('--warmup', + default=0, + type=int, + metavar='E', + help='number of warmup epochs') +parser.add_argument('--label-smoothing', + default=0.0, + type=float, + metavar='S', + help='label smoothing') + +parser.add_argument('--static-loss-scale', + type=float, + default=1, + help= + 'Static loss scale, positive power of 2 values can improve fp16 convergence.') +parser.add_argument('-t', + '--fine-tuning', + action='store_true', + help='transfer learning + fine tuning - train only the last FC layer.') + +parser.add_argument('--amp', action='store_true', help='use apex') +parser.add_argument('--pm', '--precision-mode', default='O1', type=str, + help='precision mode to use for mix precision, only support O1, O2') +parser.add_argument('--loss_scale', default=1024, type=int, help='loss_scale for amp') + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + print("=======================") + print(args) + print("=======================") + if args.npu is None: + args.npu = 0 + global CALCULATE_DEVICE + CALCULATE_DEVICE = "npu:{}".format(args.npu) + torch.npu.set_device(CALCULATE_DEVICE) + print("use ", CALCULATE_DEVICE) + + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.npu is not None: + warnings.warn('You have chosen a specific NPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + ngpus_per_node = torch.npu.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.npu, ngpus_per_node, args) + + +def main_worker(npu, ngpus_per_node, args): + global best_acc1 + args.npu = npu + + if args.npu is not None: + print("Use NPU: {} for training".format(args.npu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + npu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + #model = models.__dict__[args.arch](pretrained=True) + model = InceptionV2() + print("Load my train models...") + pretrained_dict = \ + torch.load("/home/Inception/model_best.pth.tar", map_location="cpu")["state_dict"] + model.load_state_dict(pretrained_dict, strict=False) + else: + print("=> creating model '{}'".format(args.arch)) + model = InceptionV2() + #model = models.__dict__[args.arch]() + + if args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.npu is not None: + loc = 'npu:{}'.format(args.npu) + torch.npu.set_device(loc) + model.to(loc) + # When using a single NPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of NPUs we have + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.npu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.npu is not None: + loc = 'npu:{}'.format(args.npu) + torch.npu.set_device(loc) + model = model.to(loc) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = model.to(CALCULATE_DEVICE) + #model = torch.nn.DataParallel(model).cuda() + + # define loss function (criterion) and optimizer + loss = nn.CrossEntropyLoss() + if args.label_smoothing > 0.0: + loss = lambda: LabelSmoothing(args.label_smoothing) + criterion = loss().to(CALCULATE_DEVICE) + optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + if args.amp: + print("=> use amp...") + if args.pm not in ['O1', 'O2']: + print('=>unsupported precision mode!') + exit() + opt_level = args.pm + model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=args.loss_scale, combine_grad=True) + + ''' + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + ''' + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.npu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single npu. + loc = 'npu:{}'.format(args.npu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.npu is not None: + # best_acc1 may be from a checkpoint from a different NPU + best_acc1 = best_acc1.to(args.npu) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], + std=[0.5, 0.5, 0.5]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=False, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(341), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=False) + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + if not args.amp: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) + else: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + 'amp': amp.state_dict(), + }, is_best) + + +def train(train_loader, model, criterion, optimizer, epoch, args): + + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + n=0 + end = time.time() + loc = 'npu:{}'.format(args.npu) + for i, (images, target) in enumerate(train_loader): + if n==201: + pass + n=n+1 + # measure data loading time + data_time.update(time.time() - end) + + if args.npu is not None: + images = images.to(loc, non_blocking=False) + if torch.cuda.is_available(): + target = target.cuda(loc, non_blocking=False) + images = images.to(loc, non_blocking=True) + target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True) + + # compute output + loss,output = get_loss(model, target, images, criterion) + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + if args.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + print("batch_size:", args.batch_size, 'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format( + args.batch_size/batch_time.avg)) + +def validate(val_loader, model, criterion, args): + batch_time = AverageMeter('Time', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + loc = 'npu:{}'.format(args.npu) + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + if args.npu is not None: + images = images.to(loc, non_blocking=False) + if torch.cuda.is_available(): + target = target.to(loc, non_blocking=False) + images = images.to(CALCULATE_DEVICE, non_blocking=False) + target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=False) + # compute output + output = model(images) + loss = criterion(output, target) + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + # TODO: this should also be done with the ProgressMeter + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + +def get_loss(model, target, images, criterion): + output = model(images) + loss = criterion(output, target) + #loss2 = criterion(aux1, target) + # According to the paper BN auxiliary classifier + #loss = loss1 + 0.4*loss2 + return loss, output + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, epoch, args): + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + lr = args.lr * (0.1 ** (epoch // 30)) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + +class LabelSmoothing(nn.Module): + """ + NLL loss with label smoothing. + """ + def __init__(self, smoothing=0.0): + """ + Constructor for the LabelSmoothing module. + + :param smoothing: label smoothing factor + """ + super(LabelSmoothing, self).__init__() + self.confidence = 1.0 - smoothing + self.smoothing = smoothing + + def forward(self, x, target): + target = target.to(torch.int64) + + logprobs = torch.nn.functional.log_softmax(x, dim=-1) + index = target.unsqueeze(1) + + nll_loss = -logprobs.gather(dim=-1, index=index) + nll_loss = nll_loss.squeeze(1) + smooth_loss = -logprobs.mean(dim=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + + return loss.mean() + + +if __name__ == '__main__': + main() diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt new file mode 100644 index 0000000000..41d561e947 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/modelzoo_level.txt @@ -0,0 +1,5 @@ +GPUStatus:OK +NPUMigrationStatus:OK +FuncStatus:OK +PerfStatus:OK +PrecisionStatus:OK \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py new file mode 100644 index 0000000000..df26ace55e --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/pthtar2onnx.py @@ -0,0 +1,49 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import torch +from inception import inception_v3 +import torch.onnx + +from collections import OrderedDict + + +def proc_node_module(checkpoint, AttrName): + new_state_dict = OrderedDict() + for k, v in checkpoint[AttrName].items(): + if(k[0:7] == "module."): + name = k[7:] + else: + name = k[0:] + new_state_dict[name] = v + return new_state_dict + + +def convert(): + checkpoint = torch.load("./model_best.pth.tar", map_location='cpu') + checkpoint['state_dict'] = proc_node_module(checkpoint, 'state_dict') + model = inception_v3() + model.load_state_dict(checkpoint['state_dict']) + model.eval() + print(model) + + input_names = ["actual_input_1"] + output_names = ["output1"] + dummy_input = torch.randn(16, 3, 299, 299) + torch.onnx.export(model, dummy_input, "inception_npu_16.onnx", input_names=input_names, output_names=output_names, + opset_version=11) + + +if __name__ == "__main__": + convert() diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt new file mode 100644 index 0000000000..d93a42f0f3 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/requirements.txt @@ -0,0 +1 @@ +torchvision=0.6.0 diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh new file mode 100644 index 0000000000..4cde526129 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/env_npu.sh @@ -0,0 +1,67 @@ +#!/bin/bash +CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' + +if [ -f $CANN_INSTALL_PATH_CONF ]; then + CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2) +else + CANN_INSTALL_PATH="/usr/local/Ascend" +fi + +if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then + source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh +else + source ${CANN_INSTALL_PATH}/nnae/set_env.sh +fi + +#设置device侧日志登记为error +msnpureport -g error -d 0 +msnpureport -g error -d 1 +msnpureport -g error -d 2 +msnpureport -g error -d 3 +msnpureport -g error -d 4 +msnpureport -g error -d 5 +msnpureport -g error -d 6 +msnpureport -g error -d 7 +#关闭Device侧Event日志 +msnpureport -e disable + + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 +export HCCL_IF_IP=$(hostname -I |awk '{print $1}') + + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh new file mode 100644 index 0000000000..45eeeb1cb2 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_full_8p.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 +# source env.sh +RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +# export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Inception_v2_231" +#训练epoch +train_epochs=100 +#训练batch_size +batch_size=2048 +#训练step +train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.72 + + + +#维测参数,precision_mode需要模型审视修改 +precision_mode="amp" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + fi +done + + PREC="" +if [[ $precision_mode == "amp" ]];then + PREC="--amp" +fi + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +cd $cur_path + +#设置环境变量,不需要修改 +echo "Device ID: $ASCEND_DEVICE_ID" +export RANK_ID=$RANK_ID + +if [ -d $cur_path/output_8p ];then + rm -rf $cur_path/output_8p/* + mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID +fi +wait + +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时 source 环境变量 +check_etp_flag = `env | grep etp_running_flag` +etp_flag = `echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ]; then + source ${cur_path}/test/env_npu.sh +fi + +nohup python3.7 main-8p.py \ + -a inception_v2 \ + --amp \ + --loss-scale 128 \ + --data ${data_path} \ + --addr=$(hostname -I |awk '{print $1}') \ + --seed=49 \ + --workers=184 \ + --learning-rate=${learning_rate} \ + --mom=0.9 \ + --weight-decay=1.0e-04 \ + --print-freq=30 \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --multiprocessing-distributed \ + --world-size=1 \ + --rank=0 \ + --device='npu' \ + --epochs=$train_epochs \ + --label-smoothing=0.1 \ + --batch-size=${batch_size} > $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +Timeavg=`grep "Epoch:" $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $8}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g ` +Timeavg=`awk 'BEGIN{printf "%.2f\n",'$Timeavg'}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$Timeavg'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' $cur_path/output_8p/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${cur_path}/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh new file mode 100644 index 0000000000..de9bbe468a --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_1p.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +export ASCEND_DEVICE_ID=2 +RANK_ID_START=0 +# source env.sh +RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +# export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Inception_v2_231" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=256 +#训练step +train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.045 + + + +#维测参数,precision_mode需要模型审视修改 +precision_mode="amp" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + fi +done + +PREC="" +if [[ $precision_mode == "amp" ]];then + PREC="--amp" +fi + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +cd $cur_path + +sed -i "s|pass|break|g" main.py + +#设置环境变量,不需要修改 +echo "Device ID: $ASCEND_DEVICE_ID" +export RANK_ID=$RANK_ID + +if [ -d $cur_path/output_1p ];then + rm -rf $cur_path/output_1p/* + mkdir -p $cur_path/output_1p/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output_1p/$ASCEND_DEVICE_ID +fi +wait + +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时 source 环境变量 +check_etp_flag = `env | grep etp_running_flag` +etp_flag = `echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ]; then + source ${cur_path}/test/env_npu.sh +fi + +nohup python3.7 main.py \ + --data ${data_path} \ + --npu ${ASCEND_DEVICE_ID} \ + -a inception_v2 \ + -b ${batch_size} \ + --lr ${learning_rate} \ + --epochs $train_epochs \ + -j 128 \ + -p 10 \ + ${PREC} \ + --label-smoothing 0.1 \ + --wd 0.0002 > $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait + +sed -i "s|break|pass|g" main.py + +# fps=`grep -a 'FPS' $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $7}'` +# step_time=`awk 'BEGIN{printf "%.2f\n",'1000'*'${batch_size}'/'$fps'}'` + +# echo "Final Performance image/s : $fps" +# echo "Final Performance ms/step : $step_time" +# echo "Final Training Duration sec : $e2etime" + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +Timeavg=`grep "Epoch:" $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $5}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g ` +Timeavg=`awk 'BEGIN{printf "%.2f\n",'$Timeavg'}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$Timeavg'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' $cur_path/output_1p/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: $cur_path/output_1p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Loss" '{print$2}' | awk -F "e" '{print$1}' > $cur_path/output_1p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output_1p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output_1p/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh new file mode 100644 index 0000000000..19291390e6 --- /dev/null +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/test/train_performance_8p.sh @@ -0,0 +1,169 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 +# source env.sh +RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +# export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Inception_v2_231" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=2048 +#训练step +train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.72 + + + +#维测参数,precision_mode需要模型审视修改 +precision_mode="amp" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + fi +done +PREC="" +if [[ $precision_mode == "amp" ]];then + PREC="--amp" +fi + + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +cd $cur_path + +sed -i "s|pass|break|g" main.py + +#设置环境变量,不需要修改 +echo "Device ID: $ASCEND_DEVICE_ID" +export RANK_ID=$RANK_ID + +if [ -d $cur_path/output_8p ];then + rm -rf $cur_path/output_8p/* + mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output_8p/$ASCEND_DEVICE_ID +fi +wait + +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时 source 环境变量 +check_etp_flag = `env | grep etp_running_flag` +etp_flag = `echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ]; then + source ${cur_path}/test/env_npu.sh +fi + +nohup python3.7 main-8p.py \ + -a inception_v2 \ + ${PREC} \ + --loss-scale 128 \ + --data ${data_path} \ + --addr=$(hostname -I |awk '{print $1}') \ + --seed=49 \ + --workers=184 \ + --learning-rate=${learning_rate} \ + --mom=0.9 \ + --weight-decay=1.0e-04 \ + --print-freq=30 \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --multiprocessing-distributed \ + --world-size=1 \ + --rank=0 \ + --device='npu' \ + --epochs=$train_epochs \ + --label-smoothing=0.1 \ + --batch-size=${batch_size} > $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait + +sed -i "s|break|pass|g" main.py + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#Timeavg=`grep "Epoch:" $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $8}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g ` +Timeavg=`grep "Epoch:" $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "Time" '{print $2}'|awk -F "(" '{print $1}' |tail -n +2|awk '{sum+=$1} END {print sum/NR}'|sed s/[[:space:]]//g ` +Timeavg=`awk 'BEGIN{printf "%.2f\n",'$Timeavg'}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$Timeavg'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' $cur_path/output_8p/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: $cur_path/output_8p/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output_8p/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output_8p/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From d264a76714d4af35b46eb96aa95dfe9eb26ae4e0 Mon Sep 17 00:00:00 2001 From: sysulyccc Date: Thu, 22 Dec 2022 08:39:32 +0000 Subject: [PATCH 2/2] update PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md. Signed-off-by: sysulyccc --- .../Inception_v2_231_for_Pytorch/README.md | 96 +++++++++++++------ 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md index ebe02e8775..4cb7d8623e 100644 --- a/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md +++ b/PyTorch/contrib/cv/classification/Inception_v2_231_for_Pytorch/README.md @@ -1,4 +1,4 @@ -# Inception_v2_231 +# Inception_v2_231 for PyTorch - [概述](#概述) - [准备训练环境](#准备训练环境) - [开始训练](#开始训练) @@ -8,12 +8,13 @@ ## 简述 -InceptionV2主要是在GoogLeNet的基础上添加了 BN 层,并且采用 VGG 的思想,利用两个小卷积核代替大卷积核,在保持相同感受野的同时减少参数,并提高非线性表示能力。 +InceptionV2主要是在GoogLeNet的基础上添加了 BN 层,并且采用 VGG 的思想,利用两个小卷积核代替大卷积核,在保持相同感受野的同时减少参数,并提高非线性表示能力 - 参考实现: ``` - https://github.com/shanglianlm0525/PyTorch-Networks/blob/master/ClassicNetwork/InceptionV2.py + url=https://github.com/shanglianlm0525/PyTorch-Networks/blob/master/ClassicNetwork/InceptionV2.py + commit_id=298bc76761d34e472fdc73615f50a5a9afd7e8b9 ``` - 适配昇腾 AI 处理器的实现: @@ -36,15 +37,16 @@ InceptionV2主要是在GoogLeNet的基础上添加了 BN 层,并且采用 VGG ## 准备环境 -- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示 +- 当前模型支持的硬件、NPU固件驱动、CANN 和 PyTorch 如下表所示 **表 1** 版本配套表 - | 配套 | 版本 | - | ---------- | ------------------------------------------------------------ | - | 固件与驱动 | [1.0.9](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | - | CANN | [3.2.1](https://www.hiascend.com/software/cann/commercial?version=3.2.1) | - | PyTorch | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) | + | 配套 | 版本 | + | ------------- | ------------------------------------------------------------ | + | 硬件 | [1.0.17](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | NPU固件与驱动 | [6.0.RC1](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | CANN | [6.0.RC1](https://www.hiascend.com/software/cann/commercial?version=6.0.RC1) | + | PyTorch | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/) | - 环境准备指导。 @@ -89,35 +91,75 @@ InceptionV2主要是在GoogLeNet的基础上添加了 BN 层,并且采用 VGG ``` > **说明:** - >数据集路径以用户自行定义的路径为准 + > 数据集路径以用户自行定义的路径为准 # 开始训练 ## 训练模型 -```bash -# prefomance training 1p -bash ./test/train_performance_1p.sh --data_path=/opt/npu/dataset/imagenet -# prefomance training 8p -bash ./test/train_performance_8p.sh --data_path=/opt/npu/dataset/imagenet +1. 进入解压后的源码包根目录 -# full training 1p -bash ./test/train_full_1p.sh --data_path=/opt/npu/dataset/imagenet + ```bash + cd /${模型文件夹名称} + ``` -# full training 8p -bash ./test/train_full_8p.sh --data_path=/opt/npu/dataset/imagenet +2. 运行训练脚本 -# eval -bash ./test/eval_8p.sh --data_path=/opt/npu/dataset/imagenet + 该模型支持单机单卡性能和单机8卡训练与性能 + + * 单机单卡性能 + + ```bash + # prefomance 1p, --data_path填写数据集路径, 输出日志在./output_1p + bash ./test/train_performance_1p.sh --data_path=real_data_path + ``` + + * 单机8卡 + + ```bash + # full 8p, --data_path填写数据集路径, 输出日志在./output_8p + bash ./test/train_full_8p.sh --data_path=real_data_path + + # prefomance 8p, --data_path填写数据集路径, 输出日志在./output_8p + bash ./test/train_performance_8p.sh --data_path=real_data_path + ``` + +模型训练脚本参数说明如下: + +``` +--data_path //数据集路径 +--addr //主机地址 +--workers //加载数据进程数 +--learning-rate //初始化学习率 +--mom //动量 +--weight-decay //权重衰减 +--multiprocessing-distributed //是否使用多卡训练 +--epochs //重复训练次数 +--batch-size //训练批次大小 +--device //设备 +--amp //是否使用混合精度 +--opt-level //混合精度类型 ``` +训练完成后,权重文件保存在当前路径下,精度和性能日志在output_1p或者output_8p下 + # 训练结果展示 **表 2** 训练结果展示表 -| NAME | LOSS | FPS | Epochs | AMP_Type | ACC@1 | -| :----: | :----: | :-----: | :----: | :------: | :----: | -| 1p-GPU | 7.5401 | 793.85 | 1 | - | - | -| 1p-NPU | 7.6044 | 942.15 | 1 | O1 | - | -| 8p-GPU | 2.3929 | 5752.80 | 100 | - | 68.654 | -| 8p-NPU | 2.1992 | 6395.45 | 100 | O1 | 68.663 | +| NAME | LOSS | FPS | Epochs | AMP_Type | ACC@1 | +| :----: | :----: | :---: | :----: | :------: | :----: | +| 1p-GPU | 7.5401 | 652 | 1 | - | - | +| 1p-NPU | 7.6044 | 1280 | 1 | O1 | - | +| 8p-GPU | 2.3929 | 5752 | 100 | - | 68.654 | +| 8p-NPU | 2.1992 | 6400 | 100 | O1 | 68.663 | + +## 版本说明 + +### 变更 + +2022.9.29:首次发布 + +### 已知问题 + +无 \ No newline at end of file -- Gitee