From 81b1a7a1397d865ee52ccebb65e48839f54816ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E6=98=95=E9=AA=85?= <10760252+mxhua@user.noreply.gitee.com> Date: Mon, 9 May 2022 06:20:04 +0000 Subject: [PATCH] =?UTF-8?q?[=E4=B8=9C=E5=8C=97=E5=A4=A7=E5=AD=A6][?= =?UTF-8?q?=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][PyTorch=E7=A6=BB=E7=BA=BF?= =?UTF-8?q?=E6=8E=A8=E7=90=86][DnCNN]-=E5=88=9D=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DnCNN/DnCNN_postprocess.py | 103 +++++++++++ DnCNN/DnCNN_preprocess.py | 79 +++++++++ DnCNN/DnCNN_pth2onnx.py | 80 +++++++++ DnCNN/READEME.md | 350 +++++++++++++++++++++++++++++++++++++ DnCNN/env.sh | 8 + DnCNN/get_info.py | 60 +++++++ DnCNN/modelzoo_level.txt | 14 ++ DnCNN/requirements.txt | 6 + DnCNN/test/perf_bs1.sh | 6 + DnCNN/test/perf_bs16.sh | 6 + DnCNN/test/perf_g.sh | 1 + DnCNN/test/pth2om_bs1.sh | 3 + DnCNN/test/pth2om_bs16.sh | 3 + 13 files changed, 719 insertions(+) create mode 100644 DnCNN/DnCNN_postprocess.py create mode 100644 DnCNN/DnCNN_preprocess.py create mode 100644 DnCNN/DnCNN_pth2onnx.py create mode 100644 DnCNN/READEME.md create mode 100644 DnCNN/env.sh create mode 100644 DnCNN/get_info.py create mode 100644 DnCNN/modelzoo_level.txt create mode 100644 DnCNN/requirements.txt create mode 100644 DnCNN/test/perf_bs1.sh create mode 100644 DnCNN/test/perf_bs16.sh create mode 100644 DnCNN/test/perf_g.sh create mode 100644 DnCNN/test/pth2om_bs1.sh create mode 100644 DnCNN/test/pth2om_bs16.sh diff --git a/DnCNN/DnCNN_postprocess.py b/DnCNN/DnCNN_postprocess.py new file mode 100644 index 0000000000..82ce433eb3 --- /dev/null +++ b/DnCNN/DnCNN_postprocess.py @@ -0,0 +1,103 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import sys +import glob +import numpy as np +import cv2 +import torch +import torch.nn as nn +import struct +from skimage.metrics import peak_signal_noise_ratio as compare_psnr + + +def batch_PSNR(img, imclean, data_range): + + Img = img.data.cpu().numpy().astype(np.float32) + Iclean = imclean.data.cpu().numpy().astype(np.float32) + PSNR = 0 + for i in range(Img.shape[0]): + PSNR += compare_psnr(Iclean[i, :, :, :], Img[i, :, :, :], data_range=data_range) + return (PSNR / Img.shape[0]) + + +def bin2npy(filepath): + + size = os.path.getsize(filepath) + res = [] + L = int(size / 4) + binfile = open(filepath, 'rb') + for i in range(L): + data = binfile.read(4) + num = struct.unpack('f', data) + res.append(num[0]) + binfile.close() + dim_res = np.array(res).reshape(1, 1, 481, 481) + return dim_res + + +def main(Result_path): + + # load data info + print('Loading ISource bin ...\n') + ISource = glob.glob(os.path.join('ISource', '*.bin')) + ISource.sort() + print('Loading INoisy bin ...\n') + INoisy = glob.glob(os.path.join('INoisy', '*.bin')) + INoisy.sort() + # load result file + print('Loading res bin ...\n') + Result_path = glob.glob(os.path.join(Result_path, '*.bin')) + Result_path.sort() + + # begin data + print('begin infer') + psnr_test = 0 + n_lables = 0 + + for isource in ISource: + isource_name = isource + # isource + isource = bin2npy(isource) + isource = torch.from_numpy(isource) + # inoisy + inoisy = bin2npy(INoisy[n_lables]) + inoisy = torch.from_numpy(inoisy) + # Result_path + Result = bin2npy(Result_path[n_lables]) + Result = torch.from_numpy(Result) + n_lables += 1 + print('infering...') + with torch.no_grad(): + Out = torch.clamp(inoisy - Result, 0., 1.) + psnr = batch_PSNR(Out, isource, 1.) + psnr_test += psnr + print("%s PSNR %f" % (isource_name, psnr)) + psnr_test /= len(ISource) + print("\nPSNR on test data %f" % psnr_test) + +if __name__ == "__main__": + + try: + Result_path = sys.argv[1] + + except IndexError: + print("Stopped!") + exit(1) + + if not (os.path.exists(Result_path)): + print("Result path doesn't exist.") + + main(Result_path) diff --git a/DnCNN/DnCNN_preprocess.py b/DnCNN/DnCNN_preprocess.py new file mode 100644 index 0000000000..0de80a5e6c --- /dev/null +++ b/DnCNN/DnCNN_preprocess.py @@ -0,0 +1,79 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import sys +import os +import os.path +import numpy as np +import random +import torch +import cv2 +import glob + +infer_data = 'Set68' +infer_noiseL = 15 + +def normalize(data): + return data / 255. + + +def proprecess(data_path, ISource_bin, INoisy_bin): + + # load data info + print('Loading data info ...\n') + files = glob.glob(os.path.join(data_path, infer_data, '*.png')) + files.sort() + # process data + for i in range(len(files)): + # image + filename = os.path.basename(files[i]) + img = cv2.imread(files[i]) + img = normalize(np.float32(img[:, :, 0])) + + img_padded = np.full([481, 481], 0, dtype=np.float32) + width_offset = (481 - img.shape[1]) // 2 + height_offset = (481 - img.shape[0]) // 2 + img_padded[height_offset:height_offset + img.shape[0], width_offset:width_offset + img.shape[1]] = img + img = img_padded + + img = np.expand_dims(img, 0) + img = np.expand_dims(img, 1) + + ISource = torch.Tensor(img) + # noise + noise = torch.FloatTensor(ISource.size()).normal_(mean=0, std=infer_noiseL / 255.) + # noisy image + INoisy = ISource + noise + + # save ISource_bin + ISource = ISource.numpy() + print("ISource shape is", ISource.shape) + ISource.tofile(os.path.join(ISource_bin, filename.split('.')[0] + '.bin')) + + # save INoisy_bin + INoisy = INoisy.numpy() + print("INoisy shape is", INoisy.shape) + INoisy.tofile(os.path.join(INoisy_bin, filename.split('.')[0] + '.bin')) + +if __name__ == '__main__': + + data_path = sys.argv[1] + ISource_bin = sys.argv[2] + INoisy_bin = sys.argv[3] + if os.path.exists(ISource_bin) is False: + os.mkdir(ISource_bin) + if os.path.exists(INoisy_bin) is False: + os.mkdir(INoisy_bin) + + proprecess(data_path, ISource_bin, INoisy_bin) diff --git a/DnCNN/DnCNN_pth2onnx.py b/DnCNN/DnCNN_pth2onnx.py new file mode 100644 index 0000000000..deab4434af --- /dev/null +++ b/DnCNN/DnCNN_pth2onnx.py @@ -0,0 +1,80 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import torch +import torch.onnx +import torch.nn as nn +import sys + +from collections import OrderedDict + +class DnCNN(nn.Module): + def __init__(self, channels, num_of_layers=17): + super(DnCNN, self).__init__() + kernel_size = 3 + padding = 1 + features = 64 + layers = [] + layers.append(nn.Conv2d(in_channels=channels, out_channels=features, \ + kernel_size=kernel_size, padding=padding, bias=False)) + layers.append(nn.ReLU(inplace=True)) + for _ in range(num_of_layers - 2): + layers.append(nn.Conv2d(in_channels=features, out_channels=features, \ + kernel_size=kernel_size, padding=padding, bias=False)) + layers.append(nn.BatchNorm2d(features)) + layers.append(nn.ReLU(inplace=True)) + layers.append(nn.Conv2d(in_channels=features, out_channels=channels, \ + kernel_size=kernel_size, padding=padding, bias=False)) + self.dncnn = nn.Sequential(*layers) + + def forward(self, x): + + out = self.dncnn(x) + return out + + +def proc_nodes_module(checkpoint): + + new_state_dict = OrderedDict() + for k, v in checkpoint.items(): + if(k[0:7] == "module."): + name = k[7:] + else: + name = k[0:] + new_state_dict[name]=v + return new_state_dict + + +def convert(pth_file, onnx_file): + + pretrained_net = torch.load(pth_file, map_location='cpu') + pretrained_net['state_dict'] = proc_nodes_module(pretrained_net) + + model = DnCNN(channels=1, num_of_layers=17) + model.load_state_dict(pretrained_net['state_dict']) + model.eval() + input_names = ["actual_input_1"] + dummy_input = torch.randn(1, 1, 481, 481) + #torch.onnx.export(model, dummy_input, onnx_file, input_names = input_names, opset_version=11, verbose=True) + dynamic_axes = {'actual_input_1': {0: '-1'}} + torch.onnx.export(model, dummy_input, onnx_file, dynamic_axes=dynamic_axes, \ + input_names=input_names, opset_version=11) + +if __name__ == "__main__": + + pth_file = sys.argv[1] + onnx_file = sys.argv[2] + + convert(pth_file, onnx_file) diff --git a/DnCNN/READEME.md b/DnCNN/READEME.md new file mode 100644 index 0000000000..65429d8a5c --- /dev/null +++ b/DnCNN/READEME.md @@ -0,0 +1,350 @@ +# DnCNN Onnx模型端到端推理指导 +- [1 模型概述](#1-模型概述) + - [1.1 论文地址](#11-论文地址) + - [1.2 代码地址](#12-代码地址) +- [2 环境说明](#2-环境说明) + - [2.1 深度学习框架](#21-深度学习框架) + - [2.2 python第三方库](#22-python第三方库) +- [3 模型转换](#3-模型转换) + - [3.1 pth转onnx模型](#31-pth转onnx模型) + - [3.2 onnx转om模型](#32-onnx转om模型) +- [4 数据集预处理](#4-数据集预处理) + - [4.1 数据集获取](#41-数据集获取) + - [4.2 数据集预处理](#42-数据集预处理) + - [4.3 生成数据集信息文件](#43-生成数据集信息文件) +- [5 离线推理](#5-离线推理) + - [5.1 benchmark工具概述](#51-benchmark工具概述) + - [5.2 离线推理](#52-离线推理) +- [6 精度对比](#6-精度对比) + - [6.1 离线推理TopN精度统计](#61-离线推理TopN精度统计) + - [6.2 开源TopN精度](#62-开源TopN精度) + - [6.3 精度对比](#63-精度对比) +- [7 性能对比](#7-性能对比) + - [7.1 npu性能数据](#71-npu性能数据) + - [7.2 T4性能数据](#72-T4性能数据) + - [7.3 性能对比](#73-性能对比) + + + +## 1 模型概述 + +- **[论文地址](#11-论文地址)** + +- **[代码地址](#12-代码地址)** + +### 1.1 论文地址 +[DnCNN论文](https://ieeexplore.ieee.org/document/7839189) + +### 1.2 代码地址 + +brach:master + +commit_id: 6b0804951484eadb7f1ea24e8e5c9ede9bea485b + +备注:commitid指的是值模型基于此版本代码做的推理 + +[DnCNN代码](https://github.com/SaoYan/DnCNN-PyTorch) + +## 2 环境说明 + +- **[深度学习框架](#21-深度学习框架)** + +- **[python第三方库](#22-python第三方库)** + +### 2.1 深度学习框架 +``` +CANN 5.0.1 +torch==1.8.0 +torchvision==0.9.0 +onnx==1.9.0 +``` + +### 2.2 python第三方库 + +``` +numpy==1.20.2 +opencv-python==4.5.2.52 +scikit-image==0.16.2 +``` + +**说明:** +> X86架构:pytorch,torchvision和onnx可以通过官方下载whl包安装,其它可以通过pip3.7 install 包名 安装 +> +> Arm架构:pytorch,torchvision和onnx可以通过源码编译安装,其它可以通过pip3.7 install 包名 安装 + +## 3 模型转换 + +- **[pth转onnx模型](#31-pth转onnx模型)** + +- **[onnx转om模型](#32-onnx转om模型)** + +### 3.1 pth转onnx模型 + +1.DnCNN模型代码下载 +``` +git clone https://github.com/SaoYan/DnCNN-PyTorch +cd DnCNN-PyTorch +``` +2.获取源码pth权重文件 +wget https://ascend-model-file.obs.cn-north-4.myhuaweicloud.com/%E4%BA%A4%E4%BB%98%E4%BB%B6/cv/image_classification/DnCnn/net.pth +文件的MD5sum值是: 5703a29b082cc03401fa9d9fee12cb71 + +3.获取NPU训练pth文件,将net.pth文件移动到DnCNN目录下 + +4.编写pth2onnx脚本DnCNN_pth2onnx.py + + **说明:** +>注意目前ATC支持的onnx算子版本为11 + +5.执行pth2onnx脚本,生成onnx模型文件 +``` +python3.7 DnCNN_pth2onnx.py net.pth DnCNN-S-15.onnx +``` + + **模型转换要点:** +>此模型转换为onnx不需要修改开源代码仓代码,故不需要特殊说明 + +### 3.2 onnx转om模型 + +1.设置环境变量 +``` +source env.sh +``` +2.使用atc将onnx模型转换为om模型文件 +``` +atc --framework=5 --model=./DnCNN-S-15.onnx --input_format=NCHW --input_shape="actual_input_1:1,1,481,481" --output=DnCNN-S-15_bs1 --log=debug --soc_version=Ascend310 +``` + +## 4 数据集预处理 + +- **[数据集获取](#41-数据集获取)** + +- **[数据集预处理](#42-数据集预处理)** + +- **[生成数据集信息文件](#43-生成数据集信息文件)** + +### 4.1 推理数据集获取 +存放路径为 https://github.com/SaoYan/DnCNN-PyTorch 的data目录 + +### 4.2 数据集预处理 +1.预处理脚本data_preprocess.py + +2.执行预处理脚本,生成数据集预处理后的bin文件 + +``` +python3.7 data_preprocess.py data ISource INoisy +``` +### 4.3 生成数据集信息文件 +1.生成数据集信息文件脚本get_info.py + +2.执行生成数据集信息脚本,生成数据集信息文件 +``` +python3.7 get_info.py bin INoisy DnCNN_bin.info 481 481 +``` +第一个参数为模型输入的类型,第二个参数为生成的bin文件路径,第三个为输出的info文件,后面为宽高信息 +## 5 离线推理 + +- **[benchmark工具概述](#51-benchmark工具概述)** + +- **[离线推理](#52-离线推理)** + +### 5.1 benchmark工具概述 + +benchmark工具为华为自研的模型推理工具,支持多种模型的离线推理,能够迅速统计出模型在Ascend310上的性能,支持真实数据和纯推理两种模式,配合后处理脚本,可以实现诸多模型的端到端过程 +### 5.2 离线推理 +1.设置环境变量 +``` +source env.sh +``` +2.执行离线推理 +``` +./benchmark.x86_64 -model_type=vision -om_path=DnCNN-S-15.om -device_id=0 -batch_size=1 -input_text_path=DnCNN_bin.info -input_width=481 -input_height=481 -useDvpp=false -output_binary=true +``` +输出结果默认保存在当前目录result/dumpOutput_deviceX(X为对应的device_id),每个输入对应的输出对应一个_X.bin文件。 + +## 6 精度对比 + +- **[离线推理TopN精度](#61-离线推理TopN精度)** +- **[开源TopN精度](#62-开源TopN精度)** +- **[精度对比](#63-精度对比)** + +### 6.1 离线推理TopN精度统计 + +后处理统计TopN精度 + +调用postprocess.py脚本推理结果进行PSRN计算,结果会打印在屏幕上 +``` +python3.7 postprocess.py result/dumpOutput_device0/ +``` +第一个参数为benchmark输出目录 +查看输出结果: +``` +ISource/test064.bin PSNR 29.799832 +infering... +ISource/test065.bin PSNR 31.486418 +infering... +ISource/test066.bin PSNR 35.676752 +infering... +ISource/test067.bin PSNR 28.577475 +infering... +ISource/test068.bin PSNR 29.709767 + +PSNR on test data 31.526892 +``` +经过对bs1与bs16的om测试,本模型batch1的精度与batch16的精度没有差别,精度数据均如上 + +### 6.2 开源PSNR精度 +``` +| Noise Level | DnCNN-S | DnCNN-B | DnCNN-S-PyTorch | DnCNN-B-PyTorch | +|:-----------:|:-------:|:-------:|:---------------:|:---------------:| +| 15 | 31.73 | 31.61 | 31.71 | 31.60 | +| 25 | 29.23 | 29.16 | 29.21 | 29.15 | +| 50 | 26.23 | 26.23 | 26.22 | 26.20 | +``` +### 6.3 精度对比 +将得到的om离线模型推理PSNR值与该模型github代码仓上公布的精度对比,精度下降在1%范围之内,故精度达标。 + **精度调试:** + +>没有遇到精度不达标的问题,故不需要进行精度调试 + +## 7 性能对比 + +- **[npu性能数据](#71-npu性能数据)** +- **[T4性能数据](#72-T4性能数据)** +- **[性能对比](#73-性能对比)** + +### 7.1 npu性能数据 +benchmark工具在整个数据集上推理时也会统计性能数据,但是推理整个数据集较慢,如果这么测性能那么整个推理期间需要确保独占device。为快速获取性能数据,也可以使用benchmark纯推理功能测得性能数据,但是由于随机数不能模拟数据分布,纯推理功能测的有些模型性能数据可能不太准。这里给出两种方式,benchmark纯推理功能测性能仅为快速获取大概的性能数据以便调试优化使用,模型的性能以使用benchmark工具在整个数据集上推理得到bs1与bs16的性能数据为准,对于使用benchmark工具测试的batch4,8,32的性能数据在README.md中如下作记录即可。 +1.benchmark工具在整个数据集上推理获得性能数据 +batch1的性能,benchmark工具在整个数据集上推理后生成result/perf_vision_batchsize_1_device_0.txt: + +``` +[e2e] throughputRate: 15.0465, latency: 4519.32 +[data read] throughputRate: 966.417, moduleLatency: 1.03475 +[preprocess] throughputRate: 525.539, moduleLatency: 1.90281 +[infer] throughputRate: 22.6328, Interface throughputRate: 23.7919, moduleLatency: 43.8903 +[post] throughputRate: 22.615, moduleLatency: 44.2185 +``` +Interface throughputRate: 23.7919,23.7919x4=95.176既是batch1 310单卡吞吐率 + +batch16的性能,benchmark工具在整个数据集上推理后生成result/perf_vision_batchsize_16_device_1.txt: +``` +[e2e] throughputRate: 15.3818, latency: 4420.81 +[data read] throughputRate: 1484.65, moduleLatency: 0.673559 +[preprocess] throughputRate: 316.273, moduleLatency: 3.16182 +[infer] throughputRate: 21.4529, Interface throughputRate: 22.2853, moduleLatency: 45.6179 +[post] throughputRate: 1.56798, moduleLatency: 637.764 +``` +Interface throughputRate: 22.2853,22.2853x4=89.1412既是batch16 310单卡吞吐率 + +batch4性能: +``` +[e2e] throughputRate: 15.5641, latency: 4369.02 +[data read] throughputRate: 1898.17, moduleLatency: 0.526824 +[preprocess] throughputRate: 523.883, moduleLatency: 1.90882 +[infer] throughputRate: 22.091, Interface throughputRate: 23.9045, moduleLatency: 44.5192 +[post] throughputRate: 5.50981, moduleLatency: 181.495 +``` +batch4 310单卡吞吐率 23.9045x4=95.618 + +batch8性能: +``` +[e2e] throughputRate: 15.5035, latency: 4386.1 +[data read] throughputRate: 1863.93, moduleLatency: 0.5365 +[preprocess] throughputRate: 461.471, moduleLatency: 2.16699 +[infer] throughputRate: 20.7804, Interface throughputRate: 22.2652, moduleLatency: 47.2831 +[post] throughputRate: 2.74035, moduleLatency: 364.917 +``` +batch8 310单卡吞吐率 22.2652x4=89.0608 + +batch32性能: +``` +[e2e] throughputRate: 12.4075, latency: 5480.54 +[data read] throughputRate: 1770.65, moduleLatency: 0.564765 +[preprocess] throughputRate: 242.944, moduleLatency: 4.11618 +[infer] throughputRate: 15.641, Interface throughputRate: 13.2648, moduleLatency: 62.7386 +[post] throughputRate: 0.68503, moduleLatency: 1459.79 +``` +batch32 310单卡吞吐率 13.2648x4=53.0592 + +### 7.2 T4性能数据 +在装有T4卡的服务器上测试gpu性能,TensorRT版本:7.2.3.4,cuda版本:11.0,cudnn版本:8.2 +batch1性能: +``` +trtexec --onnx=DnCNN-S-15.onnx --fp16 --shapes=actual_input_1:1x1x484x481 --threads +``` +gpu T4是4个device并行执行的结果,mean是时延(tensorrt的时延是batch个数据的推理时间),即吞吐率的倒数乘以batch +``` +[06/05/2021-06:28:42] [I] GPU Compute +[06/05/2021-06:28:42] [I] min: 12.5439 ms +[06/05/2021-06:28:42] [I] max: 19.0195 ms +[06/05/2021-06:28:42] [I] mean: 13.1826 ms +[06/05/2021-06:28:42] [I] median: 12.9761 ms +[06/05/2021-06:28:42] [I] percentile: 17.7111 ms at 99% +[06/05/2021-06:28:42] [I] total compute time: 3.01882 s +``` +batch1 t4单卡吞吐率:1000x1/(13.1826/1)=75.858fps + +batch16性能: +``` +trtexec --onnx=DnCNN-S-15.onnx --fp16 --shapes=actual_input_1:16x1x484x481 --threads +``` +``` +[06/05/2021-06:31:53] [I] GPU Compute +[06/05/2021-06:31:53] [I] min: 198.604 ms +[06/05/2021-06:31:53] [I] max: 218.884 ms +[06/05/2021-06:31:53] [I] mean: 201.968 ms +[06/05/2021-06:31:53] [I] median: 200.267 ms +[06/05/2021-06:31:53] [I] percentile: 218.884 ms at 99% +[06/05/2021-06:31:53] [I] total compute time: 3.23149 s +``` +batch16 t4单卡吞吐率:1000x1/(201.968/16)=79.220fps + +batch4性能 +``` +[06/05/2021-13:48:52] [I] GPU Compute +[06/05/2021-13:48:52] [I] min: 48.9983 ms +[06/05/2021-13:48:52] [I] max: 67.3423 ms +[06/05/2021-13:48:52] [I] mean: 50.6542 ms +[06/05/2021-13:48:52] [I] median: 50.0736 ms +[06/05/2021-13:48:52] [I] percentile: 67.3423 ms at 99% +[06/05/2021-13:48:52] [I] total compute time: 3.08991 s +``` +batch4 t4单卡吞吐率:1000x1/(50.6542/4)=78.957fps + +batch8性能: +``` +[06/05/2021-13:50:31] [I] GPU Compute +[06/05/2021-13:50:31] [I] min: 101.378 ms +[06/05/2021-13:50:31] [I] max: 128.73 ms +[06/05/2021-13:50:31] [I] mean: 104.424 ms +[06/05/2021-13:50:31] [I] median: 102.267 ms +[06/05/2021-13:50:31] [I] percentile: 128.73 ms at 99% +[06/05/2021-13:50:31] [I] total compute time: 3.13273 s +``` +batch8 t4单卡吞吐率:1000x1/(104.424/8)=76.610fps + +batch32性能: +trtexec --onnx=DnCNN-S-15.onnx --fp16 --shapes=actual_input_1:32x1x484x481 --threads +``` +[06/05/2021-13:57:44] [I] GPU Compute +[06/05/2021-13:57:44] [I] min: 399.587 ms +[06/05/2021-13:57:44] [I] max: 426.525 ms +[06/05/2021-13:57:44] [I] mean: 409.475 ms +[06/05/2021-13:57:44] [I] median: 407.555 ms +[06/05/2021-13:57:44] [I] percentile: 426.525 ms at 99% +[06/05/2021-13:57:44] [I] total compute time: 4.09475 s +``` +batch32 t4单卡吞吐率:1000x1/(409.475/32)=78.149fps + + + +### 7.3 性能对比 +batch1:23.7919x4 > 1000x1/(13.1826/1) +batch16:22.2853x4 > 1000x1/(201.968/16) +310单个device的吞吐率乘4即单卡吞吐率,所得数据中单batch优于T4,多batch略高于T4 +对于batch1与batch16,310性能均高于T4性能1.2倍,但是batch32 310全量数据集上推理性能低于T4性能,所以该模型放在Reaserch/cv/classification目录下。 +**性能优化:** + +>单batch性能优于T4,多batch的性能略高于T4,无需优化。 +>batch32 310全量数据集上推理性能低于T4性能,但是batch32纯推理性能94.3228fps,高于T4性能。 \ No newline at end of file diff --git a/DnCNN/env.sh b/DnCNN/env.sh new file mode 100644 index 0000000000..49be8f16a0 --- /dev/null +++ b/DnCNN/env.sh @@ -0,0 +1,8 @@ +#! /bin/bash + +export install_path=/usr/local/Ascend/ascend-toolkit/latest +export PATH=/usr/local/python3.7.5/bin:${install_path}/atc/ccec_compiler/bin:${install_path}/atc/bin:$PATH +export PYTHONPATH=${install_path}/atc/python/site-packages:$PYTHONPATH +export LD_LIBRARY_PATH=${install_path}/atc/lib64:${install_path}/acllib/lib64:$LD_LIBRARY_PATH +export ASCEND_OPP_PATH=${install_path}/opp +export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest diff --git a/DnCNN/get_info.py b/DnCNN/get_info.py new file mode 100644 index 0000000000..4d05f7c4bd --- /dev/null +++ b/DnCNN/get_info.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import sys +import cv2 +from glob import glob + + +def get_bin_info(file_path, info_name, width, height): + bin_images = glob(os.path.join(file_path, '*.bin')) + with open(info_name, 'w') as file: + for index, img in enumerate(bin_images): + content = ' '.join([str(index), img, width, height]) + file.write(content) + file.write('\n') + + +def get_jpg_info(file_path, info_name): + extensions = ['jpg', 'jpeg', 'JPG', 'JPEG'] + image_names = [] + for extension in extensions: + image_names.append(glob(os.path.join(file_path, '*.' + extension))) + with open(info_name, 'w') as file: + for image_name in image_names: + if len(image_name) == 0: + continue + else: + for index, img in enumerate(image_name): + img_cv = cv2.imread(img) + shape = img_cv.shape + width, height = shape[1], shape[0] + content = ' '.join([str(index), img, str(width), str(height)]) + file.write(content) + file.write('\n') + + +if __name__ == '__main__': + file_type = sys.argv[1] + file_path = sys.argv[2] + info_name = sys.argv[3] + if file_type == 'bin': + width = sys.argv[4] + height = sys.argv[5] + assert len(sys.argv) == 6, 'The number of input parameters must be equal to 5' + get_bin_info(file_path, info_name, width, height) + elif file_type == 'jpg': + assert len(sys.argv) == 4, 'The number of input parameters must be equal to 3' + get_jpg_info(file_path, info_name) diff --git a/DnCNN/modelzoo_level.txt b/DnCNN/modelzoo_level.txt new file mode 100644 index 0000000000..2d073f9816 --- /dev/null +++ b/DnCNN/modelzoo_level.txt @@ -0,0 +1,14 @@ +精度psnr: + 310:31.535421 + 710:31.534013 + +性能:(fps) + 310 710 T4 +bs1 95.3608 131.931 75.8575 +bs4 95.774 146.31 78.1097 +bs8 89.2188 162.367 43.4176 +bs16 76.896 172.557 44.0069 +bs32 53.1048 93.2036 79.6153 +bs64 94.5656 10.3022 61.1755 + +最优batch 95.774 172.557 79.6153 diff --git a/DnCNN/requirements.txt b/DnCNN/requirements.txt new file mode 100644 index 0000000000..965bdf8bdd --- /dev/null +++ b/DnCNN/requirements.txt @@ -0,0 +1,6 @@ +torch==1.8.0 +torchvision==0.9.0 +onnx==1.9.0 +numpy==1.20.2 +opencv-python==4.5.2.52 +scikit-image==0.16.2 \ No newline at end of file diff --git a/DnCNN/test/perf_bs1.sh b/DnCNN/test/perf_bs1.sh new file mode 100644 index 0000000000..ba96f33b2f --- /dev/null +++ b/DnCNN/test/perf_bs1.sh @@ -0,0 +1,6 @@ +python data_preprocess.py data ISource INoisy #执行预处理脚本,生成数据集预处理后的bin文件 +python get_info.py bin INoisy DnCNN_bin.info 481 481 #生成数据集信息文件脚本get_info.py +source env.sh #设置环境变量 +chmod u+x benchmark.x86_64 #增加benchmark.{arch}可执行权限 +./benchmark.x86_64 -model_type=vision -om_path=DnCNN-S-15_bs1.om -device_id=0 -batch_size=1 -input_text_path=DnCNN_bin.info -input_width=481 -input_height=481 -useDvpp=false -output_binary=true #benchmark离线推理 +python postprocess.py result/dumpOutput_device0/ #调用postprocess.py脚本推理结果进行PSRN计算 \ No newline at end of file diff --git a/DnCNN/test/perf_bs16.sh b/DnCNN/test/perf_bs16.sh new file mode 100644 index 0000000000..34db65f134 --- /dev/null +++ b/DnCNN/test/perf_bs16.sh @@ -0,0 +1,6 @@ +python data_preprocess.py data ISource INoisy #执行预处理脚本,生成数据集预处理后的bin文件 +python get_info.py bin INoisy DnCNN_bin.info 481 481 #生成数据集信息文件脚本get_info.py +source env.sh #设置环境变量 +chmod u+x benchmark.x86_64 #增加benchmark.{arch}可执行权限 +./benchmark.x86_64 -model_type=vision -om_path=DnCNN-S-15_bs16.om -device_id=0 -batch_size=16 -input_text_path=DnCNN_bin.info -input_width=481 -input_height=481 -useDvpp=false -output_binary=true #benchmark离线推理 +python postprocess.py result/dumpOutput_device0/ #调用postprocess.py脚本推理结果进行PSRN计算 \ No newline at end of file diff --git a/DnCNN/test/perf_g.sh b/DnCNN/test/perf_g.sh new file mode 100644 index 0000000000..04863ed8d9 --- /dev/null +++ b/DnCNN/test/perf_g.sh @@ -0,0 +1 @@ +trtexec --dumpProfile --onnx=DnCNN-S-15.onnx --shapes=actual_input_1:1x1x481x481 --threads --fp16 \ No newline at end of file diff --git a/DnCNN/test/pth2om_bs1.sh b/DnCNN/test/pth2om_bs1.sh new file mode 100644 index 0000000000..e9bbae809c --- /dev/null +++ b/DnCNN/test/pth2om_bs1.sh @@ -0,0 +1,3 @@ +python DnCNN_pth2onnx.py net.pth DnCNN-S-15.onnx #执行pth2onnx脚本,生成onnx模型文件 +source env.sh #设置环境变量 +atc --framework=5 --model=./DnCNN-S-15.onnx --input_format=NCHW --input_shape="actual_input_1:1,1,481,481" --output=DnCNN-S-15_bs1 --log=debug --soc_version=Ascend710 \ No newline at end of file diff --git a/DnCNN/test/pth2om_bs16.sh b/DnCNN/test/pth2om_bs16.sh new file mode 100644 index 0000000000..5791dc46e9 --- /dev/null +++ b/DnCNN/test/pth2om_bs16.sh @@ -0,0 +1,3 @@ +python DnCNN_pth2onnx.py net.pth DnCNN-S-15.onnx #执行pth2onnx脚本,生成onnx模型文件 +source env.sh #设置环境变量 +atc --framework=5 --model=./DnCNN-S-15.onnx --input_format=NCHW --input_shape="actual_input_1:16,1,481,481" --output=DnCNN-S-15_bs16 --log=debug --soc_version=Ascend710 \ No newline at end of file -- Gitee