diff --git a/models/audio/speech_recognition/conformer/ixrt/README.md b/models/audio/speech_recognition/conformer/ixrt/README.md index 765a4b28c03b194c0f702d8c55e3f374c362e567..d73a68deb6cee05e3875b37866e3850c3bd914e9 100644 --- a/models/audio/speech_recognition/conformer/ixrt/README.md +++ b/models/audio/speech_recognition/conformer/ixrt/README.md @@ -20,7 +20,8 @@ Dataset: to download the Aishell dataset. ```bash # Download and put model in conformer_checkpoints -ln -s /home/deepspark/datasets/INFER/conformer/20210601_u2++_conformer_exp_aishell ./conformer_checkpoints +wget http://files.deepspark.org.cn:880/deepspark/conformer_checkpoints.tar.gz +tar xf conformer_checkpoints.tar.gz # Prepare AISHELL Data DATA_DIR=/PATH/to/aishell_test_data diff --git a/models/audio/speech_recognition/conformer/ixrt/build_engine.py b/models/audio/speech_recognition/conformer/ixrt/build_engine.py index aa20ee59f6ecd23d8a8cb9272ece0087ed65ab89..d87af65175436d7db6629501fb0aecfddc608f09 100644 --- a/models/audio/speech_recognition/conformer/ixrt/build_engine.py +++ b/models/audio/speech_recognition/conformer/ixrt/build_engine.py @@ -1,117 +1,58 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -""" -Build Engine From FusionPlugin Onnx. -""" - import os -import ctypes import json import onnx import logging import argparse import tensorrt -import tensorrt as trt from tensorrt import Dims - -TRT_LOGGER = trt.Logger(trt.Logger.WARNING) -def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""): - if not dynamic_path: - dynamic_path = os.path.join(os.path.dirname(trt.__file__), "lib", "libixrt_plugin.so") - if not os.path.exists(dynamic_path): - raise FileNotFoundError( - f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!" - ) - ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) - trt.init_libnvinfer_plugins(logger, namespace) - print(f"Loaded plugin from {dynamic_path}") - +from load_ixrt_plugin import load_ixrt_plugin load_ixrt_plugin() - def parse_args(): - parser = argparse.ArgumentParser(description="build tensorrt engine of conformer.", usage="") - parser.add_argument( - "--model_name", - type=str, - required=True, - help="conformer", - ) - parser.add_argument( - "--onnx_path", - type=str, - required=True, - help="onnx_path path to save", - ) - parser.add_argument( - "--engine_path", - type=str, - required=True, - help="engine path to save", - ) - parser.add_argument( - "--max_batch_size", - type=int, - required=True, - ) - parser.add_argument( - "--max_seq_len", - type=int, - required=True, - ) + parser = argparse.ArgumentParser(description="Build tensorrt engine of deepspeech2") + parser.add_argument("--onnx_model", type=str, required=True, help="The onnx path") + parser.add_argument("--bsz", type=int, default=1, help="batch size") + parser.add_argument("--input_size", type=tuple, default=(-1, 161), help="inference size") + parser.add_argument("--engine_path", type=str, required=True, help="engine path to save") + parser.add_argument( "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4") + args = parser.parse_args() return args -args = parse_args() -MaxBSZ = args.max_batch_size -MaxSeqLen = args.max_seq_len - def build_engine_trtapi_dynamicshape(args): - onnx_model = args.onnx_path + onnx_model = args.onnx_model assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) builder = tensorrt.Builder(IXRT_LOGGER) EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) network = builder.create_network(EXPLICIT_BATCH) build_config = builder.create_builder_config() - profile = builder.create_optimization_profile() - profile.set_shape("input", Dims([MaxBSZ, 100, 80]), Dims([MaxBSZ, 1000, 80]), Dims([MaxBSZ, 1500, 80])) - profile.set_shape("mask", Dims([MaxBSZ, 1, 25]), Dims([MaxBSZ, 1, 250]), Dims([MaxBSZ, 1, 374])) - profile.set_shape("pos_emb", Dims([1, 25, 256]), Dims([1, 250, 256]), Dims([1, 374, 256])) + + profile.set_shape( + "input", Dims([1,1,80]),Dims([16,800,80]),Dims([128,1500,80]) + ) + profile.set_shape( + "seq_lengths", Dims([1]), Dims([16]), Dims([128]) + ) + build_config.add_optimization_profile(profile) parser = tensorrt.OnnxParser(network, IXRT_LOGGER) + parser.parse_from_file(onnx_model) build_config.set_flag(tensorrt.BuilderFlag.FP16) # set dynamic - # input input_tensor = network.get_input(0) - input_tensor.shape = Dims([MaxBSZ, -1, 80]) - # mask - mask_tensor = network.get_input(1) - mask_tensor.shape = Dims([MaxBSZ, 1, -1]) - # pos_emb - pos_emb_tensor = network.get_input(2) - pos_emb_tensor.shape = Dims([1, -1, 256]) + input_tensor.shape = Dims([-1, -1, 80]) + + seq_lengths_tensor = network.get_input(1) + seq_lengths_tensor.shape = Dims([-1]) plan = builder.build_serialized_network(network, build_config) with open(args.engine_path, "wb") as f: @@ -120,26 +61,7 @@ def build_engine_trtapi_dynamicshape(args): print("Build dynamic shape engine done!") -def build_engine_trtapi_staticshape(args): - onnx_model = args.onnx_path - assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" - IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) - builder = tensorrt.Builder(IXRT_LOGGER) - EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) - network = builder.create_network(EXPLICIT_BATCH) - build_config = builder.create_builder_config() - parser = tensorrt.OnnxParser(network, IXRT_LOGGER) - - parser.parse_from_file(onnx_model) - build_config.set_flag(tensorrt.BuilderFlag.FP16) - - plan = builder.build_serialized_network(network, build_config) - with open(args.engine_path, "wb") as f: - f.write(plan) - - print("Build static shape engine done!") - - if __name__ == "__main__": + args = parse_args() build_engine_trtapi_dynamicshape(args) - # build_engine_trtapi_staticshape(args) + diff --git a/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh b/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh index 2ee5de0b5856d5a3e511be55609a6a137400d47e..7944a1fc2c0053e967917904cf94f2f5200a90c3 100644 --- a/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh +++ b/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh @@ -27,7 +27,8 @@ fi pip3 install -r requirements.txt -ln -s /root/data/checkpoints/20210601_u2++_conformer_exp_aishell ./conformer_checkpoints - -ln -s /root/data/datasets/AISHELL/data_aishell ./aishell_test_data +ln -s /root/data/checkpoints/conformer_checkpoints.tar.gz ./ +tar xf conformer_checkpoints.tar.gz +cp /root/data/datasets/aishell_test_data.tar ./ +tar xf aishell_test_data.tar bash scripts/aishell_data_prepare.sh ./aishell_test_data ./tools \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/ixrt/common.py b/models/audio/speech_recognition/conformer/ixrt/common.py index 89023300ddc7ca3e4f0f992f4b124d8a8c131ae5..6081f807c3a709e8d73f1c1a6bc62185ddcdfc09 100644 --- a/models/audio/speech_recognition/conformer/ixrt/common.py +++ b/models/audio/speech_recognition/conformer/ixrt/common.py @@ -1,43 +1,11 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - import os -import ctypes import cv2 import glob import torch import tensorrt -import tensorrt as trt import numpy as np -import pycuda.driver as cuda - -from tensorrt.hook.utils import copy_ixrt_io_tensors_as_np - - -TRT_LOGGER = trt.Logger(trt.Logger.WARNING) -def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""): - if not dynamic_path: - dynamic_path = os.path.join(os.path.dirname(trt.__file__), "lib", "libixrt_plugin.so") - if not os.path.exists(dynamic_path): - raise FileNotFoundError( - f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!" - ) - ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) - trt.init_libnvinfer_plugins(logger, namespace) - print(f"Loaded plugin from {dynamic_path}") -load_ixrt_plugin() +import cuda.cuda as cuda +import cuda.cudart as cudart def trtapi(engine_file): @@ -66,7 +34,6 @@ def create_engine_context(engine_path, logger): return engine, context - def get_io_bindings(engine): # Setup I/O bindings inputs = [] @@ -85,13 +52,15 @@ def get_io_bindings(engine): size = np.dtype(tensorrt.nptype(dtype)).itemsize for s in shape: size *= s - allocation = cuda.mem_alloc(size) + err, allocation = cudart.cudaMalloc(size) + assert(err == cuda.CUresult.CUDA_SUCCESS) binding = { "index": i, "name": name, "dtype": np.dtype(tensorrt.nptype(dtype)), "shape": list(shape), "allocation": allocation, + "nbytes": size, } print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}") allocations.append(allocation) @@ -120,17 +89,19 @@ def setup_io_bindings(engine, context): size = np.dtype(tensorrt.nptype(dtype)).itemsize for s in shape: size *= s - allocation = cuda.mem_alloc(size) + err, allocation = cudart.cudaMalloc(size) + assert(err == cuda.CUresult.CUDA_SUCCESS) binding = { "index": i, "name": name, "dtype": np.dtype(tensorrt.nptype(dtype)), "shape": list(shape), "allocation": allocation, + "nbytes": size, } allocations.append(allocation) if engine.binding_is_input(i): inputs.append(binding) else: outputs.append(binding) - return inputs, outputs, allocations + return inputs, outputs, allocations \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/ixrt/convert2onnx.py b/models/audio/speech_recognition/conformer/ixrt/convert2onnx.py deleted file mode 100644 index 823ae3215f58d18a636e868668199ed3f388ee20..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/ixrt/convert2onnx.py +++ /dev/null @@ -1,529 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -""" -Build Compute Graph(Fusion Plugin Onnx) From Checkpoints. -""" - -import os -import json -import torch -import argparse -import numpy as np -from collections import OrderedDict - -from tensorrt.deploy.api import GraphTransform, create_source, create_target -from tensorrt.deploy.ir.data_type import DataType -from tensorrt.deploy.ir.variable import Variable, VariableOptions -from tensorrt.deploy.ir.graph import Graph - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Build Compute Graph From Checkpoints.", usage="" - ) - parser.add_argument( - "--model_name", - type=str, - required=True, - help="conformer", - ) - parser.add_argument( - "--model_path", - type=str, - required=True, - help="checkpont of conformer", - ) - parser.add_argument( - "--onnx_path", - type=str, - required=True, - help="raw onnx path to save", - ) - parser.add_argument( - "--batch_size", - type=int, - required=True, - help="the batch size for test.", - ) - args = parser.parse_args() - return args - - -def add_global_cmvn_op(graph, state_dict, args): - t = graph - - sub_inputs = [t.make_variable("input", dtype=DataType.FLOAT, shape=(128, 1500, 80))] - key = "encoder.global_cmvn.mean" - sub_inputs.append(t.make_variable(name=key, value=state_dict[key])) - sub_outputs = [t.make_variable("Sub_output_0", dtype=DataType.FLOAT, shape=(128, 1500, 80))] - t.make_operator( - "Sub", - inputs=sub_inputs, - outputs=sub_outputs, - ) - - mul_inputs = sub_outputs - key = "encoder.global_cmvn.istd" - mul_inputs.append(t.make_variable(name=key, value=state_dict[key])) - mul_outputs = [t.make_variable("Mul_output_0", dtype=DataType.FLOAT, shape=(128, 1500, 80))] - t.make_operator( - "Mul", - inputs=mul_inputs, - outputs=mul_outputs, - ) - - unsqueeze_inputs = mul_outputs - unsqueeze_inputs.append(t.make_variable("axes", value=np.array([1], dtype=np.int64))) - unsqueeze_outputs = [t.make_variable("Unsqueeze_output_0", dtype=DataType.FLOAT, shape=(128, 1, 1500, 80))] - t.make_operator( - "Unsqueeze", - inputs=unsqueeze_inputs, - outputs=unsqueeze_outputs, - ) - - -def add_first_submodule_op(graph, state_dict, args): - """ - The firt submodule part contains follows: - 1.Conv2d+ReLU; - 2.Conv2d+ReLU; - 3.Transpose+Reshape; - 4.MatMul+Add+Mul; - """ - - t = graph - conv2d0_weight_keys = [ - "encoder.embed.conv.0.weight", - "encoder.embed.conv.0.bias", - ] - conv2d0_attributes = { - "dilations": [1, 1], - "group": 1, - "kernel_shape": [3, 3], - "pads": [0, 0, 0, 0], - "strides": [2, 2], - } - conv2d0_inputs = [t.get_variable("Unsqueeze_output_0")] - conv2d0_outputs = [t.make_variable("Conv_output_0", dtype=DataType.FLOAT)] - - for key in conv2d0_weight_keys: - conv2d0_inputs.append(t.make_variable(name=key, value=state_dict[key])) - t.make_operator( - "Conv", - inputs=conv2d0_inputs, - outputs=conv2d0_outputs, - **conv2d0_attributes - ) - - relu0_inputs = conv2d0_outputs - relu0_outputs = [t.make_variable("Relu_output_0", dtype=DataType.FLOAT)] - t.make_operator( - "Relu", - inputs=relu0_inputs, - outputs=relu0_outputs - ) - - conv2d1_weight_keys = [ - "encoder.embed.conv.2.weight", - "encoder.embed.conv.2.bias", - ] - conv2d1_attributes = { - "dilations": [1, 1], - "group": 1, - "kernel_shape": [3, 3], - "pads": [0, 0, 0, 0], - "strides": [2, 2], - } - conv2d1_inputs = relu0_outputs - conv2d1_outputs = [t.make_variable("Conv_output_1", dtype=DataType.FLOAT)] - - for key in conv2d1_weight_keys: - conv2d1_inputs.append(t.make_variable(name=key, value=state_dict[key])) - t.make_operator( - "Conv", - inputs=conv2d1_inputs, - outputs=conv2d1_outputs, - **conv2d1_attributes - ) - - relu1_inputs = conv2d1_outputs - relu1_outputs = [t.make_variable("Relu_output_1", dtype=DataType.FLOAT)] - t.make_operator( - "Relu", - inputs=relu1_inputs, - outputs=relu1_outputs - ) - - tran_inputs = relu1_outputs - tran_outputs = [t.make_variable("Transpose_output_0", dtype=DataType.FLOAT)] - tran_attributes = {"perm": [0, 2, 1, 3]} - t.make_operator( - "Transpose", - inputs=tran_inputs, - outputs=tran_outputs, - **tran_attributes - ) - - reshape_inputs = tran_outputs - reshape_inputs.append(t.make_variable(name="constant_0", value=np.array([args.batch_size, -1, 4864]), dtype=DataType.INT64)) - reshape_outputs = [t.make_variable("Reshape_output_0", dtype=DataType.FLOAT)] - t.make_operator( - "Reshape", - inputs=reshape_inputs, - outputs=reshape_outputs, - ) - - matmul_inputs = reshape_outputs - matmul_inputs.append(t.make_variable(name="embed.out.0.weight", value=state_dict["encoder.embed.out.0.weight"].transpose(1, 0))) # (256,4864)--->(4864,256) - matmul_outputs = [t.make_variable("MatMul_output_0", dtype=DataType.FLOAT)] - t.make_operator( - "MatMul", - inputs=matmul_inputs, - outputs=matmul_outputs, - ) - - add_inputs = matmul_outputs - add_inputs.append(t.make_variable(name="embed.out.0.bias", value=state_dict["encoder.embed.out.0.bias"])) - add_outputs = [t.make_variable("Add_output_0", dtype=DataType.FLOAT)] - t.make_operator( - "Add", - inputs=add_inputs, - outputs=add_outputs, - ) - - mul_inputs = add_outputs - mul_inputs.append(t.make_variable(name="constant_1", value=np.array([16.], dtype=np.float32), dtype=DataType.FLOAT)) - mul_outputs = [t.make_variable("Mul_output_1", dtype=DataType.FLOAT)] - t.make_operator( - "Mul", - inputs=mul_inputs, - outputs=mul_outputs, - ) - - -def add_encoder_ff_macaron_op(graph, state_dict, args, index): - - t = graph - ff_macaron_keys = [ - "encoder.encoders.{}.norm_ff_macaron.weight", - "encoder.encoders.{}.norm_ff_macaron.bias", - "encoder.encoders.{}.feed_forward_macaron.w_1.weight", - "encoder.encoders.{}.feed_forward_macaron.w_1.bias", - "encoder.encoders.{}.feed_forward_macaron.w_2.weight", - "encoder.encoders.{}.feed_forward_macaron.w_2.bias", - ] - - attributes = { - "in_feature": 256, - "hidden_size": 2048, - "act_type": 12, - "ff_scale": 0.5, - } - - if index == 0: - inputs = [graph.get_variable("Mul_output_1")] - else: - inputs = [graph.get_variable("norm_final_{}_output".format(index-1))] - - outputs = [t.make_variable("ff_macaron_{}_output".format(index), dtype=DataType.FLOAT)] - - for key in ff_macaron_keys: - key = key.format(index) - inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16)) - - t.make_operator( - "PositionWiseFFNPluginDynamic_IxRT", - inputs=inputs, - outputs=outputs, - **attributes - ) - - -def add_encoder_mhsa_op(graph, state_dict, args, index): - - t = graph - mhsa_keys = [ - "encoder.encoders.{}.norm_mha.weight", - "encoder.encoders.{}.norm_mha.bias", - "encoder.encoders.{}.self_attn.linear_q.weight", - "encoder.encoders.{}.self_attn.linear_q.bias", - "encoder.encoders.{}.self_attn.linear_k.weight", - "encoder.encoders.{}.self_attn.linear_k.bias", - "encoder.encoders.{}.self_attn.linear_v.weight", - "encoder.encoders.{}.self_attn.linear_v.bias", - "encoder.encoders.{}.self_attn.linear_pos.weight", - "encoder.encoders.{}.self_attn.pos_bias_u", - "encoder.encoders.{}.self_attn.pos_bias_v", - "encoder.encoders.{}.self_attn.linear_out.weight", - "encoder.encoders.{}.self_attn.linear_out.bias", - ] - - attributes = { - "bs": 128, - "seq_len": 374, - "n_head": 4, - "n_feat": 256, - } - - if index == 0: - inputs = [ - graph.get_variable("ff_macaron_{}_output".format(index)), - t.make_variable("mask", dtype=DataType.INT32, shape=(128, 1, 374)), - t.make_variable("pos_emb", dtype=DataType.FLOAT, shape=(1, 374, 256)), - ] - else: - inputs = [ - graph.get_variable("ff_macaron_{}_output".format(index)), - graph.get_variable("mask"), - graph.get_variable("pos_emb"), - ] - - outputs = [t.make_variable("mhsa_{}_output".format(index), dtype=DataType.FLOAT)] - - for key in mhsa_keys: - key = key.format(index) - inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16)) - - t.make_operator( - "ConformerMultiHeadSelfAttentionPlugin_IxRT", - inputs=inputs, - outputs=outputs, - **attributes - ) - - -def add_encoder_conv_module_op(graph, state_dict, args, index): - - t = graph - conv_module_keys = [ - "encoder.encoders.{}.norm_conv.weight", - "encoder.encoders.{}.norm_conv.bias", - "encoder.encoders.{}.conv_module.pointwise_conv1.weight", - "encoder.encoders.{}.conv_module.pointwise_conv1.bias", - "encoder.encoders.{}.conv_module.depthwise_conv.weight", - "encoder.encoders.{}.conv_module.depthwise_conv.bias", - "encoder.encoders.{}.conv_module.norm.weight", - "encoder.encoders.{}.conv_module.norm.bias", - "encoder.encoders.{}.conv_module.pointwise_conv2.weight", - "encoder.encoders.{}.conv_module.pointwise_conv2.bias", - ] - - attributes = { - "kernel_size_1": 1, - "stride_1": 1, - "odim_1": 512, - "kernel_size_2": 8, - "stride_2": 1, - "odim_2": 256, - "kernel_size_3": 1, - "stride_3": 1, - "odim_3": 256, - } - - inputs = [ - graph.get_variable("mhsa_{}_output".format(index)), - graph.get_variable("mask"), - ] - outputs = [t.make_variable("conv_module_{}_output".format(index), dtype=DataType.FLOAT)] - - for key in conv_module_keys: - key = key.format(index) - - if "conv_module.depthwise_conv.weight" in key: - inputs.append(t.make_variable(name=key, value=state_dict[key].permute(1, 2, 0).half(), dtype=DataType.FLOAT16)) - elif "bias" in key and "norm" not in key: - inputs.append(t.make_variable(name=key, value=state_dict[key], dtype=DataType.FLOAT)) - else: - inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16)) - - t.make_operator( - "ConformerConvModulePlugin_IxRT", - inputs=inputs, - outputs=outputs, - **attributes - ) - - -def add_encoder_positionwise_ff_op(graph, state_dict, args, index): - - t = graph - positionwise_ff_keys = [ - "encoder.encoders.{}.norm_ff.weight", - "encoder.encoders.{}.norm_ff.bias", - "encoder.encoders.{}.feed_forward.w_1.weight", - "encoder.encoders.{}.feed_forward.w_1.bias", - "encoder.encoders.{}.feed_forward.w_2.weight", - "encoder.encoders.{}.feed_forward.w_2.bias", - ] - - attributes = { - "in_feature": 256, - "hidden_size": 2048, - "act_type": 12, - "ff_scale": 0.5, - } - - inputs = [graph.get_variable('conv_module_{}_output'.format(index))] - outputs = [t.make_variable("positionwise_ff_{}_output".format(index), dtype=DataType.FLOAT)] - - for key in positionwise_ff_keys: - key = key.format(index) - inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16)) - - t.make_operator( - "PositionWiseFFNPluginDynamic_IxRT", - inputs=inputs, - outputs=outputs, - **attributes - ) - - -def add_encoder_ln_op(graph, state_dict, args, index): - - t = graph - ln_keys = [ - "encoder.encoders.{}.norm_final.weight", - "encoder.encoders.{}.norm_final.bias", - ] - - attributes = { - "axis": -1, - "epsilon": 0.000009999999747378752, - "stash_type": 1, - } - - inputs = [graph.get_variable("positionwise_ff_{}_output".format(index))] - outputs = [t.make_variable("norm_final_{}_output".format(index), dtype=DataType.FLOAT)] - - for key in ln_keys: - key = key.format(index) - inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16)) - - t.make_operator( - "LayerNormalization", - inputs=inputs, - outputs=outputs, - **attributes - ) - - -def add_final_ln_op(graph, state_dict, args): - - t = graph - ln_keys = [ - "encoder.after_norm.weight", - "encoder.after_norm.bias", - ] - - attributes = { - "axis": -1, - "epsilon": 0.000009999999747378752, - "stash_type": 1, - } - - inputs = [graph.get_variable("norm_final_11_output")] - outputs = [t.make_variable("norm_final_output", dtype=DataType.FLOAT)] - - for key in ln_keys: - inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16)) - - t.make_operator( - "LayerNormalization", - inputs=inputs, - outputs=outputs, - **attributes - ) - - -def add_ctc_op(graph, state_dict, args): - t = graph - # matmul - matmul_inputs = [graph.get_variable("norm_final_output")] - matmul_inputs.append(t.make_variable(name="ctc.ctc_lo.weight", value=state_dict["ctc.ctc_lo.weight"].transpose(1, 0))) # (4233,256)--->(256,4233) - matmul_outputs = [t.make_variable("MatMul_output_1", dtype=DataType.FLOAT)] - t.make_operator( - "MatMul", - inputs=matmul_inputs, - outputs=matmul_outputs, - ) - - add_inputs = matmul_outputs - add_inputs.append(t.make_variable(name="ctc.ctc_lo.bias", value=state_dict["ctc.ctc_lo.bias"])) - add_outputs = [t.make_variable("Add_output_1", dtype=DataType.FLOAT)] - t.make_operator( - "Add", - inputs=add_inputs, - outputs=add_outputs, - ) - - logsoftmax_inputs = add_outputs - logsoftmax_outputs = [t.make_variable("output", dtype=DataType.FLOAT)] - attributes = { - "axis": 2 - } - t.make_operator( - "LogSoftmax", - inputs=logsoftmax_inputs, - outputs=logsoftmax_outputs, - **attributes - ) - - -def main(args): - graph = Graph() - transform = GraphTransform(graph) - state_dict = torch.load(args.model_path) - - # 0. Global CMVN: sub+mul+unsqueeze - add_global_cmvn_op(transform, state_dict, args) - - # 1. First Submodule: Conv2d+Relu+Transpose+MatMul - add_first_submodule_op(transform, state_dict, args) - - # 2. Second Submodule: ConformerEncoderLayer: 12 layers - for i in range(args.num_layers): - add_encoder_ff_macaron_op(transform, state_dict, args, i) - add_encoder_mhsa_op(transform, state_dict, args, i) - add_encoder_conv_module_op(transform, state_dict, args, i) - add_encoder_positionwise_ff_op(transform, state_dict, args, i) - add_encoder_ln_op(transform, state_dict, args, i) - - # 3. Third Submodule: FinalNorm - add_final_ln_op(transform, state_dict, args) - - # 4.Forth Submodule: CTC+LogSoftmax - add_ctc_op(transform, state_dict, args) - - # 5. set input and output - graph.add_input(graph.get_variable("input")) - graph.add_input(graph.get_variable("mask")) - graph.add_input(graph.get_variable("pos_emb")) - graph.add_output(graph.get_variable("output")) - # 5. export onnx file - create_target(saved_path=args.onnx_path).export(graph) - print("save onnx: ", args.onnx_path) - - -if __name__ == "__main__": - args = parse_args() - model_name = args.model_name.lower() - args.num_layers = 12 - args.hidden_size = 2048 - args.head_num = 4 - args.head_dim = 64 - args.pad_id = 0 - args.inner_size = 3072 - main(args) diff --git a/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py b/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py index 702221902669351bb1949453c3ee566477ed5692..8e58a24279cc1f4f3d2cdba572701ad35344ed0c 100644 --- a/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py +++ b/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py @@ -1,17 +1,16 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) # -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import sys @@ -21,7 +20,6 @@ sys.path.append(os.path.dirname(os.path.dirname(__file__))) import argparse import yaml import copy -import torch import numpy as np from tqdm.contrib import tqdm @@ -34,14 +32,11 @@ from tensorrt import Dims from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings import pickle -import pycuda.autoinit -import pycuda.driver as cuda +import cuda.cuda as cuda +import cuda.cudart as cudart -from utils import make_pad_mask, RelPositionalEncoding -from postprocess import ctc_greedy_search - - -rel_positional_encoding = RelPositionalEncoding(256, 0.1) +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin() def get_args(): @@ -62,24 +57,36 @@ def get_args(): return args -def tensorrt_infer(engine, context, all_inputs): - input_names = ["input", "mask", "pos_emb"] - output_names = ["output"] +def ixrt_infer(module, input, seq_lengths): + module.set_input(key="input", value=input) + module.set_input(key="seq_lengths", value=seq_lengths) + module.run() + out = module.get_output() + return out[0] - for input_name, input_data in zip(input_names, all_inputs): - input_idx = engine.get_binding_index(input_name) - input_shape = input_data.shape - context.set_binding_shape(input_idx, Dims(input_shape)) +def tensorrt_infer(engine,context, features, lengths): + + input_names=["input","seq_lengths"] + output_names=["output"] + input_idx = engine.get_binding_index(input_names[0]) + input_shape = features.shape + context.set_binding_shape(input_idx, Dims(input_shape)) + + seq_lengths_idx = engine.get_binding_index(input_names[1]) + seq_lengths_shape = lengths.shape + context.set_binding_shape(seq_lengths_idx, Dims(seq_lengths_shape)) + inputs, outputs, allocations = setup_io_bindings(engine, context) pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) - - for i, input_data in enumerate(all_inputs): - cuda.memcpy_htod(inputs[i]["allocation"], input_data) - + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], features, features.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoD(inputs[1]["allocation"], lengths, lengths.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) context.execute_v2(allocations) - cuda.memcpy_dtoh(pred_output, outputs[0]["allocation"]) + err, = cuda.cuMemcpyDtoH(pred_output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) return pred_output @@ -87,8 +94,9 @@ def engine_init(engine): host_mem = tensorrt.IHostMemory logger = tensorrt.Logger(tensorrt.Logger.ERROR) engine, context = create_engine_context(engine, logger) - + return engine,context + def calculate_cer(data, reference_data): @@ -144,7 +152,7 @@ def main(): args = get_args() # 读取配置文件 - config_fn = os.path.join(args.model_dir, "train.yaml") + config_fn = os.path.join(args.model_dir, "config.yaml") with open(config_fn, "r") as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) @@ -164,7 +172,7 @@ def main(): dataset_conf["batch_conf"]["batch_size"] = args.batch_size # Load dict - dict_fn = os.path.join(args.model_dir, "units.txt") + dict_fn = os.path.join(args.model_dir, "words.txt") char_dict = {} with open(dict_fn, "r", encoding="utf8") as fin: for line in fin: @@ -199,15 +207,15 @@ def main(): feats_lengths.cpu().numpy().astype(np.int32), ] ) - with open(data_path_pkl, "wb") as f: - pickle.dump( - [ - eval_samples, - max_batch_size, - max_feature_length - ], - f, - ) + with open(data_path_pkl, "wb") as f: + pickle.dump( + [ + eval_samples, + max_batch_size, + max_feature_length + ], + f, + ) else: print(f"load data from tmp: {data_path_pkl}") with open(data_path_pkl, "rb") as f: @@ -221,44 +229,22 @@ def main(): ) print("*** 2. Load engine ***") - engine_path = os.path.join(args.model_dir, f"conformer_encoder_fusion.engine") + engine_path = os.path.join(args.model_dir, f"conformer_{args.infer_type}_trt.engine") engine, context = engine_init(engine_path) - + print("*** 3. Warm up ***") if args.warm_up > 0: for i in range(args.warm_up): - feats_tmp = np.ones((args.batch_size, 1500, 80)).astype(np.float32) - feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) * 1500 - mask_tmp = make_pad_mask(feats_lengths_tmp, 1500) - mask_len_tmp = mask_tmp.shape[-1] - pos_emb_tmp = rel_positional_encoding(mask_len_tmp).numpy() - all_inputs = [feats_tmp, mask_tmp, pos_emb_tmp] - tensorrt_infer(engine, context, all_inputs) + feats_tmp = np.ones((args.batch_size,800,80)).astype(np.float16) + feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) + tensorrt_infer(engine,context, feats_tmp, feats_lengths_tmp) results = [] for keys, feats, feats_lengths in tqdm(eval_samples): - b, seq_len, feat = feats.shape - - inputs = feats.astype(np.float32) - mask = make_pad_mask(feats_lengths, seq_len) - mask_len = mask.shape[-1] - pos_emb = rel_positional_encoding(mask_len).numpy() - - all_inputs = [inputs, mask, pos_emb] - hyps = tensorrt_infer( - engine, - context, - all_inputs - ) - - ctc_probs = torch.from_numpy(hyps) - ctc_lens = torch.from_numpy(feats_lengths) - hyps = ctc_greedy_search(ctc_probs, ctc_lens) - + hyps = tensorrt_infer(engine,context, feats, feats_lengths) for i, key in enumerate(keys): line = f"{key} " for w in hyps[i]: - w = w - 1 if w == eos: break line += char_dict[w] @@ -271,6 +257,7 @@ def main(): reference_data.append(line) cer, corr = calculate_cer(results, reference_data) + target_cer = float(os.environ["Accuracy"]) metricResult = {"metricResult": {}} metricResult["metricResult"]["CER"] = round(cer, 3) diff --git a/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_performance.py b/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_performance.py index fcfcb55952d5b4616c140e9ae1f076f90b57ad58..ffc01462a154b564c2ddf879f17901c7039d39db 100644 --- a/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_performance.py +++ b/models/audio/speech_recognition/conformer/ixrt/ixrt_inference_performance.py @@ -1,48 +1,45 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) # -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import sys -import time sys.path.append(os.path.dirname(os.path.dirname(__file__))) -import argparse import yaml +import time import copy -import torch +import argparse +import pickle import numpy as np from tqdm.contrib import tqdm from torch.utils.data import DataLoader + from wenet.file_utils import read_symbol_table from wenet.dataset import Dataset -from tools.compute_cer import Calculator, characterize, normalize, default_cluster + import tensorrt from tensorrt import Dims from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings import pickle -import pycuda.autoinit -import pycuda.driver as cuda - -from utils import make_pad_mask, RelPositionalEncoding -from postprocess import ctc_greedy_search - +import cuda.cuda as cuda +import cuda.cudart as cudart -rel_positional_encoding = RelPositionalEncoding(256, 0.1) +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin() def get_args(): @@ -62,90 +59,43 @@ def get_args(): args = parser.parse_args() return args - -def tensorrt_infer(engine, context, all_inputs): - - input_names = ["input", "mask", "pos_emb"] - output_names = ["output"] - - for input_name, input_data in zip(input_names, all_inputs): - input_idx = engine.get_binding_index(input_name) - input_shape = input_data.shape - context.set_binding_shape(input_idx, Dims(input_shape)) - - inputs, outputs, allocations = setup_io_bindings(engine, context) - pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) - - for i, input_data in enumerate(all_inputs): - cuda.memcpy_htod(inputs[i]["allocation"], input_data) - - context.execute_v2(allocations) - cuda.memcpy_dtoh(pred_output, outputs[0]["allocation"]) - return pred_output - - def engine_init(engine): host_mem = tensorrt.IHostMemory logger = tensorrt.Logger(tensorrt.Logger.ERROR) engine, context = create_engine_context(engine, logger) - + return engine,context +def tensorrt_infer(engine,context, features, lengths): + + input_names=["input","seq_lengths"] + output_names=["output"] + input_idx = engine.get_binding_index(input_names[0]) + input_shape = features.shape + context.set_binding_shape(input_idx, Dims(input_shape)) + + seq_lengths_idx = engine.get_binding_index(input_names[1]) + seq_lengths_shape = lengths.shape + context.set_binding_shape(seq_lengths_idx, Dims(seq_lengths_shape)) + + inputs, outputs, allocations = setup_io_bindings(engine, context) + pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], features, features.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoD(inputs[1]["allocation"], lengths, lengths.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + context.execute_v2(allocations) + err, = cuda.cuMemcpyDtoH(pred_output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + return pred_output -def calculate_cer(data, reference_data): - calculator = Calculator() - tochar = True - split = None - case_sensitive = False - ignore_words = set() - rec_set = {} - for line in data: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - default_clusters = {} - default_words = {} - for line in reference_data: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - - for word in rec + lab: - if word not in default_words: - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters: - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name]: - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - result = calculator.calculate(lab, rec) - - result = calculator.overall() - cer = float(result["ins"] + result["sub"] + result["del"]) / result["all"] - corr = result["cor"] / result["all"] - - return cer, corr def main(): args = get_args() # 读取配置文件 - config_fn = os.path.join(args.model_dir, "train.yaml") + config_fn = os.path.join(args.model_dir, "config.yaml") with open(config_fn, "r") as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) @@ -165,14 +115,13 @@ def main(): dataset_conf["batch_conf"]["batch_size"] = args.batch_size # Load dict - dict_fn = os.path.join(args.model_dir, "units.txt") + dict_fn = os.path.join(args.model_dir, "words.txt") char_dict = {} with open(dict_fn, "r", encoding="utf8") as fin: for line in fin: arr = line.strip().split() assert len(arr) == 2 char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 data_type = "raw" test_data_fn = os.path.join(args.data_dir, "data.list") @@ -200,15 +149,15 @@ def main(): feats_lengths.cpu().numpy().astype(np.int32), ] ) - with open(data_path_pkl, "wb") as f: - pickle.dump( - [ - eval_samples, - max_batch_size, - max_feature_length - ], - f, - ) + with open(data_path_pkl, "wb") as f: + pickle.dump( + [ + eval_samples, + max_batch_size, + max_feature_length + ], + f, + ) else: print(f"load data from tmp: {data_path_pkl}") with open(data_path_pkl, "rb") as f: @@ -221,40 +170,24 @@ def main(): f"dataset max shape: batch_size: {max_batch_size}, feat_length: {max_feature_length}" ) - print("*** 2. Load engine ***") - engine_path = os.path.join(args.model_dir, f"conformer_encoder_fusion.engine") + print("*** 2. Load IxRT engine ***") + engine_path = os.path.join(args.model_dir, f"conformer_{args.infer_type}_trt.engine") engine, context = engine_init(engine_path) - print("*** 3. Warm up ***") if args.warm_up > 0: for i in range(args.warm_up): - feats_tmp = np.ones((args.batch_size, 1500, 80)).astype(np.float32) - feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) * 1500 - mask_tmp = make_pad_mask(feats_lengths_tmp, 1500) - mask_len_tmp = mask_tmp.shape[-1] - pos_emb_tmp = rel_positional_encoding(mask_len_tmp).numpy() - all_inputs = [feats_tmp, mask_tmp, pos_emb_tmp] - tensorrt_infer(engine, context, all_inputs) + feats_tmp = np.ones((args.batch_size,1200,80)).astype(np.float16) + feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) + tensorrt_infer(engine,context, feats_tmp, feats_lengths_tmp) print("*** 4. Inference ***") start_time = time.time() num_samples = 0 results = [] for keys, feats, feats_lengths in tqdm(eval_samples): - b, seq_len, feat = feats.shape - num_samples += b - inputs = feats.astype(np.float32) - mask = make_pad_mask(feats_lengths, seq_len) - mask_len = mask.shape[-1] - pos_emb = rel_positional_encoding(mask_len).numpy() - - all_inputs = [inputs, mask, pos_emb] - hyps = tensorrt_infer( - engine, - context, - all_inputs - ) - + num_samples += feats.shape[0] + hyps = tensorrt_infer(engine,context, feats, feats_lengths) + results.append([hyps, keys]) eval_time = time.time() - start_time QPS = num_samples / eval_time @@ -270,7 +203,7 @@ def main(): exit() else: print("failed!") - exit(1) + exit(10) if __name__ == "__main__": diff --git a/models/audio/speech_recognition/conformer/ixrt/load_ixrt_plugin.py b/models/audio/speech_recognition/conformer/ixrt/load_ixrt_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..932efbdfd1a4e91d8ddfd363adf6bce989df1709 --- /dev/null +++ b/models/audio/speech_recognition/conformer/ixrt/load_ixrt_plugin.py @@ -0,0 +1,12 @@ +import ctypes +import tensorrt +from os.path import join, dirname, exists +def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path) + tensorrt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/ixrt/postprocess/__init__.py b/models/audio/speech_recognition/conformer/ixrt/postprocess/__init__.py deleted file mode 100644 index 33f8b0465aee011298fa9933086fbdc1c8dbd4d4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/ixrt/postprocess/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .search import ctc_greedy_search diff --git a/models/audio/speech_recognition/conformer/ixrt/postprocess/search.py b/models/audio/speech_recognition/conformer/ixrt/postprocess/search.py deleted file mode 100644 index d2ae55650539b9d0be352e78a64999606ac12fbb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/ixrt/postprocess/search.py +++ /dev/null @@ -1,103 +0,0 @@ -import math -from collections import defaultdict -from typing import List, Dict - -import torch -from torch.nn.utils.rnn import pad_sequence - - -def remove_duplicates_and_blank(hyp: List[int], - blank_id: int = 0) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != blank_id: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - - mask = mask[:, 2::2][:, 2::2] - return mask - - -class DecodeResult: - - def __init__(self, - tokens: List[int], - score: float = 0.0, - confidence: float = 0.0, - tokens_confidence: List[float] = None, - times: List[int] = None, - nbest: List[List[int]] = None, - nbest_scores: List[float] = None, - nbest_times: List[List[int]] = None): - """ - Args: - tokens: decode token list - score: the total decode score of this result - confidence: the total confidence of this result, it's in 0~1 - tokens_confidence: confidence of each token - times: timestamp of each token, list of (start, end) - nbest: nbest result - nbest_scores: score of each nbest - nbest_times: - """ - self.tokens = tokens - self.score = score - self.confidence = confidence - self.tokens_confidence = tokens_confidence - self.times = times - self.nbest = nbest - self.nbest_scores = nbest_scores - self.nbest_times = nbest_times - - -def ctc_greedy_search(ctc_probs: torch.Tensor, - ctc_lens: torch.Tensor, - blank_id: int = 0) -> List[DecodeResult]: - - batch_size = ctc_probs.shape[0] - maxlen = ctc_probs.size(1) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - - mask_ctc_lens = ctc_lens[0].item() - mask = make_pad_mask(ctc_lens, mask_ctc_lens) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, blank_id) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - results = [] - for hyp in hyps: - results.append(remove_duplicates_and_blank(hyp, blank_id)) - return results - diff --git a/models/audio/speech_recognition/conformer/ixrt/requirements.txt b/models/audio/speech_recognition/conformer/ixrt/requirements.txt index dd4788cf7291642c165bfd61f31399f2e24213e9..3dcea1ccc8337478e16d50942acc6175d270b9b5 100644 --- a/models/audio/speech_recognition/conformer/ixrt/requirements.txt +++ b/models/audio/speech_recognition/conformer/ixrt/requirements.txt @@ -1,5 +1,5 @@ tqdm onnx -typeguard==2.13.3 onnxsim -pycuda \ No newline at end of file +librosa +soundfile \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy.sh b/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy.sh index f1af4bb4e03a0c9c6084ae7a122f66f765c27c86..435cf5f35e064d239d7b7a6de361af3a40760238 100644 --- a/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy.sh +++ b/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy.sh @@ -12,38 +12,35 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - set -euo pipefail +EXIT_STATUS=0 +check_status() +{ + ret_code=${PIPESTATUS[0]} + if [ ${ret_code} != 0 ]; then + echo "fails" + [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1 + fi +} + current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) PROJECT_DIR=${current_path}/.. DATA_DIR=${current_path}/../aishell_test_data/test MODEL_DIR=${current_path}/../conformer_checkpoints -export Accuracy=${Accuracy:=0.052} +export Accuracy=${Accuracy:=0.05} cd ${PROJECT_DIR} -echo "Step1.Export Onnx From Checkpoints!" -python3 convert2onnx.py \ - --model_name "Conformer" \ - --model_path=${MODEL_DIR}/final.pt \ - --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \ - --batch_size=8 - -echo "Step2.Build Engine!" python3 build_engine.py \ - --model_name "Conformer" \ - --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \ - --engine_path=${MODEL_DIR}/conformer_encoder_fusion.engine \ - --max_batch_size=8 \ - --max_seq_len=1500 + --onnx_model ${MODEL_DIR}/conformer_fp16_trt.onnx \ + --engine ${MODEL_DIR}/conformer_fp16_trt.engine "$@" ;check_status -echo "Step3.Inference(Test ACC)!" python3 ixrt_inference_accuracy.py \ --infer_type fp16 \ - --warm_up 3 \ - --batch_size ${BATCH_SIZE:=8} \ + --batch_size ${BATCH_SIZE:=24} \ --data_dir ${DATA_DIR} \ - --model_dir ${MODEL_DIR} + --model_dir ${MODEL_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance.sh b/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance.sh index dc02673c03fb21a4301b757a18885af81cbad31d..1ad506119f1ab87b08254f1f861efb9020139e6f 100644 --- a/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance.sh +++ b/models/audio/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance.sh @@ -12,16 +12,15 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - set -euo pipefail - EXIT_STATUS=0 check_status() { - if ((${PIPESTATUS[0]} != 0));then + ret_code=${PIPESTATUS[0]} + if [ ${ret_code} != 0 ]; then echo "fails" - EXIT_STATUS=1 + [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1 fi } @@ -31,29 +30,17 @@ PROJECT_DIR=${current_path}/.. DATA_DIR=${current_path}/../aishell_test_data/test MODEL_DIR=${current_path}/../conformer_checkpoints -export Accuracy=${Accuracy:=350} +export Accuracy=${Accuracy:=529} cd ${PROJECT_DIR} - -echo "Step1.Export Onnx From Checkpoints!" -python3 convert2onnx.py \ - --model_name "Conformer" \ - --model_path=${MODEL_DIR}/final.pt \ - --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \ - --batch_size=24 - -echo "Step2.Build Engine!" python3 build_engine.py \ - --model_name "Conformer" \ - --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \ - --engine_path=${MODEL_DIR}/conformer_encoder_fusion.engine \ - --max_batch_size=24 \ - --max_seq_len=1500 + --onnx_model ${MODEL_DIR}/conformer_fp16_trt.onnx \ + --engine ${MODEL_DIR}/conformer_fp16_trt.engine "$@" ;check_status -echo "Step3.Inference(Test QPS)!" python3 ixrt_inference_performance.py \ --infer_type fp16 \ --batch_size ${BATCH_SIZE:=24} \ --data_dir ${DATA_DIR} \ - --model_dir ${MODEL_DIR} + --model_dir ${MODEL_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/ixrt/utils/__init__.py b/models/audio/speech_recognition/conformer/ixrt/utils/__init__.py deleted file mode 100644 index c57435c110fc12f39d79c1b02f4b2e83dfe1a3e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/ixrt/utils/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import torch -import numpy as np - -from .embedding import RelPositionalEncoding - - -rel_positional_encoding = RelPositionalEncoding(256, 0.1) - - -def make_pad_mask(lengths: np.ndarray, max_len: int = 0) -> np.ndarray : - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (numpy.ndarray): Batch of lengths (B,). - Returns: - numpy.ndarray: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - - batch_size = lengths.shape[0] - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = np.arange(0, max_len, dtype=np.int64) - seq_range_expand = np.tile(seq_range, batch_size).reshape(batch_size, max_len) - seq_length_expand = lengths[..., None] - mask = seq_range_expand >= seq_length_expand - mask = np.expand_dims(mask, axis=1) - mask = ~mask - mask = mask[:, :, 2::2][:, :, 2::2] - mask = mask.astype(np.int32) - return mask diff --git a/models/audio/speech_recognition/conformer/ixrt/utils/embedding.py b/models/audio/speech_recognition/conformer/ixrt/utils/embedding.py deleted file mode 100644 index 0fd65c4cdfc3fec244c88d2c47cf94b33b9088f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/ixrt/utils/embedding.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F -import numpy as np - - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0) - self.register_buffer("pe", pe) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, - offset: Union[int, torch.Tensor], - size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - # import ipdb;ipdb.set_trace() - if isinstance(offset, int): - assert offset + size <= self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size <= self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size <= self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - seq_len: int, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - pos_emb = self.position_encoding(offset, seq_len, False) - # return self.dropout(pos_emb) - return pos_emb - diff --git a/models/audio/speech_recognition/conformer/ixrt/wenet/__init__.py b/models/audio/speech_recognition/conformer/ixrt/wenet/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000