From fc46caca2ae7878419bffce8c0742b74a92582a5 Mon Sep 17 00:00:00 2001 From: Lanxi Date: Fri, 8 Mar 2024 18:55:31 +0800 Subject: [PATCH 01/11] First commit of conformer adaption --- .../audio/Conformer/decoder_compile.py | 24 + .../audio/Conformer/encoder_compile.py | 28 + .../audio/Conformer/export_torchscript.py | 505 ++++++++++++++++++ .../built-in/audio/Conformer/perf_test_aie.py | 131 +++++ 4 files changed, 688 insertions(+) create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py new file mode 100644 index 0000000000..bbb5eb6c50 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py @@ -0,0 +1,24 @@ +import sys +import torch +import torch_aie + +DECODER_Y_SHAPE = (1, 2) + +inputs = [torch_aie.Input(DECODER_Y_SHAPE, dtype=torch.float32)] + +decoder_ts_model = torch.jit.load('./exp/exported_decoder-epoch-99-avg-1.ts') +decoder_ts_model.eval() + +try: + compiled_decoder = torch_aie.compile( + decoder_ts_model, + inputs=inputs, + precision_policy=torch_aie.PrecisionPolicy.FP32, + truncate_long_and_double=True, + soc_version="Ascend310P3", + ) + # torch.jit.save() + compiled_decoder.save("./compiled_decoder.ts") +except Exception as e: + print("During the compilation of decoder model, an error has occured.") + sys.exit(1) \ No newline at end of file diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py new file mode 100644 index 0000000000..fa6af3fd0f --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py @@ -0,0 +1,28 @@ +import torch +import mindietorch as torch_aie + +ENCODER_X_SHAPE = (1, 100, 80) +ENCODER_X_LENS_SHAPE = (1, ) + +inputs = [torch_aie.Input(ENCODER_X_SHAPE, dtype=torch.float32), torch_aie.Input(ENCODER_X_LENS_SHAPE, dtype=torch.int64)] + +encoder_ts_model = torch.jit.load('./exp/exported_encoder-epoch-99-avg-1.ts') +encoder_ts_model.eval() + +try: + compiled_encoder_model = torch_aie.compile( + encoder_ts_model, + inputs=inputs, + precision_policy=torch_aie.PrecisionPolicy.FP32, + truncate_long_and_double=True, + soc_version="Ascend310P3", + optimization_level=1 + ) + # torch.jit.save(compiled_encoder_model, "./compiled_encoder.ts") + compiled_encoder_model.save("./compiled_encoder.ts") + # torch.jit.load("./compiled_encoder.ts") +except Exception as e: + print("an error has occured.") + print(e) + import sys + sys.exit(1) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py b/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py new file mode 100644 index 0000000000..ebb2ca4ac7 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python3 +# +# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang) + +""" +This script exports a transducer model from PyTorch to ONNX. + +We use the pre-trained model from +https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_offline/ +as an example to show how to use this file. + +1. Download the pre-trained model + +cd egs/wenetspeech/ASR + +repo_url=https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_offline/ +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +repo=$(basename $repo_url) + +pushd $repo +git lfs pull --include "data/lang_char/Linv.pt" +git lfs pull --include "exp/pretrained_epoch_9_avg_1.pt" + +cd exp +ln -s pretrained_epoch_9_avg_1.pt epoch-99.pt +popd + +2. Export the model to ONNX + +./pruned_transducer_stateless5/export-onnx.py \ + --tokens $repo/data/lang_char/tokens.txt \ + --epoch 99 \ + --avg 1 \ + --use-averaged-model 0 \ + --exp-dir $repo/exp \ + --num-encoder-layers 24 \ + --dim-feedforward 1536 \ + --nhead 8 \ + --encoder-dim 384 \ + --decoder-dim 512 \ + --joiner-dim 512 + +It will generate the following 3 files inside $repo/exp: + + - encoder-epoch-99-avg-1.onnx + - decoder-epoch-99-avg-1.onnx + - joiner-epoch-99-avg-1.onnx + +See ./onnx_pretrained.py and ./onnx_check.py for how to +use the exported ONNX models. +""" + +import argparse +import logging +from pathlib import Path +from typing import Dict, Tuple + +import k2 +import onnx +import torch +import torch.nn as nn +from conformer import Conformer +from decoder import Decoder +from onnxruntime.quantization import QuantType, quantize_dynamic +from scaling_converter import convert_scaled_to_non_scaled +from train import add_model_arguments, get_params, get_transducer_model + +from icefall.checkpoint import ( + average_checkpoints, + average_checkpoints_with_averaged_model, + find_checkpoints, + load_checkpoint, +) +from icefall.utils import num_tokens, setup_logger, str2bool + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--epoch", + type=int, + default=28, + help="""It specifies the checkpoint to use for averaging. + Note: Epoch counts from 0. + You can specify --avg to use more checkpoints for model averaging.""", + ) + + parser.add_argument( + "--iter", + type=int, + default=0, + help="""If positive, --epoch is ignored and it + will use the checkpoint exp_dir/checkpoint-iter.pt. + You can specify --avg to use more checkpoints for model averaging. + """, + ) + + parser.add_argument( + "--avg", + type=int, + default=15, + help="Number of checkpoints to average. Automatically select " + "consecutive checkpoints before the checkpoint specified by " + "'--epoch' and '--iter'", + ) + + parser.add_argument( + "--use-averaged-model", + type=str2bool, + default=True, + help="Whether to load averaged model. Currently it only supports " + "using --epoch. If True, it would decode with the averaged model " + "over the epoch range from `epoch-avg` (excluded) to `epoch`." + "Actually only the models with epoch number of `epoch-avg` and " + "`epoch` are loaded for averaging. ", + ) + + parser.add_argument( + "--exp-dir", + type=str, + default="pruned_transducer_stateless5/exp", + help="""It specifies the directory where all training related + files, e.g., checkpoints, log, etc, are saved + """, + ) + + parser.add_argument( + "--tokens", + type=str, + default="data/lang_char/tokens.txt", + help="Path to the tokens.txt", + ) + + parser.add_argument( + "--context-size", + type=int, + default=2, + help="The context size in the decoder. 1 means bigram; 2 means tri-gram", + ) + + add_model_arguments(parser) + + return parser + + +# def add_meta_data(filename: str, meta_data: Dict[str, str]): +# """Add meta data to an ONNX model. It is changed in-place. + +# Args: +# filename: +# Filename of the ONNX model to be changed. +# meta_data: +# Key-value pairs. +# """ +# model = onnx.load(filename) +# for key, value in meta_data.items(): +# meta = model.metadata_props.add() +# meta.key = key +# meta.value = value + +# onnx.save(model, filename) + + +class OnnxEncoder(nn.Module): + """A wrapper for Conformer and the encoder_proj from the joiner""" + + def __init__(self, encoder: Conformer, encoder_proj: nn.Linear): + """ + Args: + encoder: + A Conformer encoder. + encoder_proj: + The projection layer for encoder from the joiner. + """ + super().__init__() + self.encoder = encoder + self.encoder_proj = encoder_proj + + def forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Please see the help information of Conformer.forward + + Args: + x: + A 3-D tensor of shape (N, T, C) + x_lens: + A 1-D tensor of shape (N,). Its dtype is torch.int64 + Returns: + Return a tuple containing: + - encoder_out, A 3-D tensor of shape (N, T', joiner_dim) + - encoder_out_lens, A 1-D tensor of shape (N,) + """ + encoder_out, encoder_out_lens = self.encoder(x, x_lens) + + encoder_out = self.encoder_proj(encoder_out) + # Now encoder_out is of shape (N, T, joiner_dim) + + return encoder_out, encoder_out_lens + + +class OnnxDecoder(nn.Module): + """A wrapper for Decoder and the decoder_proj from the joiner""" + + def __init__(self, decoder: Decoder, decoder_proj: nn.Linear): + super().__init__() + self.decoder = decoder + self.decoder_proj = decoder_proj + + def forward(self, y: torch.Tensor) -> torch.Tensor: + """ + Args: + y: + A 2-D tensor of shape (N, context_size). + Returns + Return a 2-D tensor of shape (N, joiner_dim) + """ + need_pad = False + decoder_output = self.decoder(y, need_pad=need_pad) + decoder_output = decoder_output.squeeze(1) + output = self.decoder_proj(decoder_output) + + return output + + +class OnnxJoiner(nn.Module): + """A wrapper for the joiner""" + + def __init__(self, output_linear: nn.Linear): + super().__init__() + self.output_linear = output_linear + + def forward( + self, + encoder_out: torch.Tensor, + decoder_out: torch.Tensor, + ) -> torch.Tensor: + """ + Args: + encoder_out: + A 2-D tensor of shape (N, joiner_dim) + decoder_out: + A 2-D tensor of shape (N, joiner_dim) + Returns: + Return a 2-D tensor of shape (N, vocab_size) + """ + logit = encoder_out + decoder_out + logit = self.output_linear(torch.tanh(logit)) + return logit + + +def export_encoder_torchscript( + encoder_model: OnnxEncoder, + encoder_filename: str, + +) -> None: + """Export the given encoder model to ONNX format. + The exported model has two inputs: + + - x, a tensor of shape (N, T, C); dtype is torch.float32 + - x_lens, a tensor of shape (N,); dtype is torch.int64 + + and it has two outputs: + + - encoder_out, a tensor of shape (N, T', joiner_dim) + - encoder_out_lens, a tensor of shape (N,) + + Args: + encoder_model: + The input encoder model + encoder_filename: + The filename to save the exported ONNX model. + opset_version: + The opset version to use. + """ + x = torch.rand(1, 100, 80, dtype=torch.float32) + x_lens = torch.tensor([100], dtype=torch.int64) + encoder_ts = torch.jit.trace(encoder_model, (x, x_lens)) + torch.jit.save(encoder_ts, encoder_filename) + +def export_decoder_torchscript( + decoder_model: OnnxDecoder, + decoder_filename: str, +) -> None: + """Export the decoder model to ONNX format. + + The exported model has one input: + + - y: a torch.int64 tensor of shape (N, decoder_model.context_size) + + and has one output: + + - decoder_out: a torch.float32 tensor of shape (N, joiner_dim) + + Args: + decoder_model: + The decoder model to be exported. + decoder_filename: + Filename to save the exported ONNX model. + opset_version: + The opset version to use. + """ + context_size = decoder_model.decoder.context_size + vocab_size = decoder_model.decoder.vocab_size + + y = torch.rand(10, context_size).to(dtype=torch.int64) + decoder_model = torch.jit.trace(decoder_model, y) + torch.jit.save(decoder_model, decoder_filename) + +def export_joiner_torchscript( + joiner_model: nn.Module, + joiner_filename: str, + # opset_version: int = 11, +) -> None: + """Export the joiner model to ONNX format. + The exported joiner model has two inputs: + + - encoder_out: a tensor of shape (N, joiner_dim) + - decoder_out: a tensor of shape (N, joiner_dim) + + and produces one output: + + - logit: a tensor of shape (N, vocab_size) + """ + joiner_dim = joiner_model.output_linear.weight.shape[1] + logging.info(f"joiner dim: {joiner_dim}") + + projected_encoder_out = torch.rand(11, joiner_dim, dtype=torch.float32) + projected_decoder_out = torch.rand(11, joiner_dim, dtype=torch.float32) + joiner_ts = torch.jit.trace(joiner_model, (projected_encoder_out, projected_decoder_out)) + torch.jit.save(joiner_ts, joiner_filename) + + +@torch.no_grad() +def main(): + args = get_parser().parse_args() + args.exp_dir = Path(args.exp_dir) + + params = get_params() + params.update(vars(args)) + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + + setup_logger(f"{params.exp_dir}/log-export/log-export-onnx") + + logging.info(f"device: {device}") + + token_table = k2.SymbolTable.from_file(params.tokens) + params.blank_id = token_table[""] + params.vocab_size = num_tokens(token_table) + 1 + + logging.info(params) + + logging.info("About to create model") + model = get_transducer_model(params) + + model.to(device) + + if not params.use_averaged_model: + if params.iter > 0: + filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[ + : params.avg + ] + if len(filenames) == 0: + raise ValueError( + f"No checkpoints found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + elif len(filenames) < params.avg: + raise ValueError( + f"Not enough checkpoints ({len(filenames)}) found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict(average_checkpoints(filenames, device=device)) + elif params.avg == 1: + load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model) + else: + start = params.epoch - params.avg + 1 + filenames = [] + for i in range(start, params.epoch + 1): + if i >= 1: + filenames.append(f"{params.exp_dir}/epoch-{i}.pt") + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict(average_checkpoints(filenames, device=device)) + else: + if params.iter > 0: + filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[ + : params.avg + 1 + ] + if len(filenames) == 0: + raise ValueError( + f"No checkpoints found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + elif len(filenames) < params.avg + 1: + raise ValueError( + f"Not enough checkpoints ({len(filenames)}) found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + filename_start = filenames[-1] + filename_end = filenames[0] + logging.info( + "Calculating the averaged model over iteration checkpoints" + f" from {filename_start} (excluded) to {filename_end}" + ) + model.to(device) + model.load_state_dict( + average_checkpoints_with_averaged_model( + filename_start=filename_start, + filename_end=filename_end, + device=device, + ) + ) + else: + assert params.avg > 0, params.avg + start = params.epoch - params.avg + assert start >= 1, start + filename_start = f"{params.exp_dir}/epoch-{start}.pt" + filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt" + logging.info( + f"Calculating the averaged model over epoch range from " + f"{start} (excluded) to {params.epoch}" + ) + model.to(device) + model.load_state_dict( + average_checkpoints_with_averaged_model( + filename_start=filename_start, + filename_end=filename_end, + device=device, + ) + ) + + model.to("cpu") + model.eval() + + convert_scaled_to_non_scaled(model, inplace=True) + + encoder = OnnxEncoder( + encoder=model.encoder, + encoder_proj=model.joiner.encoder_proj, + ) + + decoder = OnnxDecoder( + decoder=model.decoder, + decoder_proj=model.joiner.decoder_proj, + ) + + joiner = OnnxJoiner(output_linear=model.joiner.output_linear) + + encoder_num_param = sum([p.numel() for p in encoder.parameters()]) + decoder_num_param = sum([p.numel() for p in decoder.parameters()]) + joiner_num_param = sum([p.numel() for p in joiner.parameters()]) + total_num_param = encoder_num_param + decoder_num_param + joiner_num_param + logging.info(f"encoder parameters: {encoder_num_param}") + logging.info(f"decoder parameters: {decoder_num_param}") + logging.info(f"joiner parameters: {joiner_num_param}") + logging.info(f"total parameters: {total_num_param}") + + if params.iter > 0: + suffix = f"iter-{params.iter}" + else: + suffix = f"epoch-{params.epoch}" + + suffix += f"-avg-{params.avg}" + + opset_version = 13 + + logging.info("Exporting encoder") + encoder_filename = params.exp_dir / f"exported_encoder-{suffix}.ts" + export_encoder_torchscript( + encoder, + encoder_filename + ) + logging.info(f"Exported encoder to {encoder_filename}") + + logging.info("Exporting decoder") + decoder_filename = params.exp_dir / f"exported_decoder-{suffix}.ts" + export_decoder_torchscript( + decoder, + decoder_filename + ) + logging.info(f"Exported decoder to {decoder_filename}") + + logging.info("Exporting joiner") + joiner_filename = params.exp_dir / f"exported_joiner-{suffix}.ts" + export_joiner_torchscript( + joiner, + joiner_filename + ) + logging.info(f"Exported joiner to {joiner_filename}") + +if __name__ == "__main__": + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + main() diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py new file mode 100644 index 0000000000..9a68e3d977 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py @@ -0,0 +1,131 @@ +import time +import argparse +import json + +import numpy as np +import torch +import torch_aie +# import mindietorch as torch_aie +from tqdm import tqdm +# from utils import init_encoder_states, build_encoder_input_output + + +def test_encoder(aie_path, device_id = 0): + batch_size = 1 + device = f'npu:{device_id}' + stream = torch_aie.npu.Stream(device) + print("Start loading ts module...") + ts = torch.jit.load(aie_path) + print("Ts module loaded.") + ts.eval() + x, x_lens = np.ones((1, 100, 80), dtype=np.float32), np.array([100]) + + inputs = (torch.from_numpy(x).to("npu:0"), torch.from_numpy(x_lens).to("npu:0")) + print("Start infering...") + # warmup + for _ in range(10): + with torch_aie.npu.stream(stream): + ts(*inputs) + stream.synchronize() + + # performance test + num_infer = 100 + time_cost = [] + for _ in tqdm(range(num_infer)): + with torch_aie.npu.stream(stream): + start = time.time() + + ts(*inputs) + stream.synchronize() + end = time.time() + time_cost.append(end - start) + # print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms") + print(f"Encoder throughput: {num_infer * batch_size / (sum(time_cost)):.2f} fps") + + +def test_decoder(aie_path, device_id): + batch_size = 1 + dummpy_input = np.ones((batch_size, 2), dtype=np.int64) + + device = f'npu:{device_id}' + stream = torch_aie.npu.Stream(device) + print("Start loading ts module...") + model = torch.jit.load(aie_path) + print("Ts module loaded.") + model.eval() + dummpy_input = torch.from_numpy(dummpy_input).to(device) + + # warmup + for _ in range(10): + with torch_aie.npu.stream(stream): + model(dummpy_input) + stream.synchronize() + + # performance test + num_infer = 100 + start = time.time() + for _ in tqdm(range(num_infer)): + with torch_aie.npu.stream(stream): + model(dummpy_input) + stream.synchronize() + end = time.time() + + print(f"Decoder latency: {(end - start) / num_infer * 1000:.2f} ms") + print(f"Decoder throughput: {num_infer * batch_size / (end - start):.2f} fps") + + +def test_joiner(aie_path, device_id): + batch_size = 1 + encoder_out = np.ones((batch_size, 512), dtype=np.float32) + decoder_out = np.ones((batch_size, 512), dtype=np.float32) + + device = f'npu:{device_id}' + stream = torch_aie.npu.Stream(device) + model = torch.jit.load(aie_path) + model.eval() + encoder_out = torch.from_numpy(encoder_out).to(device) + decoder_out = torch.from_numpy(decoder_out).to(device) + + # warmup + for _ in range(10): + with torch_aie.npu.stream(stream): + out = model(encoder_out, decoder_out) + stream.synchronize() + + # performance test + num_infer = 100 + start = time.time() + for _ in range(num_infer): + with torch_aie.npu.stream(stream): + model(encoder_out, decoder_out) + stream.synchronize() + end = time.time() + + print(f"Joiner latency: {(end - start) / num_infer * 1000:.2f} ms") + print(f"Joiner throughput: {num_infer * batch_size / (end - start):.2f} fps") + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--encoder_aie_path", type=str, required=True) + parser.add_argument("--decoder_aie_path", type=str, required=True) + parser.add_argument("--joiner_aie_path", type=str, required=True) + parser.add_argument("--device_id", type=int, help="NPU device id", default=0) + + args = parser.parse_args() + return args + + +def main(): + torch_aie.set_device(0) + args = parse_args() + + + test_encoder(args.encoder_aie_path, args.device_id) + test_decoder(args.decoder_aie_path, args.device_id) + test_joiner(args.joiner_aie_path, args.device_id) + + +if __name__ == "__main__": + main() -- Gitee From 46fdbd9c0c70f8c587dde2e773a910810fac72fa Mon Sep 17 00:00:00 2001 From: Lanxi Date: Mon, 18 Mar 2024 10:13:14 +0800 Subject: [PATCH 02/11] Readme updated. --- .../audio/Conformer/decoder_compile.py | 1 - .../audio/Conformer/encoder_compile.py | 14 +-- .../audio/Conformer/joiner_compile.py | 24 ++++ .../built-in/audio/Conformer/perf_test_aie.py | 9 +- .../built-in/audio/Conformer/readme.md | 115 ++++++++++++++++++ 5 files changed, 151 insertions(+), 12 deletions(-) create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/readme.md diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py index bbb5eb6c50..1fda260002 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py @@ -17,7 +17,6 @@ try: truncate_long_and_double=True, soc_version="Ascend310P3", ) - # torch.jit.save() compiled_decoder.save("./compiled_decoder.ts") except Exception as e: print("During the compilation of decoder model, an error has occured.") diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py index fa6af3fd0f..2fc240be20 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py @@ -1,28 +1,28 @@ +import sys + import torch -import mindietorch as torch_aie +import mindietorch +from mindietorch import _enums ENCODER_X_SHAPE = (1, 100, 80) ENCODER_X_LENS_SHAPE = (1, ) -inputs = [torch_aie.Input(ENCODER_X_SHAPE, dtype=torch.float32), torch_aie.Input(ENCODER_X_LENS_SHAPE, dtype=torch.int64)] +inputs = [mindietorch.Input(ENCODER_X_SHAPE, dtype=torch.float32), torch_aie.Input(ENCODER_X_LENS_SHAPE, dtype=torch.int64)] encoder_ts_model = torch.jit.load('./exp/exported_encoder-epoch-99-avg-1.ts') encoder_ts_model.eval() try: - compiled_encoder_model = torch_aie.compile( + compiled_encoder_model = mindietorch.compile( encoder_ts_model, inputs=inputs, - precision_policy=torch_aie.PrecisionPolicy.FP32, + precision_policy=_enums.PrecisionPolicy.FP32, truncate_long_and_double=True, soc_version="Ascend310P3", optimization_level=1 ) - # torch.jit.save(compiled_encoder_model, "./compiled_encoder.ts") compiled_encoder_model.save("./compiled_encoder.ts") - # torch.jit.load("./compiled_encoder.ts") except Exception as e: print("an error has occured.") print(e) - import sys sys.exit(1) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py new file mode 100644 index 0000000000..63c40e3609 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py @@ -0,0 +1,24 @@ +import argparse +import torch +import torch_aie + +if __name__ == "__main__": + torch_aie.set_device(0) + JOINER_X_SHAPE = (1, 512) + + inputs = [torch_aie.Input(JOINER_X_SHAPE, dtype=torch.float32), torch_aie.Input(JOINER_X_SHAPE, dtype=torch.float32)] + + joiner_ts_model = torch.jit.load('./exp/exported_joiner-epoch-99-avg-1.ts') + joiner_ts_model.eval() + try: + compiled_joiner = torch_aie.compile( + joiner_ts_model, + inputs=inputs, + precision_policy=torch_aie.PrecisionPolicy.FP32, + truncate_long_and_double=True, + soc_version="Ascend310P3", + ) + compiled_joiner.save("./compiled_joiner.ts") + + except Exception as e: + print(f"Failed to compiled joiner model due to {e}") diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py index 9a68e3d977..bc2e4fc56e 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py @@ -31,15 +31,16 @@ def test_encoder(aie_path, device_id = 0): # performance test num_infer = 100 time_cost = [] + + start = time.time() for _ in tqdm(range(num_infer)): with torch_aie.npu.stream(stream): - start = time.time() ts(*inputs) stream.synchronize() - end = time.time() - time_cost.append(end - start) - # print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms") + end = time.time() + + print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms") print(f"Encoder throughput: {num_infer * batch_size / (sum(time_cost)):.2f} fps") diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md new file mode 100644 index 0000000000..361b280997 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -0,0 +1,115 @@ +# Zipformer流式模型-推理指导 + + +- [概述](#ZH-CN_TOPIC_0000001172161501) + +- [推理环境准备](#ZH-CN_TOPIC_0000001126281702) + +- [快速上手](#ZH-CN_TOPIC_0000001126281700) + +- [模型推理性能精度](#ZH-CN_TOPIC_0000001172201573) + + +# 概述 + +(来自论文摘要)Conformer 已成为自动语音识别 (ASR) 中最流行的编码器模型。它将卷积模块添加到变压器中以学习局部和全局依赖性。 在这项工作中,我们描述了一种更快、内存效率更高、性能更好的转换器,称为 Zipformer。建模变化包括:1)类似 U-Net 的编码器结构,其中中间堆栈以较低的帧速率运行; 2)重新组织了具有更多模块的块结构,其中我们重新使用注意力权重以提高效率;3)LayerNorm的一种修改形式称为BiasNorm,允许我们保留一些长度信息;4)新的激活函数SwooshR和SwooshL比Swish效果更好。 我们还提出了一个新的优化器,称为 ScaledAdam,它通过每个张量的当前尺度来缩放更新以保持相对变化大致相同,并且还显式地学习参数尺度。它比 Adam 实现了更快的收敛和更好的性能。 在 LibriSpeech、Aishell-1 和 WenetSpeech 数据集上进行的大量实验证明了我们提出的 Zipformer 相对于其他最先进的 ASR 模型的有效性。 + + +# 推理环境准备\[所有版本\] + +- 该模型需要以下依赖 + + **表 1** 版本配套表 + + | 配套 | 版本 | + |---------|---------| + | CANN | 8.0.T5 | - | + | Python | 3.10.13 | + | torch | 2.1.0 | + | 芯片类型 | Ascend310P3 | - + +# 快速上手 + +## 环境安装 + +1. 安装k2 + 1. (NPU)x86环境 + ```shell + wget https://huggingface.co/csukuangfj/k2/resolve/main/cpu/k2-1.24.4.dev20231220+cpu.torch2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + pip install k2-1.24.4.dev20231220+cpu.torch2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + ``` + 2. (NPU/GPU)arm环境,需要从源码编译。 + ```shell + git clone https://github.com/k2-fsa/k2.git + cd k2 + export K2_MAKE_ARGS="-j6" + python3 setup.py install + ``` + 若执行以上命令遇到错误,请参考[此链接](https://k2-fsa.github.io/k2/installation/from_source.html)。 + 3. (GPU) x86环境。从[此链接](https://k2-fsa.github.io/k2/cuda.html)下载对应CUDA版本的whl文件,然后使用pip进行安装。 + 4. 验证k2是否安装成功 + ```shell + python3 -m k2.version + ``` +2. 安装其他依赖 + ```shell + pip install lhotse + pip install kaldifeat + ``` +3. 安装icefall + ```shell + git clone https://github.com/k2-fsa/icefall.git + git reset --hard e2fcb42f5f176d9e39eb38506ab99d0a3adaf202 + + cd icefall + pip install -r requirements.txt + ``` +4. 将icefall加入环境变量, "/path/to/icefall"替换为icefall文件夹所在的路径。 + **这一步很重要,否则会报icefall找不到的错误。** + ```shell + export PYTHONPATH=/path/to/icefall:$PYTHONPATH + ``` + +## 模型转换 +1. 下载模型 +从以下HuggingFace链接下载所需文件,所需文件为/exp/pretrained_epoch_9_avg_1.pt, 和/data整个文件夹 +https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_offline/tree/main + +进入上一步中的icefall路径,并cd到egs/wenetspeech/ASR/中的,将pretrained_epoch_9_avg_1.pt放在ASR路径cd 下 +即 wenetspeech/asr/-- + --data/ + --lang_char/ + --Linv.pt + --exp/ + + +2. 导出torchscipt模型 +```shell +#将perf_test_aie.py放在ASR路径下,然后执行 +python3 ./pruned_transducer_stateless5/export_torchscript.py \ + --tokens ./data/lang_char/tokens.txt \ + --epoch 99 \ + --avg 1 \ + --use-averaged-model 0 \ + --exp-dir ./exp \ + --num-encoder-layers 24 \ + --dim-feedforward 1536 \ + --nhead 8 \ + --encoder-dim 384 \ + --decoder-dim 512 \ + --joiner-dim 512 + +``` +3. 转为MindIETorch模型 +- 性能验证 +```shell +#将perf_test_aie.py放在ASR路径下,然后执行 +python perf_test_aie.py \ +--encoder_aie_path ./compiled_encoder.ts \ +--decoder_aie_path ./compiled_decoder.ts \ +--joiner_aie_path ./compiled_joiner.ts \ +--device_id 0 +``` +屏幕上会打印性能数据,以FPS记 + +精度验证 -- Gitee From a6950ba0c68c96fc4674f8fa4eb93f6c280dc529 Mon Sep 17 00:00:00 2001 From: Lanxi Date: Mon, 18 Mar 2024 10:36:40 +0800 Subject: [PATCH 03/11] Fixed typo in readme.md --- AscendIE/TorchAIE/built-in/audio/Conformer/readme.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md index 361b280997..e8827ca559 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -101,6 +101,14 @@ python3 ./pruned_transducer_stateless5/export_torchscript.py \ ``` 3. 转为MindIETorch模型 +将encoder_compile.py decoder_compile.py joiner_compile.py放在ASR/pruned_transducer_stateless5/目录下,分别执行 +```python +#在ASR目录下执行 +python ./pruned_transducer_stateless5/encoder_compile.py +python ./pruned_transducer_stateless5/decoder_compile.py +python ./pruned_transducer_stateless5/joiner_compile.py +``` +会在ASR目录下生成compiled_encoder.ts compiled_decoder.ts compiled_joiner.ts 三个文件。 - 性能验证 ```shell #将perf_test_aie.py放在ASR路径下,然后执行 @@ -111,5 +119,3 @@ python perf_test_aie.py \ --device_id 0 ``` 屏幕上会打印性能数据,以FPS记 - -精度验证 -- Gitee From 682d3a71b9944d2527c2d60b968f81ac0a1ec7c6 Mon Sep 17 00:00:00 2001 From: Lanxi Date: Mon, 18 Mar 2024 16:56:03 +0800 Subject: [PATCH 04/11] fixed joiner_compile.py --- .../audio/Conformer/joiner_compile.py | 39 ++++++++++--------- .../built-in/audio/Conformer/perf_test_aie.py | 18 ++++----- .../built-in/audio/Conformer/readme.md | 15 ++++--- 3 files changed, 39 insertions(+), 33 deletions(-) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py index 63c40e3609..f0206fb5db 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py @@ -1,24 +1,25 @@ -import argparse import torch -import torch_aie +import mindietorch -if __name__ == "__main__": - torch_aie.set_device(0) - JOINER_X_SHAPE = (1, 512) +JOINER_X_SHAPE = (1, 512) - inputs = [torch_aie.Input(JOINER_X_SHAPE, dtype=torch.float32), torch_aie.Input(JOINER_X_SHAPE, dtype=torch.float32)] +inputs = [mindietorch.Input(JOINER_X_SHAPE, dtype=torch.float32), mindietorch.Input(JOINER_X_SHAPE, dtype=torch.float32)] - joiner_ts_model = torch.jit.load('./exp/exported_joiner-epoch-99-avg-1.ts') - joiner_ts_model.eval() - try: - compiled_joiner = torch_aie.compile( - joiner_ts_model, - inputs=inputs, - precision_policy=torch_aie.PrecisionPolicy.FP32, - truncate_long_and_double=True, - soc_version="Ascend310P3", - ) - compiled_joiner.save("./compiled_joiner.ts") +joiner_ts_model = torch.jit.load('./exp/exported_joiner-epoch-99-avg-1.ts') +joiner_ts_model.eval() - except Exception as e: - print(f"Failed to compiled joiner model due to {e}") +try: + compiled_joiner = mindietorch.compile( + joiner_ts_model, + inputs=inputs, + precision_policy=mindietorch.PrecisionPolicy.FP32, + truncate_long_and_double=True, + soc_version="Ascend310P3", + ) + # torch.jit.save() + compiled_joiner.save("./compiled_joiner.ts") +except Exception as e: + print("During the compilation of joiner model, an error has occured.") + # print(e) + import sys + sys.exit(1) \ No newline at end of file diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py index bc2e4fc56e..20f2aa3680 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py @@ -4,7 +4,7 @@ import json import numpy as np import torch -import torch_aie +import mindietorch # import mindietorch as torch_aie from tqdm import tqdm # from utils import init_encoder_states, build_encoder_input_output @@ -24,17 +24,17 @@ def test_encoder(aie_path, device_id = 0): print("Start infering...") # warmup for _ in range(10): - with torch_aie.npu.stream(stream): + with mindietorch.npu.stream(stream): ts(*inputs) stream.synchronize() # performance test num_infer = 100 time_cost = [] - + start = time.time() for _ in tqdm(range(num_infer)): - with torch_aie.npu.stream(stream): + with mindietorch.npu.stream(stream): ts(*inputs) stream.synchronize() @@ -49,7 +49,7 @@ def test_decoder(aie_path, device_id): dummpy_input = np.ones((batch_size, 2), dtype=np.int64) device = f'npu:{device_id}' - stream = torch_aie.npu.Stream(device) + stream = mindietorch.npu.Stream(device) print("Start loading ts module...") model = torch.jit.load(aie_path) print("Ts module loaded.") @@ -58,7 +58,7 @@ def test_decoder(aie_path, device_id): # warmup for _ in range(10): - with torch_aie.npu.stream(stream): + with mindietorch.npu.stream(stream): model(dummpy_input) stream.synchronize() @@ -66,7 +66,7 @@ def test_decoder(aie_path, device_id): num_infer = 100 start = time.time() for _ in tqdm(range(num_infer)): - with torch_aie.npu.stream(stream): + with mindietorch.npu.stream(stream): model(dummpy_input) stream.synchronize() end = time.time() @@ -81,7 +81,7 @@ def test_joiner(aie_path, device_id): decoder_out = np.ones((batch_size, 512), dtype=np.float32) device = f'npu:{device_id}' - stream = torch_aie.npu.Stream(device) + stream = mindietorch.npu.Stream(device) model = torch.jit.load(aie_path) model.eval() encoder_out = torch.from_numpy(encoder_out).to(device) @@ -97,7 +97,7 @@ def test_joiner(aie_path, device_id): num_infer = 100 start = time.time() for _ in range(num_infer): - with torch_aie.npu.stream(stream): + with mindietorch.npu.stream(stream): model(encoder_out, decoder_out) stream.synchronize() end = time.time() diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md index e8827ca559..4e443bc893 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -1,4 +1,4 @@ -# Zipformer流式模型-推理指导 +# Conformer模型-推理指导 - [概述](#ZH-CN_TOPIC_0000001172161501) @@ -23,7 +23,7 @@ | 配套 | 版本 | |---------|---------| - | CANN | 8.0.T5 | - | + | CANN | 7.0RC1 | - | | Python | 3.10.13 | | torch | 2.1.0 | | 芯片类型 | Ascend310P3 | - @@ -42,7 +42,7 @@ ```shell git clone https://github.com/k2-fsa/k2.git cd k2 - export K2_MAKE_ARGS="-j6" + export K2_MAKE_ARGS="-j" python3 setup.py install ``` 若执行以上命令遇到错误,请参考[此链接](https://k2-fsa.github.io/k2/installation/from_source.html)。 @@ -72,18 +72,22 @@ ## 模型转换 1. 下载模型 + 从以下HuggingFace链接下载所需文件,所需文件为/exp/pretrained_epoch_9_avg_1.pt, 和/data整个文件夹 https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_offline/tree/main 进入上一步中的icefall路径,并cd到egs/wenetspeech/ASR/中的,将pretrained_epoch_9_avg_1.pt放在ASR路径cd 下 -即 wenetspeech/asr/-- +即 +``` +wenetspeech/asr/ --data/ --lang_char/ --Linv.pt --exp/ +``` +1. 导出torchscipt模型 -2. 导出torchscipt模型 ```shell #将perf_test_aie.py放在ASR路径下,然后执行 python3 ./pruned_transducer_stateless5/export_torchscript.py \ @@ -101,6 +105,7 @@ python3 ./pruned_transducer_stateless5/export_torchscript.py \ ``` 3. 转为MindIETorch模型 +4. 将encoder_compile.py decoder_compile.py joiner_compile.py放在ASR/pruned_transducer_stateless5/目录下,分别执行 ```python #在ASR目录下执行 -- Gitee From c6ca80b968c235efcddb4bf3cfce2a385c2d6bad Mon Sep 17 00:00:00 2001 From: Lanxi Date: Mon, 18 Mar 2024 17:04:56 +0800 Subject: [PATCH 05/11] Fix typo2 --- .../audio/Conformer/conformer_py.patch | 11 ++++++++++ .../audio/Conformer/decoder_compile.py | 20 ++++++++++--------- .../audio/Conformer/encoder_compile.py | 8 +++----- .../audio/Conformer/export_torchscript.py | 18 +++++++++++------ .../built-in/audio/Conformer/perf_test_aie.py | 10 ++++------ .../built-in/audio/Conformer/readme.md | 10 ++++++++-- 6 files changed, 49 insertions(+), 28 deletions(-) create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/conformer_py.patch diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/conformer_py.patch b/AscendIE/TorchAIE/built-in/audio/Conformer/conformer_py.patch new file mode 100644 index 0000000000..c7b8c2b335 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/conformer_py.patch @@ -0,0 +1,11 @@ +--- ./pruned_transducer_stateless5/conformer.py 2024-03-18 17:36:34.852000000 +0800 ++++ ./pruned_transducer_stateless5/new_conformer.py 2024-03-18 17:35:14.136000000 +0800 +@@ -193,7 +193,7 @@ + ) # (T, N, C) + + x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) +- return x, lengths ++ return x#, lengths + + @torch.jit.export + def get_init_state( diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py index 1fda260002..7fee3116a6 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py @@ -1,23 +1,25 @@ -import sys import torch -import torch_aie +import mindietorch +from mindietorch import _enums DECODER_Y_SHAPE = (1, 2) -inputs = [torch_aie.Input(DECODER_Y_SHAPE, dtype=torch.float32)] +inputs = [mindietorch.Input(DECODER_Y_SHAPE, dtype=torch.int64)] decoder_ts_model = torch.jit.load('./exp/exported_decoder-epoch-99-avg-1.ts') decoder_ts_model.eval() - +mindietorch.set_device(0) try: - compiled_decoder = torch_aie.compile( + compiled_decoder = mindietorch.compile( decoder_ts_model, inputs=inputs, - precision_policy=torch_aie.PrecisionPolicy.FP32, + precision_policy=_enums.PrecisionPolicy.FP32, truncate_long_and_double=True, soc_version="Ascend310P3", ) - compiled_decoder.save("./compiled_decoder.ts") + compiled_decoder.save("compiled_decoder.ts") except Exception as e: - print("During the compilation of decoder model, an error has occured.") - sys.exit(1) \ No newline at end of file + print(f"During the compilation of decoder model, an error has occured: {e}") + + import sys + sys.exit(1) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py index 2fc240be20..d8fe2a782c 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py @@ -2,12 +2,11 @@ import sys import torch import mindietorch -from mindietorch import _enums ENCODER_X_SHAPE = (1, 100, 80) ENCODER_X_LENS_SHAPE = (1, ) -inputs = [mindietorch.Input(ENCODER_X_SHAPE, dtype=torch.float32), torch_aie.Input(ENCODER_X_LENS_SHAPE, dtype=torch.int64)] +inputs = [mindietorch.Input(ENCODER_X_SHAPE, dtype=torch.float32), mindietorch.Input(ENCODER_X_LENS_SHAPE, dtype=torch.int64)] encoder_ts_model = torch.jit.load('./exp/exported_encoder-epoch-99-avg-1.ts') encoder_ts_model.eval() @@ -16,10 +15,9 @@ try: compiled_encoder_model = mindietorch.compile( encoder_ts_model, inputs=inputs, - precision_policy=_enums.PrecisionPolicy.FP32, + precision_policy=mindietorch.PrecisionPolicy.FP32, truncate_long_and_double=True, - soc_version="Ascend310P3", - optimization_level=1 + soc_version="Ascend310P3" ) compiled_encoder_model.save("./compiled_encoder.ts") except Exception as e: diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py b/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py index ebb2ca4ac7..ab9ec0ced3 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/export_torchscript.py @@ -196,12 +196,13 @@ class OnnxEncoder(nn.Module): - encoder_out, A 3-D tensor of shape (N, T', joiner_dim) - encoder_out_lens, A 1-D tensor of shape (N,) """ - encoder_out, encoder_out_lens = self.encoder(x, x_lens) + # encoder_out, encoder_out_lens = self.encoder(x, x_lens) + encoder_out = self.encoder(x, x_lens) encoder_out = self.encoder_proj(encoder_out) # Now encoder_out is of shape (N, T, joiner_dim) - return encoder_out, encoder_out_lens + return encoder_out#, encoder_out_lens class OnnxDecoder(nn.Module): @@ -257,7 +258,7 @@ class OnnxJoiner(nn.Module): def export_encoder_torchscript( encoder_model: OnnxEncoder, encoder_filename: str, - + # opset_version: int = 11, ) -> None: """Export the given encoder model to ONNX format. The exported model has two inputs: @@ -281,11 +282,13 @@ def export_encoder_torchscript( x = torch.rand(1, 100, 80, dtype=torch.float32) x_lens = torch.tensor([100], dtype=torch.int64) encoder_ts = torch.jit.trace(encoder_model, (x, x_lens)) + # encoder_ts = torch.jit.trace(encoder_model, x) torch.jit.save(encoder_ts, encoder_filename) def export_decoder_torchscript( decoder_model: OnnxDecoder, decoder_filename: str, + # opset_version: int = 11, ) -> None: """Export the decoder model to ONNX format. @@ -479,7 +482,8 @@ def main(): encoder_filename = params.exp_dir / f"exported_encoder-{suffix}.ts" export_encoder_torchscript( encoder, - encoder_filename + encoder_filename, + # opset_version=opset_version, ) logging.info(f"Exported encoder to {encoder_filename}") @@ -487,7 +491,8 @@ def main(): decoder_filename = params.exp_dir / f"exported_decoder-{suffix}.ts" export_decoder_torchscript( decoder, - decoder_filename + decoder_filename, + # opset_version=opset_version, ) logging.info(f"Exported decoder to {decoder_filename}") @@ -495,7 +500,8 @@ def main(): joiner_filename = params.exp_dir / f"exported_joiner-{suffix}.ts" export_joiner_torchscript( joiner, - joiner_filename + joiner_filename, + # opset_version=opset_version, ) logging.info(f"Exported joiner to {joiner_filename}") diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py index 20f2aa3680..db384473ca 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/perf_test_aie.py @@ -5,7 +5,6 @@ import json import numpy as np import torch import mindietorch -# import mindietorch as torch_aie from tqdm import tqdm # from utils import init_encoder_states, build_encoder_input_output @@ -13,7 +12,7 @@ from tqdm import tqdm def test_encoder(aie_path, device_id = 0): batch_size = 1 device = f'npu:{device_id}' - stream = torch_aie.npu.Stream(device) + stream = mindietorch.npu.Stream(device) print("Start loading ts module...") ts = torch.jit.load(aie_path) print("Ts module loaded.") @@ -30,7 +29,6 @@ def test_encoder(aie_path, device_id = 0): # performance test num_infer = 100 - time_cost = [] start = time.time() for _ in tqdm(range(num_infer)): @@ -41,7 +39,7 @@ def test_encoder(aie_path, device_id = 0): end = time.time() print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms") - print(f"Encoder throughput: {num_infer * batch_size / (sum(time_cost)):.2f} fps") + print(f"Encoder throughput: {num_infer * batch_size / (end - start):.2f} fps") def test_decoder(aie_path, device_id): @@ -89,7 +87,7 @@ def test_joiner(aie_path, device_id): # warmup for _ in range(10): - with torch_aie.npu.stream(stream): + with mindietorch.npu.stream(stream): out = model(encoder_out, decoder_out) stream.synchronize() @@ -119,7 +117,7 @@ def parse_args(): def main(): - torch_aie.set_device(0) + mindietorch.set_device(0) args = parse_args() diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md index 4e443bc893..4b38bc7bf1 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -12,7 +12,7 @@ # 概述 -(来自论文摘要)Conformer 已成为自动语音识别 (ASR) 中最流行的编码器模型。它将卷积模块添加到变压器中以学习局部和全局依赖性。 在这项工作中,我们描述了一种更快、内存效率更高、性能更好的转换器,称为 Zipformer。建模变化包括:1)类似 U-Net 的编码器结构,其中中间堆栈以较低的帧速率运行; 2)重新组织了具有更多模块的块结构,其中我们重新使用注意力权重以提高效率;3)LayerNorm的一种修改形式称为BiasNorm,允许我们保留一些长度信息;4)新的激活函数SwooshR和SwooshL比Swish效果更好。 我们还提出了一个新的优化器,称为 ScaledAdam,它通过每个张量的当前尺度来缩放更新以保持相对变化大致相同,并且还显式地学习参数尺度。它比 Adam 实现了更快的收敛和更好的性能。 在 LibriSpeech、Aishell-1 和 WenetSpeech 数据集上进行的大量实验证明了我们提出的 Zipformer 相对于其他最先进的 ASR 模型的有效性。 +Conformer模型是一种混合神经网络架构,专门设计用于处理序列到序列的任务,如自动语音识别(ASR)。它融合了卷积神经网络(CNN)和自注意力机制(来自Transformer模型)的优点,旨在捕捉序列数据的局部特征和全局依赖。Conformer通过在其架构中巧妙地结合这两种方法,有效地处理了时间序列数据的复杂性,比如语音波形,从而在许多任务上实现了卓越的性能。简而言之,Conformer通过集成CNN的强大特征提取能力和Transformer的高效序列建模能力,为序列分析任务提供了一种强大的解决方案。 # 推理环境准备\[所有版本\] @@ -82,12 +82,18 @@ https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_s wenetspeech/asr/ --data/ --lang_char/ - --Linv.pt + --Linv.pt等文件 --exp/ + -- conformer_py.patch ``` 1. 导出torchscipt模型 +1.1 修改原始模型 +执行以下命令 +```shell +patch ./pruned_transducer_stateless5/conformer.py conformer_py.patch +``` ```shell #将perf_test_aie.py放在ASR路径下,然后执行 python3 ./pruned_transducer_stateless5/export_torchscript.py \ -- Gitee From 063f771c7a358bae22799aab5654cf3ac4bdc5ee Mon Sep 17 00:00:00 2001 From: Lanxi Date: Sun, 24 Mar 2024 20:27:40 +0800 Subject: [PATCH 06/11] now compiled to fp16; Readme updated. --- .../TorchAIE/built-in/audio/Conformer/decoder_compile.py | 2 +- .../TorchAIE/built-in/audio/Conformer/encoder_compile.py | 4 +++- .../TorchAIE/built-in/audio/Conformer/joiner_compile.py | 3 ++- AscendIE/TorchAIE/built-in/audio/Conformer/readme.md | 7 +++++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py index 7fee3116a6..ad68d00270 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/decoder_compile.py @@ -13,7 +13,7 @@ try: compiled_decoder = mindietorch.compile( decoder_ts_model, inputs=inputs, - precision_policy=_enums.PrecisionPolicy.FP32, + precision_policy=_enums.PrecisionPolicy.FP16, truncate_long_and_double=True, soc_version="Ascend310P3", ) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py index d8fe2a782c..b01db8a551 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/encoder_compile.py @@ -11,11 +11,13 @@ inputs = [mindietorch.Input(ENCODER_X_SHAPE, dtype=torch.float32), mindietorch.I encoder_ts_model = torch.jit.load('./exp/exported_encoder-epoch-99-avg-1.ts') encoder_ts_model.eval() +mindietorch.set_device(0) + try: compiled_encoder_model = mindietorch.compile( encoder_ts_model, inputs=inputs, - precision_policy=mindietorch.PrecisionPolicy.FP32, + precision_policy=mindietorch.PrecisionPolicy.FP16, truncate_long_and_double=True, soc_version="Ascend310P3" ) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py b/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py index f0206fb5db..a0ab0b7877 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/joiner_compile.py @@ -7,12 +7,13 @@ inputs = [mindietorch.Input(JOINER_X_SHAPE, dtype=torch.float32), mindietorch.In joiner_ts_model = torch.jit.load('./exp/exported_joiner-epoch-99-avg-1.ts') joiner_ts_model.eval() +mindietorch.set_device(0) try: compiled_joiner = mindietorch.compile( joiner_ts_model, inputs=inputs, - precision_policy=mindietorch.PrecisionPolicy.FP32, + precision_policy=mindietorch.PrecisionPolicy.FP16, truncate_long_and_double=True, soc_version="Ascend310P3", ) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md index 4b38bc7bf1..35c8f5adc3 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -130,3 +130,10 @@ python perf_test_aie.py \ --device_id 0 ``` 屏幕上会打印性能数据,以FPS记 + +### 性能数据 (时延/吞吐率) +|Model| MindIE Torch | T4| A10| +|------| ----------------- |------| --------| +|encoder| 37.31ms/26.81FPS | 20.53ms/48.70FPS | 16.4ms / 60.9FPS| +|decoder| 0.22ms/ 4470FPS | 0.13ms/7443FPS | 0.12ms/8333FPS | +|joiner | 0.20ms/ 4913FPS | 0.13ms/7612FPS | 0.11ms/9212FPS | \ No newline at end of file -- Gitee From 96562be5869b0f96077548175bd0a6f1c361b586 Mon Sep 17 00:00:00 2001 From: Lanxi Date: Mon, 25 Mar 2024 22:37:18 +0800 Subject: [PATCH 07/11] Fixed typo in readme.md --- AscendIE/TorchAIE/built-in/audio/Conformer/readme.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md index 35c8f5adc3..c0e0325ee9 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -76,7 +76,7 @@ Conformer模型是一种混合神经网络架构,专门设计用于处理序 从以下HuggingFace链接下载所需文件,所需文件为/exp/pretrained_epoch_9_avg_1.pt, 和/data整个文件夹 https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_offline/tree/main -进入上一步中的icefall路径,并cd到egs/wenetspeech/ASR/中的,将pretrained_epoch_9_avg_1.pt放在ASR路径cd 下 +进入上一步中的icefall路径,并cd到egs/wenetspeech/ASR/中的,将pretrained_epoch_9_avg_1.pt放在ASR/exp目录下,并**重命名为epoch-99.pt** 即 ``` wenetspeech/asr/ @@ -84,7 +84,15 @@ wenetspeech/asr/ --lang_char/ --Linv.pt等文件 --exp/ + --epoch-99.pt -- conformer_py.patch + --pruned_transducer_stateless5/ + -- export_torchscript.py + -- encoder_compile.py + -- decoder_compile.py + -- joiner_compile.py + -- perf_test_aie.py + ``` 1. 导出torchscipt模型 @@ -95,7 +103,7 @@ patch ./pruned_transducer_stateless5/conformer.py conformer_py.patch ``` ```shell -#将perf_test_aie.py放在ASR路径下,然后执行 +#将export_torchscript.py放在ASR路径下,然后执行 python3 ./pruned_transducer_stateless5/export_torchscript.py \ --tokens ./data/lang_char/tokens.txt \ --epoch 99 \ -- Gitee From 05d935e0c839b82cc70f3f0f6965822bd24909e7 Mon Sep 17 00:00:00 2001 From: Lanxi Date: Tue, 26 Mar 2024 12:03:51 +0800 Subject: [PATCH 08/11] =?UTF-8?q?=E8=A1=A5=E5=85=85=E7=B2=BE=E5=BA=A6?= =?UTF-8?q?=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Conformer/readme.md | 31 +++++-- .../audio/Conformer/test_precision.py | 80 +++++++++++++++++++ 2 files changed, 106 insertions(+), 5 deletions(-) create mode 100644 AscendIE/TorchAIE/built-in/audio/Conformer/test_precision.py diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md index c0e0325ee9..c661834845 100644 --- a/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/readme.md @@ -92,11 +92,14 @@ wenetspeech/asr/ -- decoder_compile.py -- joiner_compile.py -- perf_test_aie.py + -- test_precision.py ``` -1. 导出torchscipt模型 -1.1 修改原始模型 +2. 导出torchscipt模型 + +2.1 修改原始模型 + 执行以下命令 ```shell patch ./pruned_transducer_stateless5/conformer.py conformer_py.patch @@ -118,8 +121,8 @@ python3 ./pruned_transducer_stateless5/export_torchscript.py \ --joiner-dim 512 ``` -3. 转为MindIETorch模型 -4. +2.2 转为MindIETorch模型 + 将encoder_compile.py decoder_compile.py joiner_compile.py放在ASR/pruned_transducer_stateless5/目录下,分别执行 ```python #在ASR目录下执行 @@ -128,7 +131,24 @@ python ./pruned_transducer_stateless5/decoder_compile.py python ./pruned_transducer_stateless5/joiner_compile.py ``` 会在ASR目录下生成compiled_encoder.ts compiled_decoder.ts compiled_joiner.ts 三个文件。 -- 性能验证 + +### 精度验证 + +encoder 模型精度验证,屏幕显示Precision test passed 为精度正常。 +```shell + python test_precision.py encoder compiled_encoder.ts +``` +decoder 模型精度验证,屏幕显示Precision test passed 为精度正常。 +```shell +python test_precision.py decoder compiled_decoder.ts +``` + +joiner模型精度验证,屏幕显示Precision test passed 为精度正常。 +```shell +python test_precision.py joiner compiled_joiner.ts +``` + +### 性能验证 ```shell #将perf_test_aie.py放在ASR路径下,然后执行 python perf_test_aie.py \ @@ -139,6 +159,7 @@ python perf_test_aie.py \ ``` 屏幕上会打印性能数据,以FPS记 + ### 性能数据 (时延/吞吐率) |Model| MindIE Torch | T4| A10| |------| ----------------- |------| --------| diff --git a/AscendIE/TorchAIE/built-in/audio/Conformer/test_precision.py b/AscendIE/TorchAIE/built-in/audio/Conformer/test_precision.py new file mode 100644 index 0000000000..3a1e5e5503 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/audio/Conformer/test_precision.py @@ -0,0 +1,80 @@ +import sys +from pruned_transducer_stateless5.onnx_pretrained import OnnxModel + +import numpy as np +import torch +import mindietorch + +from torch.nn.functional import cosine_similarity + +# Initialize the ONNX model globally +onnxmodel = OnnxModel("./exp/encoder-epoch-99-avg-1.onnx", "./exp/decoder-epoch-99-avg-1.onnx", "./exp/joiner-epoch-99-avg-1.onnx") + +def is_close_to_ones(x1, atol): + x2 = torch.ones_like(x1) + return torch.allclose(x1, x2, atol) + +def precision_test(ts_output, onnx_output, atol=1e-02): + result = is_close_to_ones(cosine_similarity(ts_output, onnx_output), atol) + print("Precision test" + "passed" if result else "failed") + + +def run_ts_inference(ts_path, dummpy_input, device_id): + batch_size = 1 + device = f'npu:{device_id}' + stream = mindietorch.npu.Stream(device) + model = torch.jit.load(ts_path) + model.eval() + + with mindietorch.npu.stream(stream): + ts_out = model(*dummpy_input) + stream.synchronize() + return ts_out + + +def evaluate_model(mode, ts_path, device_id): + print(f"Evaluating precision of {mode} model") + if mode == 'encoder': + #dummy inputs + x, x_lens = np.random.rand(1, 100, 80).astype(np.float32), np.array([100]) + x_tensor, x_lens_tensor = torch.from_numpy(x), torch.from_numpy(x_lens) + x_npu_tensor, x_lens_npu_tensor = x_tensor.to(f"npu:{device_id}"), x_lens_tensor.to(f"npu:{device_id}") + + #gpu/npu inference + ts_out = run_ts_inference(ts_path, (x_npu_tensor, x_lens_npu_tensor), device_id) + onnx_output, _ = onnxmodel.run_encoder(x_tensor, x_lens_tensor) + + elif mode == 'decoder': + y = np.random.randint(0, 10, size=(1, 2)).astype(np.int64) + y_tensor = torch.from_numpy(y) + y_npu_tensor = y_tensor.to(f'npu:{device_id}') + + ts_out = run_ts_inference(ts_path, (y_npu_tensor, ), device_id) + onnx_output = onnxmodel.run_decoder(y_tensor) + + elif mode == 'joiner': + enc, dec = np.random.rand(1, 512).astype(np.float32), np.random.rand(1, 512).astype(np.float32) + enc_tensor, dec_tensor = torch.from_numpy(enc), torch.from_numpy(dec) + enc_npu_tensor, dec_npu_tensor = enc_tensor.to(f'npu:{device_id}'), dec_tensor.to(f'npu:{device_id}') + + ts_out = run_ts_inference(ts_path, (enc_npu_tensor, dec_npu_tensor), device_id) + onnx_output = onnxmodel.run_joiner(enc_tensor, dec_tensor) + + else: + raise ValueError("Invalid mode") + + ts_out = ts_out.to("cpu") + + precision_test(ts_out, onnx_output, atol=1e-02) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: