From 085f095a8aace29a0727a55afc3eb3ff71084607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=8A=E5=9F=8E=E7=83=9F=E6=B2=99?= <602123157@qq.com> Date: Mon, 13 Oct 2025 11:38:42 +0800 Subject: [PATCH] SenseVoice --- .../contrib/audio/SenseVoice/README.md | 130 ++++++++++++ .../contrib/audio/SenseVoice/diif.patch | 13 ++ .../contrib/audio/SenseVoice/export_onnx.py | 9 + .../contrib/audio/SenseVoice/om_infer.py | 185 ++++++++++++++++++ .../contrib/audio/SenseVoice/requirements.txt | 6 + 5 files changed, 343 insertions(+) create mode 100644 ACL_PyTorch/contrib/audio/SenseVoice/README.md create mode 100644 ACL_PyTorch/contrib/audio/SenseVoice/diif.patch create mode 100644 ACL_PyTorch/contrib/audio/SenseVoice/export_onnx.py create mode 100644 ACL_PyTorch/contrib/audio/SenseVoice/om_infer.py create mode 100644 ACL_PyTorch/contrib/audio/SenseVoice/requirements.txt diff --git a/ACL_PyTorch/contrib/audio/SenseVoice/README.md b/ACL_PyTorch/contrib/audio/SenseVoice/README.md new file mode 100644 index 0000000000..8280a5ef70 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/SenseVoice/README.md @@ -0,0 +1,130 @@ +# SenseVoice + Vad 推理指导 + +- [概述](#概述) +- [推理环境准备](#推理环境准备) +- [快速上手](#快速上手) + - [获取源码](#获取源码) + - [模型推理](#模型推理) + - [模型转换](#模型转换) + - [开始推理验证](#开始推理验证) + +# 概述 + +本文档参考[SenseVoice(ONNX)-推理指导](../../../built-in/audio/SenseVoice/README_onnx.md),新增vad语音端点检测模型,用于检测音频中有效的语音片段,并支持输出timestamp(每个识别词对应音频中的时间) + +- 版本说明: + + ``` + url=https://github.com/modelscope/FunASR + commit_id=c4ac64fd5d24bb3fc8ccc441d36a07c83c8b9015 + ``` + +# 推理环境准备 + +**表 1** 版本配套表 + +| 配套 | 版本 | 环境准备指导 | +| ----------------------------------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------- | +| 固件与驱动 | 25.0.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | +| CANN | 8.2.RC2 | - | +| Python | 3.11.10 | - | +| PyTorch | 2.1.0 | - | +| 说明:Atlas 800I A2/Atlas 300I Pro 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | + +# 快速上手 + +## 获取源码 + +1. 获取本仓源码 + +```shell +git clone https://gitee.com/ascend/ModelZoo-PyTorch.git +cd ModelZoo-PyTorch/ACL_PyTorch/contrib/audio/SenseVoice +``` + +2. 安装依赖 + +```shell +pip3 install -r requirements.txt +``` + +3. 获取 `Pytorch`源码 + +```shell +git clone https://github.com/modelscope/FunASR.git +cd FunASR +git reset c4ac64fd5d24bb3fc8ccc441d36a07c83c8b9015 --hard +git apply ../diff.patch +cp ../export_onnx.py ./ +cp ../om_infer.py ./ +``` + +4. 安装aisbench工具 + 参考[aisbench](https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench)安装aisbench工具 +5. 获取权重 + ++ [SenseVoiceSmall](https://modelscope.cn/models/iic/SenseVoiceSmall/files) ++ [speech_fsmn_vad_zh-cn-16k-common-pytorch](https://modelscope.cn/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch) + +## 模型推理 + +### 模型转换 + +1. 导出onnx模型 + +```shell +python3 export_onnx.py --model /path/to/SenseVoiceSmall +``` + ++ 参数说明 ++ --model SenseVoiceSmall模型路径 + +脚本运行后会在权重目录下生成model.onnx文件 + +1. 修改onnx模型 + +``` +cp ./../../built-in/audio/SenseVoice/modify_onnx.py . +python3 modify_onnx.py --input_path=./SenseVoiceSmall/model.onnx --save_path=./SenseVoiceSmall/model_md.onnx +``` + +修改原始onnx模型。删除多余的domian,生成新的model_md.onnx模型 + +1. 使用 `ATC`工具将 `ONNX`模型转为 `OM`模型 + +配置环境变量 + +``` +source /usr/local/Ascend/ascend-toolkit/set_env.sh +``` + +执行ATC命令,利用npu-smi info命令获取芯片型号,填入soc_version参数中 + +``` +atc --framework=5 --soc_version=Ascend${soc_version} --model ./SenseVoiceSmall/model_md.onnx --output SenseVoice --input_shape="speech:1,-1,560;speech_lengths:1;language:1;textnorm:1" +``` + +在当前目录下生成动态模型SenseVoice_{arch}.om + +### 开始推理验证 + +执行推理命令 + +``` +python3 om_infer.py \ +--vad_path speech_fsmn_vad_zh-cn-16k-common-pytorch +--model_path SenseVoiceSmall \ +--om_path SenseVoice_{arch}.om \ +--device 0 \ +--input "./SenseVoiceSmall/example/zh.mp3" +``` + +- 参数说明 +- vad_path: vad模型权重路径 +- model_path: SenseVoice模型权重路径 +- om_model: om模型路径 +- device: npu芯片id,默认使用0卡 +- input: 输入mp3格式语音文件,这里以权重文件内的样例为例 + +推理执行完成后,会打屏语音文本的输出,和单次推理的耗时 + diff --git a/ACL_PyTorch/contrib/audio/SenseVoice/diif.patch b/ACL_PyTorch/contrib/audio/SenseVoice/diif.patch new file mode 100644 index 0000000000..932d892125 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/SenseVoice/diif.patch @@ -0,0 +1,13 @@ +diff --git a/funasr/models/sense_voice/export_meta.py b/funasr/models/sense_voice/export_meta.py +index 449388ef..19b9faf2 100644 +--- a/funasr/models/sense_voice/export_meta.py ++++ b/funasr/models/sense_voice/export_meta.py +@@ -48,7 +48,7 @@ def export_forward( + + ctc_logits = self.ctc.ctc_lo(encoder_out) + +- return ctc_logits, encoder_out_lens ++ return ctc_logits, encoder_out_lens, encoder_out + + def export_dummy_inputs(self): + speech = torch.randn(2, 30, 560) diff --git a/ACL_PyTorch/contrib/audio/SenseVoice/export_onnx.py b/ACL_PyTorch/contrib/audio/SenseVoice/export_onnx.py new file mode 100644 index 0000000000..9892eb8c18 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/SenseVoice/export_onnx.py @@ -0,0 +1,9 @@ +import argparse +from funasr import AutoModel +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str, help="model path") +args = parser.parse_args() +model = AutoModel(model=args.model, device="cpu") + +res = model.export(quantize=False) +print(res) \ No newline at end of file diff --git a/ACL_PyTorch/contrib/audio/SenseVoice/om_infer.py b/ACL_PyTorch/contrib/audio/SenseVoice/om_infer.py new file mode 100644 index 0000000000..d5ff84dca9 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/SenseVoice/om_infer.py @@ -0,0 +1,185 @@ +import argparse +import time +import numpy as np +import torch +import torch_npu +from torch_npu.contrib import transfer_to_npu + +from ais_bench.infer.interface import InferSession +from funasr import AutoModel +from funasr.utils.postprocess_utils import rich_transcription_postprocess +from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank +from funasr.models.ctc.ctc import CTC +from funasr.models.sense_voice.utils.ctc_alignment import ctc_forced_align + + +class SenseVoiceOnnxModel(): + def __init__(self, device_id, om_path, **kwargs): + super().__init__() + vocab_size = kwargs.get("vocab_size", -1) + encoder_output_size = kwargs.get("encoder_conf", {}).get("output_size", 256) + self.blank_id = 0 + self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13} + self.textnorm_dict = {'withitn': 14, "woitn": 15} + self.om_sess = InferSession(device_id, om_path) + self.ignore_id = -1 + ctc_conf = None + if ctc_conf is None: + ctc_conf = {} + self.ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf) + self.ctc.ctc_lo = self.ctc.ctc_lo.to(device=f"npu:{device_id}") + + + def post(self, timestamp): + timestamp_new = [] + prev_word = None + for i, t in enumerate(timestamp): + word, start, end = t + start = int(start * 1000) + end = int(end * 1000) + if word == "▁": + continue + if i == 0: + timestamp_new.append([start, end, word]) + elif word.startswith("▁"): + word = word[1:] + timestamp_new.append([start, end, word]) + elif prev_word is not None and prev_word.isalpha() and prev_word.isascii() and word.isalpha() and word.isascii(): + prev_word += word + timestamp_new[-1][1] = end + timestamp_new[-1][2] += word + else: + timestamp_new.append([start, end, word]) + prev_word = word + return timestamp_new + + + def infer_onnx( + self, + feed, + vad_res_list, + tokenizer=None, + frontend=None, + output_timestamp=False, + **kwargs + ): + custom_sizes = (feed[0].shape[1] + 4) * 4 * 25056 + ctc_logits, encoder_out_lens, encoder_out = self.om_sess.infer(feed, mode='dymshape', custom_sizes=custom_sizes) + ctc_logits = torch.from_numpy(ctc_logits).to(device=kwargs["device"]) + encoder_out_lens = torch.from_numpy(encoder_out_lens).to(device=kwargs["device"]) + encoder_out = torch.from_numpy(encoder_out).to(device=kwargs["device"]) + x = ctc_logits[0, : encoder_out_lens[0].item(), :] + yseq = x.argmax(dim=-1) + yseq = torch.unique_consecutive(yseq, dim=-1) + mask = yseq != self.blank_id + token_int = yseq[mask].tolist() + text = tokenizer.decode(token_int) + if not output_timestamp: + return {'text': text} + from itertools import groupby + timestamp = [] + tokens = tokenizer.text2tokens(text)[4:] + token_back_to_id = tokenizer.tokens2ids(tokens) + token_ids = [] + for tok_ls in token_back_to_id: + if tok_ls: + token_ids.extend(tok_ls) + else: + token_ids.append(124) + if len(token_ids) == 0: + return {'text': text} + logits_speech = self.ctc.softmax(encoder_out)[0, 4: encoder_out_lens[0].item(), :] + pred = logits_speech.argmax(-1).cpu() + logits_speech[pred == self.blank_id, self.blank_id] = 0 + align = ctc_forced_align( + logits_speech.unsqueeze(0).float().cpu(), + torch.Tensor(token_ids).unsqueeze(0).long(), + (encoder_out_lens[0] - 4).long().cpu(), + torch.tensor(len(token_ids)).unsqueeze(0).long(), + ignore_id=self.ignore_id, + ) + pred = groupby(align[0, : encoder_out_lens[0]]) + _start = (vad_res_list[0] + 30) / 60 + token_id = 0 + ts_max = (vad_res_list[1] + 30) / 60 + for pred_token, pred_frame in pred: + _end = _start + len(list(pred_frame)) + if pred_token != 0: + ts_left = max((_start * 60 - 30) / 1000, 0) + ts_right = min((_end * 60 - 30) / 1000, (ts_max * 60 - 30) / 1000) + timestamp.append([tokens[token_id], ts_left, ts_right]) + token_id += 1 + _start = _end + timestamp = self.post(timestamp) + return {'text': text, 'timestamp': timestamp} + + +def infer(vad_model, sense_model, data_in, kwargs, vad_kwargs): + start_time = time.time() + lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13} + textnorm_dict = {'withitn': 14, "woitn": 15} + frontend = kwargs.get("frontend", None) + tokenizer = kwargs.get("tokenizer", None) + audio_sample_list = load_audio_text_image_video( + data_in, + fs=frontend.fs, + audio_fs=kwargs.get("fs", 16000), + data_type=kwargs.get("data_type", "sound"), + tokenizer=tokenizer + ) + speech_list = [] + speech_lengths_list = [] + results, meta_data = vad_model.inference([data_in], key=['test'], **vad_kwargs) + vad_res_list = results[0]['value'] + for start, end in vad_res_list: + bed_idx = int(start * 16) + end_idx = min(int(end * 16), len(audio_sample_list)) + sub_audio_sample_list = audio_sample_list[bed_idx:end_idx] + speech, speech_lengths = extract_fbank( + sub_audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend + ) + speech = speech.to(device=kwargs["device"]) + speech_lengths = speech_lengths.to(device=kwargs["device"]) + speech_list.append(speech) + speech_lengths_list.append(speech_lengths) + language = kwargs.get("language", "auto") + language = torch.LongTensor([lid_dict[language] if language in lid_dict else 0]).to(device=kwargs["device"]) + use_itn = kwargs.get('use_itn', True) + textnorm = kwargs.get("text_norm", None) + if textnorm is None: + textnorm = "withitn" if use_itn else "woitn" + textnorm = torch.LongTensor([textnorm_dict.get(textnorm, 0)]).to(device=kwargs["device"]) + results = {"key": data_in, "text": "", "timestamp": []} + for speech, speech_lengths, vad_res in zip(speech_list, speech_lengths_list, vad_res_list): + feed = [speech.cpu().detach().numpy().astype(np.float32), + speech_lengths.cpu().detach().numpy().astype(np.int32), + language.cpu().detach().numpy().astype(np.int32), + textnorm.cpu().detach().numpy().astype(np.int32)] + result_i = sense_model.infer_onnx( + feed=feed, + vad_res_list=vad_res, + language="auto", + use_itn=False, + ban_emo_unk=False, + **kwargs) + results['text'] += result_i['text'] + results['timestamp'] += result_i['timestamp'] + print("cost time", time.time() - start_time) + return results +torch_npu.npu.set_compile_mode(jit_compile=False) +parser = argparse.ArgumentParser(description="Sensevoice infer") +parser.add_argument('--vad_path', type=str, help='vad path') +parser.add_argument('--model_path', type=str, help='model path') +parser.add_argument('--om_path', type=str, help='om model') +parser.add_argument('--device', type=int, help='device', default=0) +parser.add_argument('--input', type=str, help='input audio file') +args = parser.parse_args() +_, kwargs = AutoModel.build_model(model=args.model_path, trust_remote_code=True) +sense_model = SenseVoiceOnnxModel(args.device, args.om_path, **kwargs) +vad_model, vad_kwargs = AutoModel.build_model(model=args.vad_path, trust_remote_code=True) +with torch.no_grad(): + res = infer(vad_model, sense_model, args.input, kwargs, vad_kwargs) + print(res) + print(rich_transcription_postprocess(res['text'])) + for _ in range(5): + res = infer(vad_model, sense_model, args.input, kwargs, vad_kwargs) \ No newline at end of file diff --git a/ACL_PyTorch/contrib/audio/SenseVoice/requirements.txt b/ACL_PyTorch/contrib/audio/SenseVoice/requirements.txt new file mode 100644 index 0000000000..7728a69829 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/SenseVoice/requirements.txt @@ -0,0 +1,6 @@ +torch==2.1.0 +torchaudio==2.1.0 +funasr>=1.1.13 +numpy<=1.26.4 +gradio +funasr_onnx \ No newline at end of file -- Gitee