diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/README.md b/AscendIE/TorchAIE/built-in/audio/Whisper/README.md
index 8d46c8717576dc0ce91baac18db17712f9cea3d8..39a4069727cc68fa4fdb3a912f9858f822afe8fb 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/README.md
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/README.md
@@ -17,17 +17,18 @@
# 推理环境准备\[所有版本\]
-- 该模型需要以下插件与驱动
+- 该模型需要以下依赖
**表 1** 版本配套表
- | 配套 | 版本 |
- |---------| ------- |
- | 固件与驱动 | 24.1.rc1 |
- | CANN | 8.0.rc1 |
- | Python | 3.10.13 |
- | PyTorch | 2.1.0 |
- | Ascend-mindie-rt1.0.RC1 | - |
- | Ascend-mindie-torch-1.0.RC1 | - |
+
+| 配套 | 版本 |
+|-----------------------------|-------------|
+| CANN | 8.0.RC1 | - |
+| Python | 3.10.13 |
+| torch | 2.1.0 |
+| Ascend-mindie-rt_1.0.RC1 | -
+| Ascend-mindie-torch-1.0.RC1 | -
+| 芯片类型 | Ascend310P3 | - |
# 快速上手
@@ -46,14 +47,14 @@
mkdir /tmp/models
whisper zh.wav --model tiny
```
- 执行上述步骤需要依赖`ffmpeg`,ubuntu下可通过`apt-get install ffmpeg`安装。完成上述步骤将在`/tmp/models`目录下生成`encoder.ts/onnx`, `decoder_prefill.ts/onnx`, `decoder_decode.onnx`6个文件。
+ 完成该步骤将在`/tmp/models`目录下生成`encoder.ts`, `decoder_prefill.ts`, `decoder_decode.ts`3个文件。
注:如需修改模型路径,可在打完补丁后手动修改`whisper/decoding.py`和`whisper/model.py`文件,后续步骤模型推理同样需要修改对应模型的载入路径。
3. 模型编译
```
python3 compile.py
```
- 执行完成后将在`/tmp/models`目录下生成`encoder_compiled.ts`, `language_detection_compiled.ts`, `decoder_prefill_compiled.ts`, `decoder_decode_compiled.ts`四个文件。
+ 请忽略命令行的报错信息,执行完成后将在`/tmp/models`目录下生成`encoder_compiled.ts`, `language_detection_compiled.ts`, `decoder_prefill_compiled.ts`, `decoder_decode_compiled.ts`四个文件。
参数说明:
- --model_path:导出的Torchscript模型路径,模型编译后保存在同一路径, 默认为`/tmp/models`。
@@ -65,7 +66,7 @@
```
cd whisper
git reset --hard ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
- patch -p1 < ../mindietorch_infer.patch
+ patch -p1 < ../torch_aie_infer.patch
pip3 install .
cd ..
whisper zh.wav --model tiny
@@ -84,78 +85,4 @@
# 模型推理性能精度
-1. 精度验证
- ```
- python3 precision_test.py
- ```
-
- 参数说明:
- - --sim_threshold: 余弦相似度阈值,默认0.99。
- - --ntokens: prefill阶段输入token数量,decode阶段缓存token数量,默认100。
-
- 执行结束后,期望输出如下:
- ```
- === Compare the outputs of ONNX and AIE ===
- Start comparing encoder...
- Number of outputs to compare: 1
- Number of outputs with cosine similarity > 0.99: 1
- Number of outputs to compare: 3
- Number of outputs with cosine similarity > 0.99: 3
- Number of outputs to compare: 3
- Number of outputs with cosine similarity > 0.99: 3
- ```
-
-2. 性能验证
-
- a) aie模型性能测试
- ```
- python perf_test_aie.py
- ```
-
- 执行结束后,期望输出如下:
- ```
- Encoder latency: 7.75 ms
- Encoder throughput: 128.97 fps
- Decoder prefill latency: 10.14 ms
- Decoder prefill throughput: 98.63 fps
- Decoder decode latency: 2.92 ms
- Decoder decode throughput: 342.55 fps
- ```
-
- b) onnx模型性能测试
- (可选)若使用GPU,请确保已安装CUDA和pytorch-gpu版本,同时需安装onnxruntime-gpu,如下所示:
- ```shell
- pip uninstall onnxruntime
- pip install onnxruntime-gpu
- ```
- 验证onnxruntime-gpu是否安装成功:
- ```python
- import onnxruntime
- print(onnxruntime.get_device()) # 若输出为GPU,则说明安装成功
- ```
- 执行性能测试
- ```
- python perf_test_onnx.py --use_gpu
- ```
-
- 参数说明:
- - --use_gpu: 使能gpu推理,不加该选项默认cpu。
-
- 执行结束后,期望输出如下:
- ```
- Encoder latency: 59.49 ms
- Encoder throughput: 16.81 fps
- Decoder prefill latency: 141.14 ms
- Decoder prefill throughput: 7.09 fps
- Decoder decode latency: 36.05 ms
- Decoder decode throughput: 27.74 fps
- ```
-
-
- | 模型 | pt插件 - 310P性能(时延/吞吐率) | T4性能(时延/吞吐率) | A10性能(时延/吞吐率)|
- |---------|--------------------------------|---------------------|--------------------|
- | encoder | 7.75 ms / 128.97 fps | 9.31 ms / 107.47 fps | 4.21 ms / 237.50 fps |
- | prefill | 10.14 ms / 98.63 fps | 72.08 ms / 13.87 fps | 45.15 ms / 22.15 fps |
- | decode | 2.92 ms / 342.55 fps | 10.46 ms / 95.62 fps | 4.91 ms / 203.61 fps |
-
- 注:在实际推理中encoder和prefill均调用一次,decode会调用多次(上面数据假设缓存token长度为100)。并且在whisper全流程推理中还包括后处理,cache重新排布等步骤,以上数据仅作参考。
\ No newline at end of file
+待后续补充。
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py b/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py
index b739beb0ce050fcb1d745e1ec941cdbc99edff85..75d3f817a5d9c3d7db476fe2362de72e97bf464c 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py
@@ -34,7 +34,6 @@ def parse_args():
return args
def compile_and_save(ts_model, input_info, soc_version, save_path):
- ts_model.eval()
mindie_model = mindietorch.compile(
ts_model,
inputs=input_info,
@@ -44,6 +43,7 @@ def compile_and_save(ts_model, input_info, soc_version, save_path):
soc_version=soc_version,
optimization_level=0
)
+ mindie_model.eval()
mindie_model.save(save_path)
def encoder(args):
@@ -73,10 +73,7 @@ def prefill(args):
min_shape=[args.beam_size, 1],
max_shape=[args.beam_size, _MAX_TOKEN]
)
- input_audio_features_info = mindietorch.Input(
- min_shape=[1, _HALF_FRAMES, _HIDDEN],
- max_shape=[1, _HALF_FRAMES, _HIDDEN]
- )
+ input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, _HIDDEN])
input_pos_embed_info = mindietorch.Input(
min_shape=[1, _HIDDEN],
max_shape=[_MAX_TOKEN, _HIDDEN]
@@ -92,26 +89,14 @@ def prefill(args):
def decode(args):
ts_model = torch.jit.load(f"{args.model_path}/decoder_decode.ts")
- input_tokens_info = mindietorch.Input(
- min_shape=[args.beam_size, 1],
- max_shape=[args.beam_size, 1]
- )
- input_audio_features_info = mindietorch.Input(
- min_shape=[1, _HALF_FRAMES, _HIDDEN],
- max_shape=[1, _HALF_FRAMES, _HIDDEN]
- )
- input_pos_embed_info = mindietorch.Input(
- min_shape=[_HIDDEN],
- max_shape=[_HIDDEN]
- )
+ input_tokens_info = mindietorch.Input([args.beam_size, 1])
+ input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, _HIDDEN])
+ input_pos_embed_info = mindietorch.Input([_HIDDEN])
input_cache_dyn_info = mindietorch.Input(
min_shape=(args.nblocks, _KV_NUM, args.beam_size, 1, _HIDDEN),
max_shape=(args.nblocks, _KV_NUM, args.beam_size, _MAX_TOKEN, _HIDDEN)
)
- input_cache_sta_info = mindietorch.Input(
- min_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN],
- max_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN]
- )
+ input_cache_sta_info = mindietorch.Input([args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN])
input_info = [
input_tokens_info,
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py
deleted file mode 100644
index 79530d717653c2afc8842328703132d4d8a7be1a..0000000000000000000000000000000000000000
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import torch
-import mindietorch
-
-_N_MEL = 80
-_FRAMES = 3000
-_HALF_FRAMES = 1500
-_HIDDEN = 384
-_MAX_TOKEN = 224
-_KV_NUM = 2
-
-
-def test(inputs, model, stream, meta=""):
- # warmup
- for _ in range(10):
- with mindietorch.npu.stream(stream):
- model(*inputs)
- stream.synchronize()
-
- # performance test
- num_infer = 100
- start = time.time()
- for _ in range(num_infer):
- with mindietorch.npu.stream(stream):
- model(*inputs)
- stream.synchronize()
- end = time.time()
-
- print(f"{meta} latency: {(end - start) / num_infer * 1000:.2f} ms")
- print(f"{meta} throughput: {num_infer / (end - start):.2f} fps")
-
-
-def test_encoder(args):
- device = f'npu:{args.device_id}'
- stream = mindietorch.npu.Stream(device)
- model = torch.jit.load(args.encoder_aie_path)
- model.eval()
-
- inputs = [
- torch.ones((1, _N_MEL, _FRAMES), dtype=torch.float32).to(device)
- ]
-
- test(inputs, model, stream, "Encoder")
-
-
-def test_decoder_prefill(args):
- device = f'npu:{args.device_id}'
- stream = mindietorch.npu.Stream(device)
- model = torch.jit.load(args.decoder_prefill_aie_path)
- model.eval()
-
- assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
-
- inputs = [
- torch.ones((args.beam_size, args.ntokens), dtype=torch.float32).to(device),
- torch.ones((1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device),
- torch.ones((args.ntokens, _HIDDEN), dtype=torch.float32).to(device)
- ]
-
- test(inputs, model, stream, "Decoder prefill")
-
-
-def test_decoder_decode(args):
- device = f'npu:{args.device_id}'
- stream = mindietorch.npu.Stream(device)
- model = torch.jit.load(args.decoder_decode_aie_path)
- model.eval()
-
- inputs = [
- torch.ones((args.beam_size, 1), dtype=torch.float32).to(device),
- torch.ones((1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device),
- torch.ones((_HIDDEN), dtype=torch.float32).to(device),
- torch.ones((args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), dtype=torch.float32).to(device),
- torch.ones((args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device),
- ]
-
- test(inputs, model, stream, "Decoder decode")
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--encoder_aie_path",
- type=str, default="/tmp/models/encoder_compiled.ts"
- )
- parser.add_argument(
- "--decoder_prefill_aie_path",
- type=str, default="/tmp/models/decoder_prefill_compiled.ts"
- )
- parser.add_argument(
- "--decoder_decode_aie_path",
- type=str, default="/tmp/models/decoder_decode_compiled.ts"
- )
- parser.add_argument("--beam_size", type=int, default=5)
- parser.add_argument("--ntokens", type=int, default=100)
- parser.add_argument("--nblocks", type=int, default=4)
- parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
-
- args = parser.parse_args()
- return args
-
-
-def main():
- args = parse_args()
-
- mindietorch.set_device(args.device_id)
-
- for func in test_encoder, test_decoder_prefill, test_decoder_decode:
- func(args)
-
-
-if __name__ == "__main__":
- main()
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py b/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py
deleted file mode 100644
index 891d5507e7acd63d9d7ae7a9b37479ab36ec92f4..0000000000000000000000000000000000000000
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import onnxruntime as ort
-import numpy as np
-
-_N_MEL = 80
-_FRAMES = 3000
-_MAX_TOKEN = 224
-_HALF_FRAMES = 1500
-_HIDDEN = 384
-_KV_NUM = 2
-
-
-def test(encoder_path, provider, output_names, onnx_inputs, meta=""):
- onnx_model = ort.InferenceSession(
- encoder_path,
- providers=[provider]
- )
-
- # warmup
- for _ in range(10):
- onnx_model.run(output_names, onnx_inputs)
- # performance test
- num_infer = 100
- start = time.time()
- for _ in range(num_infer):
- onnx_model.run(output_names, onnx_inputs)
- end = time.time()
-
- print(f"{meta} latency: {(end - start) / num_infer * 1000:.2f} ms")
- print(f"{meta} throughput: {num_infer / (end - start):.2f} fps")
-
-
-def test_encoder(args, provider):
- x = np.ones((1, _N_MEL, _FRAMES), dtype=np.float16 if args.use_gpu else np.float32)
- onnx_inputs = {'mel': ort.OrtValue.ortvalue_from_numpy(x)}
- output_names = ['ret']
-
- test(args.encoder_onnx_path, provider, output_names, onnx_inputs, "Encoder")
-
-
-def test_decoder_prefill(args, provider):
- assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
- tokens = np.ones((args.beam_size, args.ntokens), dtype=np.int64)
- audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float16 if args.use_gpu else np.float32)
- pos_embed = np.ones((args.ntokens, _HIDDEN), dtype=np.float32)
- onnx_inputs = {
- 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens),
- 'audio_features': ort.OrtValue.ortvalue_from_numpy(audio_features),
- 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed)
- }
- output_names = ["logits", "cache_dyn", "cache_sta"]
-
- test(args.decoder_prefill_onnx_path, provider, output_names, onnx_inputs, "Decoder prefill")
-
-
-def test_decoder_decode(args, provider):
- assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
- tokens = np.ones((args.beam_size, 1), dtype=np.int64)
- pos_embed = np.ones((_HIDDEN), dtype=np.float32)
- cache_dyn = np.ones(
- (args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN),
- dtype=np.float16 if args.use_gpu else np.float32
- )
- cache_sta = np.ones(
- (args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN),
- dtype=np.float16 if args.use_gpu else np.float32
- )
- onnx_inputs = {
- 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), # audio_features onnx导出被折叠
- 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed),
- 'cache_dyn': ort.OrtValue.ortvalue_from_numpy(cache_dyn),
- 'cache_sta': ort.OrtValue.ortvalue_from_numpy(cache_sta)
- }
- output_names = ["logits", "new_cache_dyn", "new_cache_sta"]
-
- test(args.decoder_decode_onnx_path, provider, output_names, onnx_inputs, "Decoder decode")
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('--encoder_onnx_path',type=str, default='/tmp/models/encoder.onnx')
- parser.add_argument('--decoder_prefill_onnx_path',type=str, default='/tmp/models/decoder_prefill.onnx')
- parser.add_argument('--decoder_decode_onnx_path',type=str, default='/tmp/models/decoder_decode.onnx')
- parser.add_argument("--use_gpu", action="store_true")
- parser.add_argument("--beam_size", type=int, default=5)
- parser.add_argument("--ntokens", type=int, default=100)
- parser.add_argument("--nblocks", type=int, default=4)
-
- args = parser.parse_args()
- return args
-
-
-def main():
- args = parse_args()
- if args.use_gpu:
- provider = "CUDAExecutionProvider"
- else:
- provider = "CPUExecutionProvider"
-
- for func in test_encoder, test_decoder_prefill, test_decoder_decode:
- func(args, provider)
-
-
-if __name__ == "__main__":
- main()
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py b/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py
deleted file mode 100644
index 6df0dfc8aac9e84591f70bbf872a8c29f5d0bca9..0000000000000000000000000000000000000000
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import torch
-import torch.nn.functional as F
-import onnxruntime as ort
-import numpy as np
-import mindietorch
-
-_N_MEL = 80
-_FRAMES = 3000
-_MAX_TOKEN = 224
-_HALF_FRAMES = 1500
-_HIDDEN = 384
-_KV_NUM = 2
-
-def compare_onnx_aie_output(onnx_out, aie_out, sim_threshold=0.99):
- num_sim = 0
- for i, (a, b) in enumerate(zip(onnx_out, aie_out)):
- a = a.reshape(1, -1).astype(np.float32)
- b = b.reshape(1, -1)
- sim = F.cosine_similarity(torch.from_numpy(a), b, dim=1)
- if sim > sim_threshold:
- num_sim += 1
- else:
- print(f'Output {i} similarity: {sim}')
-
- print(f'Number of outputs to compare: {len(onnx_out)}')
- print(f'Number of outputs with cosine similarity > {sim_threshold}: {num_sim}')
-
-
-def compare_encoder(args):
- device = f'npu:{args.device_id}'
-
- onnx_model = ort.InferenceSession(
- args.encoder_onnx_path,
- providers=["CPUExecutionProvider"]
- )
-
- x = np.ones((1, _N_MEL, _FRAMES), dtype=np.float32)
- onnx_inputs = {'mel': ort.OrtValue.ortvalue_from_numpy(x)}
- output_names = ['ret']
- onnx_out = onnx_model.run(output_names, onnx_inputs)
-
- aie_inputs = [x]
- for i in range(len(aie_inputs)):
- aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device)
-
- mindietorch.set_device(args.device_id)
- stream = mindietorch.npu.Stream(device)
- model = torch.jit.load(args.encoder_aie_path)
- model.eval().to(device)
-
- with mindietorch.npu.stream(stream):
- aie_out = model(*aie_inputs)
- stream.synchronize()
-
- if isinstance(aie_out, tuple):
- aie_out = (x.cpu() for x in aie_out)
- else:
- aie_out = aie_out.cpu()
- compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def compare_decoder_prefill(args):
- device = f'npu:{args.device_id}'
-
- onnx_model = ort.InferenceSession(
- args.decoder_prefill_onnx_path,
- providers=["CPUExecutionProvider"]
- )
-
- assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
- tokens = np.ones((args.beam_size, args.ntokens), dtype=np.int64)
- audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float32)
- pos_embed = np.ones((args.ntokens, _HIDDEN), dtype=np.float32)
- onnx_inputs = {
- 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens),
- 'audio_features': ort.OrtValue.ortvalue_from_numpy(audio_features),
- 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed)
- }
- output_names = ["logits", "cache_dyn", "cache_sta"]
- onnx_out = onnx_model.run(output_names, onnx_inputs)
-
- aie_inputs = [tokens.astype(np.float32), audio_features, pos_embed]
- for i in range(len(aie_inputs)):
- aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device)
-
- mindietorch.set_device(args.device_id)
- stream = mindietorch.npu.Stream(device)
- model = torch.jit.load(args.decoder_prefill_aie_path)
- model.eval().to(device)
-
- with mindietorch.npu.stream(stream):
- aie_out = model(*aie_inputs)
- stream.synchronize()
- if isinstance(aie_out, tuple):
- aie_out = (x.cpu() for x in aie_out)
- else:
- aie_out = aie_out.cpu()
- compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def compare_decoder_decode(args):
- device = f'npu:{args.device_id}'
-
- onnx_model = ort.InferenceSession(
- args.decoder_decode_onnx_path,
- providers=["CPUExecutionProvider"]
- )
-
- assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
- tokens = np.ones((args.beam_size, 1), dtype=np.int64)
- audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float32)
- pos_embed = np.ones((_HIDDEN), dtype=np.float32)
- cache_dyn = np.ones((args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), dtype=np.float32)
- cache_sta = np.ones((args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), dtype=np.float32)
- onnx_inputs = {
- 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), # audio_features onnx导出被折叠
- 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed),
- 'cache_dyn': ort.OrtValue.ortvalue_from_numpy(cache_dyn),
- 'cache_sta': ort.OrtValue.ortvalue_from_numpy(cache_sta)
- }
-
- output_names = ["logits", "new_cache_dyn", "new_cache_sta"]
- onnx_out = onnx_model.run(output_names, onnx_inputs)
-
- aie_inputs = [tokens.astype(np.float32), audio_features, pos_embed, cache_dyn, cache_sta]
- for i in range(len(aie_inputs)):
- aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device)
-
- mindietorch.set_device(args.device_id)
- stream = mindietorch.npu.Stream(device)
- model = torch.jit.load(args.decoder_decode_aie_path)
- model.eval().to(device)
-
- with mindietorch.npu.stream(stream):
- aie_out = model(*aie_inputs)
- stream.synchronize()
- if isinstance(aie_out, tuple):
- aie_out = (x.cpu() for x in aie_out)
- else:
- aie_out = aie_out.cpu()
- compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- # encoder
- parser.add_argument('--encoder_onnx_path',type=str, default='/tmp/models/encoder.onnx')
- parser.add_argument('--encoder_aie_path', type=str, default='/tmp/models/encoder_compiled.ts')
- # decoder_prefill
- parser.add_argument('--decoder_prefill_onnx_path',type=str, default='/tmp/models/decoder_prefill.onnx')
- parser.add_argument('--decoder_prefill_aie_path', type=str, default='/tmp/models/decoder_prefill_compiled.ts')
- # decoder_decode
- parser.add_argument('--decoder_decode_onnx_path',type=str, default='/tmp/models/decoder_decode.onnx')
- parser.add_argument('--decoder_decode_aie_path', type=str, default='/tmp/models/decoder_decode_compiled.ts')
- parser.add_argument('--sim_threshold', type=float, default=0.99)
- parser.add_argument('--device_id', type=int, default=0)
- parser.add_argument("--beam_size", type=int, default=5)
- parser.add_argument("--ntokens", type=int, default=100)
- parser.add_argument("--nblocks", type=int, default=4)
- args = parser.parse_args()
- return args
-
-
-def main():
- args = parse_args()
-
- print('=== Compare the outputs of ONNX and AIE ===')
-
- print('Start comparing encoder...')
- funcs = [compare_encoder, compare_decoder_prefill, compare_decoder_decode]
- for func in funcs:
- func(args)
-
-
-if __name__ == "__main__":
- main()
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch b/AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch
similarity index 92%
rename from AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch
rename to AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch
index fc7f771847486b8542d41a2a54876304c481399e..62b432f05f9e241170508b108d5b77f08517229c 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch
@@ -1,5 +1,5 @@
diff --git a/whisper/decoding.py b/whisper/decoding.py
-index 49485d0..4dccc86 100644
+index 49485d0..345dea7 100644
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
@@ -6,6 +6,7 @@ import torch
@@ -83,7 +83,7 @@ index 49485d0..4dccc86 100644
class SequenceRanker:
def rank(
diff --git a/whisper/model.py b/whisper/model.py
-index a678283..c94a024 100644
+index a678283..c2c2278 100644
--- a/whisper/model.py
+++ b/whisper/model.py
@@ -1,12 +1,14 @@
@@ -106,7 +106,7 @@ index a678283..c94a024 100644
)
self.ln_post = LayerNorm(n_state)
+ self.device = "npu:0"
-+ self.mindietorch_encoder_model = torch.jit.load(
++ self.mindie_encoder_model = torch.jit.load(
+ "/tmp/models/encoder_compiled.ts"
+ ).eval().to(self.device)
@@ -128,7 +128,7 @@ index a678283..c94a024 100644
- x = self.ln_post(x)
- return x
+ x = x.to(self.device)
-+ x = self.mindietorch_encoder_model(x)
++ x = self.mindie_encoder_model(x)
+ return x.cpu()
@@ -154,13 +154,13 @@ index a678283..c94a024 100644
- for block in self.blocks:
- x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+ self.device = "npu:0"
-+ self.mindietorch_language_detection_model = torch.jit.load(
++ self.mindie_language_detection_model = torch.jit.load(
+ "/tmp/models/language_detection_compiled.ts"
+ ).eval().to(self.device)
-+ self.mindietorch_prefill_model = torch.jit.load(
++ self.mindie_prefill_model = torch.jit.load(
+ "/tmp/models/decoder_prefill_compiled.ts"
+ ).eval().to(self.device)
-+ self.mindietorch_decode_model = torch.jit.load(
++ self.mindie_decode_model = torch.jit.load(
+ "/tmp/models/decoder_decode_compiled.ts"
+ ).eval().to(self.device)
@@ -183,13 +183,13 @@ index a678283..c94a024 100644
+ audio_features_npu = xa.to(self.device)
+ pos_embed_npu = pos_embed.to(self.device)
+ if x.shape[0] != 1:
-+ logits, cache_dyn, cache_sta = self.mindietorch_prefill_model(
++ logits, cache_dyn, cache_sta = self.mindie_prefill_model(
+ tokens_npu,
+ audio_features_npu,
+ pos_embed_npu
+ )
+ else:
-+ logits, cache_dyn, cache_sta = self.mindietorch_language_detection_model(
++ logits, cache_dyn, cache_sta = self.mindie_language_detection_model(
+ tokens_npu,
+ audio_features_npu,
+ pos_embed_npu
@@ -201,7 +201,7 @@ index a678283..c94a024 100644
+ audio_features_npu = xa.to(self.device)
+ pos_embed_npu = pos_embed.to(self.device)
+ cache_dyn_npu = cache_dyn.to(self.device)
-+ logits, cache_dyn, _ = self.mindietorch_decode_model(
++ logits, cache_dyn, _ = self.mindie_decode_model(
+ tokens_npu,
+ audio_features_npu,
+ pos_embed_npu,
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch b/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch
index a35756ff38f412308096baa0caf2da2880b71613..d9c1ff7e29431f8774be6c91dd89f84bbc473221 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch
@@ -1,8 +1,8 @@
diff --git a/whisper/decoding.py b/whisper/decoding.py
-index 49485d0..495fe45 100644
+index 49485d0..4826389 100644
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
-@@ -2,6 +2,7 @@ from dataclasses import dataclass, field, replace
+@@ -2,10 +2,12 @@ from dataclasses import dataclass, field, replace
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
import numpy as np
@@ -10,22 +10,18 @@ index 49485d0..495fe45 100644
import torch
import torch.nn.functional as F
from torch import Tensor
-@@ -49,12 +50,24 @@ def detect_language(
+ from torch.distributions import Categorical
++import mindietorch
+
+ from .audio import CHUNK_LENGTH
+ from .tokenizer import Tokenizer, get_tokenizer
+@@ -49,12 +51,15 @@ def detect_language(
# skip encoder forward pass if already-encoded audio features were given
if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
+ encoder_ts_model = torch.jit.trace(model.encoder, mel)
+ encoder_ts_model.save(
+ "/tmp/models/encoder.ts")
-+ torch.onnx.export(
-+ model.encoder,
-+ (mel),
-+ "/tmp/models/encoder.onnx",
-+ opset_version=11,
-+ input_names=["mel"],
-+ output_names=["ret"]
-+ )
-+
mel = model.encoder(mel)
# forward pass using a single token, startoftranscript
@@ -36,7 +32,7 @@ index 49485d0..495fe45 100644
# collect detected languages; suppress all non-language tokens
mask = torch.ones(logits.shape[-1], dtype=torch.bool)
-@@ -145,36 +158,74 @@ class PyTorchInference(Inference):
+@@ -145,36 +150,48 @@ class PyTorchInference(Inference):
def __init__(self, model: "Whisper", initial_token_length: int):
self.model: "Whisper" = model
self.initial_token_length = initial_token_length
@@ -57,44 +53,18 @@ index 49485d0..495fe45 100644
# only need to use the last token except in the first forward pass
tokens = tokens[:, -1:]
+ pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]]
-+ torch.onnx.export(
-+ self.model.decoder,
-+ (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta),
-+ "/tmp/models/decoder_decode.onnx",
-+ opset_version=11,
-+ input_names=["tokens", "audio_features", "pos_embed", "cache_dyn", "cache_sta"],
-+ output_names=["logits", "new_cache_dyn", "new_cache_sta"],
-+ dynamic_axes={
-+ "cache_dyn": {3: "ntokens"},
-+ "new_cache_dyn": {3: "ntokens"}
-+ }
-+ )
+ decoder_decode_ts_model = torch.jit.trace(
+ self.model.decoder,
+ (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
+ )
+ decoder_decode_ts_model.save(
+ "/tmp/models/decoder_decode.ts")
++ os.sys.exit(0)
+ logits, cache_dyn, _ = self.model.decoder(
+ tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
-+ os.sys.exit(0)
+ self.cache_dyn = cache_dyn
+ else:
+ pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]]
-+ torch.onnx.export(
-+ self.model.decoder,
-+ (tokens, audio_features, pos_embed),
-+ "/tmp/models/decoder_prefill.onnx",
-+ opset_version=11,
-+ input_names=["tokens", "audio_features", "pos_embed"],
-+ output_names=["logits", "cache_dyn", "cache_sta"],
-+ dynamic_axes={
-+ "tokens": {1: "ntokens"},
-+ "pos_embed": {0: "ntokens"},
-+ "logits": {1: "ntokens"},
-+ "cache_dyn": {3: "ntokens"}
-+ }
-+ )
+ decoder_prefill_ts_model = torch.jit.trace(
+ self.model.decoder,
+ (tokens, audio_features, pos_embed)
@@ -131,10 +101,10 @@ index 49485d0..495fe45 100644
class SequenceRanker:
def rank(
diff --git a/whisper/model.py b/whisper/model.py
-index a678283..2a95e28 100644
+index a678283..a20aaad 100644
--- a/whisper/model.py
+++ b/whisper/model.py
-@@ -1,6 +1,7 @@
+@@ -1,12 +1,14 @@
import base64
import gzip
from dataclasses import dataclass
@@ -142,7 +112,14 @@ index a678283..2a95e28 100644
from typing import Dict, Iterable, Optional
import numpy as np
-@@ -68,6 +69,63 @@ class MultiHeadAttention(nn.Module):
+ import torch
+ import torch.nn.functional as F
+ from torch import Tensor, nn
++import mindietorch
+
+ from .decoding import decode as decode_function
+ from .decoding import detect_language as detect_language_function
+@@ -68,6 +70,63 @@ class MultiHeadAttention(nn.Module):
self.value = Linear(n_state, n_state)
self.out = Linear(n_state, n_state)
@@ -206,7 +183,7 @@ index a678283..2a95e28 100644
def forward(
self,
x: Tensor,
-@@ -126,6 +184,39 @@ class ResidualAttentionBlock(nn.Module):
+@@ -126,6 +185,39 @@ class ResidualAttentionBlock(nn.Module):
)
self.mlp_ln = LayerNorm(n_state)
@@ -246,7 +223,7 @@ index a678283..2a95e28 100644
def forward(
self,
x: Tensor,
-@@ -163,11 +254,10 @@ class AudioEncoder(nn.Module):
+@@ -163,11 +255,10 @@ class AudioEncoder(nn.Module):
x = F.gelu(self.conv2(x))
x = x.permute(0, 2, 1)
@@ -259,7 +236,7 @@ index a678283..2a95e28 100644
x = self.ln_post(x)
return x
-@@ -193,29 +283,56 @@ class TextDecoder(nn.Module):
+@@ -193,29 +284,56 @@ class TextDecoder(nn.Module):
mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
self.register_buffer("mask", mask, persistent=False)
@@ -331,7 +308,7 @@ index a678283..2a95e28 100644
class Whisper(nn.Module):
-@@ -257,7 +374,8 @@ class Whisper(nn.Module):
+@@ -257,7 +375,8 @@ class Whisper(nn.Module):
return self.encoder(mel)
def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):