diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/README.md b/AscendIE/TorchAIE/built-in/audio/Whisper/README.md index 8d46c8717576dc0ce91baac18db17712f9cea3d8..39a4069727cc68fa4fdb3a912f9858f822afe8fb 100644 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/README.md +++ b/AscendIE/TorchAIE/built-in/audio/Whisper/README.md @@ -17,17 +17,18 @@ # 推理环境准备\[所有版本\] -- 该模型需要以下插件与驱动 +- 该模型需要以下依赖 **表 1** 版本配套表 - | 配套 | 版本 | - |---------| ------- | - | 固件与驱动 | 24.1.rc1 | - | CANN | 8.0.rc1 | - | Python | 3.10.13 | - | PyTorch | 2.1.0 | - | Ascend-mindie-rt1.0.RC1 | - | - | Ascend-mindie-torch-1.0.RC1 | - | + +| 配套 | 版本 | +|-----------------------------|-------------| +| CANN | 8.0.RC1 | - | +| Python | 3.10.13 | +| torch | 2.1.0 | +| Ascend-mindie-rt_1.0.RC1 | - +| Ascend-mindie-torch-1.0.RC1 | - +| 芯片类型 | Ascend310P3 | - | # 快速上手 @@ -46,14 +47,14 @@ mkdir /tmp/models whisper zh.wav --model tiny ``` - 执行上述步骤需要依赖`ffmpeg`,ubuntu下可通过`apt-get install ffmpeg`安装。完成上述步骤将在`/tmp/models`目录下生成`encoder.ts/onnx`, `decoder_prefill.ts/onnx`, `decoder_decode.onnx`6个文件。 + 完成该步骤将在`/tmp/models`目录下生成`encoder.ts`, `decoder_prefill.ts`, `decoder_decode.ts`3个文件。 注:如需修改模型路径,可在打完补丁后手动修改`whisper/decoding.py`和`whisper/model.py`文件,后续步骤模型推理同样需要修改对应模型的载入路径。 3. 模型编译 ``` python3 compile.py ``` - 执行完成后将在`/tmp/models`目录下生成`encoder_compiled.ts`, `language_detection_compiled.ts`, `decoder_prefill_compiled.ts`, `decoder_decode_compiled.ts`四个文件。 + 请忽略命令行的报错信息,执行完成后将在`/tmp/models`目录下生成`encoder_compiled.ts`, `language_detection_compiled.ts`, `decoder_prefill_compiled.ts`, `decoder_decode_compiled.ts`四个文件。 参数说明: - --model_path:导出的Torchscript模型路径,模型编译后保存在同一路径, 默认为`/tmp/models`。 @@ -65,7 +66,7 @@ ``` cd whisper git reset --hard ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab - patch -p1 < ../mindietorch_infer.patch + patch -p1 < ../torch_aie_infer.patch pip3 install . cd .. whisper zh.wav --model tiny @@ -84,78 +85,4 @@ # 模型推理性能精度 -1. 精度验证 - ``` - python3 precision_test.py - ``` - - 参数说明: - - --sim_threshold: 余弦相似度阈值,默认0.99。 - - --ntokens: prefill阶段输入token数量,decode阶段缓存token数量,默认100。 - - 执行结束后,期望输出如下: - ``` - === Compare the outputs of ONNX and AIE === - Start comparing encoder... - Number of outputs to compare: 1 - Number of outputs with cosine similarity > 0.99: 1 - Number of outputs to compare: 3 - Number of outputs with cosine similarity > 0.99: 3 - Number of outputs to compare: 3 - Number of outputs with cosine similarity > 0.99: 3 - ``` - -2. 性能验证 - - a) aie模型性能测试 - ``` - python perf_test_aie.py - ``` - - 执行结束后,期望输出如下: - ``` - Encoder latency: 7.75 ms - Encoder throughput: 128.97 fps - Decoder prefill latency: 10.14 ms - Decoder prefill throughput: 98.63 fps - Decoder decode latency: 2.92 ms - Decoder decode throughput: 342.55 fps - ``` - - b) onnx模型性能测试 - (可选)若使用GPU,请确保已安装CUDA和pytorch-gpu版本,同时需安装onnxruntime-gpu,如下所示: - ```shell - pip uninstall onnxruntime - pip install onnxruntime-gpu - ``` - 验证onnxruntime-gpu是否安装成功: - ```python - import onnxruntime - print(onnxruntime.get_device()) # 若输出为GPU,则说明安装成功 - ``` - 执行性能测试 - ``` - python perf_test_onnx.py --use_gpu - ``` - - 参数说明: - - --use_gpu: 使能gpu推理,不加该选项默认cpu。 - - 执行结束后,期望输出如下: - ``` - Encoder latency: 59.49 ms - Encoder throughput: 16.81 fps - Decoder prefill latency: 141.14 ms - Decoder prefill throughput: 7.09 fps - Decoder decode latency: 36.05 ms - Decoder decode throughput: 27.74 fps - ``` - - - | 模型 | pt插件 - 310P性能(时延/吞吐率) | T4性能(时延/吞吐率) | A10性能(时延/吞吐率)| - |---------|--------------------------------|---------------------|--------------------| - | encoder | 7.75 ms / 128.97 fps | 9.31 ms / 107.47 fps | 4.21 ms / 237.50 fps | - | prefill | 10.14 ms / 98.63 fps | 72.08 ms / 13.87 fps | 45.15 ms / 22.15 fps | - | decode | 2.92 ms / 342.55 fps | 10.46 ms / 95.62 fps | 4.91 ms / 203.61 fps | - - 注:在实际推理中encoder和prefill均调用一次,decode会调用多次(上面数据假设缓存token长度为100)。并且在whisper全流程推理中还包括后处理,cache重新排布等步骤,以上数据仅作参考。 \ No newline at end of file +待后续补充。 diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py b/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py index b739beb0ce050fcb1d745e1ec941cdbc99edff85..75d3f817a5d9c3d7db476fe2362de72e97bf464c 100644 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py +++ b/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py @@ -34,7 +34,6 @@ def parse_args(): return args def compile_and_save(ts_model, input_info, soc_version, save_path): - ts_model.eval() mindie_model = mindietorch.compile( ts_model, inputs=input_info, @@ -44,6 +43,7 @@ def compile_and_save(ts_model, input_info, soc_version, save_path): soc_version=soc_version, optimization_level=0 ) + mindie_model.eval() mindie_model.save(save_path) def encoder(args): @@ -73,10 +73,7 @@ def prefill(args): min_shape=[args.beam_size, 1], max_shape=[args.beam_size, _MAX_TOKEN] ) - input_audio_features_info = mindietorch.Input( - min_shape=[1, _HALF_FRAMES, _HIDDEN], - max_shape=[1, _HALF_FRAMES, _HIDDEN] - ) + input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, _HIDDEN]) input_pos_embed_info = mindietorch.Input( min_shape=[1, _HIDDEN], max_shape=[_MAX_TOKEN, _HIDDEN] @@ -92,26 +89,14 @@ def prefill(args): def decode(args): ts_model = torch.jit.load(f"{args.model_path}/decoder_decode.ts") - input_tokens_info = mindietorch.Input( - min_shape=[args.beam_size, 1], - max_shape=[args.beam_size, 1] - ) - input_audio_features_info = mindietorch.Input( - min_shape=[1, _HALF_FRAMES, _HIDDEN], - max_shape=[1, _HALF_FRAMES, _HIDDEN] - ) - input_pos_embed_info = mindietorch.Input( - min_shape=[_HIDDEN], - max_shape=[_HIDDEN] - ) + input_tokens_info = mindietorch.Input([args.beam_size, 1]) + input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, _HIDDEN]) + input_pos_embed_info = mindietorch.Input([_HIDDEN]) input_cache_dyn_info = mindietorch.Input( min_shape=(args.nblocks, _KV_NUM, args.beam_size, 1, _HIDDEN), max_shape=(args.nblocks, _KV_NUM, args.beam_size, _MAX_TOKEN, _HIDDEN) ) - input_cache_sta_info = mindietorch.Input( - min_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN], - max_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN] - ) + input_cache_sta_info = mindietorch.Input([args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN]) input_info = [ input_tokens_info, diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py deleted file mode 100644 index 79530d717653c2afc8842328703132d4d8a7be1a..0000000000000000000000000000000000000000 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -import torch -import mindietorch - -_N_MEL = 80 -_FRAMES = 3000 -_HALF_FRAMES = 1500 -_HIDDEN = 384 -_MAX_TOKEN = 224 -_KV_NUM = 2 - - -def test(inputs, model, stream, meta=""): - # warmup - for _ in range(10): - with mindietorch.npu.stream(stream): - model(*inputs) - stream.synchronize() - - # performance test - num_infer = 100 - start = time.time() - for _ in range(num_infer): - with mindietorch.npu.stream(stream): - model(*inputs) - stream.synchronize() - end = time.time() - - print(f"{meta} latency: {(end - start) / num_infer * 1000:.2f} ms") - print(f"{meta} throughput: {num_infer / (end - start):.2f} fps") - - -def test_encoder(args): - device = f'npu:{args.device_id}' - stream = mindietorch.npu.Stream(device) - model = torch.jit.load(args.encoder_aie_path) - model.eval() - - inputs = [ - torch.ones((1, _N_MEL, _FRAMES), dtype=torch.float32).to(device) - ] - - test(inputs, model, stream, "Encoder") - - -def test_decoder_prefill(args): - device = f'npu:{args.device_id}' - stream = mindietorch.npu.Stream(device) - model = torch.jit.load(args.decoder_prefill_aie_path) - model.eval() - - assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}' - - inputs = [ - torch.ones((args.beam_size, args.ntokens), dtype=torch.float32).to(device), - torch.ones((1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device), - torch.ones((args.ntokens, _HIDDEN), dtype=torch.float32).to(device) - ] - - test(inputs, model, stream, "Decoder prefill") - - -def test_decoder_decode(args): - device = f'npu:{args.device_id}' - stream = mindietorch.npu.Stream(device) - model = torch.jit.load(args.decoder_decode_aie_path) - model.eval() - - inputs = [ - torch.ones((args.beam_size, 1), dtype=torch.float32).to(device), - torch.ones((1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device), - torch.ones((_HIDDEN), dtype=torch.float32).to(device), - torch.ones((args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), dtype=torch.float32).to(device), - torch.ones((args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device), - ] - - test(inputs, model, stream, "Decoder decode") - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--encoder_aie_path", - type=str, default="/tmp/models/encoder_compiled.ts" - ) - parser.add_argument( - "--decoder_prefill_aie_path", - type=str, default="/tmp/models/decoder_prefill_compiled.ts" - ) - parser.add_argument( - "--decoder_decode_aie_path", - type=str, default="/tmp/models/decoder_decode_compiled.ts" - ) - parser.add_argument("--beam_size", type=int, default=5) - parser.add_argument("--ntokens", type=int, default=100) - parser.add_argument("--nblocks", type=int, default=4) - parser.add_argument("--device_id", type=int, help="NPU device id", default=0) - - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - - mindietorch.set_device(args.device_id) - - for func in test_encoder, test_decoder_prefill, test_decoder_decode: - func(args) - - -if __name__ == "__main__": - main() diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py b/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py deleted file mode 100644 index 891d5507e7acd63d9d7ae7a9b37479ab36ec92f4..0000000000000000000000000000000000000000 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -import onnxruntime as ort -import numpy as np - -_N_MEL = 80 -_FRAMES = 3000 -_MAX_TOKEN = 224 -_HALF_FRAMES = 1500 -_HIDDEN = 384 -_KV_NUM = 2 - - -def test(encoder_path, provider, output_names, onnx_inputs, meta=""): - onnx_model = ort.InferenceSession( - encoder_path, - providers=[provider] - ) - - # warmup - for _ in range(10): - onnx_model.run(output_names, onnx_inputs) - # performance test - num_infer = 100 - start = time.time() - for _ in range(num_infer): - onnx_model.run(output_names, onnx_inputs) - end = time.time() - - print(f"{meta} latency: {(end - start) / num_infer * 1000:.2f} ms") - print(f"{meta} throughput: {num_infer / (end - start):.2f} fps") - - -def test_encoder(args, provider): - x = np.ones((1, _N_MEL, _FRAMES), dtype=np.float16 if args.use_gpu else np.float32) - onnx_inputs = {'mel': ort.OrtValue.ortvalue_from_numpy(x)} - output_names = ['ret'] - - test(args.encoder_onnx_path, provider, output_names, onnx_inputs, "Encoder") - - -def test_decoder_prefill(args, provider): - assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}' - tokens = np.ones((args.beam_size, args.ntokens), dtype=np.int64) - audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float16 if args.use_gpu else np.float32) - pos_embed = np.ones((args.ntokens, _HIDDEN), dtype=np.float32) - onnx_inputs = { - 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), - 'audio_features': ort.OrtValue.ortvalue_from_numpy(audio_features), - 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed) - } - output_names = ["logits", "cache_dyn", "cache_sta"] - - test(args.decoder_prefill_onnx_path, provider, output_names, onnx_inputs, "Decoder prefill") - - -def test_decoder_decode(args, provider): - assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}' - tokens = np.ones((args.beam_size, 1), dtype=np.int64) - pos_embed = np.ones((_HIDDEN), dtype=np.float32) - cache_dyn = np.ones( - (args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), - dtype=np.float16 if args.use_gpu else np.float32 - ) - cache_sta = np.ones( - (args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), - dtype=np.float16 if args.use_gpu else np.float32 - ) - onnx_inputs = { - 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), # audio_features onnx导出被折叠 - 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed), - 'cache_dyn': ort.OrtValue.ortvalue_from_numpy(cache_dyn), - 'cache_sta': ort.OrtValue.ortvalue_from_numpy(cache_sta) - } - output_names = ["logits", "new_cache_dyn", "new_cache_sta"] - - test(args.decoder_decode_onnx_path, provider, output_names, onnx_inputs, "Decoder decode") - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--encoder_onnx_path',type=str, default='/tmp/models/encoder.onnx') - parser.add_argument('--decoder_prefill_onnx_path',type=str, default='/tmp/models/decoder_prefill.onnx') - parser.add_argument('--decoder_decode_onnx_path',type=str, default='/tmp/models/decoder_decode.onnx') - parser.add_argument("--use_gpu", action="store_true") - parser.add_argument("--beam_size", type=int, default=5) - parser.add_argument("--ntokens", type=int, default=100) - parser.add_argument("--nblocks", type=int, default=4) - - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - if args.use_gpu: - provider = "CUDAExecutionProvider" - else: - provider = "CPUExecutionProvider" - - for func in test_encoder, test_decoder_prefill, test_decoder_decode: - func(args, provider) - - -if __name__ == "__main__": - main() diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py b/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py deleted file mode 100644 index 6df0dfc8aac9e84591f70bbf872a8c29f5d0bca9..0000000000000000000000000000000000000000 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import torch -import torch.nn.functional as F -import onnxruntime as ort -import numpy as np -import mindietorch - -_N_MEL = 80 -_FRAMES = 3000 -_MAX_TOKEN = 224 -_HALF_FRAMES = 1500 -_HIDDEN = 384 -_KV_NUM = 2 - -def compare_onnx_aie_output(onnx_out, aie_out, sim_threshold=0.99): - num_sim = 0 - for i, (a, b) in enumerate(zip(onnx_out, aie_out)): - a = a.reshape(1, -1).astype(np.float32) - b = b.reshape(1, -1) - sim = F.cosine_similarity(torch.from_numpy(a), b, dim=1) - if sim > sim_threshold: - num_sim += 1 - else: - print(f'Output {i} similarity: {sim}') - - print(f'Number of outputs to compare: {len(onnx_out)}') - print(f'Number of outputs with cosine similarity > {sim_threshold}: {num_sim}') - - -def compare_encoder(args): - device = f'npu:{args.device_id}' - - onnx_model = ort.InferenceSession( - args.encoder_onnx_path, - providers=["CPUExecutionProvider"] - ) - - x = np.ones((1, _N_MEL, _FRAMES), dtype=np.float32) - onnx_inputs = {'mel': ort.OrtValue.ortvalue_from_numpy(x)} - output_names = ['ret'] - onnx_out = onnx_model.run(output_names, onnx_inputs) - - aie_inputs = [x] - for i in range(len(aie_inputs)): - aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device) - - mindietorch.set_device(args.device_id) - stream = mindietorch.npu.Stream(device) - model = torch.jit.load(args.encoder_aie_path) - model.eval().to(device) - - with mindietorch.npu.stream(stream): - aie_out = model(*aie_inputs) - stream.synchronize() - - if isinstance(aie_out, tuple): - aie_out = (x.cpu() for x in aie_out) - else: - aie_out = aie_out.cpu() - compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold) - - -def compare_decoder_prefill(args): - device = f'npu:{args.device_id}' - - onnx_model = ort.InferenceSession( - args.decoder_prefill_onnx_path, - providers=["CPUExecutionProvider"] - ) - - assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}' - tokens = np.ones((args.beam_size, args.ntokens), dtype=np.int64) - audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float32) - pos_embed = np.ones((args.ntokens, _HIDDEN), dtype=np.float32) - onnx_inputs = { - 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), - 'audio_features': ort.OrtValue.ortvalue_from_numpy(audio_features), - 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed) - } - output_names = ["logits", "cache_dyn", "cache_sta"] - onnx_out = onnx_model.run(output_names, onnx_inputs) - - aie_inputs = [tokens.astype(np.float32), audio_features, pos_embed] - for i in range(len(aie_inputs)): - aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device) - - mindietorch.set_device(args.device_id) - stream = mindietorch.npu.Stream(device) - model = torch.jit.load(args.decoder_prefill_aie_path) - model.eval().to(device) - - with mindietorch.npu.stream(stream): - aie_out = model(*aie_inputs) - stream.synchronize() - if isinstance(aie_out, tuple): - aie_out = (x.cpu() for x in aie_out) - else: - aie_out = aie_out.cpu() - compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold) - - -def compare_decoder_decode(args): - device = f'npu:{args.device_id}' - - onnx_model = ort.InferenceSession( - args.decoder_decode_onnx_path, - providers=["CPUExecutionProvider"] - ) - - assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}' - tokens = np.ones((args.beam_size, 1), dtype=np.int64) - audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float32) - pos_embed = np.ones((_HIDDEN), dtype=np.float32) - cache_dyn = np.ones((args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), dtype=np.float32) - cache_sta = np.ones((args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), dtype=np.float32) - onnx_inputs = { - 'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), # audio_features onnx导出被折叠 - 'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed), - 'cache_dyn': ort.OrtValue.ortvalue_from_numpy(cache_dyn), - 'cache_sta': ort.OrtValue.ortvalue_from_numpy(cache_sta) - } - - output_names = ["logits", "new_cache_dyn", "new_cache_sta"] - onnx_out = onnx_model.run(output_names, onnx_inputs) - - aie_inputs = [tokens.astype(np.float32), audio_features, pos_embed, cache_dyn, cache_sta] - for i in range(len(aie_inputs)): - aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device) - - mindietorch.set_device(args.device_id) - stream = mindietorch.npu.Stream(device) - model = torch.jit.load(args.decoder_decode_aie_path) - model.eval().to(device) - - with mindietorch.npu.stream(stream): - aie_out = model(*aie_inputs) - stream.synchronize() - if isinstance(aie_out, tuple): - aie_out = (x.cpu() for x in aie_out) - else: - aie_out = aie_out.cpu() - compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold) - - -def parse_args(): - parser = argparse.ArgumentParser() - # encoder - parser.add_argument('--encoder_onnx_path',type=str, default='/tmp/models/encoder.onnx') - parser.add_argument('--encoder_aie_path', type=str, default='/tmp/models/encoder_compiled.ts') - # decoder_prefill - parser.add_argument('--decoder_prefill_onnx_path',type=str, default='/tmp/models/decoder_prefill.onnx') - parser.add_argument('--decoder_prefill_aie_path', type=str, default='/tmp/models/decoder_prefill_compiled.ts') - # decoder_decode - parser.add_argument('--decoder_decode_onnx_path',type=str, default='/tmp/models/decoder_decode.onnx') - parser.add_argument('--decoder_decode_aie_path', type=str, default='/tmp/models/decoder_decode_compiled.ts') - parser.add_argument('--sim_threshold', type=float, default=0.99) - parser.add_argument('--device_id', type=int, default=0) - parser.add_argument("--beam_size", type=int, default=5) - parser.add_argument("--ntokens", type=int, default=100) - parser.add_argument("--nblocks", type=int, default=4) - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - - print('=== Compare the outputs of ONNX and AIE ===') - - print('Start comparing encoder...') - funcs = [compare_encoder, compare_decoder_prefill, compare_decoder_decode] - for func in funcs: - func(args) - - -if __name__ == "__main__": - main() diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch b/AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch similarity index 92% rename from AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch rename to AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch index fc7f771847486b8542d41a2a54876304c481399e..62b432f05f9e241170508b108d5b77f08517229c 100644 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch +++ b/AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch @@ -1,5 +1,5 @@ diff --git a/whisper/decoding.py b/whisper/decoding.py -index 49485d0..4dccc86 100644 +index 49485d0..345dea7 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -6,6 +6,7 @@ import torch @@ -83,7 +83,7 @@ index 49485d0..4dccc86 100644 class SequenceRanker: def rank( diff --git a/whisper/model.py b/whisper/model.py -index a678283..c94a024 100644 +index a678283..c2c2278 100644 --- a/whisper/model.py +++ b/whisper/model.py @@ -1,12 +1,14 @@ @@ -106,7 +106,7 @@ index a678283..c94a024 100644 ) self.ln_post = LayerNorm(n_state) + self.device = "npu:0" -+ self.mindietorch_encoder_model = torch.jit.load( ++ self.mindie_encoder_model = torch.jit.load( + "/tmp/models/encoder_compiled.ts" + ).eval().to(self.device) @@ -128,7 +128,7 @@ index a678283..c94a024 100644 - x = self.ln_post(x) - return x + x = x.to(self.device) -+ x = self.mindietorch_encoder_model(x) ++ x = self.mindie_encoder_model(x) + return x.cpu() @@ -154,13 +154,13 @@ index a678283..c94a024 100644 - for block in self.blocks: - x = block(x, xa, mask=self.mask, kv_cache=kv_cache) + self.device = "npu:0" -+ self.mindietorch_language_detection_model = torch.jit.load( ++ self.mindie_language_detection_model = torch.jit.load( + "/tmp/models/language_detection_compiled.ts" + ).eval().to(self.device) -+ self.mindietorch_prefill_model = torch.jit.load( ++ self.mindie_prefill_model = torch.jit.load( + "/tmp/models/decoder_prefill_compiled.ts" + ).eval().to(self.device) -+ self.mindietorch_decode_model = torch.jit.load( ++ self.mindie_decode_model = torch.jit.load( + "/tmp/models/decoder_decode_compiled.ts" + ).eval().to(self.device) @@ -183,13 +183,13 @@ index a678283..c94a024 100644 + audio_features_npu = xa.to(self.device) + pos_embed_npu = pos_embed.to(self.device) + if x.shape[0] != 1: -+ logits, cache_dyn, cache_sta = self.mindietorch_prefill_model( ++ logits, cache_dyn, cache_sta = self.mindie_prefill_model( + tokens_npu, + audio_features_npu, + pos_embed_npu + ) + else: -+ logits, cache_dyn, cache_sta = self.mindietorch_language_detection_model( ++ logits, cache_dyn, cache_sta = self.mindie_language_detection_model( + tokens_npu, + audio_features_npu, + pos_embed_npu @@ -201,7 +201,7 @@ index a678283..c94a024 100644 + audio_features_npu = xa.to(self.device) + pos_embed_npu = pos_embed.to(self.device) + cache_dyn_npu = cache_dyn.to(self.device) -+ logits, cache_dyn, _ = self.mindietorch_decode_model( ++ logits, cache_dyn, _ = self.mindie_decode_model( + tokens_npu, + audio_features_npu, + pos_embed_npu, diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch b/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch index a35756ff38f412308096baa0caf2da2880b71613..d9c1ff7e29431f8774be6c91dd89f84bbc473221 100644 --- a/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch +++ b/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch @@ -1,8 +1,8 @@ diff --git a/whisper/decoding.py b/whisper/decoding.py -index 49485d0..495fe45 100644 +index 49485d0..4826389 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py -@@ -2,6 +2,7 @@ from dataclasses import dataclass, field, replace +@@ -2,10 +2,12 @@ from dataclasses import dataclass, field, replace from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union import numpy as np @@ -10,22 +10,18 @@ index 49485d0..495fe45 100644 import torch import torch.nn.functional as F from torch import Tensor -@@ -49,12 +50,24 @@ def detect_language( + from torch.distributions import Categorical ++import mindietorch + + from .audio import CHUNK_LENGTH + from .tokenizer import Tokenizer, get_tokenizer +@@ -49,12 +51,15 @@ def detect_language( # skip encoder forward pass if already-encoded audio features were given if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): + encoder_ts_model = torch.jit.trace(model.encoder, mel) + encoder_ts_model.save( + "/tmp/models/encoder.ts") -+ torch.onnx.export( -+ model.encoder, -+ (mel), -+ "/tmp/models/encoder.onnx", -+ opset_version=11, -+ input_names=["mel"], -+ output_names=["ret"] -+ ) -+ mel = model.encoder(mel) # forward pass using a single token, startoftranscript @@ -36,7 +32,7 @@ index 49485d0..495fe45 100644 # collect detected languages; suppress all non-language tokens mask = torch.ones(logits.shape[-1], dtype=torch.bool) -@@ -145,36 +158,74 @@ class PyTorchInference(Inference): +@@ -145,36 +150,48 @@ class PyTorchInference(Inference): def __init__(self, model: "Whisper", initial_token_length: int): self.model: "Whisper" = model self.initial_token_length = initial_token_length @@ -57,44 +53,18 @@ index 49485d0..495fe45 100644 # only need to use the last token except in the first forward pass tokens = tokens[:, -1:] + pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]] -+ torch.onnx.export( -+ self.model.decoder, -+ (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta), -+ "/tmp/models/decoder_decode.onnx", -+ opset_version=11, -+ input_names=["tokens", "audio_features", "pos_embed", "cache_dyn", "cache_sta"], -+ output_names=["logits", "new_cache_dyn", "new_cache_sta"], -+ dynamic_axes={ -+ "cache_dyn": {3: "ntokens"}, -+ "new_cache_dyn": {3: "ntokens"} -+ } -+ ) + decoder_decode_ts_model = torch.jit.trace( + self.model.decoder, + (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta) + ) + decoder_decode_ts_model.save( + "/tmp/models/decoder_decode.ts") ++ os.sys.exit(0) + logits, cache_dyn, _ = self.model.decoder( + tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta) -+ os.sys.exit(0) + self.cache_dyn = cache_dyn + else: + pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]] -+ torch.onnx.export( -+ self.model.decoder, -+ (tokens, audio_features, pos_embed), -+ "/tmp/models/decoder_prefill.onnx", -+ opset_version=11, -+ input_names=["tokens", "audio_features", "pos_embed"], -+ output_names=["logits", "cache_dyn", "cache_sta"], -+ dynamic_axes={ -+ "tokens": {1: "ntokens"}, -+ "pos_embed": {0: "ntokens"}, -+ "logits": {1: "ntokens"}, -+ "cache_dyn": {3: "ntokens"} -+ } -+ ) + decoder_prefill_ts_model = torch.jit.trace( + self.model.decoder, + (tokens, audio_features, pos_embed) @@ -131,10 +101,10 @@ index 49485d0..495fe45 100644 class SequenceRanker: def rank( diff --git a/whisper/model.py b/whisper/model.py -index a678283..2a95e28 100644 +index a678283..a20aaad 100644 --- a/whisper/model.py +++ b/whisper/model.py -@@ -1,6 +1,7 @@ +@@ -1,12 +1,14 @@ import base64 import gzip from dataclasses import dataclass @@ -142,7 +112,14 @@ index a678283..2a95e28 100644 from typing import Dict, Iterable, Optional import numpy as np -@@ -68,6 +69,63 @@ class MultiHeadAttention(nn.Module): + import torch + import torch.nn.functional as F + from torch import Tensor, nn ++import mindietorch + + from .decoding import decode as decode_function + from .decoding import detect_language as detect_language_function +@@ -68,6 +70,63 @@ class MultiHeadAttention(nn.Module): self.value = Linear(n_state, n_state) self.out = Linear(n_state, n_state) @@ -206,7 +183,7 @@ index a678283..2a95e28 100644 def forward( self, x: Tensor, -@@ -126,6 +184,39 @@ class ResidualAttentionBlock(nn.Module): +@@ -126,6 +185,39 @@ class ResidualAttentionBlock(nn.Module): ) self.mlp_ln = LayerNorm(n_state) @@ -246,7 +223,7 @@ index a678283..2a95e28 100644 def forward( self, x: Tensor, -@@ -163,11 +254,10 @@ class AudioEncoder(nn.Module): +@@ -163,11 +255,10 @@ class AudioEncoder(nn.Module): x = F.gelu(self.conv2(x)) x = x.permute(0, 2, 1) @@ -259,7 +236,7 @@ index a678283..2a95e28 100644 x = self.ln_post(x) return x -@@ -193,29 +283,56 @@ class TextDecoder(nn.Module): +@@ -193,29 +284,56 @@ class TextDecoder(nn.Module): mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) self.register_buffer("mask", mask, persistent=False) @@ -331,7 +308,7 @@ index a678283..2a95e28 100644 class Whisper(nn.Module): -@@ -257,7 +374,8 @@ class Whisper(nn.Module): +@@ -257,7 +375,8 @@ class Whisper(nn.Module): return self.encoder(mel) def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):