diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/README.md b/AscendIE/TorchAIE/built-in/audio/Whisper/README.md
index 8d46c8717576dc0ce91baac18db17712f9cea3d8..39a4069727cc68fa4fdb3a912f9858f822afe8fb 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/README.md
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/README.md
@@ -17,17 +17,18 @@
 
 # 推理环境准备\[所有版本\]<a name="ZH-CN_TOPIC_0000001126281702"></a>
 
-- 该模型需要以下插件与驱动 
+- 该模型需要以下依赖
 
   **表 1**  版本配套表
-  | 配套    | 版本     |
-  |---------| ------- |
-  | 固件与驱动 | 24.1.rc1  |
-  | CANN | 8.0.rc1 |
-  | Python | 3.10.13 |
-  | PyTorch | 2.1.0 |
-  | Ascend-mindie-rt1.0.RC1 | - |
-  | Ascend-mindie-torch-1.0.RC1 | - |
+
+| 配套                          | 版本          | 
+|-----------------------------|-------------| 
+| CANN                        | 8.0.RC1     | -                                                       |
+| Python                      | 3.10.13     |                                                           
+| torch                       | 2.1.0       |
+| Ascend-mindie-rt_1.0.RC1    | -           
+| Ascend-mindie-torch-1.0.RC1 | -           
+| 芯片类型                        | Ascend310P3 | -                                                         |
 
 # 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
 
@@ -46,14 +47,14 @@
     mkdir /tmp/models
     whisper zh.wav --model tiny
     ```
-    执行上述步骤需要依赖`ffmpeg`，ubuntu下可通过`apt-get install ffmpeg`安装。完成上述步骤将在`/tmp/models`目录下生成`encoder.ts/onnx`, `decoder_prefill.ts/onnx`, `decoder_decode.onnx`6个文件。
+    完成该步骤将在`/tmp/models`目录下生成`encoder.ts`, `decoder_prefill.ts`, `decoder_decode.ts`3个文件。
     注：如需修改模型路径，可在打完补丁后手动修改`whisper/decoding.py`和`whisper/model.py`文件，后续步骤模型推理同样需要修改对应模型的载入路径。
 
 3. 模型编译
     ```
     python3 compile.py
     ```
-    执行完成后将在`/tmp/models`目录下生成`encoder_compiled.ts`, `language_detection_compiled.ts`, `decoder_prefill_compiled.ts`, `decoder_decode_compiled.ts`四个文件。
+    请忽略命令行的报错信息，执行完成后将在`/tmp/models`目录下生成`encoder_compiled.ts`, `language_detection_compiled.ts`, `decoder_prefill_compiled.ts`, `decoder_decode_compiled.ts`四个文件。
     
     参数说明：
     - --model_path：导出的Torchscript模型路径，模型编译后保存在同一路径， 默认为`/tmp/models`。
@@ -65,7 +66,7 @@
     ```
     cd whisper
     git reset --hard ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
-    patch -p1 < ../mindietorch_infer.patch
+    patch -p1 < ../torch_aie_infer.patch
     pip3 install .
     cd ..
     whisper zh.wav --model tiny
@@ -84,78 +85,4 @@
 
 # 模型推理性能精度<a name="ZH-CN_TOPIC_0000001172201573"></a>
 
-1. 精度验证
-    ```
-    python3 precision_test.py
-    ```
-    
-    参数说明：
-    - --sim_threshold: 余弦相似度阈值，默认0.99。
-    - --ntokens: prefill阶段输入token数量，decode阶段缓存token数量，默认100。
-
-    执行结束后，期望输出如下：
-    ```
-    === Compare the outputs of ONNX and AIE ===
-    Start comparing encoder...
-    Number of outputs to compare: 1
-    Number of outputs with cosine similarity > 0.99: 1
-    Number of outputs to compare: 3
-    Number of outputs with cosine similarity > 0.99: 3
-    Number of outputs to compare: 3
-    Number of outputs with cosine similarity > 0.99: 3
-    ```
-
-2. 性能验证
-
-    a) aie模型性能测试
-    ```
-    python perf_test_aie.py
-    ```
-
-    执行结束后，期望输出如下：
-    ```
-    Encoder latency: 7.75 ms
-    Encoder throughput: 128.97 fps
-    Decoder prefill latency: 10.14 ms
-    Decoder prefill throughput: 98.63 fps
-    Decoder decode latency: 2.92 ms
-    Decoder decode throughput: 342.55 fps
-    ```
-
-    b) onnx模型性能测试
-    （可选）若使用GPU，请确保已安装CUDA和pytorch-gpu版本，同时需安装onnxruntime-gpu，如下所示：
-    ```shell
-    pip uninstall onnxruntime
-    pip install onnxruntime-gpu
-    ```
-    验证onnxruntime-gpu是否安装成功：
-    ```python
-    import onnxruntime
-    print(onnxruntime.get_device())  # 若输出为GPU，则说明安装成功
-    ``` 
-    执行性能测试
-    ```
-    python perf_test_onnx.py --use_gpu
-    ```
-
-    参数说明：
-    - --use_gpu: 使能gpu推理，不加该选项默认cpu。
-
-    执行结束后，期望输出如下：
-    ```
-    Encoder latency: 59.49 ms
-    Encoder throughput: 16.81 fps
-    Decoder prefill latency: 141.14 ms
-    Decoder prefill throughput: 7.09 fps
-    Decoder decode latency: 36.05 ms
-    Decoder decode throughput: 27.74 fps
-    ```
-
-    
-    | 模型    | pt插件 - 310P性能（时延/吞吐率） | T4性能（时延/吞吐率） | A10性能（时延/吞吐率）|
-    |---------|--------------------------------|---------------------|--------------------|
-    | encoder | 7.75 ms / 128.97 fps | 9.31 ms / 107.47 fps | 4.21 ms / 237.50 fps |
-    | prefill | 10.14 ms / 98.63 fps | 72.08 ms / 13.87 fps | 45.15 ms / 22.15 fps |
-    | decode  | 2.92 ms / 342.55 fps | 10.46 ms / 95.62 fps | 4.91 ms / 203.61 fps |
-
-    注：在实际推理中encoder和prefill均调用一次，decode会调用多次（上面数据假设缓存token长度为100）。并且在whisper全流程推理中还包括后处理，cache重新排布等步骤，以上数据仅作参考。
\ No newline at end of file
+待后续补充。
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py b/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py
index b739beb0ce050fcb1d745e1ec941cdbc99edff85..75d3f817a5d9c3d7db476fe2362de72e97bf464c 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/compile.py
@@ -34,7 +34,6 @@ def parse_args():
     return args
 
 def compile_and_save(ts_model, input_info, soc_version, save_path):
-    ts_model.eval()
     mindie_model = mindietorch.compile(
         ts_model,
         inputs=input_info,
@@ -44,6 +43,7 @@ def compile_and_save(ts_model, input_info, soc_version, save_path):
         soc_version=soc_version,
         optimization_level=0
     )
+    mindie_model.eval()
     mindie_model.save(save_path)
 
 def encoder(args):
@@ -73,10 +73,7 @@ def prefill(args):
         min_shape=[args.beam_size, 1],
         max_shape=[args.beam_size, _MAX_TOKEN]
     )
-    input_audio_features_info = mindietorch.Input(
-        min_shape=[1, _HALF_FRAMES, _HIDDEN],
-        max_shape=[1, _HALF_FRAMES, _HIDDEN]
-    )
+    input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, _HIDDEN])
     input_pos_embed_info = mindietorch.Input(
         min_shape=[1, _HIDDEN],
         max_shape=[_MAX_TOKEN, _HIDDEN]
@@ -92,26 +89,14 @@ def prefill(args):
 def decode(args):
     ts_model = torch.jit.load(f"{args.model_path}/decoder_decode.ts")
 
-    input_tokens_info = mindietorch.Input(
-        min_shape=[args.beam_size, 1],
-        max_shape=[args.beam_size, 1]
-    )
-    input_audio_features_info = mindietorch.Input(
-        min_shape=[1, _HALF_FRAMES, _HIDDEN],
-        max_shape=[1, _HALF_FRAMES, _HIDDEN]
-    )
-    input_pos_embed_info = mindietorch.Input(
-        min_shape=[_HIDDEN],
-        max_shape=[_HIDDEN]
-    )
+    input_tokens_info = mindietorch.Input([args.beam_size, 1])
+    input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, _HIDDEN])
+    input_pos_embed_info = mindietorch.Input([_HIDDEN])
     input_cache_dyn_info = mindietorch.Input(
         min_shape=(args.nblocks, _KV_NUM, args.beam_size, 1, _HIDDEN),
         max_shape=(args.nblocks, _KV_NUM, args.beam_size, _MAX_TOKEN, _HIDDEN)
     )
-    input_cache_sta_info = mindietorch.Input(
-        min_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN],
-        max_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN]
-    )
+    input_cache_sta_info = mindietorch.Input([args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN])
 
     input_info = [
         input_tokens_info,
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py b/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py
deleted file mode 100644
index 79530d717653c2afc8842328703132d4d8a7be1a..0000000000000000000000000000000000000000
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_aie.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import torch
-import mindietorch
-
-_N_MEL = 80
-_FRAMES = 3000
-_HALF_FRAMES = 1500
-_HIDDEN = 384
-_MAX_TOKEN = 224
-_KV_NUM = 2
-
-
-def test(inputs, model, stream, meta=""):
-    # warmup
-    for _ in range(10):
-        with mindietorch.npu.stream(stream):
-            model(*inputs)
-            stream.synchronize()
-
-    # performance test
-    num_infer = 100
-    start = time.time()
-    for _ in range(num_infer):
-        with mindietorch.npu.stream(stream):
-            model(*inputs)
-            stream.synchronize()
-    end = time.time()
-
-    print(f"{meta} latency: {(end - start) / num_infer * 1000:.2f} ms")
-    print(f"{meta} throughput: {num_infer / (end - start):.2f} fps")
-
-
-def test_encoder(args):
-    device = f'npu:{args.device_id}'
-    stream = mindietorch.npu.Stream(device)
-    model = torch.jit.load(args.encoder_aie_path)
-    model.eval()
-
-    inputs = [
-        torch.ones((1, _N_MEL, _FRAMES), dtype=torch.float32).to(device)
-    ]
-
-    test(inputs, model, stream, "Encoder")
-
-
-def test_decoder_prefill(args):
-    device = f'npu:{args.device_id}'
-    stream = mindietorch.npu.Stream(device)
-    model = torch.jit.load(args.decoder_prefill_aie_path)
-    model.eval()
-
-    assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
-
-    inputs = [
-        torch.ones((args.beam_size, args.ntokens), dtype=torch.float32).to(device),
-        torch.ones((1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device),
-        torch.ones((args.ntokens, _HIDDEN), dtype=torch.float32).to(device)
-    ]
-    
-    test(inputs, model, stream, "Decoder prefill")
-
-
-def test_decoder_decode(args):
-    device = f'npu:{args.device_id}'
-    stream = mindietorch.npu.Stream(device)
-    model = torch.jit.load(args.decoder_decode_aie_path)
-    model.eval()
-
-    inputs = [
-        torch.ones((args.beam_size, 1), dtype=torch.float32).to(device),
-        torch.ones((1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device),
-        torch.ones((_HIDDEN), dtype=torch.float32).to(device),
-        torch.ones((args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), dtype=torch.float32).to(device),
-        torch.ones((args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), dtype=torch.float32).to(device),
-    ]
-
-    test(inputs, model, stream, "Decoder decode")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--encoder_aie_path",
-        type=str, default="/tmp/models/encoder_compiled.ts"
-    )
-    parser.add_argument(
-        "--decoder_prefill_aie_path",
-        type=str, default="/tmp/models/decoder_prefill_compiled.ts"
-    )
-    parser.add_argument(
-        "--decoder_decode_aie_path",
-        type=str, default="/tmp/models/decoder_decode_compiled.ts"
-    )
-    parser.add_argument("--beam_size", type=int, default=5)
-    parser.add_argument("--ntokens", type=int, default=100)
-    parser.add_argument("--nblocks", type=int, default=4)
-    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
-
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    mindietorch.set_device(args.device_id)
-
-    for func in test_encoder, test_decoder_prefill, test_decoder_decode:
-        func(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py b/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py
deleted file mode 100644
index 891d5507e7acd63d9d7ae7a9b37479ab36ec92f4..0000000000000000000000000000000000000000
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/perf_test_onnx.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-import onnxruntime as ort
-import numpy as np
-
-_N_MEL = 80
-_FRAMES = 3000
-_MAX_TOKEN = 224
-_HALF_FRAMES = 1500
-_HIDDEN = 384
-_KV_NUM = 2
-
-
-def test(encoder_path, provider, output_names, onnx_inputs, meta=""):
-    onnx_model = ort.InferenceSession(
-        encoder_path,
-        providers=[provider]
-    )
-
-    # warmup
-    for _ in range(10):
-        onnx_model.run(output_names, onnx_inputs)
-    # performance test
-    num_infer = 100
-    start = time.time()
-    for _ in range(num_infer):
-        onnx_model.run(output_names, onnx_inputs)
-    end = time.time()
-
-    print(f"{meta} latency: {(end - start) / num_infer * 1000:.2f} ms")
-    print(f"{meta} throughput: {num_infer / (end - start):.2f} fps")
-
-
-def test_encoder(args, provider):
-    x = np.ones((1, _N_MEL, _FRAMES), dtype=np.float16 if args.use_gpu else np.float32)
-    onnx_inputs = {'mel': ort.OrtValue.ortvalue_from_numpy(x)}
-    output_names = ['ret']
-
-    test(args.encoder_onnx_path, provider, output_names, onnx_inputs, "Encoder")
-    
-
-def test_decoder_prefill(args, provider): 
-    assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
-    tokens = np.ones((args.beam_size, args.ntokens), dtype=np.int64)
-    audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float16 if args.use_gpu else np.float32)
-    pos_embed = np.ones((args.ntokens, _HIDDEN), dtype=np.float32)
-    onnx_inputs = {
-        'tokens': ort.OrtValue.ortvalue_from_numpy(tokens),
-        'audio_features': ort.OrtValue.ortvalue_from_numpy(audio_features),
-        'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed)
-    }
-    output_names = ["logits", "cache_dyn", "cache_sta"]
-
-    test(args.decoder_prefill_onnx_path, provider, output_names, onnx_inputs, "Decoder prefill")
-
-
-def test_decoder_decode(args, provider):
-    assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
-    tokens = np.ones((args.beam_size, 1), dtype=np.int64)
-    pos_embed = np.ones((_HIDDEN), dtype=np.float32)
-    cache_dyn = np.ones(
-        (args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN),
-        dtype=np.float16 if args.use_gpu else np.float32
-    )
-    cache_sta = np.ones(
-        (args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN),
-        dtype=np.float16 if args.use_gpu else np.float32
-    )
-    onnx_inputs = {
-        'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), # audio_features onnx导出被折叠
-        'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed),
-        'cache_dyn': ort.OrtValue.ortvalue_from_numpy(cache_dyn),
-        'cache_sta': ort.OrtValue.ortvalue_from_numpy(cache_sta)
-    }
-    output_names = ["logits", "new_cache_dyn", "new_cache_sta"]
-
-    test(args.decoder_decode_onnx_path, provider, output_names, onnx_inputs, "Decoder decode")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--encoder_onnx_path',type=str, default='/tmp/models/encoder.onnx')
-    parser.add_argument('--decoder_prefill_onnx_path',type=str, default='/tmp/models/decoder_prefill.onnx')
-    parser.add_argument('--decoder_decode_onnx_path',type=str, default='/tmp/models/decoder_decode.onnx')
-    parser.add_argument("--use_gpu", action="store_true")
-    parser.add_argument("--beam_size", type=int, default=5)
-    parser.add_argument("--ntokens", type=int, default=100)
-    parser.add_argument("--nblocks", type=int, default=4)
-
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-    if args.use_gpu:
-        provider = "CUDAExecutionProvider"
-    else:
-        provider = "CPUExecutionProvider"
-
-    for func in test_encoder, test_decoder_prefill, test_decoder_decode:
-        func(args, provider)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py b/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py
deleted file mode 100644
index 6df0dfc8aac9e84591f70bbf872a8c29f5d0bca9..0000000000000000000000000000000000000000
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/precision_test.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import torch
-import torch.nn.functional as F
-import onnxruntime as ort
-import numpy as np
-import mindietorch
-
-_N_MEL = 80
-_FRAMES = 3000
-_MAX_TOKEN = 224
-_HALF_FRAMES = 1500
-_HIDDEN = 384
-_KV_NUM = 2
-
-def compare_onnx_aie_output(onnx_out, aie_out, sim_threshold=0.99):
-    num_sim = 0
-    for i, (a, b) in enumerate(zip(onnx_out, aie_out)):
-        a = a.reshape(1, -1).astype(np.float32)
-        b = b.reshape(1, -1)
-        sim = F.cosine_similarity(torch.from_numpy(a), b, dim=1)
-        if sim > sim_threshold:
-            num_sim += 1
-        else:
-            print(f'Output {i} similarity: {sim}')
-
-    print(f'Number of outputs to compare: {len(onnx_out)}')
-    print(f'Number of outputs with cosine similarity > {sim_threshold}: {num_sim}')
-
-
-def compare_encoder(args):
-    device = f'npu:{args.device_id}'
-
-    onnx_model = ort.InferenceSession(
-        args.encoder_onnx_path,
-        providers=["CPUExecutionProvider"]
-    )
-
-    x = np.ones((1, _N_MEL, _FRAMES), dtype=np.float32)
-    onnx_inputs = {'mel': ort.OrtValue.ortvalue_from_numpy(x)}
-    output_names = ['ret']
-    onnx_out = onnx_model.run(output_names, onnx_inputs)
-
-    aie_inputs = [x]
-    for i in range(len(aie_inputs)):
-        aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device)
-    
-    mindietorch.set_device(args.device_id)
-    stream = mindietorch.npu.Stream(device)
-    model = torch.jit.load(args.encoder_aie_path)
-    model.eval().to(device)
-
-    with mindietorch.npu.stream(stream):
-        aie_out = model(*aie_inputs)
-        stream.synchronize()
-    
-    if isinstance(aie_out, tuple):
-        aie_out = (x.cpu() for x in aie_out)
-    else:
-        aie_out = aie_out.cpu()
-    compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def compare_decoder_prefill(args):
-    device = f'npu:{args.device_id}'
-
-    onnx_model = ort.InferenceSession(
-        args.decoder_prefill_onnx_path,
-        providers=["CPUExecutionProvider"]
-    )
-
-    assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
-    tokens = np.ones((args.beam_size, args.ntokens), dtype=np.int64)
-    audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float32)
-    pos_embed = np.ones((args.ntokens, _HIDDEN), dtype=np.float32)
-    onnx_inputs = {
-        'tokens': ort.OrtValue.ortvalue_from_numpy(tokens),
-        'audio_features': ort.OrtValue.ortvalue_from_numpy(audio_features),
-        'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed)
-    }
-    output_names = ["logits", "cache_dyn", "cache_sta"]
-    onnx_out = onnx_model.run(output_names, onnx_inputs)
-
-    aie_inputs = [tokens.astype(np.float32), audio_features, pos_embed]
-    for i in range(len(aie_inputs)):
-        aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device)
-    
-    mindietorch.set_device(args.device_id)
-    stream = mindietorch.npu.Stream(device)
-    model = torch.jit.load(args.decoder_prefill_aie_path)
-    model.eval().to(device)
-
-    with mindietorch.npu.stream(stream):
-        aie_out = model(*aie_inputs)
-        stream.synchronize()
-    if isinstance(aie_out, tuple):
-        aie_out = (x.cpu() for x in aie_out)
-    else:
-        aie_out = aie_out.cpu()
-    compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def compare_decoder_decode(args):
-    device = f'npu:{args.device_id}'
-
-    onnx_model = ort.InferenceSession(
-        args.decoder_decode_onnx_path,
-        providers=["CPUExecutionProvider"]
-    )
-
-    assert args.ntokens <= _MAX_TOKEN, f'ntokens can not exceed {_MAX_TOKEN}'
-    tokens = np.ones((args.beam_size, 1), dtype=np.int64)
-    audio_features = np.ones((1, _HALF_FRAMES, _HIDDEN), dtype=np.float32)
-    pos_embed = np.ones((_HIDDEN), dtype=np.float32)
-    cache_dyn = np.ones((args.nblocks, _KV_NUM, args.beam_size, args.ntokens, _HIDDEN), dtype=np.float32)
-    cache_sta = np.ones((args.nblocks, _KV_NUM, 1, _HALF_FRAMES, _HIDDEN), dtype=np.float32)
-    onnx_inputs = {
-        'tokens': ort.OrtValue.ortvalue_from_numpy(tokens), # audio_features onnx导出被折叠
-        'pos_embed': ort.OrtValue.ortvalue_from_numpy(pos_embed),
-        'cache_dyn': ort.OrtValue.ortvalue_from_numpy(cache_dyn),
-        'cache_sta': ort.OrtValue.ortvalue_from_numpy(cache_sta)
-    }
-
-    output_names = ["logits", "new_cache_dyn", "new_cache_sta"]
-    onnx_out = onnx_model.run(output_names, onnx_inputs)
-
-    aie_inputs = [tokens.astype(np.float32), audio_features, pos_embed, cache_dyn, cache_sta]
-    for i in range(len(aie_inputs)):
-        aie_inputs[i] = torch.from_numpy(aie_inputs[i]).to(device)
-    
-    mindietorch.set_device(args.device_id)
-    stream = mindietorch.npu.Stream(device)
-    model = torch.jit.load(args.decoder_decode_aie_path)
-    model.eval().to(device)
-
-    with mindietorch.npu.stream(stream):
-        aie_out = model(*aie_inputs)
-        stream.synchronize()
-    if isinstance(aie_out, tuple):
-        aie_out = (x.cpu() for x in aie_out)
-    else:
-        aie_out = aie_out.cpu()
-    compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # encoder
-    parser.add_argument('--encoder_onnx_path',type=str, default='/tmp/models/encoder.onnx')
-    parser.add_argument('--encoder_aie_path', type=str, default='/tmp/models/encoder_compiled.ts')
-    # decoder_prefill
-    parser.add_argument('--decoder_prefill_onnx_path',type=str, default='/tmp/models/decoder_prefill.onnx')
-    parser.add_argument('--decoder_prefill_aie_path', type=str, default='/tmp/models/decoder_prefill_compiled.ts')
-    # decoder_decode
-    parser.add_argument('--decoder_decode_onnx_path',type=str, default='/tmp/models/decoder_decode.onnx')
-    parser.add_argument('--decoder_decode_aie_path', type=str, default='/tmp/models/decoder_decode_compiled.ts')
-    parser.add_argument('--sim_threshold', type=float, default=0.99)
-    parser.add_argument('--device_id', type=int, default=0)
-    parser.add_argument("--beam_size", type=int, default=5)
-    parser.add_argument("--ntokens", type=int, default=100)
-    parser.add_argument("--nblocks", type=int, default=4)
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    print('=== Compare the outputs of ONNX and AIE ===')
-
-    print('Start comparing encoder...')
-    funcs = [compare_encoder, compare_decoder_prefill, compare_decoder_decode]
-    for func in funcs:
-        func(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch b/AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch
similarity index 92%
rename from AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch
rename to AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch
index fc7f771847486b8542d41a2a54876304c481399e..62b432f05f9e241170508b108d5b77f08517229c 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/mindietorch_infer.patch
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/torch_aie_infer.patch
@@ -1,5 +1,5 @@
 diff --git a/whisper/decoding.py b/whisper/decoding.py
-index 49485d0..4dccc86 100644
+index 49485d0..345dea7 100644
 --- a/whisper/decoding.py
 +++ b/whisper/decoding.py
 @@ -6,6 +6,7 @@ import torch
@@ -83,7 +83,7 @@ index 49485d0..4dccc86 100644
  class SequenceRanker:
      def rank(
 diff --git a/whisper/model.py b/whisper/model.py
-index a678283..c94a024 100644
+index a678283..c2c2278 100644
 --- a/whisper/model.py
 +++ b/whisper/model.py
 @@ -1,12 +1,14 @@
@@ -106,7 +106,7 @@ index a678283..c94a024 100644
          )
          self.ln_post = LayerNorm(n_state)
 +        self.device = "npu:0"
-+        self.mindietorch_encoder_model = torch.jit.load(
++        self.mindie_encoder_model = torch.jit.load(
 +            "/tmp/models/encoder_compiled.ts"
 +        ).eval().to(self.device)
  
@@ -128,7 +128,7 @@ index a678283..c94a024 100644
 -        x = self.ln_post(x)
 -        return x
 +        x = x.to(self.device)
-+        x = self.mindietorch_encoder_model(x)
++        x = self.mindie_encoder_model(x)
 +        return x.cpu()
  
  
@@ -154,13 +154,13 @@ index a678283..c94a024 100644
 -        for block in self.blocks:
 -            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
 +        self.device = "npu:0"
-+        self.mindietorch_language_detection_model = torch.jit.load(
++        self.mindie_language_detection_model = torch.jit.load(
 +            "/tmp/models/language_detection_compiled.ts"
 +        ).eval().to(self.device)
-+        self.mindietorch_prefill_model = torch.jit.load(
++        self.mindie_prefill_model = torch.jit.load(
 +            "/tmp/models/decoder_prefill_compiled.ts"
 +        ).eval().to(self.device)
-+        self.mindietorch_decode_model = torch.jit.load(
++        self.mindie_decode_model = torch.jit.load(
 +            "/tmp/models/decoder_decode_compiled.ts"
 +        ).eval().to(self.device)
  
@@ -183,13 +183,13 @@ index a678283..c94a024 100644
 +            audio_features_npu = xa.to(self.device)
 +            pos_embed_npu = pos_embed.to(self.device)
 +            if x.shape[0] != 1:
-+                logits, cache_dyn, cache_sta = self.mindietorch_prefill_model(
++                logits, cache_dyn, cache_sta = self.mindie_prefill_model(
 +                    tokens_npu,
 +                    audio_features_npu,
 +                    pos_embed_npu
 +                )
 +            else:
-+                logits, cache_dyn, cache_sta = self.mindietorch_language_detection_model(
++                logits, cache_dyn, cache_sta = self.mindie_language_detection_model(
 +                    tokens_npu,
 +                    audio_features_npu,
 +                    pos_embed_npu
@@ -201,7 +201,7 @@ index a678283..c94a024 100644
 +            audio_features_npu = xa.to(self.device)
 +            pos_embed_npu = pos_embed.to(self.device)
 +            cache_dyn_npu = cache_dyn.to(self.device)
-+            logits, cache_dyn, _ = self.mindietorch_decode_model(
++            logits, cache_dyn, _ = self.mindie_decode_model(
 +                tokens_npu,
 +                audio_features_npu,
 +                pos_embed_npu,
diff --git a/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch b/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch
index a35756ff38f412308096baa0caf2da2880b71613..d9c1ff7e29431f8774be6c91dd89f84bbc473221 100644
--- a/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch
+++ b/AscendIE/TorchAIE/built-in/audio/Whisper/trace_model.patch
@@ -1,8 +1,8 @@
 diff --git a/whisper/decoding.py b/whisper/decoding.py
-index 49485d0..495fe45 100644
+index 49485d0..4826389 100644
 --- a/whisper/decoding.py
 +++ b/whisper/decoding.py
-@@ -2,6 +2,7 @@ from dataclasses import dataclass, field, replace
+@@ -2,10 +2,12 @@ from dataclasses import dataclass, field, replace
  from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
  
  import numpy as np
@@ -10,22 +10,18 @@ index 49485d0..495fe45 100644
  import torch
  import torch.nn.functional as F
  from torch import Tensor
-@@ -49,12 +50,24 @@ def detect_language(
+ from torch.distributions import Categorical
++import mindietorch
+ 
+ from .audio import CHUNK_LENGTH
+ from .tokenizer import Tokenizer, get_tokenizer
+@@ -49,12 +51,15 @@ def detect_language(
  
      # skip encoder forward pass if already-encoded audio features were given
      if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
 +        encoder_ts_model = torch.jit.trace(model.encoder, mel)
 +        encoder_ts_model.save(
 +            "/tmp/models/encoder.ts")
-+        torch.onnx.export(
-+            model.encoder,
-+            (mel),
-+            "/tmp/models/encoder.onnx",
-+            opset_version=11,
-+            input_names=["mel"],
-+            output_names=["ret"]
-+        )
-+
          mel = model.encoder(mel)
  
      # forward pass using a single token, startoftranscript
@@ -36,7 +32,7 @@ index 49485d0..495fe45 100644
  
      # collect detected languages; suppress all non-language tokens
      mask = torch.ones(logits.shape[-1], dtype=torch.bool)
-@@ -145,36 +158,74 @@ class PyTorchInference(Inference):
+@@ -145,36 +150,48 @@ class PyTorchInference(Inference):
      def __init__(self, model: "Whisper", initial_token_length: int):
          self.model: "Whisper" = model
          self.initial_token_length = initial_token_length
@@ -57,44 +53,18 @@ index 49485d0..495fe45 100644
              # only need to use the last token except in the first forward pass
              tokens = tokens[:, -1:]
 +            pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]]
-+            torch.onnx.export(
-+                self.model.decoder,
-+                (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta),
-+                "/tmp/models/decoder_decode.onnx",
-+                opset_version=11,
-+                input_names=["tokens", "audio_features", "pos_embed", "cache_dyn", "cache_sta"],
-+                output_names=["logits", "new_cache_dyn", "new_cache_sta"],
-+                dynamic_axes={
-+                    "cache_dyn": {3: "ntokens"},
-+                    "new_cache_dyn": {3: "ntokens"}
-+                }                  
-+            )
 +            decoder_decode_ts_model = torch.jit.trace(
 +                self.model.decoder,
 +                (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
 +            )
 +            decoder_decode_ts_model.save(
 +                "/tmp/models/decoder_decode.ts")
++            os.sys.exit(0)
 +            logits, cache_dyn, _ = self.model.decoder(
 +                tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
-+            os.sys.exit(0)
 +            self.cache_dyn = cache_dyn
 +        else:
 +            pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]]
-+            torch.onnx.export(
-+                self.model.decoder,
-+                (tokens, audio_features, pos_embed),
-+                "/tmp/models/decoder_prefill.onnx",
-+                opset_version=11,
-+                input_names=["tokens", "audio_features", "pos_embed"],
-+                output_names=["logits", "cache_dyn", "cache_sta"],
-+                dynamic_axes={
-+                    "tokens": {1: "ntokens"},
-+                    "pos_embed": {0: "ntokens"},
-+                    "logits": {1: "ntokens"},
-+                    "cache_dyn": {3: "ntokens"}
-+                }
-+            )
 +            decoder_prefill_ts_model = torch.jit.trace(
 +                self.model.decoder,
 +                (tokens, audio_features, pos_embed)
@@ -131,10 +101,10 @@ index 49485d0..495fe45 100644
  class SequenceRanker:
      def rank(
 diff --git a/whisper/model.py b/whisper/model.py
-index a678283..2a95e28 100644
+index a678283..a20aaad 100644
 --- a/whisper/model.py
 +++ b/whisper/model.py
-@@ -1,6 +1,7 @@
+@@ -1,12 +1,14 @@
  import base64
  import gzip
  from dataclasses import dataclass
@@ -142,7 +112,14 @@ index a678283..2a95e28 100644
  from typing import Dict, Iterable, Optional
  
  import numpy as np
-@@ -68,6 +69,63 @@ class MultiHeadAttention(nn.Module):
+ import torch
+ import torch.nn.functional as F
+ from torch import Tensor, nn
++import mindietorch
+ 
+ from .decoding import decode as decode_function
+ from .decoding import detect_language as detect_language_function
+@@ -68,6 +70,63 @@ class MultiHeadAttention(nn.Module):
          self.value = Linear(n_state, n_state)
          self.out = Linear(n_state, n_state)
  
@@ -206,7 +183,7 @@ index a678283..2a95e28 100644
      def forward(
          self,
          x: Tensor,
-@@ -126,6 +184,39 @@ class ResidualAttentionBlock(nn.Module):
+@@ -126,6 +185,39 @@ class ResidualAttentionBlock(nn.Module):
          )
          self.mlp_ln = LayerNorm(n_state)
  
@@ -246,7 +223,7 @@ index a678283..2a95e28 100644
      def forward(
          self,
          x: Tensor,
-@@ -163,11 +254,10 @@ class AudioEncoder(nn.Module):
+@@ -163,11 +255,10 @@ class AudioEncoder(nn.Module):
          x = F.gelu(self.conv2(x))
          x = x.permute(0, 2, 1)
  
@@ -259,7 +236,7 @@ index a678283..2a95e28 100644
  
          x = self.ln_post(x)
          return x
-@@ -193,29 +283,56 @@ class TextDecoder(nn.Module):
+@@ -193,29 +284,56 @@ class TextDecoder(nn.Module):
          mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
          self.register_buffer("mask", mask, persistent=False)
  
@@ -331,7 +308,7 @@ index a678283..2a95e28 100644
  
  
  class Whisper(nn.Module):
-@@ -257,7 +374,8 @@ class Whisper(nn.Module):
+@@ -257,7 +375,8 @@ class Whisper(nn.Module):
          return self.encoder(mel)
  
      def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):