diff --git a/mxRAG/MainRepo/patches/README.md b/mxRAG/MainRepo/patches/README.md
index cf4bd9f082f1287faacbfc90fc165eda52021447..14a98f11ad38f026532ad7a2bbc4668858744493 100644
--- a/mxRAG/MainRepo/patches/README.md
+++ b/mxRAG/MainRepo/patches/README.md
@@ -4,4 +4,4 @@
 |----------|-------------------------------------------------------------------------------------------------|
 | TEI     | 针对huggingface推出的text-embeddings-inference适配了昇腾torch_npu，方便用户基于昇腾使用高性能tei服务。|
 | optimize   | 针对transformers中常见的embedding和reranker模型进行了高度优化，包含融合算子和模型计算优化等方式。对本地或tei服务运行模型均有一定性能收益 |
-| whisper   | 针对openai推出的whisper适配了昇腾torch_npu，方便用户基于昇腾使用whisper。|
\ No newline at end of file
+|
\ No newline at end of file
diff --git a/mxRAG/MainRepo/patches/whisper/README.md b/mxRAG/MainRepo/patches/whisper/README.md
deleted file mode 100644
index 63758d4e397d7eab49f64b6d7ac56cfa10b01522..0000000000000000000000000000000000000000
--- a/mxRAG/MainRepo/patches/whisper/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# 安装openai-whisper补丁说明
-
-## 安装环境准备
-
-参考：https://gitee.com/ascend/ModelZoo-PyTorch/tree/master/MindIE/MindIE-Torch/built-in/audio/Whisper
-
-| 配套     | 版本要求  |
-|---------|---------|
-| CANN    | 8.0.RC2 |
-| MindIE  | 1.0.RC2 |
-| Python  | 3.10.X  |
-| PyTorch | 2.1.0   |
-| ffmpeg  | 4.2.7   |
-| onnx    | 1.16.1  |
-
-
-1.安装MindIE前需要先source toolkit的环境变量，然后直接安装，以默认安装路径/usr/local/Ascend为例：
-```sh
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-bash Ascend-mindie_*.run --install
-```
-2.ubuntu下可通过apt-get install ffmpeg命令安装ffmpeg
-
-## 安装补丁步骤
-1.进入patches/whisper目录，下载zh.wav音频文件。
-```sh
-wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
-```
-2.在patches/whisper目录下进行补丁操作。
-```sh
-bash whisper_patch.sh
-```
-注意事项：
-1.参考设计默认的模型为tiny,运行的设备为Atlas300I Duo推理卡,使用的DEVICE为0，如需更换模型、设备类型、device对应修改whisper_patch.sh中MODEL_NAME、SOC_VERSION、DEVICE参数后再执行补丁操作。
-
-## 模型推理
-1.命令行调用
-```sh
-whisper zh.wav --model tiny
-```
-其中：zh.wav表示待转译的音频文件,支持的音频类型包括M4A、MP3、MP4、MPEG、MPGA、WAV 和 WEBM等;
-tiny表示使用的模型,支持tiny、base、small、medium、large模型。
-
-2.Python调用
-
-```python
-from whisper import load_model
-from whisper.transcribe import transcribe
-# 模型加载
-model = load_model('tiny')
-# 音频转译
-result = transcribe(model, audio="zh.wav", verbose=False, beam_size=5, temperature=0)
-print(result['text'])
-```
-3.运行结果
-```commandline
-"我認爲跑步最重要的事就是給我帶來了身體健康"
-```
-## 参数说明
-whisper接口功能请参考openai-whisper官方接口:https://github.com/openai/whisper
-
-因模型经过npu适配重新编译，使用时需注意以下两点：
-
-1.whisper.load_model为模型加载方法,使用时参数name填写经过本地向量化的模型名称，如需更换模型请重新编译后再使用；参数download_root默认为编译模型的导出路径，使用时无需填写。
-
-2.whisper.transcribe.transcribe为模型转译方法，当temperature使用默认值0时需声明beam_size=5，当temperature使用其他非0值时需声明best_of=5。
diff --git a/mxRAG/MainRepo/patches/whisper/compile.py b/mxRAG/MainRepo/patches/whisper/compile.py
deleted file mode 100644
index 6b3471efdff13a1e3064778115a90fec67f6814b..0000000000000000000000000000000000000000
--- a/mxRAG/MainRepo/patches/whisper/compile.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-import mindietorch
-
-_FRAMES = 3000
-_HALF_FRAMES = 1500
-_MAX_TOKEN = 448
-_KV_NUM = 2
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="mindietorch model compilation")
-    parser.add_argument("--model_path", default="/tmp/models")
-    parser.add_argument("--beam_size", type=int, default=5)
-    parser.add_argument("--nblocks", type=int, default=4)
-    parser.add_argument("--hidden", type=int, default=384)
-    parser.add_argument("--n_mels", type=int, default=80)
-    parser.add_argument("--soc_version", default="Ascend310P3")
-    args = parser.parse_args()
-    return args
-
-def compile_and_save(ts_model, input_info, soc_version, save_path):
-    ts_model.eval()
-    mindie_model = mindietorch.compile(
-        ts_model,
-        inputs=input_info,
-        precision_policy=mindietorch.PrecisionPolicy.FP16,
-        truncate_long_and_double=True,
-        allow_tensor_replace_int=True,
-        soc_version=soc_version,
-        optimization_level=0
-    )
-    mindie_model.save(save_path)
-
-def encoder(args):
-    ts_model = torch.jit.load(f"{args.model_path}/encoder.ts")
-    input_mel_info = mindietorch.Input([1, args.n_mels, _FRAMES])
-    input_info = [input_mel_info]
-    save_path = f"{args.model_path}/encoder_compiled.ts"
-    compile_and_save(ts_model, input_info, args.soc_version, save_path)
-
-def language(args):
-    ts_model = torch.jit.load(f"{args.model_path}/decoder_prefill.ts")
-    input_tokens_info = mindietorch.Input([1, 1])
-    input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, args.hidden])
-    input_pos_embed_info = mindietorch.Input([1, args.hidden])
-    input_info = [
-        input_tokens_info,
-        input_audio_features_info,
-        input_pos_embed_info,
-    ]
-    save_path = f"{args.model_path}/language_detection_compiled.ts"
-    compile_and_save(ts_model, input_info, args.soc_version, save_path)
-
-def prefill(args):
-    ts_model = torch.jit.load(f"{args.model_path}/decoder_prefill.ts")
-
-    input_tokens_info = mindietorch.Input(
-        min_shape=[args.beam_size, 1],
-        max_shape=[args.beam_size, _MAX_TOKEN]
-    )
-    input_audio_features_info = mindietorch.Input(
-        min_shape=[1, _HALF_FRAMES, args.hidden],
-        max_shape=[1, _HALF_FRAMES, args.hidden]
-    )
-    input_pos_embed_info = mindietorch.Input(
-        min_shape=[1, args.hidden],
-        max_shape=[_MAX_TOKEN, args.hidden]
-    )
-    input_info = [
-        input_tokens_info,
-        input_audio_features_info,
-        input_pos_embed_info,
-    ]
-    save_path = f"{args.model_path}/decoder_prefill_compiled.ts"
-    compile_and_save(ts_model, input_info, args.soc_version, save_path)
-
-def decode(args):
-    ts_model = torch.jit.load(f"{args.model_path}/decoder_decode.ts")
-
-    input_tokens_info = mindietorch.Input(
-        min_shape=[args.beam_size, 1],
-        max_shape=[args.beam_size, 1]
-    )
-    input_audio_features_info = mindietorch.Input(
-        min_shape=[1, _HALF_FRAMES, args.hidden],
-        max_shape=[1, _HALF_FRAMES, args.hidden]
-    )
-    input_pos_embed_info = mindietorch.Input(
-        min_shape=[args.hidden],
-        max_shape=[args.hidden]
-    )
-    input_cache_dyn_info = mindietorch.Input(
-        min_shape=(args.nblocks, _KV_NUM, args.beam_size, 1, args.hidden),
-        max_shape=(args.nblocks, _KV_NUM, args.beam_size, _MAX_TOKEN, args.hidden)
-    )
-    input_cache_sta_info = mindietorch.Input(
-        min_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, args.hidden],
-        max_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, args.hidden]
-    )
-
-    input_info = [
-        input_tokens_info,
-        input_audio_features_info,
-        input_pos_embed_info,
-        input_cache_dyn_info,
-        input_cache_sta_info
-    ]
-
-    save_path = f"{args.model_path}/decoder_decode_compiled.ts"
-    compile_and_save(ts_model, input_info, args.soc_version, save_path)
-
-def main():
-    args = parse_args()
-    for func in encoder, language, prefill, decode:
-        func(args)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/mxRAG/MainRepo/patches/whisper/mindietorch_infer.patch b/mxRAG/MainRepo/patches/whisper/mindietorch_infer.patch
deleted file mode 100644
index fc7f771847486b8542d41a2a54876304c481399e..0000000000000000000000000000000000000000
--- a/mxRAG/MainRepo/patches/whisper/mindietorch_infer.patch
+++ /dev/null
@@ -1,226 +0,0 @@
-diff --git a/whisper/decoding.py b/whisper/decoding.py
-index 49485d0..4dccc86 100644
---- a/whisper/decoding.py
-+++ b/whisper/decoding.py
-@@ -6,6 +6,7 @@ import torch
- import torch.nn.functional as F
- from torch import Tensor
- from torch.distributions import Categorical
-+import mindietorch
- 
- from .audio import CHUNK_LENGTH
- from .tokenizer import Tokenizer, get_tokenizer
-@@ -14,6 +15,7 @@ from .utils import compression_ratio
- if TYPE_CHECKING:
-     from .model import Whisper
- 
-+mindietorch.set_device(0)
- 
- @torch.no_grad()
- def detect_language(
-@@ -54,7 +56,7 @@ def detect_language(
-     # forward pass using a single token, startoftranscript
-     n_audio = mel.shape[0]
-     x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
--    logits = model.logits(x, mel)[:, 0]
-+    logits = model.logits(x, mel)[0][:, 0]
- 
-     # collect detected languages; suppress all non-language tokens
-     mask = torch.ones(logits.shape[-1], dtype=torch.bool)
-@@ -145,36 +147,35 @@ class PyTorchInference(Inference):
-     def __init__(self, model: "Whisper", initial_token_length: int):
-         self.model: "Whisper" = model
-         self.initial_token_length = initial_token_length
--        self.kv_cache = {}
--        self.hooks = []
--
--        key_modules = [block.attn.key for block in self.model.decoder.blocks]
--        value_modules = [block.attn.value for block in self.model.decoder.blocks]
--        self.kv_modules = key_modules + value_modules
-+        self.cache_dyn = None
-+        self.cache_sta = None
- 
-     def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
--        if not self.kv_cache:
--            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
--
-         if tokens.shape[-1] > self.initial_token_length:
-             # only need to use the last token except in the first forward pass
-             tokens = tokens[:, -1:]
-+            pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]]
-+            logits, cache_dyn, _ = self.model.decoder(
-+                tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
-+            self.cache_dyn = cache_dyn
-+        else:
-+            pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]]
-+            logits, cache_dyn, cache_sta = self.model.decoder(tokens, audio_features, pos_embed)
-+            self.cache_dyn = cache_dyn
-+            self.cache_sta = cache_sta
- 
--        return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
-+        return logits
- 
-     def cleanup_caching(self):
--        for hook in self.hooks:
--            hook.remove()
--
--        self.kv_cache = {}
--        self.hooks = []
-+        self.cache_dyn = None
-+        self.cache_sta = None
- 
-     def rearrange_kv_cache(self, source_indices):
-         if source_indices != list(range(len(source_indices))):
--            for module in self.kv_modules:
--                # update the key/value cache to contain the selected sequences
--                self.kv_cache[module] = self.kv_cache[module][source_indices].detach()
--
-+            blocks = self.cache_dyn.shape[0]
-+            for i in range(blocks):
-+                for j in range(2): # k and v 2 items
-+                    self.cache_dyn[i][j] = self.cache_dyn[i][j][source_indices]
- 
- class SequenceRanker:
-     def rank(
-diff --git a/whisper/model.py b/whisper/model.py
-index a678283..c94a024 100644
---- a/whisper/model.py
-+++ b/whisper/model.py
-@@ -1,12 +1,14 @@
- import base64
- import gzip
- from dataclasses import dataclass
-+import os
- from typing import Dict, Iterable, Optional
- 
- import numpy as np
- import torch
- import torch.nn.functional as F
- from torch import Tensor, nn
-+import mindietorch
- 
- from .decoding import decode as decode_function
- from .decoding import detect_language as detect_language_function
-@@ -153,24 +155,19 @@ class AudioEncoder(nn.Module):
-             [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
-         )
-         self.ln_post = LayerNorm(n_state)
-+        self.device = "npu:0"
-+        self.mindietorch_encoder_model = torch.jit.load(
-+            "/tmp/models/encoder_compiled.ts"
-+        ).eval().to(self.device)
- 
-     def forward(self, x: Tensor):
-         """
-         x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
-             the mel spectrogram of the audio
-         """
--        x = F.gelu(self.conv1(x))
--        x = F.gelu(self.conv2(x))
--        x = x.permute(0, 2, 1)
--
--        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
--        x = (x + self.positional_embedding).to(x.dtype)
--
--        for block in self.blocks:
--            x = block(x)
--
--        x = self.ln_post(x)
--        return x
-+        x = x.to(self.device)
-+        x = self.mindietorch_encoder_model(x)
-+        return x.cpu()
- 
- 
- class TextDecoder(nn.Module):
-@@ -193,29 +190,58 @@ class TextDecoder(nn.Module):
-         mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
-         self.register_buffer("mask", mask, persistent=False)
- 
--    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
--        """
--        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
--            the text tokens
--        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
--            the encoded audio features to be attended on
--        """
--        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
--        x = (
--            self.token_embedding(x)
--            + self.positional_embedding[offset : offset + x.shape[-1]]
--        )
--        x = x.to(xa.dtype)
--
--        for block in self.blocks:
--            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
-+        self.device = "npu:0"
-+        self.mindietorch_language_detection_model = torch.jit.load(
-+            "/tmp/models/language_detection_compiled.ts"
-+        ).eval().to(self.device)
-+        self.mindietorch_prefill_model = torch.jit.load(
-+            "/tmp/models/decoder_prefill_compiled.ts"
-+        ).eval().to(self.device)
-+        self.mindietorch_decode_model = torch.jit.load(
-+            "/tmp/models/decoder_decode_compiled.ts"
-+        ).eval().to(self.device)
- 
--        x = self.ln(x)
--        logits = (
--            x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
--        ).float()
--
--        return logits
-+    def forward(
-+        self,
-+        x: Tensor,
-+        xa: Tensor,
-+        pos_embed: Tensor = None,
-+        cache_dyn: Tensor = None,
-+        cache_sta: Tensor = None,
-+    ):
-+        if cache_dyn is None:
-+            tokens_npu = x.float().to(self.device)
-+            audio_features_npu = xa.to(self.device)
-+            pos_embed_npu = pos_embed.to(self.device)
-+            if x.shape[0] != 1:
-+                logits, cache_dyn, cache_sta = self.mindietorch_prefill_model(
-+                    tokens_npu,
-+                    audio_features_npu,
-+                    pos_embed_npu
-+                )
-+            else:
-+                logits, cache_dyn, cache_sta = self.mindietorch_language_detection_model(
-+                    tokens_npu,
-+                    audio_features_npu,
-+                    pos_embed_npu
-+                )
-+            logits = logits.cpu()
-+            cache_dyn = cache_dyn.cpu()
-+        else:
-+            tokens_npu = x.float().to(self.device)
-+            audio_features_npu = xa.to(self.device)
-+            pos_embed_npu = pos_embed.to(self.device)
-+            cache_dyn_npu = cache_dyn.to(self.device)
-+            logits, cache_dyn, _ = self.mindietorch_decode_model(
-+                tokens_npu,
-+                audio_features_npu,
-+                pos_embed_npu,
-+                cache_dyn_npu,
-+                cache_sta
-+            )
-+            logits = logits.cpu()
-+            cache_dyn = cache_dyn.cpu()
-+        return logits, cache_dyn, cache_sta
- 
- 
- class Whisper(nn.Module):
-@@ -257,7 +283,8 @@ class Whisper(nn.Module):
-         return self.encoder(mel)
- 
-     def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
--        return self.decoder(tokens, audio_features)
-+        pos_embed = self.decoder.positional_embedding[:tokens.shape[-1]]
-+        return self.decoder(tokens, audio_features, pos_embed)
- 
-     def forward(
-         self, mel: torch.Tensor, tokens: torch.Tensor
diff --git a/mxRAG/MainRepo/patches/whisper/trace_model.patch b/mxRAG/MainRepo/patches/whisper/trace_model.patch
deleted file mode 100644
index 903444eddb385a3f8e8ed4e09e0c0eeeeea1f9e0..0000000000000000000000000000000000000000
--- a/mxRAG/MainRepo/patches/whisper/trace_model.patch
+++ /dev/null
@@ -1,343 +0,0 @@
-diff --git a/whisper/decoding.py b/whisper/decoding.py
-index 49485d0..495fe45 100644
---- a/whisper/decoding.py
-+++ b/whisper/decoding.py
-@@ -2,6 +2,7 @@ from dataclasses import dataclass, field, replace
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
- 
- import numpy as np
-+import os
- import torch
- import torch.nn.functional as F
- from torch import Tensor
-@@ -49,12 +50,24 @@ def detect_language(
- 
-     # skip encoder forward pass if already-encoded audio features were given
-     if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
-+        encoder_ts_model = torch.jit.trace(model.encoder, mel)
-+        encoder_ts_model.save(
-+            "/tmp/models/encoder.ts")
-+        torch.onnx.export(
-+            model.encoder,
-+            (mel),
-+            "/tmp/models/onnx/encode/encoder.onnx",
-+            opset_version=11,
-+            input_names=["mel"],
-+            output_names=["ret"]
-+        )
-+
-         mel = model.encoder(mel)
- 
-     # forward pass using a single token, startoftranscript
-     n_audio = mel.shape[0]
-     x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
--    logits = model.logits(x, mel)[:, 0]
-+    logits = model.logits(x, mel)[0][:, 0]
- 
-     # collect detected languages; suppress all non-language tokens
-     mask = torch.ones(logits.shape[-1], dtype=torch.bool)
-@@ -145,36 +158,74 @@ class PyTorchInference(Inference):
-     def __init__(self, model: "Whisper", initial_token_length: int):
-         self.model: "Whisper" = model
-         self.initial_token_length = initial_token_length
--        self.kv_cache = {}
--        self.hooks = []
--
--        key_modules = [block.attn.key for block in self.model.decoder.blocks]
--        value_modules = [block.attn.value for block in self.model.decoder.blocks]
--        self.kv_modules = key_modules + value_modules
-+        self.cache_dyn = None
-+        self.cache_sta = None
- 
-     def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
--        if not self.kv_cache:
--            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
--
-         if tokens.shape[-1] > self.initial_token_length:
-             # only need to use the last token except in the first forward pass
-             tokens = tokens[:, -1:]
-+            pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]]
-+            torch.onnx.export(
-+                self.model.decoder,
-+                (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta),
-+                "/tmp/models/onnx/decode/decoder_decode.onnx",
-+                opset_version=11,
-+                input_names=["tokens", "audio_features", "pos_embed", "cache_dyn", "cache_sta"],
-+                output_names=["logits", "new_cache_dyn", "new_cache_sta"],
-+                dynamic_axes={
-+                    "cache_dyn": {3: "ntokens"},
-+                    "new_cache_dyn": {3: "ntokens"}
-+                }                  
-+            )
-+            decoder_decode_ts_model = torch.jit.trace(
-+                self.model.decoder,
-+                (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
-+            )
-+            decoder_decode_ts_model.save(
-+                "/tmp/models/decoder_decode.ts")
-+            logits, cache_dyn, _ = self.model.decoder(
-+                tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta)
-+            os.sys.exit(0)
-+            self.cache_dyn = cache_dyn
-+        else:
-+            pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]]
-+            torch.onnx.export(
-+                self.model.decoder,
-+                (tokens, audio_features, pos_embed),
-+                "/tmp/models/onnx/prefill/decoder_prefill.onnx",
-+                opset_version=11,
-+                input_names=["tokens", "audio_features", "pos_embed"],
-+                output_names=["logits", "cache_dyn", "cache_sta"],
-+                dynamic_axes={
-+                    "tokens": {1: "ntokens"},
-+                    "pos_embed": {0: "ntokens"},
-+                    "logits": {1: "ntokens"},
-+                    "cache_dyn": {3: "ntokens"}
-+                }
-+            )
-+            decoder_prefill_ts_model = torch.jit.trace(
-+                self.model.decoder,
-+                (tokens, audio_features, pos_embed)
-+            )
-+            decoder_prefill_ts_model.save(
-+                "/tmp/models/decoder_prefill.ts")
-+            logits, cache_dyn, cache_sta = self.model.decoder(tokens, audio_features, pos_embed)
-+            self.cache_dyn = cache_dyn
-+            self.cache_sta = cache_sta
- 
--        return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
-+        return logits
- 
-     def cleanup_caching(self):
--        for hook in self.hooks:
--            hook.remove()
--
--        self.kv_cache = {}
--        self.hooks = []
-+        self.cache_dyn = None
-+        self.cache_sta = None
- 
-     def rearrange_kv_cache(self, source_indices):
-         if source_indices != list(range(len(source_indices))):
--            for module in self.kv_modules:
--                # update the key/value cache to contain the selected sequences
--                self.kv_cache[module] = self.kv_cache[module][source_indices].detach()
--
-+            blocks = self.cache_dyn.shape[0]
-+            for i in range(blocks):
-+                for j in range(2): # k and v 2 items
-+                    self.cache_dyn[i][j] = self.cache_dyn[i][j][source_indices]
- 
- class SequenceRanker:
-     def rank(
-diff --git a/whisper/model.py b/whisper/model.py
-index a678283..2a95e28 100644
---- a/whisper/model.py
-+++ b/whisper/model.py
-@@ -1,6 +1,7 @@
- import base64
- import gzip
- from dataclasses import dataclass
-+import os
- from typing import Dict, Iterable, Optional
- 
- import numpy as np
-@@ -68,6 +69,63 @@ class MultiHeadAttention(nn.Module):
-         self.value = Linear(n_state, n_state)
-         self.out = Linear(n_state, n_state)
- 
-+    def encoder_forward(self, x: Tensor):
-+        q = self.query(x)
-+        k = self.key(x)
-+        v = self.value(x)
-+        wv, qk = self.qkv_attention(q, k, v)
-+        return self.out(wv)
-+
-+    def prefill_self_attn_forward(
-+        self,
-+        x: Tensor,
-+        mask: Tensor,
-+    ):
-+        q = self.query(x)
-+        k = self.key(x)
-+        v = self.value(x)
-+        cache_dyn = torch.stack([k, v])
-+        wv, _ = self.qkv_attention(q, k, v, mask)
-+        return self.out(wv), cache_dyn
-+    
-+    def prefill_cross_attn_forward(
-+        self,
-+        x: Tensor,
-+        xa: Tensor,
-+    ):
-+        q = self.query(x)
-+        k = self.key(xa)
-+        v = self.value(xa)
-+        cache_sta = torch.stack([k, v])
-+        wv, _ = self.qkv_attention(q, k, v)
-+        return self.out(wv), cache_sta
-+    
-+    def decode_self_attn_forward(
-+        self,
-+        x: Tensor,
-+        mask: Tensor,
-+        cache_dyn: Tensor
-+    ):
-+        q = self.query(x)
-+        token_k = self.key(x)
-+        k = torch.cat([cache_dyn[0], token_k], dim=1).detach()
-+        token_v = self.value(x)
-+        v = torch.cat([cache_dyn[1], token_v], dim=1).detach()
-+        new_cache_dyn = torch.stack([k, v])
-+        wv, _ = self.qkv_attention(q, k, v, mask)
-+        return self.out(wv), new_cache_dyn
-+
-+    def decode_cross_attn_forward(
-+        self,
-+        x: Tensor,
-+        cache_sta: Tensor
-+    ):
-+        q = self.query(x)
-+        k = cache_sta[0]
-+        v = cache_sta[1]
-+        wv, _ = self.qkv_attention(q, k, v)
-+        return self.out(wv)
-+
-     def forward(
-         self,
-         x: Tensor,
-@@ -126,6 +184,39 @@ class ResidualAttentionBlock(nn.Module):
-         )
-         self.mlp_ln = LayerNorm(n_state)
- 
-+    def encoder_forward(self, x: Tensor):
-+        x = x + self.attn.encoder_forward(self.attn_ln(x))
-+        x = x + self.mlp(self.mlp_ln(x))
-+        return x
-+
-+    def prefill_forward(
-+        self,
-+        x: Tensor,
-+        xa: Tensor,
-+        mask: Tensor,
-+    ):
-+        self_attn_out, new_cache_dyn = self.attn.prefill_self_attn_forward(self.attn_ln(x), mask)
-+        x = x + self_attn_out
-+        cross_attn_out, new_cache_sta = self.cross_attn.prefill_cross_attn_forward(self.cross_attn_ln(x), xa)
-+        x = x + cross_attn_out
-+        x = x + self.mlp(self.mlp_ln(x))
-+        return x, new_cache_dyn, new_cache_sta
-+
-+    def decode_forward(
-+        self,
-+        x: Tensor,
-+        xa: Tensor,
-+        mask: Tensor,
-+        cache_dyn: Tensor,
-+        cache_sta: Tensor
-+    ):
-+        self_attn_out, new_cache_dyn = self.attn.decode_self_attn_forward(self.attn_ln(x), mask, cache_dyn)
-+        x = x + self_attn_out
-+        cross_attn_out = self.cross_attn.decode_cross_attn_forward(self.cross_attn_ln(x), cache_sta)
-+        x = x + cross_attn_out
-+        x = x + self.mlp(self.mlp_ln(x))
-+        return x, new_cache_dyn
-+
-     def forward(
-         self,
-         x: Tensor,
-@@ -163,11 +254,10 @@ class AudioEncoder(nn.Module):
-         x = F.gelu(self.conv2(x))
-         x = x.permute(0, 2, 1)
- 
--        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
-         x = (x + self.positional_embedding).to(x.dtype)
- 
-         for block in self.blocks:
--            x = block(x)
-+            x = block.encoder_forward(x)
- 
-         x = self.ln_post(x)
-         return x
-@@ -193,29 +283,56 @@ class TextDecoder(nn.Module):
-         mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
-         self.register_buffer("mask", mask, persistent=False)
- 
--    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
--        """
--        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
--            the text tokens
--        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
--            the encoded audio features to be attended on
--        """
--        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
--        x = (
--            self.token_embedding(x)
--            + self.positional_embedding[offset : offset + x.shape[-1]]
--        )
--        x = x.to(xa.dtype)
-+    def prefill(self, x: Tensor, xa: Tensor, pos_embed: Tensor):
-+        x = (self.token_embedding(x) + pos_embed).to(xa.dtype)
- 
-+        cache_dyn_list = []
-+        cache_sta_list = []
-         for block in self.blocks:
--            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
-+            x, new_cache_dyn, new_cache_sta = block.prefill_forward(x, xa, self.mask)
-+            cache_dyn_list.append(new_cache_dyn)
-+            cache_sta_list.append(new_cache_sta)
-+
-+        cache_dyn = torch.stack(cache_dyn_list)
-+        cache_sta = torch.stack(cache_sta_list)
- 
-         x = self.ln(x)
-         logits = (
-             x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
-         ).float()
- 
--        return logits
-+        return logits, cache_dyn, cache_sta
-+    
-+    def decode(self, x: Tensor, xa: Tensor, pos_embed: Tensor, cache_dyn: Tensor, cache_sta: Tensor):
-+        x = (self.token_embedding(x) + pos_embed).to(xa.dtype)
-+
-+        cache_dyn_list = []
-+        for idx, block in enumerate(self.blocks):
-+            x, new_cache_dyn = block.decode_forward(x, xa, self.mask, cache_dyn[idx], cache_sta[idx])
-+            cache_dyn_list.append(new_cache_dyn)
-+
-+        new_cache_dyn = torch.stack(cache_dyn_list)
-+
-+        x = self.ln(x)
-+        logits = (
-+            x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
-+        ).float()
-+
-+        return logits, new_cache_dyn
-+
-+    def forward(
-+        self,
-+        x: Tensor,
-+        xa: Tensor,
-+        pos_embed: Tensor = None,
-+        cache_dyn: Tensor = None,
-+        cache_sta: Tensor = None,
-+    ):
-+        if cache_dyn is None:
-+            logits, cache_dyn, cache_sta = self.prefill(x, xa, pos_embed)
-+        else:
-+            logits, cache_dyn = self.decode(x, xa, pos_embed, cache_dyn, cache_sta)
-+        return logits, cache_dyn, cache_sta
- 
- 
- class Whisper(nn.Module):
-@@ -257,7 +374,8 @@ class Whisper(nn.Module):
-         return self.encoder(mel)
- 
-     def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
--        return self.decoder(tokens, audio_features)
-+        pos_embed = self.decoder.positional_embedding[:tokens.shape[-1]]
-+        return self.decoder(tokens, audio_features, pos_embed)
- 
-     def forward(
-         self, mel: torch.Tensor, tokens: torch.Tensor
diff --git a/mxRAG/MainRepo/patches/whisper/whisper_patch.sh b/mxRAG/MainRepo/patches/whisper/whisper_patch.sh
deleted file mode 100644
index 5779dca9fd023297d8d18623c01337d10518bd52..0000000000000000000000000000000000000000
--- a/mxRAG/MainRepo/patches/whisper/whisper_patch.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-#在patches/whisper目录下执行一键补丁脚本
-set -e
-
-MODEL_NAME='tiny'
-DEVICE=0
-SOC_VERSION="Ascend310P3"
-
-PATCH_DIR=$(dirname $(readlink -f $0))
-dist_packages=$(python3 -c "import site;print(site.getsitepackages()[0])")
-dist_packages_path=$(echo "$dist_packages" | sed "s/[',\[\]]//g")
-echo "dist-packages path is: $dist_packages_path"
-
-# 初始化模型参数
-declare -A params
-params[tiny]="4 384 80"
-params[base]="6 512 80"
-params[small]="12 768 80"
-params[medium]="24 1024 80"
-params[large]="32 1280 128"
-
-# 提取模型
-IFS=' ' read -r -a model_params <<< "${params[$MODEL_NAME]}"
-N_BLOCKS=${model_params[0]}
-HIDDEN=${model_params[1]}
-N_MEIS=${model_params[2]}
-
-# 提取DEVICE参数
-sed -E -i "s/set_device\([0-9]+\)/set_device($DEVICE)/g" mindietorch_infer.patch
-sed -E -i "s/npu:[0-9]+/npu:$DEVICE/g" mindietorch_infer.patch
-echo "set device is: $DEVICE"
-
-function install_packages(){
-    pip3 install onnx==1.16.1
-    pip3 uninstall -y openai-whisper==20231117
-    pip3 install openai-whisper==20231117
-
-}
-
-function patch_trace_model(){
-    cd $dist_packages_path
-    patch -p1 < $PATCH_DIR/trace_model.patch
-    cd $PATCH_DIR
-    DIRS=("/tmp/models"
-    "/tmp/models/onnx"
-    "/tmp/models/onnx/encode"
-    "/tmp/models/onnx/decode"
-    "/tmp/models/onnx/prefill"
-    )
-    for dir in "${DIRS[@]}"; do
-        if [ ! -d "$dir" ]; then
-            mkdir -p "$dir"
-            echo "Directory $dir created."
-        else
-            echo "Directory $dir already exists."
-        fi 
-        done
-    whisper zh.wav --model $MODEL_NAME
-}
-
-function compile_model(){
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh
-    source /usr/local/Ascend/mindie/set_env.sh
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh
-    python3 compile.py --nblocks $N_BLOCKS --hidden $HIDDEN --n_mels $N_MEIS --soc_version $SOC_VERSION
-}
-
-function patch_mindietorch(){
-    pip3 uninstall -y openai-whisper==20231117
-    pip3 install openai-whisper==20231117
-    cd $dist_packages_path
-    patch -p1 < $PATCH_DIR/mindietorch_infer.patch
-}
-
-function main(){
-    install_packages
-    patch_trace_model
-    compile_model
-    patch_mindietorch
-}
-
-main
\ No newline at end of file