diff --git a/mxRAG/MainRepo/patches/README.md b/mxRAG/MainRepo/patches/README.md index cf4bd9f082f1287faacbfc90fc165eda52021447..14a98f11ad38f026532ad7a2bbc4668858744493 100644 --- a/mxRAG/MainRepo/patches/README.md +++ b/mxRAG/MainRepo/patches/README.md @@ -4,4 +4,4 @@ |----------|-------------------------------------------------------------------------------------------------| | TEI | 针对huggingface推出的text-embeddings-inference适配了昇腾torch_npu,方便用户基于昇腾使用高性能tei服务。| | optimize | 针对transformers中常见的embedding和reranker模型进行了高度优化,包含融合算子和模型计算优化等方式。对本地或tei服务运行模型均有一定性能收益 | -| whisper | 针对openai推出的whisper适配了昇腾torch_npu,方便用户基于昇腾使用whisper。| \ No newline at end of file +| \ No newline at end of file diff --git a/mxRAG/MainRepo/patches/whisper/README.md b/mxRAG/MainRepo/patches/whisper/README.md deleted file mode 100644 index 63758d4e397d7eab49f64b6d7ac56cfa10b01522..0000000000000000000000000000000000000000 --- a/mxRAG/MainRepo/patches/whisper/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# 安装openai-whisper补丁说明 - -## 安装环境准备 - -参考:https://gitee.com/ascend/ModelZoo-PyTorch/tree/master/MindIE/MindIE-Torch/built-in/audio/Whisper - -| 配套 | 版本要求 | -|---------|---------| -| CANN | 8.0.RC2 | -| MindIE | 1.0.RC2 | -| Python | 3.10.X | -| PyTorch | 2.1.0 | -| ffmpeg | 4.2.7 | -| onnx | 1.16.1 | - - -1.安装MindIE前需要先source toolkit的环境变量,然后直接安装,以默认安装路径/usr/local/Ascend为例: -```sh -source /usr/local/Ascend/ascend-toolkit/set_env.sh -bash Ascend-mindie_*.run --install -``` -2.ubuntu下可通过apt-get install ffmpeg命令安装ffmpeg - -## 安装补丁步骤 -1.进入patches/whisper目录,下载zh.wav音频文件。 -```sh -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -``` -2.在patches/whisper目录下进行补丁操作。 -```sh -bash whisper_patch.sh -``` -注意事项: -1.参考设计默认的模型为tiny,运行的设备为Atlas300I Duo推理卡,使用的DEVICE为0,如需更换模型、设备类型、device对应修改whisper_patch.sh中MODEL_NAME、SOC_VERSION、DEVICE参数后再执行补丁操作。 - -## 模型推理 -1.命令行调用 -```sh -whisper zh.wav --model tiny -``` -其中:zh.wav表示待转译的音频文件,支持的音频类型包括M4A、MP3、MP4、MPEG、MPGA、WAV 和 WEBM等; -tiny表示使用的模型,支持tiny、base、small、medium、large模型。 - -2.Python调用 - -```python -from whisper import load_model -from whisper.transcribe import transcribe -# 模型加载 -model = load_model('tiny') -# 音频转译 -result = transcribe(model, audio="zh.wav", verbose=False, beam_size=5, temperature=0) -print(result['text']) -``` -3.运行结果 -```commandline -"我認爲跑步最重要的事就是給我帶來了身體健康" -``` -## 参数说明 -whisper接口功能请参考openai-whisper官方接口:https://github.com/openai/whisper - -因模型经过npu适配重新编译,使用时需注意以下两点: - -1.whisper.load_model为模型加载方法,使用时参数name填写经过本地向量化的模型名称,如需更换模型请重新编译后再使用;参数download_root默认为编译模型的导出路径,使用时无需填写。 - -2.whisper.transcribe.transcribe为模型转译方法,当temperature使用默认值0时需声明beam_size=5,当temperature使用其他非0值时需声明best_of=5。 diff --git a/mxRAG/MainRepo/patches/whisper/compile.py b/mxRAG/MainRepo/patches/whisper/compile.py deleted file mode 100644 index 6b3471efdff13a1e3064778115a90fec67f6814b..0000000000000000000000000000000000000000 --- a/mxRAG/MainRepo/patches/whisper/compile.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch -import mindietorch - -_FRAMES = 3000 -_HALF_FRAMES = 1500 -_MAX_TOKEN = 448 -_KV_NUM = 2 - -def parse_args(): - parser = argparse.ArgumentParser(description="mindietorch model compilation") - parser.add_argument("--model_path", default="/tmp/models") - parser.add_argument("--beam_size", type=int, default=5) - parser.add_argument("--nblocks", type=int, default=4) - parser.add_argument("--hidden", type=int, default=384) - parser.add_argument("--n_mels", type=int, default=80) - parser.add_argument("--soc_version", default="Ascend310P3") - args = parser.parse_args() - return args - -def compile_and_save(ts_model, input_info, soc_version, save_path): - ts_model.eval() - mindie_model = mindietorch.compile( - ts_model, - inputs=input_info, - precision_policy=mindietorch.PrecisionPolicy.FP16, - truncate_long_and_double=True, - allow_tensor_replace_int=True, - soc_version=soc_version, - optimization_level=0 - ) - mindie_model.save(save_path) - -def encoder(args): - ts_model = torch.jit.load(f"{args.model_path}/encoder.ts") - input_mel_info = mindietorch.Input([1, args.n_mels, _FRAMES]) - input_info = [input_mel_info] - save_path = f"{args.model_path}/encoder_compiled.ts" - compile_and_save(ts_model, input_info, args.soc_version, save_path) - -def language(args): - ts_model = torch.jit.load(f"{args.model_path}/decoder_prefill.ts") - input_tokens_info = mindietorch.Input([1, 1]) - input_audio_features_info = mindietorch.Input([1, _HALF_FRAMES, args.hidden]) - input_pos_embed_info = mindietorch.Input([1, args.hidden]) - input_info = [ - input_tokens_info, - input_audio_features_info, - input_pos_embed_info, - ] - save_path = f"{args.model_path}/language_detection_compiled.ts" - compile_and_save(ts_model, input_info, args.soc_version, save_path) - -def prefill(args): - ts_model = torch.jit.load(f"{args.model_path}/decoder_prefill.ts") - - input_tokens_info = mindietorch.Input( - min_shape=[args.beam_size, 1], - max_shape=[args.beam_size, _MAX_TOKEN] - ) - input_audio_features_info = mindietorch.Input( - min_shape=[1, _HALF_FRAMES, args.hidden], - max_shape=[1, _HALF_FRAMES, args.hidden] - ) - input_pos_embed_info = mindietorch.Input( - min_shape=[1, args.hidden], - max_shape=[_MAX_TOKEN, args.hidden] - ) - input_info = [ - input_tokens_info, - input_audio_features_info, - input_pos_embed_info, - ] - save_path = f"{args.model_path}/decoder_prefill_compiled.ts" - compile_and_save(ts_model, input_info, args.soc_version, save_path) - -def decode(args): - ts_model = torch.jit.load(f"{args.model_path}/decoder_decode.ts") - - input_tokens_info = mindietorch.Input( - min_shape=[args.beam_size, 1], - max_shape=[args.beam_size, 1] - ) - input_audio_features_info = mindietorch.Input( - min_shape=[1, _HALF_FRAMES, args.hidden], - max_shape=[1, _HALF_FRAMES, args.hidden] - ) - input_pos_embed_info = mindietorch.Input( - min_shape=[args.hidden], - max_shape=[args.hidden] - ) - input_cache_dyn_info = mindietorch.Input( - min_shape=(args.nblocks, _KV_NUM, args.beam_size, 1, args.hidden), - max_shape=(args.nblocks, _KV_NUM, args.beam_size, _MAX_TOKEN, args.hidden) - ) - input_cache_sta_info = mindietorch.Input( - min_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, args.hidden], - max_shape=[args.nblocks, _KV_NUM, 1, _HALF_FRAMES, args.hidden] - ) - - input_info = [ - input_tokens_info, - input_audio_features_info, - input_pos_embed_info, - input_cache_dyn_info, - input_cache_sta_info - ] - - save_path = f"{args.model_path}/decoder_decode_compiled.ts" - compile_and_save(ts_model, input_info, args.soc_version, save_path) - -def main(): - args = parse_args() - for func in encoder, language, prefill, decode: - func(args) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/mxRAG/MainRepo/patches/whisper/mindietorch_infer.patch b/mxRAG/MainRepo/patches/whisper/mindietorch_infer.patch deleted file mode 100644 index fc7f771847486b8542d41a2a54876304c481399e..0000000000000000000000000000000000000000 --- a/mxRAG/MainRepo/patches/whisper/mindietorch_infer.patch +++ /dev/null @@ -1,226 +0,0 @@ -diff --git a/whisper/decoding.py b/whisper/decoding.py -index 49485d0..4dccc86 100644 ---- a/whisper/decoding.py -+++ b/whisper/decoding.py -@@ -6,6 +6,7 @@ import torch - import torch.nn.functional as F - from torch import Tensor - from torch.distributions import Categorical -+import mindietorch - - from .audio import CHUNK_LENGTH - from .tokenizer import Tokenizer, get_tokenizer -@@ -14,6 +15,7 @@ from .utils import compression_ratio - if TYPE_CHECKING: - from .model import Whisper - -+mindietorch.set_device(0) - - @torch.no_grad() - def detect_language( -@@ -54,7 +56,7 @@ def detect_language( - # forward pass using a single token, startoftranscript - n_audio = mel.shape[0] - x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1] -- logits = model.logits(x, mel)[:, 0] -+ logits = model.logits(x, mel)[0][:, 0] - - # collect detected languages; suppress all non-language tokens - mask = torch.ones(logits.shape[-1], dtype=torch.bool) -@@ -145,36 +147,35 @@ class PyTorchInference(Inference): - def __init__(self, model: "Whisper", initial_token_length: int): - self.model: "Whisper" = model - self.initial_token_length = initial_token_length -- self.kv_cache = {} -- self.hooks = [] -- -- key_modules = [block.attn.key for block in self.model.decoder.blocks] -- value_modules = [block.attn.value for block in self.model.decoder.blocks] -- self.kv_modules = key_modules + value_modules -+ self.cache_dyn = None -+ self.cache_sta = None - - def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: -- if not self.kv_cache: -- self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() -- - if tokens.shape[-1] > self.initial_token_length: - # only need to use the last token except in the first forward pass - tokens = tokens[:, -1:] -+ pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]] -+ logits, cache_dyn, _ = self.model.decoder( -+ tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta) -+ self.cache_dyn = cache_dyn -+ else: -+ pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]] -+ logits, cache_dyn, cache_sta = self.model.decoder(tokens, audio_features, pos_embed) -+ self.cache_dyn = cache_dyn -+ self.cache_sta = cache_sta - -- return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache) -+ return logits - - def cleanup_caching(self): -- for hook in self.hooks: -- hook.remove() -- -- self.kv_cache = {} -- self.hooks = [] -+ self.cache_dyn = None -+ self.cache_sta = None - - def rearrange_kv_cache(self, source_indices): - if source_indices != list(range(len(source_indices))): -- for module in self.kv_modules: -- # update the key/value cache to contain the selected sequences -- self.kv_cache[module] = self.kv_cache[module][source_indices].detach() -- -+ blocks = self.cache_dyn.shape[0] -+ for i in range(blocks): -+ for j in range(2): # k and v 2 items -+ self.cache_dyn[i][j] = self.cache_dyn[i][j][source_indices] - - class SequenceRanker: - def rank( -diff --git a/whisper/model.py b/whisper/model.py -index a678283..c94a024 100644 ---- a/whisper/model.py -+++ b/whisper/model.py -@@ -1,12 +1,14 @@ - import base64 - import gzip - from dataclasses import dataclass -+import os - from typing import Dict, Iterable, Optional - - import numpy as np - import torch - import torch.nn.functional as F - from torch import Tensor, nn -+import mindietorch - - from .decoding import decode as decode_function - from .decoding import detect_language as detect_language_function -@@ -153,24 +155,19 @@ class AudioEncoder(nn.Module): - [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)] - ) - self.ln_post = LayerNorm(n_state) -+ self.device = "npu:0" -+ self.mindietorch_encoder_model = torch.jit.load( -+ "/tmp/models/encoder_compiled.ts" -+ ).eval().to(self.device) - - def forward(self, x: Tensor): - """ - x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) - the mel spectrogram of the audio - """ -- x = F.gelu(self.conv1(x)) -- x = F.gelu(self.conv2(x)) -- x = x.permute(0, 2, 1) -- -- assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" -- x = (x + self.positional_embedding).to(x.dtype) -- -- for block in self.blocks: -- x = block(x) -- -- x = self.ln_post(x) -- return x -+ x = x.to(self.device) -+ x = self.mindietorch_encoder_model(x) -+ return x.cpu() - - - class TextDecoder(nn.Module): -@@ -193,29 +190,58 @@ class TextDecoder(nn.Module): - mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) - self.register_buffer("mask", mask, persistent=False) - -- def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): -- """ -- x : torch.LongTensor, shape = (batch_size, <= n_ctx) -- the text tokens -- xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state) -- the encoded audio features to be attended on -- """ -- offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 -- x = ( -- self.token_embedding(x) -- + self.positional_embedding[offset : offset + x.shape[-1]] -- ) -- x = x.to(xa.dtype) -- -- for block in self.blocks: -- x = block(x, xa, mask=self.mask, kv_cache=kv_cache) -+ self.device = "npu:0" -+ self.mindietorch_language_detection_model = torch.jit.load( -+ "/tmp/models/language_detection_compiled.ts" -+ ).eval().to(self.device) -+ self.mindietorch_prefill_model = torch.jit.load( -+ "/tmp/models/decoder_prefill_compiled.ts" -+ ).eval().to(self.device) -+ self.mindietorch_decode_model = torch.jit.load( -+ "/tmp/models/decoder_decode_compiled.ts" -+ ).eval().to(self.device) - -- x = self.ln(x) -- logits = ( -- x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1) -- ).float() -- -- return logits -+ def forward( -+ self, -+ x: Tensor, -+ xa: Tensor, -+ pos_embed: Tensor = None, -+ cache_dyn: Tensor = None, -+ cache_sta: Tensor = None, -+ ): -+ if cache_dyn is None: -+ tokens_npu = x.float().to(self.device) -+ audio_features_npu = xa.to(self.device) -+ pos_embed_npu = pos_embed.to(self.device) -+ if x.shape[0] != 1: -+ logits, cache_dyn, cache_sta = self.mindietorch_prefill_model( -+ tokens_npu, -+ audio_features_npu, -+ pos_embed_npu -+ ) -+ else: -+ logits, cache_dyn, cache_sta = self.mindietorch_language_detection_model( -+ tokens_npu, -+ audio_features_npu, -+ pos_embed_npu -+ ) -+ logits = logits.cpu() -+ cache_dyn = cache_dyn.cpu() -+ else: -+ tokens_npu = x.float().to(self.device) -+ audio_features_npu = xa.to(self.device) -+ pos_embed_npu = pos_embed.to(self.device) -+ cache_dyn_npu = cache_dyn.to(self.device) -+ logits, cache_dyn, _ = self.mindietorch_decode_model( -+ tokens_npu, -+ audio_features_npu, -+ pos_embed_npu, -+ cache_dyn_npu, -+ cache_sta -+ ) -+ logits = logits.cpu() -+ cache_dyn = cache_dyn.cpu() -+ return logits, cache_dyn, cache_sta - - - class Whisper(nn.Module): -@@ -257,7 +283,8 @@ class Whisper(nn.Module): - return self.encoder(mel) - - def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor): -- return self.decoder(tokens, audio_features) -+ pos_embed = self.decoder.positional_embedding[:tokens.shape[-1]] -+ return self.decoder(tokens, audio_features, pos_embed) - - def forward( - self, mel: torch.Tensor, tokens: torch.Tensor diff --git a/mxRAG/MainRepo/patches/whisper/trace_model.patch b/mxRAG/MainRepo/patches/whisper/trace_model.patch deleted file mode 100644 index 903444eddb385a3f8e8ed4e09e0c0eeeeea1f9e0..0000000000000000000000000000000000000000 --- a/mxRAG/MainRepo/patches/whisper/trace_model.patch +++ /dev/null @@ -1,343 +0,0 @@ -diff --git a/whisper/decoding.py b/whisper/decoding.py -index 49485d0..495fe45 100644 ---- a/whisper/decoding.py -+++ b/whisper/decoding.py -@@ -2,6 +2,7 @@ from dataclasses import dataclass, field, replace - from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union - - import numpy as np -+import os - import torch - import torch.nn.functional as F - from torch import Tensor -@@ -49,12 +50,24 @@ def detect_language( - - # skip encoder forward pass if already-encoded audio features were given - if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): -+ encoder_ts_model = torch.jit.trace(model.encoder, mel) -+ encoder_ts_model.save( -+ "/tmp/models/encoder.ts") -+ torch.onnx.export( -+ model.encoder, -+ (mel), -+ "/tmp/models/onnx/encode/encoder.onnx", -+ opset_version=11, -+ input_names=["mel"], -+ output_names=["ret"] -+ ) -+ - mel = model.encoder(mel) - - # forward pass using a single token, startoftranscript - n_audio = mel.shape[0] - x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1] -- logits = model.logits(x, mel)[:, 0] -+ logits = model.logits(x, mel)[0][:, 0] - - # collect detected languages; suppress all non-language tokens - mask = torch.ones(logits.shape[-1], dtype=torch.bool) -@@ -145,36 +158,74 @@ class PyTorchInference(Inference): - def __init__(self, model: "Whisper", initial_token_length: int): - self.model: "Whisper" = model - self.initial_token_length = initial_token_length -- self.kv_cache = {} -- self.hooks = [] -- -- key_modules = [block.attn.key for block in self.model.decoder.blocks] -- value_modules = [block.attn.value for block in self.model.decoder.blocks] -- self.kv_modules = key_modules + value_modules -+ self.cache_dyn = None -+ self.cache_sta = None - - def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: -- if not self.kv_cache: -- self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() -- - if tokens.shape[-1] > self.initial_token_length: - # only need to use the last token except in the first forward pass - tokens = tokens[:, -1:] -+ pos_embed = self.model.decoder.positional_embedding[self.cache_dyn.shape[3]] -+ torch.onnx.export( -+ self.model.decoder, -+ (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta), -+ "/tmp/models/onnx/decode/decoder_decode.onnx", -+ opset_version=11, -+ input_names=["tokens", "audio_features", "pos_embed", "cache_dyn", "cache_sta"], -+ output_names=["logits", "new_cache_dyn", "new_cache_sta"], -+ dynamic_axes={ -+ "cache_dyn": {3: "ntokens"}, -+ "new_cache_dyn": {3: "ntokens"} -+ } -+ ) -+ decoder_decode_ts_model = torch.jit.trace( -+ self.model.decoder, -+ (tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta) -+ ) -+ decoder_decode_ts_model.save( -+ "/tmp/models/decoder_decode.ts") -+ logits, cache_dyn, _ = self.model.decoder( -+ tokens, audio_features, pos_embed, self.cache_dyn, self.cache_sta) -+ os.sys.exit(0) -+ self.cache_dyn = cache_dyn -+ else: -+ pos_embed = self.model.decoder.positional_embedding[:tokens.shape[-1]] -+ torch.onnx.export( -+ self.model.decoder, -+ (tokens, audio_features, pos_embed), -+ "/tmp/models/onnx/prefill/decoder_prefill.onnx", -+ opset_version=11, -+ input_names=["tokens", "audio_features", "pos_embed"], -+ output_names=["logits", "cache_dyn", "cache_sta"], -+ dynamic_axes={ -+ "tokens": {1: "ntokens"}, -+ "pos_embed": {0: "ntokens"}, -+ "logits": {1: "ntokens"}, -+ "cache_dyn": {3: "ntokens"} -+ } -+ ) -+ decoder_prefill_ts_model = torch.jit.trace( -+ self.model.decoder, -+ (tokens, audio_features, pos_embed) -+ ) -+ decoder_prefill_ts_model.save( -+ "/tmp/models/decoder_prefill.ts") -+ logits, cache_dyn, cache_sta = self.model.decoder(tokens, audio_features, pos_embed) -+ self.cache_dyn = cache_dyn -+ self.cache_sta = cache_sta - -- return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache) -+ return logits - - def cleanup_caching(self): -- for hook in self.hooks: -- hook.remove() -- -- self.kv_cache = {} -- self.hooks = [] -+ self.cache_dyn = None -+ self.cache_sta = None - - def rearrange_kv_cache(self, source_indices): - if source_indices != list(range(len(source_indices))): -- for module in self.kv_modules: -- # update the key/value cache to contain the selected sequences -- self.kv_cache[module] = self.kv_cache[module][source_indices].detach() -- -+ blocks = self.cache_dyn.shape[0] -+ for i in range(blocks): -+ for j in range(2): # k and v 2 items -+ self.cache_dyn[i][j] = self.cache_dyn[i][j][source_indices] - - class SequenceRanker: - def rank( -diff --git a/whisper/model.py b/whisper/model.py -index a678283..2a95e28 100644 ---- a/whisper/model.py -+++ b/whisper/model.py -@@ -1,6 +1,7 @@ - import base64 - import gzip - from dataclasses import dataclass -+import os - from typing import Dict, Iterable, Optional - - import numpy as np -@@ -68,6 +69,63 @@ class MultiHeadAttention(nn.Module): - self.value = Linear(n_state, n_state) - self.out = Linear(n_state, n_state) - -+ def encoder_forward(self, x: Tensor): -+ q = self.query(x) -+ k = self.key(x) -+ v = self.value(x) -+ wv, qk = self.qkv_attention(q, k, v) -+ return self.out(wv) -+ -+ def prefill_self_attn_forward( -+ self, -+ x: Tensor, -+ mask: Tensor, -+ ): -+ q = self.query(x) -+ k = self.key(x) -+ v = self.value(x) -+ cache_dyn = torch.stack([k, v]) -+ wv, _ = self.qkv_attention(q, k, v, mask) -+ return self.out(wv), cache_dyn -+ -+ def prefill_cross_attn_forward( -+ self, -+ x: Tensor, -+ xa: Tensor, -+ ): -+ q = self.query(x) -+ k = self.key(xa) -+ v = self.value(xa) -+ cache_sta = torch.stack([k, v]) -+ wv, _ = self.qkv_attention(q, k, v) -+ return self.out(wv), cache_sta -+ -+ def decode_self_attn_forward( -+ self, -+ x: Tensor, -+ mask: Tensor, -+ cache_dyn: Tensor -+ ): -+ q = self.query(x) -+ token_k = self.key(x) -+ k = torch.cat([cache_dyn[0], token_k], dim=1).detach() -+ token_v = self.value(x) -+ v = torch.cat([cache_dyn[1], token_v], dim=1).detach() -+ new_cache_dyn = torch.stack([k, v]) -+ wv, _ = self.qkv_attention(q, k, v, mask) -+ return self.out(wv), new_cache_dyn -+ -+ def decode_cross_attn_forward( -+ self, -+ x: Tensor, -+ cache_sta: Tensor -+ ): -+ q = self.query(x) -+ k = cache_sta[0] -+ v = cache_sta[1] -+ wv, _ = self.qkv_attention(q, k, v) -+ return self.out(wv) -+ - def forward( - self, - x: Tensor, -@@ -126,6 +184,39 @@ class ResidualAttentionBlock(nn.Module): - ) - self.mlp_ln = LayerNorm(n_state) - -+ def encoder_forward(self, x: Tensor): -+ x = x + self.attn.encoder_forward(self.attn_ln(x)) -+ x = x + self.mlp(self.mlp_ln(x)) -+ return x -+ -+ def prefill_forward( -+ self, -+ x: Tensor, -+ xa: Tensor, -+ mask: Tensor, -+ ): -+ self_attn_out, new_cache_dyn = self.attn.prefill_self_attn_forward(self.attn_ln(x), mask) -+ x = x + self_attn_out -+ cross_attn_out, new_cache_sta = self.cross_attn.prefill_cross_attn_forward(self.cross_attn_ln(x), xa) -+ x = x + cross_attn_out -+ x = x + self.mlp(self.mlp_ln(x)) -+ return x, new_cache_dyn, new_cache_sta -+ -+ def decode_forward( -+ self, -+ x: Tensor, -+ xa: Tensor, -+ mask: Tensor, -+ cache_dyn: Tensor, -+ cache_sta: Tensor -+ ): -+ self_attn_out, new_cache_dyn = self.attn.decode_self_attn_forward(self.attn_ln(x), mask, cache_dyn) -+ x = x + self_attn_out -+ cross_attn_out = self.cross_attn.decode_cross_attn_forward(self.cross_attn_ln(x), cache_sta) -+ x = x + cross_attn_out -+ x = x + self.mlp(self.mlp_ln(x)) -+ return x, new_cache_dyn -+ - def forward( - self, - x: Tensor, -@@ -163,11 +254,10 @@ class AudioEncoder(nn.Module): - x = F.gelu(self.conv2(x)) - x = x.permute(0, 2, 1) - -- assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" - x = (x + self.positional_embedding).to(x.dtype) - - for block in self.blocks: -- x = block(x) -+ x = block.encoder_forward(x) - - x = self.ln_post(x) - return x -@@ -193,29 +283,56 @@ class TextDecoder(nn.Module): - mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) - self.register_buffer("mask", mask, persistent=False) - -- def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): -- """ -- x : torch.LongTensor, shape = (batch_size, <= n_ctx) -- the text tokens -- xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state) -- the encoded audio features to be attended on -- """ -- offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 -- x = ( -- self.token_embedding(x) -- + self.positional_embedding[offset : offset + x.shape[-1]] -- ) -- x = x.to(xa.dtype) -+ def prefill(self, x: Tensor, xa: Tensor, pos_embed: Tensor): -+ x = (self.token_embedding(x) + pos_embed).to(xa.dtype) - -+ cache_dyn_list = [] -+ cache_sta_list = [] - for block in self.blocks: -- x = block(x, xa, mask=self.mask, kv_cache=kv_cache) -+ x, new_cache_dyn, new_cache_sta = block.prefill_forward(x, xa, self.mask) -+ cache_dyn_list.append(new_cache_dyn) -+ cache_sta_list.append(new_cache_sta) -+ -+ cache_dyn = torch.stack(cache_dyn_list) -+ cache_sta = torch.stack(cache_sta_list) - - x = self.ln(x) - logits = ( - x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1) - ).float() - -- return logits -+ return logits, cache_dyn, cache_sta -+ -+ def decode(self, x: Tensor, xa: Tensor, pos_embed: Tensor, cache_dyn: Tensor, cache_sta: Tensor): -+ x = (self.token_embedding(x) + pos_embed).to(xa.dtype) -+ -+ cache_dyn_list = [] -+ for idx, block in enumerate(self.blocks): -+ x, new_cache_dyn = block.decode_forward(x, xa, self.mask, cache_dyn[idx], cache_sta[idx]) -+ cache_dyn_list.append(new_cache_dyn) -+ -+ new_cache_dyn = torch.stack(cache_dyn_list) -+ -+ x = self.ln(x) -+ logits = ( -+ x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1) -+ ).float() -+ -+ return logits, new_cache_dyn -+ -+ def forward( -+ self, -+ x: Tensor, -+ xa: Tensor, -+ pos_embed: Tensor = None, -+ cache_dyn: Tensor = None, -+ cache_sta: Tensor = None, -+ ): -+ if cache_dyn is None: -+ logits, cache_dyn, cache_sta = self.prefill(x, xa, pos_embed) -+ else: -+ logits, cache_dyn = self.decode(x, xa, pos_embed, cache_dyn, cache_sta) -+ return logits, cache_dyn, cache_sta - - - class Whisper(nn.Module): -@@ -257,7 +374,8 @@ class Whisper(nn.Module): - return self.encoder(mel) - - def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor): -- return self.decoder(tokens, audio_features) -+ pos_embed = self.decoder.positional_embedding[:tokens.shape[-1]] -+ return self.decoder(tokens, audio_features, pos_embed) - - def forward( - self, mel: torch.Tensor, tokens: torch.Tensor diff --git a/mxRAG/MainRepo/patches/whisper/whisper_patch.sh b/mxRAG/MainRepo/patches/whisper/whisper_patch.sh deleted file mode 100644 index 5779dca9fd023297d8d18623c01337d10518bd52..0000000000000000000000000000000000000000 --- a/mxRAG/MainRepo/patches/whisper/whisper_patch.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash -#在patches/whisper目录下执行一键补丁脚本 -set -e - -MODEL_NAME='tiny' -DEVICE=0 -SOC_VERSION="Ascend310P3" - -PATCH_DIR=$(dirname $(readlink -f $0)) -dist_packages=$(python3 -c "import site;print(site.getsitepackages()[0])") -dist_packages_path=$(echo "$dist_packages" | sed "s/[',\[\]]//g") -echo "dist-packages path is: $dist_packages_path" - -# 初始化模型参数 -declare -A params -params[tiny]="4 384 80" -params[base]="6 512 80" -params[small]="12 768 80" -params[medium]="24 1024 80" -params[large]="32 1280 128" - -# 提取模型 -IFS=' ' read -r -a model_params <<< "${params[$MODEL_NAME]}" -N_BLOCKS=${model_params[0]} -HIDDEN=${model_params[1]} -N_MEIS=${model_params[2]} - -# 提取DEVICE参数 -sed -E -i "s/set_device\([0-9]+\)/set_device($DEVICE)/g" mindietorch_infer.patch -sed -E -i "s/npu:[0-9]+/npu:$DEVICE/g" mindietorch_infer.patch -echo "set device is: $DEVICE" - -function install_packages(){ - pip3 install onnx==1.16.1 - pip3 uninstall -y openai-whisper==20231117 - pip3 install openai-whisper==20231117 - -} - -function patch_trace_model(){ - cd $dist_packages_path - patch -p1 < $PATCH_DIR/trace_model.patch - cd $PATCH_DIR - DIRS=("/tmp/models" - "/tmp/models/onnx" - "/tmp/models/onnx/encode" - "/tmp/models/onnx/decode" - "/tmp/models/onnx/prefill" - ) - for dir in "${DIRS[@]}"; do - if [ ! -d "$dir" ]; then - mkdir -p "$dir" - echo "Directory $dir created." - else - echo "Directory $dir already exists." - fi - done - whisper zh.wav --model $MODEL_NAME -} - -function compile_model(){ - source /usr/local/Ascend/ascend-toolkit/set_env.sh - source /usr/local/Ascend/mindie/set_env.sh - source /usr/local/Ascend/ascend-toolkit/set_env.sh - python3 compile.py --nblocks $N_BLOCKS --hidden $HIDDEN --n_mels $N_MEIS --soc_version $SOC_VERSION -} - -function patch_mindietorch(){ - pip3 uninstall -y openai-whisper==20231117 - pip3 install openai-whisper==20231117 - cd $dist_packages_path - patch -p1 < $PATCH_DIR/mindietorch_infer.patch -} - -function main(){ - install_packages - patch_trace_model - compile_model - patch_mindietorch -} - -main \ No newline at end of file