diff --git a/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2/diff_CosyVoice.patch b/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2/diff_CosyVoice.patch index f5f1b9016c39c51e958019a729c4f6f9a4822886..6bec7233c6f96591e27b498c811a4d4b30de91e0 100755 --- a/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2/diff_CosyVoice.patch +++ b/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2/diff_CosyVoice.patch @@ -99,10 +99,31 @@ index 6e10f00..25ad767 100644 speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device) return speech_token, speech_token_len diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py -index 9ebf8cb..dc8b4d2 100644 +index 9ebf8cb..a8775a1 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py -@@ -314,6 +314,14 @@ class CosyVoice2Model(CosyVoiceModel): +@@ -99,7 +99,7 @@ class CosyVoiceModel: + self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context() + + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): +- with self.llm_context: ++ with self.llm_context(): + if isinstance(text, Generator): + assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!' + for i in self.llm.inference_bistream(text=text, +@@ -307,13 +307,25 @@ class CosyVoice2Model(CosyVoiceModel): + self.speech_window = np.hamming(2 * self.source_cache_len) + # rtf and decoding related + self.stream_scale_factor = 1 +- self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() ++ if torch.cuda.is_available(): ++ stream = torch.cuda.Stream(device=self.device) ++ self.llm_context = lambda: torch.cuda.stream(stream) ++ else: ++ self.llm_context = lambda: contextlib.nullcontext() + self.lock = threading.Lock() + # dict used to store session related variable + self.tts_speech_token_dict = {} self.llm_end_dict = {} self.hift_cache_dict = {} @@ -117,7 +138,7 @@ index 9ebf8cb..dc8b4d2 100644 def load_jit(self, flow_encoder_model): flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) self.flow.encoder = flow_encoder -@@ -362,12 +370,17 @@ class CosyVoice2Model(CosyVoiceModel): +@@ -362,12 +374,17 @@ class CosyVoice2Model(CosyVoiceModel): with self.lock: self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False self.hift_cache_dict[this_uuid] = None @@ -139,7 +160,7 @@ index 9ebf8cb..dc8b4d2 100644 if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len: this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0) this_tts_speech = self.token2wav(token=this_tts_speech_token, -@@ -379,10 +392,6 @@ class CosyVoice2Model(CosyVoiceModel): +@@ -379,10 +396,6 @@ class CosyVoice2Model(CosyVoiceModel): finalize=False) token_offset += self.token_hop_len yield {'tts_speech': this_tts_speech.cpu()} @@ -150,7 +171,7 @@ index 9ebf8cb..dc8b4d2 100644 this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) this_tts_speech = self.token2wav(token=this_tts_speech_token, prompt_token=flow_prompt_speech_token, -@@ -393,6 +402,8 @@ class CosyVoice2Model(CosyVoiceModel): +@@ -393,6 +406,8 @@ class CosyVoice2Model(CosyVoiceModel): finalize=True) yield {'tts_speech': this_tts_speech.cpu()} else: @@ -159,7 +180,7 @@ index 9ebf8cb..dc8b4d2 100644 # deal with all tokens p.join() this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) -@@ -409,3 +420,83 @@ class CosyVoice2Model(CosyVoiceModel): +@@ -409,3 +424,83 @@ class CosyVoice2Model(CosyVoiceModel): self.tts_speech_token_dict.pop(this_uuid) self.llm_end_dict.pop(this_uuid) torch.cuda.empty_cache() @@ -243,7 +264,6 @@ index 9ebf8cb..dc8b4d2 100644 + self.prompt_speech_token_dict.pop(this_uuid) + self.speech_feat_dict.pop(this_uuid) + self.embedding_dict.pop(this_uuid) -\ No newline at end of file diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index 6a60f6d..fbe7545 100644 --- a/cosyvoice/flow/flow_matching.py