diff --git a/README.md b/README.md index 2596d50b8e416917486c1496b6d8d843ebf771a2..eeb2f6ce717045a1753a50e117ebbf140c950c7d 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ | Baichuan2-7B | `vLLM` | [✅](models/nlp/llm/baichuan2-7b/vllm) | 4.3.0 | | ChatGLM-3-6B | `vLLM` | [✅](models/nlp/llm/chatglm3-6b/vllm) | 4.3.0 | | ChatGLM-3-6B-32K | `vLLM` | [✅](models/nlp/llm/chatglm3-6b-32k/vllm) | 4.3.0 | +| CosyVoice2-0.5B | `PyTorch` | [✅](models/speech/speech_synthesis/cosyvoice/pytorch) | 4.3.0 | | DeepSeek-R1-Distill-Llama-8B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Llama-70B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Qwen-1.5B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm) | 4.3.0 | diff --git a/README_en.md b/README_en.md index f6da160441a54d3e4a768942edbf7a6d5955a744..7586c21ac33c5fcea12bc45db50e13dd3fa1b1ff 100644 --- a/README_en.md +++ b/README_en.md @@ -41,6 +41,7 @@ inference to be expanded in the future. | Baichuan2-7B | `vLLM` | [✅](models/nlp/llm/baichuan2-7b/vllm) | 4.3.0 | | ChatGLM-3-6B | `vLLM` | [✅](models/nlp/llm/chatglm3-6b/vllm) | 4.3.0 | | ChatGLM-3-6B-32K | `vLLM` | [✅](models/nlp/llm/chatglm3-6b-32k/vllm) | 4.3.0 | +| CosyVoice2-0.5B | `PyTorch` | [✅](models/speech/speech_synthesis/cosyvoice/pytorch) | 4.3.0 | | DeepSeek-R1-Distill-Llama-8B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Llama-70B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Qwen-1.5B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm) | 4.3.0 | diff --git a/models/speech/speech_synthesis/cosyvoice/pytorch/README.md b/models/speech/speech_synthesis/cosyvoice/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7e559dd5c33770bb4b88014879ce6341b028f28d --- /dev/null +++ b/models/speech/speech_synthesis/cosyvoice/pytorch/README.md @@ -0,0 +1,49 @@ +# CosyVoice2 (pytorch) + +## Model Description + +CosyVoice2-0.5B is a small speech model designed to understand and generate human-like speech. It can be used for tasks like voice assistants, text-to-speech, or voice cloning. With 0.5 billion parameters, it is lightweight and works well on devices with limited computing power. It focuses on natural-sounding voices and easy customization. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +### Install Dependencies + +```bash +pip3 install -r requirements.txt +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +# If you failed to clone the submodule due to network failures, please run the following command until success +cd CosyVoice +git submodule update --init --recursive + +mkdir -p pretrained_models +# download CosyVoice2-0.5B model into pretrained_models dir + +# If you encounter sox compatibility issues +# ubuntu +sudo apt-get install sox libsox-dev +# centos +sudo yum install sox sox-devel +``` + +## Model Inference + +```bash +cp ../inference_test.py ./ +python3 inference_test.py +``` + +## Model Results + +## References + +- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice/commit/0a496c18f78ca993c63f6d880fcc60778bfc85c1) \ No newline at end of file diff --git a/models/speech/speech_synthesis/cosyvoice/pytorch/ci/prepare.sh b/models/speech/speech_synthesis/cosyvoice/pytorch/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..9ab95280bd97e5d705d163d465b45916cd9239e1 --- /dev/null +++ b/models/speech/speech_synthesis/cosyvoice/pytorch/ci/prepare.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +apt update +apt-get install sox libsox-dev + +cp -r /mnt/deepspark/data/repos/CosyVoice ./ +cd CosyVoice +mkdir -p pretrained_models +ln -s /mnt/deepspark/data/checkpoints/CosyVoice2-0.5B pretrained_models/ +pip3 install -r requirements.txt + +cp ../inference_test.py ./ \ No newline at end of file diff --git a/models/speech/speech_synthesis/cosyvoice/pytorch/inference_test.py b/models/speech/speech_synthesis/cosyvoice/pytorch/inference_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5dff40a055714853e1da648abce1720c1b9c5768 --- /dev/null +++ b/models/speech/speech_synthesis/cosyvoice/pytorch/inference_test.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append('third_party/Matcha-TTS') +from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 +from cosyvoice.utils.file_utils import load_wav +import torchaudio +cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False) + +# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference +# zero_shot usage +prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000) +for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + +# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248 +for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)): + torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + +# instruct usage +for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)): + torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) \ No newline at end of file diff --git a/models/speech/speech_synthesis/cosyvoice/pytorch/requirements.txt b/models/speech/speech_synthesis/cosyvoice/pytorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bc0604ee8811262e2952ba2b9f954e333b5c675 --- /dev/null +++ b/models/speech/speech_synthesis/cosyvoice/pytorch/requirements.txt @@ -0,0 +1,40 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684 +conformer==0.3.2 +#deepspeed==0.15.1; sys_platform == 'linux' +diffusers==0.29.0 +fastapi==0.115.6 +fastapi-cli==0.0.4 +gdown==5.1.0 +gradio==5.4.0 +grpcio==1.57.0 +grpcio-tools==1.57.0 +hydra-core==1.3.2 +HyperPyYAML==1.2.2 +inflect==7.3.1 +librosa==0.10.2 +lightning==2.2.4 +matplotlib==3.7.5 +modelscope==1.20.0 +networkx==3.1 +omegaconf==2.3.0 +#onnx==1.16.0 +#onnxruntime-gpu==1.18.0; sys_platform == 'linux' +onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32' +openai-whisper +protobuf==4.25 +pyarrow==18.1.0 +pydantic==2.7.0 +pyworld==0.3.4 +rich==13.7.1 +soundfile==0.12.1 +#tensorboard==2.14.0 +#tensorrt-cu12==10.0.1; sys_platform == 'linux' +#tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux' +#tensorrt-cu12-libs==10.0.1; sys_platform == 'linux' +#torch==2.3.1 +#torchaudio==2.3.1 +#transformers==4.40.1 +uvicorn==0.30.0 +wetext==0.0.4 +wget==3.2 \ No newline at end of file diff --git a/tests/model_info.json b/tests/model_info.json index 2ad04e4347d20aabcc7070ba918b63af6e06d7b7..731f0339bfdadc87f38e39d583343e39649684ff 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -8072,7 +8072,7 @@ "framework": "fastdeploy", "release_version": "25.09", "release_sdk": "4.3.0", - "release_gpgpu": "BI-V150", + "release_gpgpu": "MR-V100", "latest_sdk": "", "latest_gpgpu": "", "category": "nlp/llm", @@ -8105,7 +8105,7 @@ "framework": "fastdeploy", "release_version": "25.09", "release_sdk": "4.3.0", - "release_gpgpu": "BI-V150", + "release_gpgpu": "MR-V100", "latest_sdk": "", "latest_gpgpu": "", "category": "nlp/llm", @@ -8131,6 +8131,39 @@ "type": "inference", "hasDemo": false, "demoType": "" + }, + { + "display_name": "CosyVoice2-0.5B", + "model_name": "cosyvoice", + "framework": "pytorch", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "speech/speech_synthesis", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/speech/speech_synthesis/cosyvoice/pytorch", + "readme_file": "models/speech/speech_synthesis/cosyvoice/pytorch/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" } ] } \ No newline at end of file diff --git a/tests/run_vllm.py b/tests/run_vllm.py index 39b3c2227568ccfd30da953b81954266e90961fd..eab1387958318d8c3263edb3d7725ff0f74438bb 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -56,7 +56,7 @@ def main(): result = {} # NLP模型 - if model["category"] in ["nlp/llm", "multimodal/vision_language_model", "speech/asr"]: + if model["category"] in ["nlp/llm", "multimodal/vision_language_model", "speech/asr", "speech/speech_synthesis"]: logging.info(f"Start running {model['model_name']} test case:\n{json.dumps(model, indent=4)}") d_url = model["download_url"] if d_url is not None: @@ -72,7 +72,7 @@ def get_model_config(mode_name): models = json.load(file) for model in models['models']: - if model["model_name"] == mode_name.lower() and (model["framework"] == "vllm" or model["framework"] == "lmdeploy"): + if model["model_name"] == mode_name.lower() and (model["framework"] == "vllm" or model["framework"] == "lmdeploy" or model["framework"] == "pytorch"): return model return @@ -341,6 +341,12 @@ def run_nlp_testcase(model): cd ../{model['model_path']} python3 offline_inference_vision_language.py --model ./{model_name} --max-model-len 4096 --max-num-seqs 2 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache """ + elif model_name == "cosyvoice": + script = f""" + set -x + cd ../{model['model_path']}/CosyVoice + python3 inference_test.py + """ r, t = run_script(script) sout = r.stdout