From 8094f320ed884a03f33c7ad555771e0b0936b386 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 30 Sep 2024 19:13:26 +0800 Subject: [PATCH 01/13] add stable-audio-tools infer way --- .../stable-audio-open-1.0/diffusers/README.md | 10 +- .../precision_brownian_interval.patch | 1 + .../diffusers/prompt.txt | 3 + .../stable-audio-tools/README.md | 148 ++++++++++++++++++ .../stable-audio-tools/conditioners.patch | 24 +++ .../stable-audio-tools/conditioners_patch.py | 14 ++ .../stable-audio-tools/pretrained.patch | 20 +++ .../stable-audio-tools/pretrained_patch.py | 14 ++ .../stable-audio-tools/prompt.txt | 3 + .../stable-audio-tools/requirements.txt | 5 + .../stable_audio_open_tools_pipeline.py | 121 ++++++++++++++ 11 files changed, 358 insertions(+), 5 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompt.txt create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompt.txt create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/requirements.txt create mode 100644 MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md index 61a1864ca2..2176c04d80 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md @@ -1,4 +1,4 @@ -# stable-audio-open-1.0模型-推理指导 +# stable-audio-open-1.0模型-diffusers方式推理指导 - [概述](#ZH-CN_TOPIC_0000001172161501) @@ -164,8 +164,8 @@ --audio_end_in_s 10 10 47 \ --num_waveforms_per_prompt 1 \ --guidance_scale 7 \ - --device 0 \ - --save_dir ./result + --save_dir ./result \ + --device 0 ``` 参数说明: @@ -173,11 +173,11 @@ - --output_dir:存放导出模型的目录。 - --prompt_file:提示词文件。 - --num_inference_steps: 语音生成迭代次数。 - - --save_dir:生成语音的存放目录。 - - --device:推理设备ID。 - --audio_end_in_s:生成语音的时长,如不输入则默认生成10s。 - --num_waveforms_per_prompt:一个提示词生成的语音数量。 - --guidance_scale:音频生成质量与准确度系数。 + - --save_dir:生成语音的存放目录。 + - --device:推理设备ID。 执行完成后在`./results`目录下生成推理语音,语音生成顺序与文本中prompt顺序保持一致,并在终端显示推理时间。 diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/precision_brownian_interval.patch b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/precision_brownian_interval.patch index fcaca7605d..d9d94e5801 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/precision_brownian_interval.patch +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/precision_brownian_interval.patch @@ -8,3 +8,4 @@ - return torch.randn(size, dtype=dtype, device=device, generator=generator) + torch.manual_seed(int(seed)) + return torch.randn(size, dtype=dtype, device="cpu").to(device) + diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompt.txt b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompt.txt new file mode 100644 index 0000000000..e1c7734ef9 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompt.txt @@ -0,0 +1,3 @@ +Berlin techno, rave, drum machine, kick, ARP synthesizer, dark, moody, hypnotic, evolving, 135BPM. LOOP. +Uplifting acoustic loop. 120 BPM. +Disco, Driving Drum Machine, Synthesizer, Bass, Piano, Guitars, Instrumental, Clubby, Euphoric, Chicago, New York, 115 BPM. \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md new file mode 100644 index 0000000000..d9bcef1b15 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -0,0 +1,148 @@ +# stable-audio-open-1.0模型-stable-audio-tools方式推理指导 + +- [概述](#ZH-CN_TOPIC_0000001172161501) + + +- [推理环境准备](#ZH-CN_TOPIC_0000001126281702) + +- [快速上手](#ZH-CN_TOPIC_0000001126281700) + + - [获取源码](#section4622531142816) + - [模型推理](#section741711594517) + +- [模型推理性能&精度](#ZH-CN_TOPIC_0000001172201573) + +# 概述 + + [此处获得](https://huggingface.co/stabilityai/stable-audio-open-1.0) + +- 参考实现: + ```bash + # StableAudioOpen1.0 + https://huggingface.co/stabilityai/stable-audio-open-1.0 + ``` + +# 推理环境准备 + +- 该模型需要以下插件与驱动 + + **表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + | ----- | ----- |-----| + | Python | 3.10.2 | - | + | torch | 2.1.0 | - | + +该模型性能受CPU规格影响,建议使用64核CPU(arm)以复现性能 + +# 快速上手 +## 获取源码 +1. 安装依赖。 + ```bash + pip3 install -r requirements.txt + ``` + +2. 安装mindie包 + + ```bash + # 安装mindie + source /usr/local/Ascend/ascend-toolkit/set_env.sh + chmod +x ./Ascend-mindie_xxx.run + ./Ascend-mindie_xxx.run --install + source /usr/local/Ascend/mindie/set_env.sh + ``` + +3. 代码修改 + +- 执行命令: + ```bash + python3 conditioners_patch.py + python3 pretrained_patch.py + ``` + +## 模型推理 + +1. 模型准备。 + 1. 获取模型权重 + ```bash + # 需要使用 git-lfs (https://git-lfs.com) + git lfs install + + # 下载stable-audio-open-1.0权重 + git clone https://huggingface.co/stabilityai/stable-audio-open-1.0 + ``` + + 2. 设置模型权重的路径。 + ```bash + # stable-audio-open-1.0 (执行时下载权重) + model_base="stabilityai/stable-audio-open-1.0" + + # stable-audio-open-1.0 (使用上一步下载的权重) + model_base="./stable-audio-open-1.0" + ``` + + 3. 获取T5模型权重(可选) + + 推理过程中会自动从huggingface下载T5-base的模型权重,若希望以加载本地T5-base模型权重方式进行推理,请将`model_base`路径下的`tokenizer`和`text_encoder`文件夹复制到代码执行路径中。 + + +2. 开始推理验证。 + + 1. 开启cpu高性能模式 + ```bash + echo performance |tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + sysctl -w vm.swappiness=0 + sysctl -w kernel.numa_balancing=0 + ``` + + 2. 安装绑核工具 + ```bash + apt-get update + apt-get install numactl + ``` + 查询卡的NUMA node + ```shell + lspci -vs bus-id + ``` + bus-id可通过npu-smi info获得,查询到NUMA node,在推理命令前加上对应的数字 + + 可通过lscpu获得NUMA node对应的CPU核数 + ```shell + NUMA node0: 0-23 + NUMA node1: 24-47 + NUMA node2: 48-71 + NUMA node3: 72-95 + ``` + 当前查到NUMA node是0,对应0-23,推荐绑定其中单核以获得更好的性能。 + + 3. 执行推理脚本。 + ```bash + numactl -C 0-23 python3 stable_audio_open_tools_pipeline.py \ + --model ${model_base} \ + --prompt_file ./prompts.txt \ + --num_inference_steps 100 \ + --seconds_total 10 10 47 \ + --save_dir ./result \ + --device 0 + ``` + + 参数说明: + - --model:模型权重路径。 + - --prompt_file:提示词文件。 + - --num_inference_steps: 语音生成迭代次数。 + - --seconds_total:生成语音的时长,如不输入则默认生成10s。 + - --save_dir:生成语音的存放目录。 + - --device:推理设备ID。 + + 执行完成后在`./results`目录下生成推理语音,语音生成顺序与文本中prompt顺序保持一致,并在终端显示推理时间。 + + + +# 模型推理性能&精度 +性能参考下列数据。 + +### Stable-Audio-Open-1.0 + +| 硬件形态 | 迭代次数 | 平均耗时| +| :------: |:----:|:----:| +| A2 | 100 | 15.675s | \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch new file mode 100644 index 0000000000..f635d7a20d --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch @@ -0,0 +1,24 @@ +--- conditioners.py 2024-09-30 15:31:32.480360700 +0800 ++++ conditioners_patch.py 2024-09-30 18:20:43.344830200 +0800 +@@ -280,10 +280,17 @@ + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + try: +- # self.tokenizer = T5Tokenizer.from_pretrained(t5_model_name, model_max_length = max_length) +- # model = T5EncoderModel.from_pretrained(t5_model_name, max_length=max_length).train(enable_grad).requires_grad_(enable_grad) +- self.tokenizer = AutoTokenizer.from_pretrained(t5_model_name) +- model = T5EncoderModel.from_pretrained(t5_model_name).train(enable_grad).requires_grad_(enable_grad).to(torch.float16) ++ import os ++ tokenizer_path = os.path.join(os.getcwd() + "tokenizer") ++ text_encoder_path = os.path.join(os.getcwd() + "text_encoder") ++ if os.path.exists(tokenizer_path) and os.path.exists(text_encoder_path): ++ print("From loacl import T5-base . . .") ++ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) ++ model = T5EncoderModel.from_pretrained(text_encoder_path).train(enable_grad).requires_grad_(enable_grad).to(torch.float16) ++ else: ++ print("From HuggingFace download T5-base . . .") ++ self.tokenizer = AutoTokenizer.from_pretrained(t5_model_name) ++ model = T5EncoderModel.from_pretrained(t5_model_name).train(enable_grad).requires_grad_(enable_grad).to(torch.float16) + finally: + logging.disable(previous_level) + diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py new file mode 100644 index 0000000000..7c20ccaa61 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py @@ -0,0 +1,14 @@ +import os +import stable_audio_tools + + +def main(): + stable_audio_tools_path = stable_audio_tools.__path__ + stable_audio_tools_version = stable_audio_tools.__version__ + + assert stable_audio_tools_version is not '0.0.16', "expectation stable_audio_tools_version==0.0.16" + os.system(f'patch -p0 {stable_audio_tools_path[0]}/models/conditioners.py conditioners.patch') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch new file mode 100644 index 0000000000..ca200e51c7 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch @@ -0,0 +1,20 @@ +--- pretrained.py 2024-09-30 15:31:40.672485200 +0800 ++++ pretrained_patch.py 2024-09-30 18:15:59.061846100 +0800 +@@ -1,4 +1,5 @@ + import json ++import os + + from .factory import create_model_from_config + from .utils import load_ckpt_state_dict +@@ -15,10 +16,7 @@ + model = create_model_from_config(model_config) + + # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file +- try: +- model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model') +- except Exception as e: +- model_ckpt_path = hf_hub_download(name, filename="model.ckpt", repo_type='model') ++ model_ckpt_path = os.path.join(name, "model.safetensors") + + model.load_state_dict(load_ckpt_state_dict(model_ckpt_path)) + diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py new file mode 100644 index 0000000000..5725b47300 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py @@ -0,0 +1,14 @@ +import os +import stable_audio_tools + + +def main(): + stable_audio_tools_path = stable_audio_tools.__path__ + stable_audio_tools_version = stable_audio_tools.__version__ + + assert stable_audio_tools_version is not '0.0.16', "expectation stable_audio_tools_version==0.0.16" + os.system(f'patch -p0 {stable_audio_tools_path[0]}/models/pretrained.py pretrained.patch') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompt.txt b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompt.txt new file mode 100644 index 0000000000..e1c7734ef9 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompt.txt @@ -0,0 +1,3 @@ +Berlin techno, rave, drum machine, kick, ARP synthesizer, dark, moody, hypnotic, evolving, 135BPM. LOOP. +Uplifting acoustic loop. 120 BPM. +Disco, Driving Drum Machine, Synthesizer, Bass, Piano, Guitars, Instrumental, Clubby, Euphoric, Chicago, New York, 115 BPM. \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/requirements.txt b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/requirements.txt new file mode 100644 index 0000000000..df8e40939a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/requirements.txt @@ -0,0 +1,5 @@ +torch==2.1.0 +torchaudio==2.1.0 +stable_audio_tools==0.0.16 +transformers==4.40.0 +torch_npu==2.1.0.post6 \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py new file mode 100644 index 0000000000..8766a6d3cc --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py @@ -0,0 +1,121 @@ +import torch +import torch_npu +import sys +import time +import os +import argparse +from safetensors.torch import load_file +import torchaudio +from einops import rearrange +from stable_audio_tools import get_pretrained_model +from stable_audio_tools.inference.generation import generate_diffusion_cond + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--prompt_file", + type=str, + default="./prompts.txt", + help="The prompts file to guide audio generation.", + ) + parser.add_argument( + "--num_inference_steps", + type=int, + default=100, + help="The number of denoising steps. More denoising steps usually lead to a higher quality audio at the expense of slower inference.", + ) + parser.add_argument( + "--model", + type=str, + default="./stable-audio-open-1.0", + help="The path of stable-audio-open-1.0.", + ) + parser.add_argument( + "--seconds_total", + nargs='+', + default=[10], + help="Audio end index in seconds.", + ) + parser.add_argument( + "--device", + type=int, + default=0, + help="NPU device id.", + ) + parser.add_argument( + "--save_dir", + type=str, + default="./results", + help="Path to save result audio files.", + ) + return parser.parse_args() + +def main(): + args = parse_arguments() + save_dir = args.save_dir + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + torch_npu.npu.set_device(args.device) + npu_stream = torch_npu.npu.Stream() + + model, model_config = get_pretrained_model(args.model) + sample_rate = model_config["sample_rate"] + sample_size = model_config["sample_size"] + + model = model.to("npu").to(torch.float16).eval() + + conditioning = [{ + "prompt":"", + "seconds_start": 0, + "seconds_total": 0, + }] + total_time = 0 + prompts_num = 0 + average_time = 0 + skip = 2 + with os.fdopen(os.open(args.prompt_file, os.O_RDONLY), "r") as f: + for i, prompt in enumerate(f): + with torch.no_grad(): + conditioning[0]["prompt"] = prompt + conditioning[0]["seconds_total"] = float(args.audio_end_in_s[i]) if (len(args.audio_end_in_s) > i) else 10.0 + + npu_stream.synchronize() + begin = time.time() + output = generate_diffusion_cond( + model, + steps=args.num_inference_steps, + cfg_scale=7, + conditioning=conditioning, + sample_size=sample_size, + sigma_min=0.3, + sigma_max=500, + sampler_type="dpmpp-3m-3sde", + device="npu" + ) + npu_stream.synchronize() + end = time.time() + if i > skip-1: + total_time += end - begin + prompts_num = i+1 + wavefrom_start = int(conditioning[0]["seconds_start"] * sample_rate) + wavefrom_end = int(conditioning[0]["seconds_total"] * sample_rate) + output = output[:, :, wavefrom_start:wavefrom_end] + output = rearrange(output, "b d n -> d (b n)") + output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1,1).mul(32767).to(torch.int16).cpu() + torchaudio.save(args.save_dir + "/audio_by_prompt" + str(prompts_num) + ".wav", output, sample_rate) + if prompts_num>skip: + average_time = total_time/(prompts_num-skip) + else: + print("Infer average time skip first two prompts, make sure prompts.txt has three more prompts") + print(f"Infer average time: {average_time:.3f}s\n") + +if __name__ == "__main__": + main() + + + + + + + -- Gitee From 2400e0c936534bc231cc52b7e32bd379f699a57f Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 14:59:14 +0800 Subject: [PATCH 02/13] modify patch err --- .../stable-audio-tools/README.md | 3 +-- .../stable-audio-tools/conditioners.patch | 2 +- .../stable-audio-tools/conditioners_patch.py | 2 -- .../stable-audio-tools/pretrained.patch | 11 ++++++++++- .../stable-audio-tools/pretrained_patch.py | 2 -- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index d9bcef1b15..fde348d621 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -53,7 +53,6 @@ ``` 3. 代码修改 - - 执行命令: ```bash python3 conditioners_patch.py @@ -83,7 +82,7 @@ 3. 获取T5模型权重(可选) - 推理过程中会自动从huggingface下载T5-base的模型权重,若希望以加载本地T5-base模型权重方式进行推理,请将`model_base`路径下的`tokenizer`和`text_encoder`文件夹复制到代码执行路径中。 + 推理过程中会自动从huggingface下载T5-base的模型权重,若希望以加载本地T5-base模型权重方式进行推理,请将`model_base`路径下的`tokenizer`和`text_encoder`文件夹复制到推理代码的执行路径中。 2. 开始推理验证。 diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch index f635d7a20d..061c0e0e6f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch @@ -12,7 +12,7 @@ + tokenizer_path = os.path.join(os.getcwd() + "tokenizer") + text_encoder_path = os.path.join(os.getcwd() + "text_encoder") + if os.path.exists(tokenizer_path) and os.path.exists(text_encoder_path): -+ print("From loacl import T5-base . . .") ++ print("From local import T5-base . . .") + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + model = T5EncoderModel.from_pretrained(text_encoder_path).train(enable_grad).requires_grad_(enable_grad).to(torch.float16) + else: diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py index 7c20ccaa61..71db741779 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners_patch.py @@ -4,9 +4,7 @@ import stable_audio_tools def main(): stable_audio_tools_path = stable_audio_tools.__path__ - stable_audio_tools_version = stable_audio_tools.__version__ - assert stable_audio_tools_version is not '0.0.16', "expectation stable_audio_tools_version==0.0.16" os.system(f'patch -p0 {stable_audio_tools_path[0]}/models/conditioners.py conditioners.patch') diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch index ca200e51c7..f51e6a1d90 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained.patch @@ -1,11 +1,20 @@ --- pretrained.py 2024-09-30 15:31:40.672485200 +0800 -+++ pretrained_patch.py 2024-09-30 18:15:59.061846100 +0800 ++++ pretrained_patch.py 2024-10-07 14:54:18.756960100 +0800 @@ -1,4 +1,5 @@ import json +import os from .factory import create_model_from_config from .utils import load_ckpt_state_dict +@@ -7,7 +8,7 @@ + + def get_pretrained_model(name: str): + +- model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model') ++ model_config_path = os.path.join(name, "model_config.json") + + with open(model_config_path) as f: + model_config = json.load(f) @@ -15,10 +16,7 @@ model = create_model_from_config(model_config) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py index 5725b47300..4abdad47b0 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/pretrained_patch.py @@ -4,9 +4,7 @@ import stable_audio_tools def main(): stable_audio_tools_path = stable_audio_tools.__path__ - stable_audio_tools_version = stable_audio_tools.__version__ - assert stable_audio_tools_version is not '0.0.16', "expectation stable_audio_tools_version==0.0.16" os.system(f'patch -p0 {stable_audio_tools_path[0]}/models/pretrained.py pretrained.patch') -- Gitee From 7f897e14848bf4584950b39dc3f7a3d00d0b280d Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 15:37:14 +0800 Subject: [PATCH 03/13] modify patch err --- .../stable-audio-tools/conditioners.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch index 061c0e0e6f..c61a74932a 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/conditioners.patch @@ -9,8 +9,8 @@ - self.tokenizer = AutoTokenizer.from_pretrained(t5_model_name) - model = T5EncoderModel.from_pretrained(t5_model_name).train(enable_grad).requires_grad_(enable_grad).to(torch.float16) + import os -+ tokenizer_path = os.path.join(os.getcwd() + "tokenizer") -+ text_encoder_path = os.path.join(os.getcwd() + "text_encoder") ++ tokenizer_path = os.path.join(os.getcwd(), "tokenizer") ++ text_encoder_path = os.path.join(os.getcwd(), "text_encoder") + if os.path.exists(tokenizer_path) and os.path.exists(text_encoder_path): + print("From local import T5-base . . .") + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) -- Gitee From 459e1389d4b767706c5d6b4e844802f9d0500916 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 15:39:41 +0800 Subject: [PATCH 04/13] modify prompts err --- .../stable-audio-open-1.0/diffusers/{prompt.txt => prompts.txt} | 0 .../stable-audio-tools/{prompt.txt => prompts.txt} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/{prompt.txt => prompts.txt} (100%) rename MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/{prompt.txt => prompts.txt} (100%) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompt.txt b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompts.txt similarity index 100% rename from MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompt.txt rename to MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/prompts.txt diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompt.txt b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompts.txt similarity index 100% rename from MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompt.txt rename to MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/prompts.txt -- Gitee From 2076ba600ae006c8cf29f237d4dd3946f88fe7bc Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 15:48:55 +0800 Subject: [PATCH 05/13] modify pipeline err --- .../stable_audio_open_tools_pipeline.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py index 8766a6d3cc..192879312f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/stable_audio_open_tools_pipeline.py @@ -78,7 +78,7 @@ def main(): for i, prompt in enumerate(f): with torch.no_grad(): conditioning[0]["prompt"] = prompt - conditioning[0]["seconds_total"] = float(args.audio_end_in_s[i]) if (len(args.audio_end_in_s) > i) else 10.0 + conditioning[0]["seconds_total"] = float(args.seconds_total[i]) if (len(args.seconds_total) > i) else 10.0 npu_stream.synchronize() begin = time.time() @@ -90,7 +90,7 @@ def main(): sample_size=sample_size, sigma_min=0.3, sigma_max=500, - sampler_type="dpmpp-3m-3sde", + sampler_type="dpmpp-3m-sde", device="npu" ) npu_stream.synchronize() @@ -98,14 +98,14 @@ def main(): if i > skip-1: total_time += end - begin prompts_num = i+1 - wavefrom_start = int(conditioning[0]["seconds_start"] * sample_rate) - wavefrom_end = int(conditioning[0]["seconds_total"] * sample_rate) - output = output[:, :, wavefrom_start:wavefrom_end] + waveform_start = int(conditioning[0]["seconds_start"] * sample_rate) + waveform_end = int(conditioning[0]["seconds_total"] * sample_rate) + output = output[:, :, waveform_start:waveform_end] output = rearrange(output, "b d n -> d (b n)") output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1,1).mul(32767).to(torch.int16).cpu() torchaudio.save(args.save_dir + "/audio_by_prompt" + str(prompts_num) + ".wav", output, sample_rate) - if prompts_num>skip: - average_time = total_time/(prompts_num-skip) + if prompts_num > skip: + average_time = total_time / (prompts_num-skip) else: print("Infer average time skip first two prompts, make sure prompts.txt has three more prompts") print(f"Infer average time: {average_time:.3f}s\n") -- Gitee From 9c9924235f00524b51ce0244f1495388d51366a4 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:18:10 +0800 Subject: [PATCH 06/13] modify readme err --- .../foundation/stable-audio-open-1.0/diffusers/README.md | 2 +- .../stable-audio-open-1.0/stable-audio-tools/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md index 2176c04d80..2ddb2adbf7 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/diffusers/README.md @@ -164,7 +164,7 @@ --audio_end_in_s 10 10 47 \ --num_waveforms_per_prompt 1 \ --guidance_scale 7 \ - --save_dir ./result \ + --save_dir ./results \ --device 0 ``` diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index fde348d621..df5f7f7840 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -121,7 +121,7 @@ --prompt_file ./prompts.txt \ --num_inference_steps 100 \ --seconds_total 10 10 47 \ - --save_dir ./result \ + --save_dir ./results \ --device 0 ``` -- Gitee From bc21edc6ea721a4a9a522e857301b3f3ba2c871d Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:23:23 +0800 Subject: [PATCH 07/13] modify readme infer time --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index df5f7f7840..cc3e289a79 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -144,4 +144,4 @@ | 硬件形态 | 迭代次数 | 平均耗时| | :------: |:----:|:----:| -| A2 | 100 | 15.675s | \ No newline at end of file +| A2 | 100 | 14.711s | \ No newline at end of file -- Gitee From 9f9e7e2a539c0069a0d68f5ecfe178c8543d6e4b Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:26:47 +0800 Subject: [PATCH 08/13] modify readme download model-base --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index cc3e289a79..cb2816b2ff 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -63,6 +63,7 @@ 1. 模型准备。 1. 获取模型权重 + 可提前下载权重,以避免执行后面步骤时可能会出现下载失败 ```bash # 需要使用 git-lfs (https://git-lfs.com) git lfs install -- Gitee From 0f8a344758c6fbf5133b656b8bd47bf2cd686c97 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:28:41 +0800 Subject: [PATCH 09/13] modify readme download model-base --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index cb2816b2ff..0d455aa3a6 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -62,8 +62,10 @@ ## 模型推理 1. 模型准备。 - 1. 获取模型权重 + 1. 获取模型权重 + 可提前下载权重,以避免执行后面步骤时可能会出现下载失败 + ```bash # 需要使用 git-lfs (https://git-lfs.com) git lfs install -- Gitee From 663aaff652409ef4bf7b57b7389b547a00520305 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:35:24 +0800 Subject: [PATCH 10/13] modify readme download model-base --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index 0d455aa3a6..3570f226ce 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -62,10 +62,9 @@ ## 模型推理 1. 模型准备。 - 1. 获取模型权重 - + 1. 获取模型权重 可提前下载权重,以避免执行后面步骤时可能会出现下载失败 - + ```bash # 需要使用 git-lfs (https://git-lfs.com) git lfs install @@ -74,7 +73,7 @@ git clone https://huggingface.co/stabilityai/stable-audio-open-1.0 ``` - 2. 设置模型权重的路径。 + 2. 设置模型权重的路径。 ```bash # stable-audio-open-1.0 (执行时下载权重) model_base="stabilityai/stable-audio-open-1.0" @@ -83,7 +82,7 @@ model_base="./stable-audio-open-1.0" ``` - 3. 获取T5模型权重(可选) + 3. 获取T5模型权重(可选) 推理过程中会自动从huggingface下载T5-base的模型权重,若希望以加载本地T5-base模型权重方式进行推理,请将`model_base`路径下的`tokenizer`和`text_encoder`文件夹复制到推理代码的执行路径中。 -- Gitee From 2c8d70808b9b58a2323f83140da440565eab9a28 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:37:49 +0800 Subject: [PATCH 11/13] modify readme download model-base --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index 3570f226ce..e98f35145c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -63,6 +63,7 @@ 1. 模型准备。 1. 获取模型权重 + 可提前下载权重,以避免执行后面步骤时可能会出现下载失败 ```bash -- Gitee From ede1b9d0fc8df9e4dc12610d310d221a12f8c746 Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:40:06 +0800 Subject: [PATCH 12/13] modify readme download model-base --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index e98f35145c..b0cc51741f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -53,7 +53,7 @@ ``` 3. 代码修改 -- 执行命令: + 执行命令: ```bash python3 conditioners_patch.py python3 pretrained_patch.py @@ -63,7 +63,7 @@ 1. 模型准备。 1. 获取模型权重 - + 可提前下载权重,以避免执行后面步骤时可能会出现下载失败 ```bash @@ -132,7 +132,7 @@ - --model:模型权重路径。 - --prompt_file:提示词文件。 - --num_inference_steps: 语音生成迭代次数。 - - --seconds_total:生成语音的时长,如不输入则默认生成10s。 + - --seconds_total:生成语音的时长,与prompts.txt中的prompt一一对应,如不输入则默认生成10s。 - --save_dir:生成语音的存放目录。 - --device:推理设备ID。 -- Gitee From c47fe263b2989f7f59f65235b4e7f3472eca4f4a Mon Sep 17 00:00:00 2001 From: zhoufan2956 Date: Mon, 7 Oct 2024 16:43:51 +0800 Subject: [PATCH 13/13] modify readme download model-base --- .../stable-audio-open-1.0/stable-audio-tools/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md index b0cc51741f..99af1a0d3c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/stable-audio-open-1.0/stable-audio-tools/README.md @@ -53,6 +53,7 @@ ``` 3. 代码修改 + 执行命令: ```bash python3 conditioners_patch.py @@ -64,7 +65,7 @@ 1. 模型准备。 1. 获取模型权重 - 可提前下载权重,以避免执行后面步骤时可能会出现下载失败 + 可提前下载权重,以避免执行后面步骤时可能会出现下载失败。 ```bash # 需要使用 git-lfs (https://git-lfs.com) @@ -132,7 +133,7 @@ - --model:模型权重路径。 - --prompt_file:提示词文件。 - --num_inference_steps: 语音生成迭代次数。 - - --seconds_total:生成语音的时长,与prompts.txt中的prompt一一对应,如不输入则默认生成10s。 + - --seconds_total:生成语音的时长,与prompts.txt中的prompt对应,如不输入则默认生成10s。 - --save_dir:生成语音的存放目录。 - --device:推理设备ID。 -- Gitee