From 1d9c066a63d7c6775701b136a09bb44e3e23ef56 Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Fri, 5 Dec 2025 16:05:59 +0800 Subject: [PATCH] add sd-3 inference --- README.md | 30 ++++---- README_en.md | 31 ++++---- .../diffusers/README.md | 0 .../diffusers/ci/prepare.sh | 0 .../diffusers/demo.py | 0 .../diffusers/requirements.txt | 0 .../stable-diffusion-3/diffusers/README.md | 35 ++++++++++ .../stable-diffusion-3/diffusers/demo_sd3.py | 70 +++++++++++++++++++ tests/model_info.json | 39 ++++++++++- 9 files changed, 173 insertions(+), 32 deletions(-) rename models/multimodal/diffusion_model/{stable-diffusion => stable-diffusion-1.5}/diffusers/README.md (100%) rename models/multimodal/diffusion_model/{stable-diffusion => stable-diffusion-1.5}/diffusers/ci/prepare.sh (100%) rename models/multimodal/diffusion_model/{stable-diffusion => stable-diffusion-1.5}/diffusers/demo.py (100%) rename models/multimodal/diffusion_model/{stable-diffusion => stable-diffusion-1.5}/diffusers/requirements.txt (100%) create mode 100644 models/multimodal/diffusion_model/stable-diffusion-3/diffusers/README.md create mode 100644 models/multimodal/diffusion_model/stable-diffusion-3/diffusers/demo_sd3.py diff --git a/README.md b/README.md index 3c24aabe..cdd42ca9 100644 --- a/README.md +++ b/README.md @@ -279,20 +279,22 @@ ### 多模态 -| Model | vLLM | IxFormer | IXUCA SDK | -|---------------------|-----------------------------------------------------------------------|------------------------------------------------------------|-----------| -| Aria | [✅](models/multimodal/vision_language_model/aria/vllm) | | 4.3.0 | -| Chameleon-7B | [✅](models/multimodal/vision_language_model/chameleon_7b/vllm) | | 4.3.0 | -| CLIP | | [✅](models/multimodal/vision_language_model/clip/ixformer) | 4.3.0 | -| Fuyu-8B | [✅](models/multimodal/vision_language_model/fuyu_8b/vllm) | | 4.3.0 | -| H2OVL Mississippi | [✅](models/multimodal/vision_language_model/h2vol/vllm) | | 4.3.0 | -| Idefics3 | [✅](models/multimodal/vision_language_model/idefics3/vllm) | | 4.3.0 | -| InternVL2-4B | [✅](models/multimodal/vision_language_model/intern_vl/vllm) | | 4.3.0 | -| LLaVA | [✅](models/multimodal/vision_language_model/llava/vllm) | | 4.3.0 | -| LLaVA-Next-Video-7B | [✅](models/multimodal/vision_language_model/llava_next_video_7b/vllm) | | 4.3.0 | -| Llama-3.2 | [✅](models/multimodal/vision_language_model/llama-3.2/vllm) | | 4.3.0 | -| MiniCPM-V 2 | [✅](models/multimodal/vision_language_model/minicpm_v/vllm) | | 4.3.0 | -| Pixtral | [✅](models/multimodal/vision_language_model/pixtral/vllm) | | 4.3.0 | +| Model | Engine | Supported | IXUCA SDK | +|---------------------|----------|------------------------------------------------------------------------|-----------| +| Aria | vLLM | [✅](models/multimodal/vision_language_model/aria/vllm) | 4.3.0 | +| Chameleon-7B | vLLM | [✅](models/multimodal/vision_language_model/chameleon_7b/vllm) | 4.3.0 | +| CLIP | IxFormer | [✅](models/multimodal/vision_language_model/clip/ixformer) | 4.3.0 | +| Fuyu-8B | vLLM | [✅](models/multimodal/vision_language_model/fuyu_8b/vllm) | 4.3.0 | +| H2OVL Mississippi | vLLM | [✅](models/multimodal/vision_language_model/h2vol/vllm) | 4.3.0 | +| Idefics3 | vLLM | [✅](models/multimodal/vision_language_model/idefics3/vllm) | 4.3.0 | +| InternVL2-4B | vLLM | [✅](models/multimodal/vision_language_model/intern_vl/vllm) | 4.3.0 | +| LLaVA | vLLM | [✅](models/multimodal/vision_language_model/llava/vllm) | 4.3.0 | +| LLaVA-Next-Video-7B | vLLM | [✅](models/multimodal/vision_language_model/llava_next_video_7b/vllm) | 4.3.0 | +| Llama-3.2 | vLLM | [✅](models/multimodal/vision_language_model/llama-3.2/vllm) | 4.3.0 | +| MiniCPM-V 2 | vLLM | [✅](models/multimodal/vision_language_model/minicpm_v/vllm) | 4.3.0 | +| Pixtral | vLLM | [✅](models/multimodal/vision_language_model/pixtral/vllm) | 4.3.0 | +| Stable Diffusion 1.5 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers) | 4.3.0 | +| Stable Diffusion 3 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers) | 4.3.0 | ### 自然语言处理(NLP) diff --git a/README_en.md b/README_en.md index 55f0eaf0..c3f238bd 100644 --- a/README_en.md +++ b/README_en.md @@ -289,21 +289,22 @@ inference to be expanded in the future. ### Multimodal -| Model | vLLM | IxFormer | IXUCA SDK | -|---------------------|-----------------------------------------------------------------------|------------------------------------------------------------|-----------| -| Aria | [✅](models/multimodal/vision_language_model/aria/vllm) | | 4.3.0 | -| Chameleon-7B | [✅](models/multimodal/vision_language_model/chameleon_7b/vllm) | | 4.3.0 | -| CLIP | | [✅](models/multimodal/vision_language_model/clip/ixformer) | 4.3.0 | -| Fuyu-8B | [✅](models/multimodal/vision_language_model/fuyu_8b/vllm) | | 4.3.0 | -| H2OVL Mississippi | [✅](models/multimodal/vision_language_model/h2vol/vllm) | | 4.3.0 | -| Idefics3 | [✅](models/multimodal/vision_language_model/idefics3/vllm) | | 4.3.0 | -| InternVL2-4B | [✅](models/multimodal/vision_language_model/intern_vl/vllm) | | 4.3.0 | -| LLaVA | [✅](models/multimodal/vision_language_model/llava/vllm) | | 4.3.0 | -| LLaVA-Next-Video-7B | [✅](models/multimodal/vision_language_model/llava_next_video_7b/vllm) | | 4.3.0 | -| Llama-3.2 | [✅](models/multimodal/vision_language_model/llama-3.2/vllm) | | 4.3.0 | -| MiniCPM-V 2 | [✅](models/multimodal/vision_language_model/minicpm_v/vllm) | | 4.3.0 | -| Pixtral | [✅](models/multimodal/vision_language_model/pixtral/vllm) | | 4.3.0 | - +| Model | Engine | Supported | IXUCA SDK | +|---------------------|----------|------------------------------------------------------------------------|-----------| +| Aria | vLLM | [✅](models/multimodal/vision_language_model/aria/vllm) | 4.3.0 | +| Chameleon-7B | vLLM | [✅](models/multimodal/vision_language_model/chameleon_7b/vllm) | 4.3.0 | +| CLIP | IxFormer | [✅](models/multimodal/vision_language_model/clip/ixformer) | 4.3.0 | +| Fuyu-8B | vLLM | [✅](models/multimodal/vision_language_model/fuyu_8b/vllm) | 4.3.0 | +| H2OVL Mississippi | vLLM | [✅](models/multimodal/vision_language_model/h2vol/vllm) | 4.3.0 | +| Idefics3 | vLLM | [✅](models/multimodal/vision_language_model/idefics3/vllm) | 4.3.0 | +| InternVL2-4B | vLLM | [✅](models/multimodal/vision_language_model/intern_vl/vllm) | 4.3.0 | +| LLaVA | vLLM | [✅](models/multimodal/vision_language_model/llava/vllm) | 4.3.0 | +| LLaVA-Next-Video-7B | vLLM | [✅](models/multimodal/vision_language_model/llava_next_video_7b/vllm) | 4.3.0 | +| Llama-3.2 | vLLM | [✅](models/multimodal/vision_language_model/llama-3.2/vllm) | 4.3.0 | +| MiniCPM-V 2 | vLLM | [✅](models/multimodal/vision_language_model/minicpm_v/vllm) | 4.3.0 | +| Pixtral | vLLM | [✅](models/multimodal/vision_language_model/pixtral/vllm) | 4.3.0 | +| Stable Diffusion 1.5 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers) | 4.3.0 | +| Stable Diffusion 3 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers) | 4.3.0 | ### NLP #### PLM (Pre-trained Language Model) diff --git a/models/multimodal/diffusion_model/stable-diffusion/diffusers/README.md b/models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/README.md similarity index 100% rename from models/multimodal/diffusion_model/stable-diffusion/diffusers/README.md rename to models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/README.md diff --git a/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh b/models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/ci/prepare.sh similarity index 100% rename from models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh rename to models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/ci/prepare.sh diff --git a/models/multimodal/diffusion_model/stable-diffusion/diffusers/demo.py b/models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/demo.py similarity index 100% rename from models/multimodal/diffusion_model/stable-diffusion/diffusers/demo.py rename to models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/demo.py diff --git a/models/multimodal/diffusion_model/stable-diffusion/diffusers/requirements.txt b/models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/requirements.txt similarity index 100% rename from models/multimodal/diffusion_model/stable-diffusion/diffusers/requirements.txt rename to models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/requirements.txt diff --git a/models/multimodal/diffusion_model/stable-diffusion-3/diffusers/README.md b/models/multimodal/diffusion_model/stable-diffusion-3/diffusers/README.md new file mode 100644 index 00000000..9fc5a40a --- /dev/null +++ b/models/multimodal/diffusion_model/stable-diffusion-3/diffusers/README.md @@ -0,0 +1,35 @@ +# Stable Diffusion 3 + +## Model description + +Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.12 | + +## Model Preparation + +### Prepare Resources + +Download the stable-diffusion-3-medium-diffusers from [huggingface page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers). + +### Install Dependencies + +```bash +pip3 install transformers==4.39.3 accelerate==0.29.0 scipy safetensors +pip3 uninstall apex +pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0+corex.4.3.0-py3-none-any.whl +``` + +## Model Inference + +```bash +python3 demo_sd3.py +``` + +## References + +- [diffusers](https://github.com/huggingface/diffusers) diff --git a/models/multimodal/diffusion_model/stable-diffusion-3/diffusers/demo_sd3.py b/models/multimodal/diffusion_model/stable-diffusion-3/diffusers/demo_sd3.py new file mode 100644 index 00000000..c4c33c11 --- /dev/null +++ b/models/multimodal/diffusion_model/stable-diffusion-3/diffusers/demo_sd3.py @@ -0,0 +1,70 @@ +import torch +from diffusers import StableDiffusion3Pipeline +import time +from PIL import Image +import numpy as np +import random +import os +dtype=torch.float16 + + +os.environ["ENABLE_IXFORMER_INFERENCE"] = "1" +os.environ["USE_NHWC_GN"] = "1" +def ixformer_accelerate(pipe): + pipe.transformer.fuse_qkv_projections() + if int(os.environ.get("USE_NHWC_GN", 0)): + pipe.vae.to(memory_format=torch.channels_last) + pipe.text_encoder=torch.compile(pipe.text_encoder) + pipe.text_encoder_2=torch.compile(pipe.text_encoder_2) + pipe.text_encoder_3=torch.compile(pipe.text_encoder_3) + + +pipe = StableDiffusion3Pipeline.from_pretrained(f"stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=dtype) +pipe = pipe.to("cuda") +ixformer_accelerate(pipe) +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + +prompt="A cat holding a sign that says hello world" +resolution=[512,1024] +num_inference_steps=20 +guidance_scale=7.0 +for item in resolution: + # 设置随机数种子 + setup_seed(20) + width=height=item + #warm up + image = pipe( + prompt, + negative_prompt="", + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + width=width, + height=height + ).images[0] + image_name = 'sd3_'+str(item)+'x'+str(item)+'.png' + image.save(image_name) + + iter=2 + #performence + torch.cuda.synchronize() + start_time = time.time() + torch.cuda.profiler.start() + for _ in range(iter): + pipe( + prompt, + negative_prompt="", + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale,width=width, + height=height) + torch.cuda.profiler.stop() + torch.cuda.synchronize() + use_time = time.time() - start_time + print(f"resolution: {item}x{item}, num_inference_steps: {num_inference_steps}, guidance_scale: {guidance_scale}, time: {use_time/iter:.2f} seconds") + + + diff --git a/tests/model_info.json b/tests/model_info.json index d34cb772..6b4dfc3d 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -4836,7 +4836,7 @@ }, { "display_name": "Stable Diffusion 1.5", - "model_name": "stable-diffusion", + "model_name": "stable-diffusion-1.5", "framework": "diffusers", "release_version": "25.03", "release_sdk": "CoreX 4.2.0", @@ -4848,8 +4848,8 @@ "mdims": "", "dataset": "", "license": "", - "model_path": "models/multimodal/diffusion_model/stable-diffusion/diffusers", - "readme_file": "models/multimodal/diffusion_model/stable-diffusion/diffusers/README.md", + "model_path": "models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers", + "readme_file": "models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers/README.md", "bitbucket_repo": "", "bitbucket_branch": "", "bitbucket_path": "", @@ -8733,6 +8733,39 @@ "type": "inference", "hasDemo": false, "demoType": "" + }, + { + "display_name": "Stable Diffusion 3", + "model_name": "stable-diffusion-3", + "framework": "diffusers", + "release_version": "25.12", + "release_sdk": "CoreX 4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.3.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/stable-diffusion-3/diffusers", + "readme_file": "models/multimodal/diffusion_model/stable-diffusion-3/diffusers/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "", + "need_third_part": "", + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" } ] } \ No newline at end of file -- Gitee