From 6f41c623b4191cb6814f2189d16ab4349dac525e Mon Sep 17 00:00:00 2001 From: lyu-xingjia Date: Wed, 15 Jan 2025 19:11:37 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90MindIE-SD=E3=80=91Cogvideo=E4=BD=BF?= =?UTF-8?q?=E8=83=BD=E9=87=87=E6=A0=B7=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipelines/pipeline_cogvideox.py | 30 ++++++++++++++----- .../foundation/CogVideoX-5b/inference.py | 5 ++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py index 761f3049eb..afac04d276 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py @@ -685,14 +685,25 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): timestep = t.expand(latent_model_input.shape[0]) # predict noise model_output - noise_pred = self.transformer( - hidden_states=latent_model_input, - encoder_hidden_states=prompt_embeds, - timestep=timestep, - image_rotary_emb=image_rotary_emb, - attention_kwargs=attention_kwargs, - return_dict=False, - )[0] + if hasattr(self, "skip_strategy"): + noise_pred = self.skip_strategy( + self.transformer, + hidden_states=latent_model_input, + encoder_hidden_states=prompt_embeds, + timestep=timestep, + image_rotary_emb=image_rotary_emb, + attention_kwargs=attention_kwargs, + return_dict=False, + )[0] + else: + noise_pred = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=prompt_embeds, + timestep=timestep, + image_rotary_emb=image_rotary_emb, + attention_kwargs=attention_kwargs, + return_dict=False, + )[0] noise_pred = noise_pred.float() @@ -720,6 +731,9 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ) latents = latents.to(prompt_embeds.dtype) + if hasattr(self, "skip_strategy"): + self.skip_strategy.update_strategy(latents) + # call the callback, if provided if callback_on_step_end is not None: callback_kwargs = {} diff --git a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py index 6b2d8bd1a9..7e72f78fca 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py +++ b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py @@ -14,6 +14,7 @@ from typing import List, Optional, Tuple, Union, Literal from cogvideox_5b import CogVideoXPipeline, CogVideoXTransformer3DModel, get_rank, get_world_size, all_gather +from mindiesd.pipeline.sampling_optm import AdaStep def parallelize_transformer(pipe): transformer = pipe.transformer @@ -122,6 +123,10 @@ def generate_video( pipe.vae.enable_slicing() pipe.vae.enable_tiling() pipe.transformer.switch_to_qkvLinear() + # sampling optm + skip_strategy = AdaStep(skip_thr=0.006, max_skip_steps=1, decay_ratio=0.99, device="npu") + pipe.skip_strategy = skip_strategy + if get_world_size() > 1: parallelize_transformer(pipe) -- Gitee