From 6f41c623b4191cb6814f2189d16ab4349dac525e Mon Sep 17 00:00:00 2001
From: lyu-xingjia <lyuxingjia@huawei.com>
Date: Wed, 15 Jan 2025 19:11:37 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90MindIE-SD=E3=80=91Cogvideo=E4=BD=BF?=
 =?UTF-8?q?=E8=83=BD=E9=87=87=E6=A0=B7=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipelines/pipeline_cogvideox.py           | 30 ++++++++++++++-----
 .../foundation/CogVideoX-5b/inference.py      |  5 ++++
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py
index 761f3049eb..afac04d276 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/cogvideox_5b/pipelines/pipeline_cogvideox.py
@@ -685,14 +685,25 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
                 timestep = t.expand(latent_model_input.shape[0])
 
                 # predict noise model_output
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    encoder_hidden_states=prompt_embeds,
-                    timestep=timestep,
-                    image_rotary_emb=image_rotary_emb,
-                    attention_kwargs=attention_kwargs,
-                    return_dict=False,
-                )[0]
+                if hasattr(self, "skip_strategy"):
+                    noise_pred = self.skip_strategy(
+                        self.transformer,
+                        hidden_states=latent_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep=timestep,
+                        image_rotary_emb=image_rotary_emb,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                else:
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep=timestep,
+                        image_rotary_emb=image_rotary_emb,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
 
                 noise_pred = noise_pred.float()
 
@@ -720,6 +731,9 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
                     )
                 latents = latents.to(prompt_embeds.dtype)
 
+                if hasattr(self, "skip_strategy"):
+                    self.skip_strategy.update_strategy(latents)
+
                 # call the callback, if provided
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
diff --git a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py
index 6b2d8bd1a9..7e72f78fca 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/CogVideoX-5b/inference.py
@@ -14,6 +14,7 @@ from typing import List, Optional, Tuple, Union, Literal
 
 from cogvideox_5b import CogVideoXPipeline, CogVideoXTransformer3DModel, get_rank, get_world_size, all_gather
 
+from mindiesd.pipeline.sampling_optm import AdaStep
 
 def parallelize_transformer(pipe):
     transformer = pipe.transformer
@@ -122,6 +123,10 @@ def generate_video(
     pipe.vae.enable_slicing()
     pipe.vae.enable_tiling()
     pipe.transformer.switch_to_qkvLinear()
+    # sampling optm
+    skip_strategy = AdaStep(skip_thr=0.006, max_skip_steps=1, decay_ratio=0.99, device="npu")
+    pipe.skip_strategy = skip_strategy
+
     if get_world_size() > 1:
         parallelize_transformer(pipe)
 
-- 
Gitee